import requests import faiss import pickle import os from bs4 import BeautifulSoup from urllib.parse import quote from sentence_transformers import SentenceTransformer from urllib.parse import parse_qs, urlparse, unquote class WebSearchHelper: def __init__(self, kb_path="web_kb.index", meta_path="web_kb_meta.pkl"): self.embedder = SentenceTransformer("all-MiniLM-L6-v2") self.kb_path = kb_path self.meta_path = meta_path self.meta = [] self.index = None self._load_index() def _load_index(self): if os.path.exists(self.kb_path): self.index = faiss.read_index(self.kb_path) with open(self.meta_path, "rb") as f: self.meta = pickle.load(f) else: self.index = faiss.IndexFlatL2(384) def _save_index(self): faiss.write_index(self.index, self.kb_path) with open(self.meta_path, "wb") as f: pickle.dump(self.meta, f) def search_duckduckgo(self, query, num=5): results = [] q = quote(query) headers = {'User-Agent': 'Mozilla/5.0'} url = f"https://lite.duckduckgo.com/lite?q={q}" print(url) res = requests.get(url, headers=headers) soup = BeautifulSoup(res.text, "html.parser") links = soup.find_all("a", href=True) for link in links[:num]: parsed = urlparse(link['href']) if parsed.path.startswith("/l/"): qs = parse_qs(parsed.query) actual_url = unquote(qs.get("uddg", [""])[0]) if actual_url: results.append(actual_url) #results.append(link['href']) return results def crawl_and_summarize(self, urls, llm_function): summaries = [] for url in urls: try: print(f"[crawling] {url}") html = requests.get(url, timeout=5).text text = BeautifulSoup(html, "html.parser").get_text() clean = ' '.join(text.strip().split()[:1000]) # truncate summary = llm_function(clean) summaries.append((url, summary)) except Exception as e: print(f"[crawl error] {url} -> {e}") return summaries def add_to_kb(self, summaries): for url, content in summaries: vec = self.embedder.encode([content]) self.index.add(vec) self.meta.append({"url": url, "summary": content}) self._save_index() def query_kb(self, text, top_k=3): if self.index.ntotal == 0: return [], [] vec = self.embedder.encode([text]) D, I = self.index.search(vec, top_k) results = [self.meta[i] for i in I[0] if i < len(self.meta)] return D[0], results def should_trigger_search(self, score_threshold=0.7, text=""): if self.index.ntotal == 0: return True scores, _ = self.query_kb(text, top_k=1) if not scores or len(scores) == 0: return True return scores[0] > 1.0 or scores[0] < (1 - score_threshold)