diff --git a/agent.py b/agent.py index b941c6f..e8dcf10 100644 --- a/agent.py +++ b/agent.py @@ -1,108 +1,112 @@ import time -begin_time = time.time() - import torch from transformers import pipeline, AutoTokenizer from memory import Memory from web_search_helper import WebSearchHelper -from llm_wrapper import LlmWrapper -# Initialize components -memory = Memory() -searcher = WebSearchHelper() -summarizer = LlmWrapper(model_name="Qwen/Qwen3-0.6B") # optional, could summarize search results +# Initialize clock +begin_time = time.time() -tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct") -# Load your main LLM (Llama 3.2:1B-Instruct) +# 🔧 Load model and tokenizer (Llama3.2:1B) model_id = "meta-llama/Llama-3.2-1B-Instruct" +tokenizer = AutoTokenizer.from_pretrained(model_id) pipe = pipeline( "text-generation", model=model_id, torch_dtype=torch.bfloat16, device_map="auto", + pad_token_id=128001 # Suppress warnings ) -# Define system prompt and Kshama's capabilities +# 🧩 Agent components +memory = Memory() +searcher = WebSearchHelper() + +# 🧭 System prompt (Kshama's persona + capabilities) SYSTEM_PROMPT = """ You are ক্ষমা, Abu's personal AI assistant. You're helpful, respectful, and aligned with his goals and preferences. -You maintain and query a persistent memory of past interactions and facts via a vector store. You can: -1. Recall relevant knowledge from memory using semantic similarity. -2. Add new insights to memory when useful. -3. Perform live web searches and summarize results if memory is insufficient. -Structure your outputs clearly: -- Use ##MEM:add(...) to store thoughts to memory. -- Use ##MEM:recall(...) to request a lookup (already handled externally). -- Use ##SEARCH:trigger(...) when memory lacks the answer. -Respond in clear, friendly tone. Actively use what you know about Abu’s past work (e.g., GANs, TensorFlow, Exopid). +1. Recall relevant information from long-term memory. +2. Decide whether to perform a web search if the memory lacks necessary detail. +3. Summarize text clearly when requested. + +You use these tags: +- ##MEM:add("...") to store information in memory. +- ##SEARCH:yes if a web search is needed. +- ##SEARCH:no if memory is sufficient. + +Be concise but friendly. Don't suggest a search unless it is clearly needed. """ -def generate_response(user_input: str): - # Step 1: Recall relevant memory - recalled = memory.query(user_input, top_k=3) - memory_context = "\n".join([f"- {item}" for item in recalled]) - - # Step 2: Evaluate recall quality - should_search = searcher.should_trigger_search(text=user_input) - kb_hits = "" - if should_search: - urls = searcher.search_duckduckgo(user_input) - summaries = searcher.crawl_and_summarize(urls, llm_function=summarizer.summarize) - searcher.add_to_kb(summaries) - _, hits = searcher.query_kb(user_input) - kb_hits = "\n".join([f"- {h['summary']}" for h in hits]) - else: - _, hits = searcher.query_kb(user_input) - kb_hits = "\n".join([f"- {h['summary']}" for h in hits]) - - # Step 3: Compose structured messages - context_block = f"""Known facts from memory: - {memory_context or '[None]'} - - External knowledge from web: - {kb_hits or '[None]'} - """ +# 📝 Wrapper: summarize text with Llama +def summarize_with_llama(text): + prompt = f"Summarize the following webpage text:\n\n{text.strip()}\n\nSummary:" + output = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7) + return output[0]["generated_text"].replace(prompt, "").strip() +# 🎯 Ask model if it needs web search +def should_search(user_input, memory_hits, kb_hits): messages = [ {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": f"{context_block}\nUser asked: {user_input}"}, + {"role": "user", "content": f"User asked: {user_input}"}, + {"role": "user", "content": f"Known memory:\n{memory_hits or '[None]'}"}, + {"role": "user", "content": f"Web knowledge:\n{kb_hits or '[None]'}"}, + {"role": "user", "content": "Do you need more information to answer this? Reply with ##SEARCH:yes or ##SEARCH:no."}, ] + prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + output = pipe(prompt, max_new_tokens=32, do_sample=False) + reply = output[0]["generated_text"].strip().lower() + return "##search:yes" in reply - # Convert using chat template - prompt = tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True # appends assistant tag if needed - ) +# 🧠 Core reasoning + memory loop +def generate_response(user_input: str): + # Step 1: recall memory + web KB + memory_hits = memory.query(user_input, top_k=3) + mem_text = "\n".join([f"- {x}" for x in memory_hits]) - # Step 4: Call the model - output = pipe( - prompt, - max_new_tokens=512, - do_sample=True, - temperature=0.7, - ) + _, kb = searcher.query_kb(user_input, top_k=3) + kb_text = "\n".join([f"- {x['summary']}" for x in kb]) - # Step 5: Process model output (add to memory if marked) - response = output[0]["generated_text"].strip() + # Step 2: let Kshama decide if she wants to search + if should_search(user_input, mem_text, kb_text): + urls = searcher.search_duckduckgo(user_input) + summaries = searcher.crawl_and_summarize(urls, llm_function=summarize_with_llama) + searcher.add_to_kb(summaries) + _, kb = searcher.query_kb(user_input) + kb_text = "\n".join([f"- {x['summary']}" for x in kb]) + # Step 3: Compose final answer prompt + messages = [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": f"{user_input}"}, + {"role": "user", "content": f"Relevant memory:\n{mem_text or '[None]'}"}, + {"role": "user", "content": f"Web knowledge:\n{kb_text or '[None]'}"} + ] + prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + + # Step 4: generate final response + start = time.time() + output = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7) + elapsed = time.time() - start + response = output[0]["generated_text"].replace(prompt, "").strip() + + # Step 5: parse memory intent if "##MEM:add(" in response: try: - content = response.split("##MEM:add(")[1].split(")")[0].strip('"\'') - memory.add(content) + thought = response.split("##MEM:add(")[1].split(")")[0].strip('"\'') + memory.add(thought) print("[✅ Memory Added]") except: - print("[⚠️ Couldn't parse memory add]") + print("[⚠️ Could not parse memory directive]") - return response + return response, elapsed - -# 💬 REPL for testing +# 🧪 Interactive loop if __name__ == "__main__": - print(f"Time elapsed: {time.time() - begin_time:.2f} seconds") - print("👋 Welcome to Kshama. Type 'exit' to leave.") + print(f"🚀 Booted in {time.time() - begin_time:.2f}s") + print("👋 Welcome to Kshama. Type 'exit' to quit.") while True: user_input = input("\n🧑 You: ") if user_input.strip().lower() in ["exit", "quit"]: break - response = generate_response(user_input) - print(f"\n🤖 ক্ষমা: {response}") + response, t = generate_response(user_input) + print(f"\n🤖 ক্ষমা [{t:.2f}s]: {response}")