From d845c29e81dc661355d95837d6da7bf70d8b577e Mon Sep 17 00:00:00 2001 From: sufian Date: Sun, 29 Jun 2025 20:52:43 +0600 Subject: [PATCH] Refactor agent initialization and response workflow Streamlined agent initialization, separating components for clarity. Improved memory, web search, and response generation logic for better structure and efficiency. Added elapsed time display and refined interactive REPL messaging. --- agent.py | 144 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 74 insertions(+), 70 deletions(-) diff --git a/agent.py b/agent.py index b941c6f..e8dcf10 100644 --- a/agent.py +++ b/agent.py @@ -1,108 +1,112 @@ import time -begin_time = time.time() - import torch from transformers import pipeline, AutoTokenizer from memory import Memory from web_search_helper import WebSearchHelper -from llm_wrapper import LlmWrapper -# Initialize components -memory = Memory() -searcher = WebSearchHelper() -summarizer = LlmWrapper(model_name="Qwen/Qwen3-0.6B") # optional, could summarize search results +# Initialize clock +begin_time = time.time() -tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct") -# Load your main LLM (Llama 3.2:1B-Instruct) +# 🔧 Load model and tokenizer (Llama3.2:1B) model_id = "meta-llama/Llama-3.2-1B-Instruct" +tokenizer = AutoTokenizer.from_pretrained(model_id) pipe = pipeline( "text-generation", model=model_id, torch_dtype=torch.bfloat16, device_map="auto", + pad_token_id=128001 # Suppress warnings ) -# Define system prompt and Kshama's capabilities +# 🧩 Agent components +memory = Memory() +searcher = WebSearchHelper() + +# 🧭 System prompt (Kshama's persona + capabilities) SYSTEM_PROMPT = """ You are ক্ষমা, Abu's personal AI assistant. You're helpful, respectful, and aligned with his goals and preferences. -You maintain and query a persistent memory of past interactions and facts via a vector store. You can: -1. Recall relevant knowledge from memory using semantic similarity. -2. Add new insights to memory when useful. -3. Perform live web searches and summarize results if memory is insufficient. -Structure your outputs clearly: -- Use ##MEM:add(...) to store thoughts to memory. -- Use ##MEM:recall(...) to request a lookup (already handled externally). -- Use ##SEARCH:trigger(...) when memory lacks the answer. -Respond in clear, friendly tone. Actively use what you know about Abu’s past work (e.g., GANs, TensorFlow, Exopid). +1. Recall relevant information from long-term memory. +2. Decide whether to perform a web search if the memory lacks necessary detail. +3. Summarize text clearly when requested. + +You use these tags: +- ##MEM:add("...") to store information in memory. +- ##SEARCH:yes if a web search is needed. +- ##SEARCH:no if memory is sufficient. + +Be concise but friendly. Don't suggest a search unless it is clearly needed. """ -def generate_response(user_input: str): - # Step 1: Recall relevant memory - recalled = memory.query(user_input, top_k=3) - memory_context = "\n".join([f"- {item}" for item in recalled]) - - # Step 2: Evaluate recall quality - should_search = searcher.should_trigger_search(text=user_input) - kb_hits = "" - if should_search: - urls = searcher.search_duckduckgo(user_input) - summaries = searcher.crawl_and_summarize(urls, llm_function=summarizer.summarize) - searcher.add_to_kb(summaries) - _, hits = searcher.query_kb(user_input) - kb_hits = "\n".join([f"- {h['summary']}" for h in hits]) - else: - _, hits = searcher.query_kb(user_input) - kb_hits = "\n".join([f"- {h['summary']}" for h in hits]) - - # Step 3: Compose structured messages - context_block = f"""Known facts from memory: - {memory_context or '[None]'} - - External knowledge from web: - {kb_hits or '[None]'} - """ +# 📝 Wrapper: summarize text with Llama +def summarize_with_llama(text): + prompt = f"Summarize the following webpage text:\n\n{text.strip()}\n\nSummary:" + output = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7) + return output[0]["generated_text"].replace(prompt, "").strip() +# 🎯 Ask model if it needs web search +def should_search(user_input, memory_hits, kb_hits): messages = [ {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": f"{context_block}\nUser asked: {user_input}"}, + {"role": "user", "content": f"User asked: {user_input}"}, + {"role": "user", "content": f"Known memory:\n{memory_hits or '[None]'}"}, + {"role": "user", "content": f"Web knowledge:\n{kb_hits or '[None]'}"}, + {"role": "user", "content": "Do you need more information to answer this? Reply with ##SEARCH:yes or ##SEARCH:no."}, ] + prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + output = pipe(prompt, max_new_tokens=32, do_sample=False) + reply = output[0]["generated_text"].strip().lower() + return "##search:yes" in reply - # Convert using chat template - prompt = tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True # appends assistant tag if needed - ) +# 🧠 Core reasoning + memory loop +def generate_response(user_input: str): + # Step 1: recall memory + web KB + memory_hits = memory.query(user_input, top_k=3) + mem_text = "\n".join([f"- {x}" for x in memory_hits]) - # Step 4: Call the model - output = pipe( - prompt, - max_new_tokens=512, - do_sample=True, - temperature=0.7, - ) + _, kb = searcher.query_kb(user_input, top_k=3) + kb_text = "\n".join([f"- {x['summary']}" for x in kb]) - # Step 5: Process model output (add to memory if marked) - response = output[0]["generated_text"].strip() + # Step 2: let Kshama decide if she wants to search + if should_search(user_input, mem_text, kb_text): + urls = searcher.search_duckduckgo(user_input) + summaries = searcher.crawl_and_summarize(urls, llm_function=summarize_with_llama) + searcher.add_to_kb(summaries) + _, kb = searcher.query_kb(user_input) + kb_text = "\n".join([f"- {x['summary']}" for x in kb]) + # Step 3: Compose final answer prompt + messages = [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": f"{user_input}"}, + {"role": "user", "content": f"Relevant memory:\n{mem_text or '[None]'}"}, + {"role": "user", "content": f"Web knowledge:\n{kb_text or '[None]'}"} + ] + prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + + # Step 4: generate final response + start = time.time() + output = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7) + elapsed = time.time() - start + response = output[0]["generated_text"].replace(prompt, "").strip() + + # Step 5: parse memory intent if "##MEM:add(" in response: try: - content = response.split("##MEM:add(")[1].split(")")[0].strip('"\'') - memory.add(content) + thought = response.split("##MEM:add(")[1].split(")")[0].strip('"\'') + memory.add(thought) print("[✅ Memory Added]") except: - print("[⚠️ Couldn't parse memory add]") + print("[⚠️ Could not parse memory directive]") - return response + return response, elapsed - -# 💬 REPL for testing +# 🧪 Interactive loop if __name__ == "__main__": - print(f"Time elapsed: {time.time() - begin_time:.2f} seconds") - print("👋 Welcome to Kshama. Type 'exit' to leave.") + print(f"🚀 Booted in {time.time() - begin_time:.2f}s") + print("👋 Welcome to Kshama. Type 'exit' to quit.") while True: user_input = input("\n🧑 You: ") if user_input.strip().lower() in ["exit", "quit"]: break - response = generate_response(user_input) - print(f"\n🤖 ক্ষমা: {response}") + response, t = generate_response(user_input) + print(f"\n🤖 ক্ষমা [{t:.2f}s]: {response}")