import time import torch from transformers import pipeline, AutoTokenizer from memory import Memory from web_search_helper import WebSearchHelper # Initialize clock begin_time = time.time() # 🔧 Load model and tokenizer (Llama3.2:1B) model_id = "meta-llama/Llama-3.2-1B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_id) pipe = pipeline( "text-generation", model=model_id, torch_dtype=torch.bfloat16, device_map="auto", pad_token_id=128001 # Suppress warnings ) # 🧩 Agent components memory = Memory() searcher = WebSearchHelper() # 🧭 System prompt (Kshama's persona + capabilities) SYSTEM_PROMPT = """ You are ক্ষমা, Abu's personal AI assistant. You're helpful, respectful, and aligned with his goals and preferences. You can: 1. Recall relevant information from long-term memory. 2. Decide whether to perform a web search if the memory lacks necessary detail. 3. Summarize text clearly when requested. You use these tags: - ##MEM:add("...") to store information in memory. - ##SEARCH:yes if a web search is needed. - ##SEARCH:no if memory is sufficient. Be concise but friendly. Don't suggest a search unless it is clearly needed. """ # 📝 Wrapper: summarize text with Llama def summarize_with_llama(text): prompt = f"Summarize the following webpage text:\n\n{text.strip()}\n\nSummary:" output = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7) return output[0]["generated_text"].replace(prompt, "").strip() # 🎯 Ask model if it needs web search def should_search(user_input, memory_hits, kb_hits): messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": f"User asked: {user_input}"}, {"role": "user", "content": f"Known memory:\n{memory_hits or '[None]'}"}, {"role": "user", "content": f"Web knowledge:\n{kb_hits or '[None]'}"}, {"role": "user", "content": "Do you need more information to answer this? Reply with ##SEARCH:yes or ##SEARCH:no."}, ] prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) output = pipe(prompt, max_new_tokens=32, do_sample=False) reply = output[0]["generated_text"].strip().lower() return "##search:yes" in reply # 🧠 Core reasoning + memory loop def generate_response(user_input: str): # Step 1: recall memory + web KB memory_hits = memory.query(user_input, top_k=3) mem_text = "\n".join([f"- {x}" for x in memory_hits]) _, kb = searcher.query_kb(user_input, top_k=3) kb_text = "\n".join([f"- {x['summary']}" for x in kb]) # Step 2: let Kshama decide if she wants to search if should_search(user_input, mem_text, kb_text): urls = searcher.search_duckduckgo(user_input) summaries = searcher.crawl_and_summarize(urls, llm_function=summarize_with_llama) searcher.add_to_kb(summaries) _, kb = searcher.query_kb(user_input) kb_text = "\n".join([f"- {x['summary']}" for x in kb]) # Step 3: Compose final answer prompt messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": f"{user_input}"}, {"role": "user", "content": f"Relevant memory:\n{mem_text or '[None]'}"}, {"role": "user", "content": f"Web knowledge:\n{kb_text or '[None]'}"} ] prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Step 4: generate final response start = time.time() output = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7) elapsed = time.time() - start response = output[0]["generated_text"].replace(prompt, "").strip() # Step 5: parse memory intent if "##MEM:add(" in response: try: thought = response.split("##MEM:add(")[1].split(")")[0].strip('"\'') memory.add(thought) print("[✅ Memory Added]") except: print("[⚠️ Could not parse memory directive]") return response, elapsed # 🧪 Interactive loop if __name__ == "__main__": print(f"🚀 Booted in {time.time() - begin_time:.2f}s") print("👋 Welcome to Kshama. Type 'exit' to quit.") while True: user_input = input("\n🧑 You: ") if user_input.strip().lower() in ["exit", "quit"]: break response, t = generate_response(user_input) print(f"\n🤖 ক্ষমা [{t:.2f}s]: {response}")