import time begin_time = time.time() import torch from transformers import pipeline, AutoTokenizer from memory import Memory from web_search_helper import WebSearchHelper from llm_wrapper import LlmWrapper # Initialize components memory = Memory() searcher = WebSearchHelper() summarizer = LlmWrapper(model_name="Qwen/Qwen3-0.6B") # optional, could summarize search results tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct") # Load your main LLM (Llama 3.2:1B-Instruct) model_id = "meta-llama/Llama-3.2-1B-Instruct" pipe = pipeline( "text-generation", model=model_id, torch_dtype=torch.bfloat16, device_map="auto", ) # Define system prompt and Kshama's capabilities SYSTEM_PROMPT = """ You are ক্ষমা, Abu's personal AI assistant. You're helpful, respectful, and aligned with his goals and preferences. You maintain and query a persistent memory of past interactions and facts via a vector store. You can: 1. Recall relevant knowledge from memory using semantic similarity. 2. Add new insights to memory when useful. 3. Perform live web searches and summarize results if memory is insufficient. Structure your outputs clearly: - Use ##MEM:add(...) to store thoughts to memory. - Use ##MEM:recall(...) to request a lookup (already handled externally). - Use ##SEARCH:trigger(...) when memory lacks the answer. Respond in clear, friendly tone. Actively use what you know about Abu’s past work (e.g., GANs, TensorFlow, Exopid). """ def generate_response(user_input: str): # Step 1: Recall relevant memory recalled = memory.query(user_input, top_k=3) memory_context = "\n".join([f"- {item}" for item in recalled]) # Step 2: Evaluate recall quality should_search = searcher.should_trigger_search(text=user_input) kb_hits = "" if should_search: urls = searcher.search_duckduckgo(user_input) summaries = searcher.crawl_and_summarize(urls, llm_function=summarizer.summarize) searcher.add_to_kb(summaries) _, hits = searcher.query_kb(user_input) kb_hits = "\n".join([f"- {h['summary']}" for h in hits]) else: _, hits = searcher.query_kb(user_input) kb_hits = "\n".join([f"- {h['summary']}" for h in hits]) # Step 3: Compose structured messages context_block = f"""Known facts from memory: {memory_context or '[None]'} External knowledge from web: {kb_hits or '[None]'} """ messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": f"{context_block}\nUser asked: {user_input}"}, ] # Convert using chat template prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True # appends assistant tag if needed ) # Step 4: Call the model output = pipe( prompt, max_new_tokens=512, do_sample=True, temperature=0.7, ) # Step 5: Process model output (add to memory if marked) response = output[0]["generated_text"].strip() if "##MEM:add(" in response: try: content = response.split("##MEM:add(")[1].split(")")[0].strip('"\'') memory.add(content) print("[✅ Memory Added]") except: print("[⚠️ Couldn't parse memory add]") return response # 💬 REPL for testing if __name__ == "__main__": print(f"Time elapsed: {time.time() - begin_time:.2f} seconds") print("👋 Welcome to Kshama. Type 'exit' to leave.") while True: user_input = input("\n🧑 You: ") if user_input.strip().lower() in ["exit", "quit"]: break response = generate_response(user_input) print(f"\n🤖 ক্ষমা: {response}")