diff --git a/agent.py b/agent.py
index b941c6f..e8dcf10 100644
--- a/agent.py
+++ b/agent.py
@@ -1,108 +1,112 @@
 import time
-begin_time = time.time()
-
 import torch
 from transformers import pipeline, AutoTokenizer
 from memory import Memory
 from web_search_helper import WebSearchHelper
-from llm_wrapper import LlmWrapper
 
-# Initialize components
-memory = Memory()
-searcher = WebSearchHelper()
-summarizer = LlmWrapper(model_name="Qwen/Qwen3-0.6B")  # optional, could summarize search results
+# Initialize clock
+begin_time = time.time()
 
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
-# Load your main LLM (Llama 3.2:1B-Instruct)
+# 🔧 Load model and tokenizer (Llama3.2:1B)
 model_id = "meta-llama/Llama-3.2-1B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
 pipe = pipeline(
     "text-generation",
     model=model_id,
     torch_dtype=torch.bfloat16,
     device_map="auto",
+    pad_token_id=128001  # Suppress warnings
 )
 
-# Define system prompt and Kshama's capabilities
+# 🧩 Agent components
+memory = Memory()
+searcher = WebSearchHelper()
+
+# 🧭 System prompt (Kshama's persona + capabilities)
 SYSTEM_PROMPT = """
 You are ক্ষমা, Abu's personal AI assistant. You're helpful, respectful, and aligned with his goals and preferences.
-You maintain and query a persistent memory of past interactions and facts via a vector store.
 You can:
-1. Recall relevant knowledge from memory using semantic similarity.
-2. Add new insights to memory when useful.
-3. Perform live web searches and summarize results if memory is insufficient.
-Structure your outputs clearly:
-- Use ##MEM:add(...) to store thoughts to memory.
-- Use ##MEM:recall(...) to request a lookup (already handled externally).
-- Use ##SEARCH:trigger(...) when memory lacks the answer.
-Respond in clear, friendly tone. Actively use what you know about Abu’s past work (e.g., GANs, TensorFlow, Exopid).
+1. Recall relevant information from long-term memory.
+2. Decide whether to perform a web search if the memory lacks necessary detail.
+3. Summarize text clearly when requested.
+
+You use these tags:
+- ##MEM:add("...") to store information in memory.
+- ##SEARCH:yes if a web search is needed.
+- ##SEARCH:no if memory is sufficient.
+
+Be concise but friendly. Don't suggest a search unless it is clearly needed.
 """
 
-def generate_response(user_input: str):
-    # Step 1: Recall relevant memory
-    recalled = memory.query(user_input, top_k=3)
-    memory_context = "\n".join([f"- {item}" for item in recalled])
-
-    # Step 2: Evaluate recall quality
-    should_search = searcher.should_trigger_search(text=user_input)
-    kb_hits = ""
-    if should_search:
-        urls = searcher.search_duckduckgo(user_input)
-        summaries = searcher.crawl_and_summarize(urls, llm_function=summarizer.summarize)
-        searcher.add_to_kb(summaries)
-        _, hits = searcher.query_kb(user_input)
-        kb_hits = "\n".join([f"- {h['summary']}" for h in hits])
-    else:
-        _, hits = searcher.query_kb(user_input)
-        kb_hits = "\n".join([f"- {h['summary']}" for h in hits])
-
-    # Step 3: Compose structured messages
-    context_block = f"""Known facts from memory:
-        {memory_context or '[None]'}
-        
-        External knowledge from web:
-        {kb_hits or '[None]'}
-        """
+# 📝 Wrapper: summarize text with Llama
+def summarize_with_llama(text):
+    prompt = f"Summarize the following webpage text:\n\n{text.strip()}\n\nSummary:"
+    output = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7)
+    return output[0]["generated_text"].replace(prompt, "").strip()
 
+# 🎯 Ask model if it needs web search
+def should_search(user_input, memory_hits, kb_hits):
     messages = [
         {"role": "system", "content": SYSTEM_PROMPT},
-        {"role": "user", "content": f"{context_block}\nUser asked: {user_input}"},
+        {"role": "user", "content": f"User asked: {user_input}"},
+        {"role": "user", "content": f"Known memory:\n{memory_hits or '[None]'}"},
+        {"role": "user", "content": f"Web knowledge:\n{kb_hits or '[None]'}"},
+        {"role": "user", "content": "Do you need more information to answer this? Reply with ##SEARCH:yes or ##SEARCH:no."},
     ]
+    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    output = pipe(prompt, max_new_tokens=32, do_sample=False)
+    reply = output[0]["generated_text"].strip().lower()
+    return "##search:yes" in reply
 
-    # Convert using chat template
-    prompt = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True  # appends assistant tag if needed
-    )
+# 🧠 Core reasoning + memory loop
+def generate_response(user_input: str):
+    # Step 1: recall memory + web KB
+    memory_hits = memory.query(user_input, top_k=3)
+    mem_text = "\n".join([f"- {x}" for x in memory_hits])
 
-    # Step 4: Call the model
-    output = pipe(
-        prompt,
-        max_new_tokens=512,
-        do_sample=True,
-        temperature=0.7,
-    )
+    _, kb = searcher.query_kb(user_input, top_k=3)
+    kb_text = "\n".join([f"- {x['summary']}" for x in kb])
 
-    # Step 5: Process model output (add to memory if marked)
-    response = output[0]["generated_text"].strip()
+    # Step 2: let Kshama decide if she wants to search
+    if should_search(user_input, mem_text, kb_text):
+        urls = searcher.search_duckduckgo(user_input)
+        summaries = searcher.crawl_and_summarize(urls, llm_function=summarize_with_llama)
+        searcher.add_to_kb(summaries)
+        _, kb = searcher.query_kb(user_input)
+        kb_text = "\n".join([f"- {x['summary']}" for x in kb])
 
+    # Step 3: Compose final answer prompt
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": f"{user_input}"},
+        {"role": "user", "content": f"Relevant memory:\n{mem_text or '[None]'}"},
+        {"role": "user", "content": f"Web knowledge:\n{kb_text or '[None]'}"}
+    ]
+    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+    # Step 4: generate final response
+    start = time.time()
+    output = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)
+    elapsed = time.time() - start
+    response = output[0]["generated_text"].replace(prompt, "").strip()
+
+    # Step 5: parse memory intent
     if "##MEM:add(" in response:
         try:
-            content = response.split("##MEM:add(")[1].split(")")[0].strip('"\'')
-            memory.add(content)
+            thought = response.split("##MEM:add(")[1].split(")")[0].strip('"\'')
+            memory.add(thought)
             print("[✅ Memory Added]")
         except:
-            print("[⚠️ Couldn't parse memory add]")
+            print("[⚠️ Could not parse memory directive]")
 
-    return response
+    return response, elapsed
 
-
-# 💬 REPL for testing
+# 🧪 Interactive loop
 if __name__ == "__main__":
-    print(f"Time elapsed: {time.time() - begin_time:.2f} seconds")
-    print("👋 Welcome to Kshama. Type 'exit' to leave.")
+    print(f"🚀 Booted in {time.time() - begin_time:.2f}s")
+    print("👋 Welcome to Kshama. Type 'exit' to quit.")
     while True:
         user_input = input("\n🧑 You: ")
         if user_input.strip().lower() in ["exit", "quit"]: break
-        response = generate_response(user_input)
-        print(f"\n🤖 ক্ষমা: {response}")
+        response, t = generate_response(user_input)
+        print(f"\n🤖 ক্ষমা [{t:.2f}s]: {response}")