kshama/agent.py

import time
import torch
from transformers import pipeline, AutoTokenizer
from memory import Memory
from web_search_helper import WebSearchHelper

# Initialize clock
begin_time = time.time()

# 🔧 Load model and tokenizer (Llama3.2:1B)
model_id = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    pad_token_id=128001  # Suppress warnings
)

# 🧩 Agent components
memory = Memory()
searcher = WebSearchHelper()

# 🧭 System prompt (Kshama's persona + capabilities)
SYSTEM_PROMPT = """
You are ক্ষমা, Abu's personal AI assistant. You're helpful, respectful, and aligned with his goals and preferences.
You can:
1. Recall relevant information from long-term memory.
2. Decide whether to perform a web search if the memory lacks necessary detail.
3. Summarize text clearly when requested.

You use these tags:
- ##MEM:add("...") to store information in memory.
- ##SEARCH:yes if a web search is needed.
- ##SEARCH:no if memory is sufficient.

Be concise but friendly. Don't suggest a search unless it is clearly needed.
"""

# 📝 Wrapper: summarize text with Llama
def summarize_with_llama(text):
    prompt = f"Summarize the following webpage text:\n\n{text.strip()}\n\nSummary:"
    output = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7)
    return output[0]["generated_text"].replace(prompt, "").strip()

# 🎯 Ask model if it needs web search
def should_search(user_input, memory_hits, kb_hits):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f"User asked: {user_input}"},
        {"role": "user", "content": f"Known memory:\n{memory_hits or '[None]'}"},
        {"role": "user", "content": f"Web knowledge:\n{kb_hits or '[None]'}"},
        {"role": "user", "content": "Do you need more information to answer this? Reply with ##SEARCH:yes or ##SEARCH:no."},
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    output = pipe(prompt, max_new_tokens=32, do_sample=False)
    reply = output[0]["generated_text"].strip().lower()
    return "##search:yes" in reply

# 🧠 Core reasoning + memory loop
def generate_response(user_input: str):
    # Step 1: recall memory + web KB
    memory_hits = memory.query(user_input, top_k=3)
    mem_text = "\n".join([f"- {x}" for x in memory_hits])

    _, kb = searcher.query_kb(user_input, top_k=3)
    kb_text = "\n".join([f"- {x['summary']}" for x in kb])

    # Step 2: let Kshama decide if she wants to search
    if should_search(user_input, mem_text, kb_text):
        urls = searcher.search_duckduckgo(user_input)
        summaries = searcher.crawl_and_summarize(urls, llm_function=summarize_with_llama)
        searcher.add_to_kb(summaries)
        _, kb = searcher.query_kb(user_input)
        kb_text = "\n".join([f"- {x['summary']}" for x in kb])

    # Step 3: Compose final answer prompt
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f"{user_input}"},
        {"role": "user", "content": f"Relevant memory:\n{mem_text or '[None]'}"},
        {"role": "user", "content": f"Web knowledge:\n{kb_text or '[None]'}"}
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Step 4: generate final response
    start = time.time()
    output = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)
    elapsed = time.time() - start
    response = output[0]["generated_text"].replace(prompt, "").strip()

    # Step 5: parse memory intent
    if "##MEM:add(" in response:
        try:
            thought = response.split("##MEM:add(")[1].split(")")[0].strip('"\'')
            memory.add(thought)
            print("[✅ Memory Added]")
        except:
            print("[⚠️ Could not parse memory directive]")

    return response, elapsed

# 🧪 Interactive loop
if __name__ == "__main__":
    print(f"🚀 Booted in {time.time() - begin_time:.2f}s")
    print("👋 Welcome to Kshama. Type 'exit' to quit.")
    while True:
        user_input = input("\n🧑 You: ")
        if user_input.strip().lower() in ["exit", "quit"]: break
        response, t = generate_response(user_input)
        print(f"\n🤖 ক্ষমা [{t:.2f}s]: {response}")