kshama/agent.py

import time
import torch
from transformers import pipeline, AutoTokenizer
from memory import Memory
from web_search_helper import WebSearchHelper

begin_time = time.time()

# === 🔧 Load model + tokenizer ===
model_id = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline(
    "text-generation",
    model=model_id,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    pad_token_id=128001
)

# === 🔌 Core modules ===
memory = Memory()
searcher = WebSearchHelper()

# === 🧭 System behavior instruction ===
SYSTEM_PROMPT = """
You are personal AI assistant. You're wise, efficient, and intentional.

You can:
- Recall long-term memory and use it to answer.
- Summarize long documents clearly.
- Perform web search *only if you believe it's necessary*, and clearly state that with ##SEARCH:yes.

You also refine web search queries using what you understand of the user's intent.
Always follow this format:
- ##MEM:add("...") to add memories
- ##SEARCH:yes or ##SEARCH:no on its own line to trigger or skip web search
- After search: generate a clear answer, using memory and the retrieved summaries
"""

# === 📘 Summarization using main model ===
def summarize_with_llama(text: str) -> str:
    prompt = f"Summarize the following:\n\n{text.strip()}\n\nSummary:"
    output = pipe(prompt, max_new_tokens=256)
    return output[0]["generated_text"].replace(prompt, "").strip()

# === 🔍 Ask if search is needed ===
def ask_should_search(user_input, mem_text, kb_text):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f"User asked: {user_input}"},
        {"role": "user", "content": f"Memory:\n{mem_text or '[None]'}"},
        {"role": "user", "content": f"Web Knowledge:\n{kb_text or '[None]'}"},
        {"role": "user", "content": "Do you need to search the web to answer this? Reply ##SEARCH:yes or ##SEARCH:no on the first line only."}
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    output = pipe(prompt, max_new_tokens=16)
    reply = output[0]["generated_text"].strip().lower()
    return reply.splitlines()[0].strip().__contains__("##SEARCH:yes")

# === ✍️ Compose better search query ===
def compose_search_query(user_input, mem_text):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f"User asked: {user_input}"},
        {"role": "user", "content": f"Relevant memory:\n{mem_text or '[None]'}"},
        {"role": "user", "content": "Rewrite a concise web search query to find useful info. Output only the query string, nothing else."}
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    output = pipe(prompt, max_new_tokens=32)
    return output[0]["generated_text"].strip().splitlines()[0]

# === 🧠 Main reasoning function ===
def generate_response(user_input: str):
    # Step 1: Recall memory and web KB
    mem_hits = memory.query(user_input, top_k=3)
    mem_text = "\n".join([f"- {x}" for x in mem_hits])

    _, kb_hits = searcher.query_kb(user_input)
    kb_text = "\n".join([f"- {k['summary']}" for k in kb_hits])

    # Step 2: Ask model if search is truly required
    if ask_should_search(user_input, mem_text, kb_text):
        print("[🌐 Search Triggered]")
        search_query = compose_search_query(user_input, mem_text)
        print(f"[🔎 Composed Query] {search_query}")
        urls = searcher.search_duckduckgo(search_query)
        summaries = searcher.crawl_and_summarize(urls, llm_function=summarize_with_llama)
        searcher.add_to_kb(summaries)
        _, kb_hits = searcher.query_kb(user_input)
        kb_text = "\n".join([f"- {k['summary']}" for k in kb_hits])
    else:
        print("[🔒 Search Skipped]")

    # Step 3: Final answer generation
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_input},
        {"role": "user", "content": f"Memory:\n{mem_text or '[None]'}"},
        {"role": "user", "content": f"Web Knowledge:\n{kb_text or '[None]'}"}
    ]
    full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    start = time.time()
    output = pipe(full_prompt, max_new_tokens=512)
    elapsed = time.time() - start
    response = output[0]["generated_text"].replace(full_prompt, "").strip()

    if "##MEM:add(" in response:
        try:
            content = response.split("##MEM:add(")[1].split(")")[0].strip('"\'')
            memory.add(content)
            print("[✅ Memory Added]")
        except Exception as e:
            print(f"[⚠️ Failed to add memory]: {e}")

    return response, elapsed

# === 💬 REPL Loop ===
if __name__ == "__main__":
    print(f"🚀 Kshama ready in {time.time() - begin_time:.2f}s")
    print("👋 Hello, Abu. Type 'exit' to quit.")
    while True:
        user_input = input("\n🧑 You: ")
        if user_input.strip().lower() in ["exit", "quit"]:
            print("👋 Goodbye.")
            break
        response, delay = generate_response(user_input)
        print(f"\n🤖 ক্ষমা [{delay:.2f}s]: {response}")