kshama/agent.py
sufian 79e345097e Refactor and enhance system logic and memory handling.
Streamline system prompts, rename functions for clarity, and improve search query generation. Fix memory query edge cases and enhance robustness when no indexed data exists. Minor wording adjustments and structure improvements for better maintainability.
2025-06-29 23:37:37 +06:00

129 lines
5.2 KiB
Python

import time
import torch
from transformers import pipeline, AutoTokenizer
from memory import Memory
from web_search_helper import WebSearchHelper
begin_time = time.time()
# === 🔧 Load model + tokenizer ===
model_id = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline(
"text-generation",
model=model_id,
tokenizer=tokenizer,
torch_dtype=torch.bfloat16,
device_map="auto",
pad_token_id=128001
)
# === 🔌 Core modules ===
memory = Memory()
searcher = WebSearchHelper()
# === 🧭 System behavior instruction ===
SYSTEM_PROMPT = """
You are personal AI assistant. You're wise, efficient, and intentional.
You can:
- Recall long-term memory and use it to answer.
- Summarize long documents clearly.
- Perform web search *only if you believe it's necessary*, and clearly state that with ##SEARCH:yes.
You also refine web search queries using what you understand of the user's intent.
Always follow this format:
- ##MEM:add("...") to add memories
- ##SEARCH:yes or ##SEARCH:no on its own line to trigger or skip web search
- After search: generate a clear answer, using memory and the retrieved summaries
"""
# === 📘 Summarization using main model ===
def summarize_with_llama(text: str) -> str:
prompt = f"Summarize the following:\n\n{text.strip()}\n\nSummary:"
output = pipe(prompt, max_new_tokens=256)
return output[0]["generated_text"].replace(prompt, "").strip()
# === 🔍 Ask if search is needed ===
def ask_should_search(user_input, mem_text, kb_text):
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"User asked: {user_input}"},
{"role": "user", "content": f"Memory:\n{mem_text or '[None]'}"},
{"role": "user", "content": f"Web Knowledge:\n{kb_text or '[None]'}"},
{"role": "user", "content": "Do you need to search the web to answer this? Reply ##SEARCH:yes or ##SEARCH:no on the first line only."}
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
output = pipe(prompt, max_new_tokens=16)
reply = output[0]["generated_text"].strip().lower()
return reply.splitlines()[0].strip().__contains__("##SEARCH:yes")
# === ✍️ Compose better search query ===
def compose_search_query(user_input, mem_text):
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"User asked: {user_input}"},
{"role": "user", "content": f"Relevant memory:\n{mem_text or '[None]'}"},
{"role": "user", "content": "Rewrite a concise web search query to find useful info. Output only the query string, nothing else."}
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
output = pipe(prompt, max_new_tokens=32)
return output[0]["generated_text"].strip().splitlines()[0]
# === 🧠 Main reasoning function ===
def generate_response(user_input: str):
# Step 1: Recall memory and web KB
mem_hits = memory.query(user_input, top_k=3)
mem_text = "\n".join([f"- {x}" for x in mem_hits])
_, kb_hits = searcher.query_kb(user_input)
kb_text = "\n".join([f"- {k['summary']}" for k in kb_hits])
# Step 2: Ask model if search is truly required
if ask_should_search(user_input, mem_text, kb_text):
print("[🌐 Search Triggered]")
search_query = compose_search_query(user_input, mem_text)
print(f"[🔎 Composed Query] {search_query}")
urls = searcher.search_duckduckgo(search_query)
summaries = searcher.crawl_and_summarize(urls, llm_function=summarize_with_llama)
searcher.add_to_kb(summaries)
_, kb_hits = searcher.query_kb(user_input)
kb_text = "\n".join([f"- {k['summary']}" for k in kb_hits])
else:
print("[🔒 Search Skipped]")
# Step 3: Final answer generation
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_input},
{"role": "user", "content": f"Memory:\n{mem_text or '[None]'}"},
{"role": "user", "content": f"Web Knowledge:\n{kb_text or '[None]'}"}
]
full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
start = time.time()
output = pipe(full_prompt, max_new_tokens=512)
elapsed = time.time() - start
response = output[0]["generated_text"].replace(full_prompt, "").strip()
if "##MEM:add(" in response:
try:
content = response.split("##MEM:add(")[1].split(")")[0].strip('"\'')
memory.add(content)
print("[✅ Memory Added]")
except Exception as e:
print(f"[⚠️ Failed to add memory]: {e}")
return response, elapsed
# === 💬 REPL Loop ===
if __name__ == "__main__":
print(f"🚀 Kshama ready in {time.time() - begin_time:.2f}s")
print("👋 Hello, Abu. Type 'exit' to quit.")
while True:
user_input = input("\n🧑 You: ")
if user_input.strip().lower() in ["exit", "quit"]:
print("👋 Goodbye.")
break
response, delay = generate_response(user_input)
print(f"\n🤖 ক্ষমা [{delay:.2f}s]: {response}")