Streamlined agent initialization, separating components for clarity. Improved memory, web search, and response generation logic for better structure and efficiency. Added elapsed time display and refined interactive REPL messaging.
113 lines
4.3 KiB
Python
113 lines
4.3 KiB
Python
import time
|
|
import torch
|
|
from transformers import pipeline, AutoTokenizer
|
|
from memory import Memory
|
|
from web_search_helper import WebSearchHelper
|
|
|
|
# Initialize clock
|
|
begin_time = time.time()
|
|
|
|
# 🔧 Load model and tokenizer (Llama3.2:1B)
|
|
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
pipe = pipeline(
|
|
"text-generation",
|
|
model=model_id,
|
|
torch_dtype=torch.bfloat16,
|
|
device_map="auto",
|
|
pad_token_id=128001 # Suppress warnings
|
|
)
|
|
|
|
# 🧩 Agent components
|
|
memory = Memory()
|
|
searcher = WebSearchHelper()
|
|
|
|
# 🧭 System prompt (Kshama's persona + capabilities)
|
|
SYSTEM_PROMPT = """
|
|
You are ক্ষমা, Abu's personal AI assistant. You're helpful, respectful, and aligned with his goals and preferences.
|
|
You can:
|
|
1. Recall relevant information from long-term memory.
|
|
2. Decide whether to perform a web search if the memory lacks necessary detail.
|
|
3. Summarize text clearly when requested.
|
|
|
|
You use these tags:
|
|
- ##MEM:add("...") to store information in memory.
|
|
- ##SEARCH:yes if a web search is needed.
|
|
- ##SEARCH:no if memory is sufficient.
|
|
|
|
Be concise but friendly. Don't suggest a search unless it is clearly needed.
|
|
"""
|
|
|
|
# 📝 Wrapper: summarize text with Llama
|
|
def summarize_with_llama(text):
|
|
prompt = f"Summarize the following webpage text:\n\n{text.strip()}\n\nSummary:"
|
|
output = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7)
|
|
return output[0]["generated_text"].replace(prompt, "").strip()
|
|
|
|
# 🎯 Ask model if it needs web search
|
|
def should_search(user_input, memory_hits, kb_hits):
|
|
messages = [
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|
{"role": "user", "content": f"User asked: {user_input}"},
|
|
{"role": "user", "content": f"Known memory:\n{memory_hits or '[None]'}"},
|
|
{"role": "user", "content": f"Web knowledge:\n{kb_hits or '[None]'}"},
|
|
{"role": "user", "content": "Do you need more information to answer this? Reply with ##SEARCH:yes or ##SEARCH:no."},
|
|
]
|
|
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
output = pipe(prompt, max_new_tokens=32, do_sample=False)
|
|
reply = output[0]["generated_text"].strip().lower()
|
|
return "##search:yes" in reply
|
|
|
|
# 🧠 Core reasoning + memory loop
|
|
def generate_response(user_input: str):
|
|
# Step 1: recall memory + web KB
|
|
memory_hits = memory.query(user_input, top_k=3)
|
|
mem_text = "\n".join([f"- {x}" for x in memory_hits])
|
|
|
|
_, kb = searcher.query_kb(user_input, top_k=3)
|
|
kb_text = "\n".join([f"- {x['summary']}" for x in kb])
|
|
|
|
# Step 2: let Kshama decide if she wants to search
|
|
if should_search(user_input, mem_text, kb_text):
|
|
urls = searcher.search_duckduckgo(user_input)
|
|
summaries = searcher.crawl_and_summarize(urls, llm_function=summarize_with_llama)
|
|
searcher.add_to_kb(summaries)
|
|
_, kb = searcher.query_kb(user_input)
|
|
kb_text = "\n".join([f"- {x['summary']}" for x in kb])
|
|
|
|
# Step 3: Compose final answer prompt
|
|
messages = [
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|
{"role": "user", "content": f"{user_input}"},
|
|
{"role": "user", "content": f"Relevant memory:\n{mem_text or '[None]'}"},
|
|
{"role": "user", "content": f"Web knowledge:\n{kb_text or '[None]'}"}
|
|
]
|
|
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
|
|
# Step 4: generate final response
|
|
start = time.time()
|
|
output = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)
|
|
elapsed = time.time() - start
|
|
response = output[0]["generated_text"].replace(prompt, "").strip()
|
|
|
|
# Step 5: parse memory intent
|
|
if "##MEM:add(" in response:
|
|
try:
|
|
thought = response.split("##MEM:add(")[1].split(")")[0].strip('"\'')
|
|
memory.add(thought)
|
|
print("[✅ Memory Added]")
|
|
except:
|
|
print("[⚠️ Could not parse memory directive]")
|
|
|
|
return response, elapsed
|
|
|
|
# 🧪 Interactive loop
|
|
if __name__ == "__main__":
|
|
print(f"🚀 Booted in {time.time() - begin_time:.2f}s")
|
|
print("👋 Welcome to Kshama. Type 'exit' to quit.")
|
|
while True:
|
|
user_input = input("\n🧑 You: ")
|
|
if user_input.strip().lower() in ["exit", "quit"]: break
|
|
response, t = generate_response(user_input)
|
|
print(f"\n🤖 ক্ষমা [{t:.2f}s]: {response}")
|