Streamline system prompts, rename functions for clarity, and improve search query generation. Fix memory query edge cases and enhance robustness when no indexed data exists. Minor wording adjustments and structure improvements for better maintainability.
129 lines
5.2 KiB
Python
129 lines
5.2 KiB
Python
import time
|
|
import torch
|
|
from transformers import pipeline, AutoTokenizer
|
|
from memory import Memory
|
|
from web_search_helper import WebSearchHelper
|
|
|
|
begin_time = time.time()
|
|
|
|
# === 🔧 Load model + tokenizer ===
|
|
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
pipe = pipeline(
|
|
"text-generation",
|
|
model=model_id,
|
|
tokenizer=tokenizer,
|
|
torch_dtype=torch.bfloat16,
|
|
device_map="auto",
|
|
pad_token_id=128001
|
|
)
|
|
|
|
# === 🔌 Core modules ===
|
|
memory = Memory()
|
|
searcher = WebSearchHelper()
|
|
|
|
# === 🧭 System behavior instruction ===
|
|
SYSTEM_PROMPT = """
|
|
You are personal AI assistant. You're wise, efficient, and intentional.
|
|
|
|
You can:
|
|
- Recall long-term memory and use it to answer.
|
|
- Summarize long documents clearly.
|
|
- Perform web search *only if you believe it's necessary*, and clearly state that with ##SEARCH:yes.
|
|
|
|
You also refine web search queries using what you understand of the user's intent.
|
|
Always follow this format:
|
|
- ##MEM:add("...") to add memories
|
|
- ##SEARCH:yes or ##SEARCH:no on its own line to trigger or skip web search
|
|
- After search: generate a clear answer, using memory and the retrieved summaries
|
|
"""
|
|
|
|
# === 📘 Summarization using main model ===
|
|
def summarize_with_llama(text: str) -> str:
|
|
prompt = f"Summarize the following:\n\n{text.strip()}\n\nSummary:"
|
|
output = pipe(prompt, max_new_tokens=256)
|
|
return output[0]["generated_text"].replace(prompt, "").strip()
|
|
|
|
# === 🔍 Ask if search is needed ===
|
|
def ask_should_search(user_input, mem_text, kb_text):
|
|
messages = [
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|
{"role": "user", "content": f"User asked: {user_input}"},
|
|
{"role": "user", "content": f"Memory:\n{mem_text or '[None]'}"},
|
|
{"role": "user", "content": f"Web Knowledge:\n{kb_text or '[None]'}"},
|
|
{"role": "user", "content": "Do you need to search the web to answer this? Reply ##SEARCH:yes or ##SEARCH:no on the first line only."}
|
|
]
|
|
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
output = pipe(prompt, max_new_tokens=16)
|
|
reply = output[0]["generated_text"].strip().lower()
|
|
return reply.splitlines()[0].strip().__contains__("##SEARCH:yes")
|
|
|
|
# === ✍️ Compose better search query ===
|
|
def compose_search_query(user_input, mem_text):
|
|
messages = [
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|
{"role": "user", "content": f"User asked: {user_input}"},
|
|
{"role": "user", "content": f"Relevant memory:\n{mem_text or '[None]'}"},
|
|
{"role": "user", "content": "Rewrite a concise web search query to find useful info. Output only the query string, nothing else."}
|
|
]
|
|
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
output = pipe(prompt, max_new_tokens=32)
|
|
return output[0]["generated_text"].strip().splitlines()[0]
|
|
|
|
# === 🧠 Main reasoning function ===
|
|
def generate_response(user_input: str):
|
|
# Step 1: Recall memory and web KB
|
|
mem_hits = memory.query(user_input, top_k=3)
|
|
mem_text = "\n".join([f"- {x}" for x in mem_hits])
|
|
|
|
_, kb_hits = searcher.query_kb(user_input)
|
|
kb_text = "\n".join([f"- {k['summary']}" for k in kb_hits])
|
|
|
|
# Step 2: Ask model if search is truly required
|
|
if ask_should_search(user_input, mem_text, kb_text):
|
|
print("[🌐 Search Triggered]")
|
|
search_query = compose_search_query(user_input, mem_text)
|
|
print(f"[🔎 Composed Query] {search_query}")
|
|
urls = searcher.search_duckduckgo(search_query)
|
|
summaries = searcher.crawl_and_summarize(urls, llm_function=summarize_with_llama)
|
|
searcher.add_to_kb(summaries)
|
|
_, kb_hits = searcher.query_kb(user_input)
|
|
kb_text = "\n".join([f"- {k['summary']}" for k in kb_hits])
|
|
else:
|
|
print("[🔒 Search Skipped]")
|
|
|
|
# Step 3: Final answer generation
|
|
messages = [
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|
{"role": "user", "content": user_input},
|
|
{"role": "user", "content": f"Memory:\n{mem_text or '[None]'}"},
|
|
{"role": "user", "content": f"Web Knowledge:\n{kb_text or '[None]'}"}
|
|
]
|
|
full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
start = time.time()
|
|
output = pipe(full_prompt, max_new_tokens=512)
|
|
elapsed = time.time() - start
|
|
response = output[0]["generated_text"].replace(full_prompt, "").strip()
|
|
|
|
if "##MEM:add(" in response:
|
|
try:
|
|
content = response.split("##MEM:add(")[1].split(")")[0].strip('"\'')
|
|
memory.add(content)
|
|
print("[✅ Memory Added]")
|
|
except Exception as e:
|
|
print(f"[⚠️ Failed to add memory]: {e}")
|
|
|
|
return response, elapsed
|
|
|
|
# === 💬 REPL Loop ===
|
|
if __name__ == "__main__":
|
|
print(f"🚀 Kshama ready in {time.time() - begin_time:.2f}s")
|
|
print("👋 Hello, Abu. Type 'exit' to quit.")
|
|
while True:
|
|
user_input = input("\n🧑 You: ")
|
|
if user_input.strip().lower() in ["exit", "quit"]:
|
|
print("👋 Goodbye.")
|
|
break
|
|
response, delay = generate_response(user_input)
|
|
print(f"\n🤖 ক্ষমা [{delay:.2f}s]: {response}")
|