109 lines
3.7 KiB
Python
109 lines
3.7 KiB
Python
import time
|
||
begin_time = time.time()
|
||
|
||
import torch
|
||
from transformers import pipeline, AutoTokenizer
|
||
from memory import Memory
|
||
from web_search_helper import WebSearchHelper
|
||
from llm_wrapper import LlmWrapper
|
||
|
||
# Initialize components
|
||
memory = Memory()
|
||
searcher = WebSearchHelper()
|
||
summarizer = LlmWrapper(model_name="Qwen/Qwen3-0.6B") # optional, could summarize search results
|
||
|
||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
|
||
# Load your main LLM (Llama 3.2:1B-Instruct)
|
||
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
||
pipe = pipeline(
|
||
"text-generation",
|
||
model=model_id,
|
||
torch_dtype=torch.bfloat16,
|
||
device_map="auto",
|
||
)
|
||
|
||
# Define system prompt and Kshama's capabilities
|
||
SYSTEM_PROMPT = """
|
||
You are ক্ষমা, Abu's personal AI assistant. You're helpful, respectful, and aligned with his goals and preferences.
|
||
You maintain and query a persistent memory of past interactions and facts via a vector store.
|
||
You can:
|
||
1. Recall relevant knowledge from memory using semantic similarity.
|
||
2. Add new insights to memory when useful.
|
||
3. Perform live web searches and summarize results if memory is insufficient.
|
||
Structure your outputs clearly:
|
||
- Use ##MEM:add(...) to store thoughts to memory.
|
||
- Use ##MEM:recall(...) to request a lookup (already handled externally).
|
||
- Use ##SEARCH:trigger(...) when memory lacks the answer.
|
||
Respond in clear, friendly tone. Actively use what you know about Abu’s past work (e.g., GANs, TensorFlow, Exopid).
|
||
"""
|
||
|
||
def generate_response(user_input: str):
|
||
# Step 1: Recall relevant memory
|
||
recalled = memory.query(user_input, top_k=3)
|
||
memory_context = "\n".join([f"- {item}" for item in recalled])
|
||
|
||
# Step 2: Evaluate recall quality
|
||
should_search = searcher.should_trigger_search(text=user_input)
|
||
kb_hits = ""
|
||
if should_search:
|
||
urls = searcher.search_duckduckgo(user_input)
|
||
summaries = searcher.crawl_and_summarize(urls, llm_function=summarizer.summarize)
|
||
searcher.add_to_kb(summaries)
|
||
_, hits = searcher.query_kb(user_input)
|
||
kb_hits = "\n".join([f"- {h['summary']}" for h in hits])
|
||
else:
|
||
_, hits = searcher.query_kb(user_input)
|
||
kb_hits = "\n".join([f"- {h['summary']}" for h in hits])
|
||
|
||
# Step 3: Compose structured messages
|
||
context_block = f"""Known facts from memory:
|
||
{memory_context or '[None]'}
|
||
|
||
External knowledge from web:
|
||
{kb_hits or '[None]'}
|
||
"""
|
||
|
||
messages = [
|
||
{"role": "system", "content": SYSTEM_PROMPT},
|
||
{"role": "user", "content": f"{context_block}\nUser asked: {user_input}"},
|
||
]
|
||
|
||
# Convert using chat template
|
||
prompt = tokenizer.apply_chat_template(
|
||
messages,
|
||
tokenize=False,
|
||
add_generation_prompt=True # appends assistant tag if needed
|
||
)
|
||
|
||
# Step 4: Call the model
|
||
output = pipe(
|
||
prompt,
|
||
max_new_tokens=512,
|
||
do_sample=True,
|
||
temperature=0.7,
|
||
)
|
||
|
||
# Step 5: Process model output (add to memory if marked)
|
||
response = output[0]["generated_text"].strip()
|
||
|
||
if "##MEM:add(" in response:
|
||
try:
|
||
content = response.split("##MEM:add(")[1].split(")")[0].strip('"\'')
|
||
memory.add(content)
|
||
print("[✅ Memory Added]")
|
||
except:
|
||
print("[⚠️ Couldn't parse memory add]")
|
||
|
||
return response
|
||
|
||
|
||
# 💬 REPL for testing
|
||
if __name__ == "__main__":
|
||
print(f"Time elapsed: {time.time() - begin_time:.2f} seconds")
|
||
print("👋 Welcome to Kshama. Type 'exit' to leave.")
|
||
while True:
|
||
user_input = input("\n🧑 You: ")
|
||
if user_input.strip().lower() in ["exit", "quit"]: break
|
||
response = generate_response(user_input)
|
||
print(f"\n🤖 ক্ষমা: {response}")
|