Refactor agent initialization and response workflow
Streamlined agent initialization, separating components for clarity. Improved memory, web search, and response generation logic for better structure and efficiency. Added elapsed time display and refined interactive REPL messaging.
This commit is contained in:
parent
4b5921d829
commit
d845c29e81
144
agent.py
144
agent.py
@ -1,108 +1,112 @@
|
|||||||
import time
|
import time
|
||||||
begin_time = time.time()
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from transformers import pipeline, AutoTokenizer
|
from transformers import pipeline, AutoTokenizer
|
||||||
from memory import Memory
|
from memory import Memory
|
||||||
from web_search_helper import WebSearchHelper
|
from web_search_helper import WebSearchHelper
|
||||||
from llm_wrapper import LlmWrapper
|
|
||||||
|
|
||||||
# Initialize components
|
# Initialize clock
|
||||||
memory = Memory()
|
begin_time = time.time()
|
||||||
searcher = WebSearchHelper()
|
|
||||||
summarizer = LlmWrapper(model_name="Qwen/Qwen3-0.6B") # optional, could summarize search results
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
|
# 🔧 Load model and tokenizer (Llama3.2:1B)
|
||||||
# Load your main LLM (Llama 3.2:1B-Instruct)
|
|
||||||
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
pipe = pipeline(
|
pipe = pipeline(
|
||||||
"text-generation",
|
"text-generation",
|
||||||
model=model_id,
|
model=model_id,
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.bfloat16,
|
||||||
device_map="auto",
|
device_map="auto",
|
||||||
|
pad_token_id=128001 # Suppress warnings
|
||||||
)
|
)
|
||||||
|
|
||||||
# Define system prompt and Kshama's capabilities
|
# 🧩 Agent components
|
||||||
|
memory = Memory()
|
||||||
|
searcher = WebSearchHelper()
|
||||||
|
|
||||||
|
# 🧭 System prompt (Kshama's persona + capabilities)
|
||||||
SYSTEM_PROMPT = """
|
SYSTEM_PROMPT = """
|
||||||
You are ক্ষমা, Abu's personal AI assistant. You're helpful, respectful, and aligned with his goals and preferences.
|
You are ক্ষমা, Abu's personal AI assistant. You're helpful, respectful, and aligned with his goals and preferences.
|
||||||
You maintain and query a persistent memory of past interactions and facts via a vector store.
|
|
||||||
You can:
|
You can:
|
||||||
1. Recall relevant knowledge from memory using semantic similarity.
|
1. Recall relevant information from long-term memory.
|
||||||
2. Add new insights to memory when useful.
|
2. Decide whether to perform a web search if the memory lacks necessary detail.
|
||||||
3. Perform live web searches and summarize results if memory is insufficient.
|
3. Summarize text clearly when requested.
|
||||||
Structure your outputs clearly:
|
|
||||||
- Use ##MEM:add(...) to store thoughts to memory.
|
You use these tags:
|
||||||
- Use ##MEM:recall(...) to request a lookup (already handled externally).
|
- ##MEM:add("...") to store information in memory.
|
||||||
- Use ##SEARCH:trigger(...) when memory lacks the answer.
|
- ##SEARCH:yes if a web search is needed.
|
||||||
Respond in clear, friendly tone. Actively use what you know about Abu’s past work (e.g., GANs, TensorFlow, Exopid).
|
- ##SEARCH:no if memory is sufficient.
|
||||||
|
|
||||||
|
Be concise but friendly. Don't suggest a search unless it is clearly needed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def generate_response(user_input: str):
|
# 📝 Wrapper: summarize text with Llama
|
||||||
# Step 1: Recall relevant memory
|
def summarize_with_llama(text):
|
||||||
recalled = memory.query(user_input, top_k=3)
|
prompt = f"Summarize the following webpage text:\n\n{text.strip()}\n\nSummary:"
|
||||||
memory_context = "\n".join([f"- {item}" for item in recalled])
|
output = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7)
|
||||||
|
return output[0]["generated_text"].replace(prompt, "").strip()
|
||||||
# Step 2: Evaluate recall quality
|
|
||||||
should_search = searcher.should_trigger_search(text=user_input)
|
|
||||||
kb_hits = ""
|
|
||||||
if should_search:
|
|
||||||
urls = searcher.search_duckduckgo(user_input)
|
|
||||||
summaries = searcher.crawl_and_summarize(urls, llm_function=summarizer.summarize)
|
|
||||||
searcher.add_to_kb(summaries)
|
|
||||||
_, hits = searcher.query_kb(user_input)
|
|
||||||
kb_hits = "\n".join([f"- {h['summary']}" for h in hits])
|
|
||||||
else:
|
|
||||||
_, hits = searcher.query_kb(user_input)
|
|
||||||
kb_hits = "\n".join([f"- {h['summary']}" for h in hits])
|
|
||||||
|
|
||||||
# Step 3: Compose structured messages
|
|
||||||
context_block = f"""Known facts from memory:
|
|
||||||
{memory_context or '[None]'}
|
|
||||||
|
|
||||||
External knowledge from web:
|
|
||||||
{kb_hits or '[None]'}
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
# 🎯 Ask model if it needs web search
|
||||||
|
def should_search(user_input, memory_hits, kb_hits):
|
||||||
messages = [
|
messages = [
|
||||||
{"role": "system", "content": SYSTEM_PROMPT},
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
{"role": "user", "content": f"{context_block}\nUser asked: {user_input}"},
|
{"role": "user", "content": f"User asked: {user_input}"},
|
||||||
|
{"role": "user", "content": f"Known memory:\n{memory_hits or '[None]'}"},
|
||||||
|
{"role": "user", "content": f"Web knowledge:\n{kb_hits or '[None]'}"},
|
||||||
|
{"role": "user", "content": "Do you need more information to answer this? Reply with ##SEARCH:yes or ##SEARCH:no."},
|
||||||
]
|
]
|
||||||
|
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||||
|
output = pipe(prompt, max_new_tokens=32, do_sample=False)
|
||||||
|
reply = output[0]["generated_text"].strip().lower()
|
||||||
|
return "##search:yes" in reply
|
||||||
|
|
||||||
# Convert using chat template
|
# 🧠 Core reasoning + memory loop
|
||||||
prompt = tokenizer.apply_chat_template(
|
def generate_response(user_input: str):
|
||||||
messages,
|
# Step 1: recall memory + web KB
|
||||||
tokenize=False,
|
memory_hits = memory.query(user_input, top_k=3)
|
||||||
add_generation_prompt=True # appends assistant tag if needed
|
mem_text = "\n".join([f"- {x}" for x in memory_hits])
|
||||||
)
|
|
||||||
|
|
||||||
# Step 4: Call the model
|
_, kb = searcher.query_kb(user_input, top_k=3)
|
||||||
output = pipe(
|
kb_text = "\n".join([f"- {x['summary']}" for x in kb])
|
||||||
prompt,
|
|
||||||
max_new_tokens=512,
|
|
||||||
do_sample=True,
|
|
||||||
temperature=0.7,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Step 5: Process model output (add to memory if marked)
|
# Step 2: let Kshama decide if she wants to search
|
||||||
response = output[0]["generated_text"].strip()
|
if should_search(user_input, mem_text, kb_text):
|
||||||
|
urls = searcher.search_duckduckgo(user_input)
|
||||||
|
summaries = searcher.crawl_and_summarize(urls, llm_function=summarize_with_llama)
|
||||||
|
searcher.add_to_kb(summaries)
|
||||||
|
_, kb = searcher.query_kb(user_input)
|
||||||
|
kb_text = "\n".join([f"- {x['summary']}" for x in kb])
|
||||||
|
|
||||||
|
# Step 3: Compose final answer prompt
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": f"{user_input}"},
|
||||||
|
{"role": "user", "content": f"Relevant memory:\n{mem_text or '[None]'}"},
|
||||||
|
{"role": "user", "content": f"Web knowledge:\n{kb_text or '[None]'}"}
|
||||||
|
]
|
||||||
|
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||||
|
|
||||||
|
# Step 4: generate final response
|
||||||
|
start = time.time()
|
||||||
|
output = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)
|
||||||
|
elapsed = time.time() - start
|
||||||
|
response = output[0]["generated_text"].replace(prompt, "").strip()
|
||||||
|
|
||||||
|
# Step 5: parse memory intent
|
||||||
if "##MEM:add(" in response:
|
if "##MEM:add(" in response:
|
||||||
try:
|
try:
|
||||||
content = response.split("##MEM:add(")[1].split(")")[0].strip('"\'')
|
thought = response.split("##MEM:add(")[1].split(")")[0].strip('"\'')
|
||||||
memory.add(content)
|
memory.add(thought)
|
||||||
print("[✅ Memory Added]")
|
print("[✅ Memory Added]")
|
||||||
except:
|
except:
|
||||||
print("[⚠️ Couldn't parse memory add]")
|
print("[⚠️ Could not parse memory directive]")
|
||||||
|
|
||||||
return response
|
return response, elapsed
|
||||||
|
|
||||||
|
# 🧪 Interactive loop
|
||||||
# 💬 REPL for testing
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print(f"Time elapsed: {time.time() - begin_time:.2f} seconds")
|
print(f"🚀 Booted in {time.time() - begin_time:.2f}s")
|
||||||
print("👋 Welcome to Kshama. Type 'exit' to leave.")
|
print("👋 Welcome to Kshama. Type 'exit' to quit.")
|
||||||
while True:
|
while True:
|
||||||
user_input = input("\n🧑 You: ")
|
user_input = input("\n🧑 You: ")
|
||||||
if user_input.strip().lower() in ["exit", "quit"]: break
|
if user_input.strip().lower() in ["exit", "quit"]: break
|
||||||
response = generate_response(user_input)
|
response, t = generate_response(user_input)
|
||||||
print(f"\n🤖 ক্ষমা: {response}")
|
print(f"\n🤖 ক্ষমা [{t:.2f}s]: {response}")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user