import time
begin_time = time.time()

import torch
from transformers import pipeline, AutoTokenizer
from memory import Memory
from web_search_helper import WebSearchHelper
from llm_wrapper import LlmWrapper

# Initialize components
memory = Memory()
searcher = WebSearchHelper()
summarizer = LlmWrapper(model_name="Qwen/Qwen3-0.6B")  # optional, could summarize search results

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
# Load your main LLM (Llama 3.2:1B-Instruct)
model_id = "meta-llama/Llama-3.2-1B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# Define system prompt and Kshama's capabilities
SYSTEM_PROMPT = """
You are ক্ষমা, Abu's personal AI assistant. You're helpful, respectful, and aligned with his goals and preferences.
You maintain and query a persistent memory of past interactions and facts via a vector store.
You can:
1. Recall relevant knowledge from memory using semantic similarity.
2. Add new insights to memory when useful.
3. Perform live web searches and summarize results if memory is insufficient.
Structure your outputs clearly:
- Use ##MEM:add(...) to store thoughts to memory.
- Use ##MEM:recall(...) to request a lookup (already handled externally).
- Use ##SEARCH:trigger(...) when memory lacks the answer.
Respond in clear, friendly tone. Actively use what you know about Abu’s past work (e.g., GANs, TensorFlow, Exopid).
"""

def generate_response(user_input: str):
    # Step 1: Recall relevant memory
    recalled = memory.query(user_input, top_k=3)
    memory_context = "\n".join([f"- {item}" for item in recalled])

    # Step 2: Evaluate recall quality
    should_search = searcher.should_trigger_search(text=user_input)
    kb_hits = ""
    if should_search:
        urls = searcher.search_duckduckgo(user_input)
        summaries = searcher.crawl_and_summarize(urls, llm_function=summarizer.summarize)
        searcher.add_to_kb(summaries)
        _, hits = searcher.query_kb(user_input)
        kb_hits = "\n".join([f"- {h['summary']}" for h in hits])
    else:
        _, hits = searcher.query_kb(user_input)
        kb_hits = "\n".join([f"- {h['summary']}" for h in hits])

    # Step 3: Compose structured messages
    context_block = f"""Known facts from memory:
        {memory_context or '[None]'}
        
        External knowledge from web:
        {kb_hits or '[None]'}
        """

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f"{context_block}\nUser asked: {user_input}"},
    ]

    # Convert using chat template
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True  # appends assistant tag if needed
    )

    # Step 4: Call the model
    output = pipe(
        prompt,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
    )

    # Step 5: Process model output (add to memory if marked)
    response = output[0]["generated_text"].strip()

    if "##MEM:add(" in response:
        try:
            content = response.split("##MEM:add(")[1].split(")")[0].strip('"\'')
            memory.add(content)
            print("[✅ Memory Added]")
        except:
            print("[⚠️ Couldn't parse memory add]")

    return response


# 💬 REPL for testing
if __name__ == "__main__":
    print(f"Time elapsed: {time.time() - begin_time:.2f} seconds")
    print("👋 Welcome to Kshama. Type 'exit' to leave.")
    while True:
        user_input = input("\n🧑 You: ")
        if user_input.strip().lower() in ["exit", "quit"]: break
        response = generate_response(user_input)
        print(f"\n🤖 ক্ষমা: {response}")