import time load_start_time = time.time() from transformers import pipeline # Initialize your model model_id = "meta-llama/Llama-3.2-1B-Instruct" pipe = pipeline( "text-generation", model=model_id, torch_dtype="auto", # or torch.bfloat16 if your GPU supports it device_map="auto", pad_token_id=128001 # same as eos_token_id ) # System prompt (optional) SYSTEM_PROMPT = "You are a helpful assistant. Keep responses brief and clear." print(f"Time elapsed: {time.time() - load_start_time:.2f} seconds") print("👋 Kshama is listening. Type 'exit' to quit.\n") while True: user_input = input("🧑 You: ") if user_input.strip().lower() == "exit": print("👋 Goodbye!") break prompt = f"<|system|>\n{SYSTEM_PROMPT}\n<|user|>\n{user_input}\n<|assistant|>\n" start_time = time.time() output = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7) elapsed = time.time() - start_time if isinstance(output[0], dict) and "generated_text" in output[0]: response = output[0]["generated_text"].replace(prompt, "").strip() elif isinstance(output[0], str): response = output[0].replace(prompt, "").strip() else: response = str(output[0]) print(f"🤖 ক্ষমা [{elapsed:.2f}s]: {response}\n")