From 185d5dc866a476764b6bf3b62a8aabf782c9a2ee Mon Sep 17 00:00:00 2001 From: sufian Date: Wed, 9 Jul 2025 01:14:05 +0600 Subject: [PATCH] million-labs added --- million-labs.ipynb | 950 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 950 insertions(+) create mode 100644 million-labs.ipynb diff --git a/million-labs.ipynb b/million-labs.ipynb new file mode 100644 index 0000000..d38ecfc --- /dev/null +++ b/million-labs.ipynb @@ -0,0 +1,950 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "# PyTorch for implementing LLM (No GPU)\n", + "import torch\n", + "\n", + "# Neural network modules and functions from PyTorch\n", + "from torch import nn\n", + "from torch.nn import functional as F\n", + "\n", + "# NumPy for numerical operations\n", + "import numpy as np\n", + "\n", + "# Matplotlib for plotting Loss etc.\n", + "from matplotlib import pyplot as plt\n", + "\n", + "# Time module for tracking execution time\n", + "import time\n", + "\n", + "# Pandas for data manipulation and analysis\n", + "import pandas as pd\n", + "\n", + "# urllib for handling URL requests (Downloading Dataset)\n", + "import urllib.request" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ded604c1a4e80155", + "metadata": {}, + "outputs": [], + "source": [ + "torch.set_default_device('cuda')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97afd4d11e5d5391", + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration object for model parameters\n", + "MASTER_CONFIG = {\n", + " # Adding parameters later\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4c23d3282151461", + "metadata": {}, + "outputs": [], + "source": [ + "# The URL of the raw text file on GitHub\n", + "url = \"https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt\"\n", + "\n", + "# The file name for local storage\n", + "file_name = \"tinyshakespeare.txt\"\n", + "\n", + "# Execute the download\n", + "urllib.request.urlretrieve(url, file_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1f50a516659363", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the content of the dataset\n", + "lines = open(\"tinyshakespeare.txt\", 'r').read()\n", + "\n", + "# Create a sorted list of unique characters in the dataset\n", + "vocab = sorted(list(set(lines)))\n", + "\n", + "# Display the first 10 characters in the vocabulary list\n", + "print('Printing the first 10 characters of the vocab list:', vocab[:10])\n", + "\n", + "# Output the total number of characters in our dataset (Vocabulary Size)\n", + "print('Total number of characters in our dataset (Vocabulary Size):', len(vocab))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e898838024633e3", + "metadata": {}, + "outputs": [], + "source": [ + "# Mapping integers to characters (itos)\n", + "itos = {i: ch for i, ch in enumerate(vocab)}\n", + "\n", + "# Mapping characters to integers (stoi)\n", + "stoi = {ch: i for i, ch in enumerate(vocab)}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a75bf8384dae76a", + "metadata": {}, + "outputs": [], + "source": [ + "# Encode function: Converts a string to a list of integers using the mapping stoi\n", + "def encode(s):\n", + " return [stoi[ch] for ch in s]\n", + "\n", + "# Decode function: Converts a list of integers back to a string using the mapping itos\n", + "def decode(l):\n", + " return ''.join([itos[i] for i in l])\n", + "\n", + "# Example: Encode the string \"hello\" and then decode the result\n", + "decode(encode(\"morning\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7179c91e90c32ec6", + "metadata": {}, + "outputs": [], + "source": [ + "# Convert the dataset into a torch tensor with specified data type (dtype)\n", + "dataset = torch.tensor(encode(lines), dtype=torch.int8)\n", + "\n", + "# adding the vocab size\n", + "MASTER_CONFIG = {\n", + " \"vocab_size\": len(vocab),\n", + "}\n", + "\n", + "# Display the shape of the resulting tensor\n", + "print(dataset.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d6c3c9d114ed620", + "metadata": {}, + "outputs": [], + "source": [ + "# Function to get batches for training, validation, or testing\n", + "def get_batches(data, split, batch_size, context_window, config=MASTER_CONFIG):\n", + " # Split the dataset into training, validation, and test sets\n", + " train = data[:int(.8 * len(data))]\n", + " val = data[int(.8 * len(data)): int(.9 * len(data))]\n", + " test = data[int(.9 * len(data)):]\n", + "\n", + " # Determine which split to use\n", + " batch_data = train\n", + " if split == 'val':\n", + " batch_data = val\n", + " if split == 'test':\n", + " batch_data = test\n", + "\n", + " # Pick random starting points within the data\n", + " ix = torch.randint(0, batch_data.size(0) - context_window - 1, (batch_size,))\n", + "\n", + " # Create input sequences (x) and corresponding target sequences (y)\n", + " x = torch.stack([batch_data[i:i+context_window] for i in ix]).long()\n", + " y = torch.stack([batch_data[i+1:i+context_window+1] for i in ix]).long()\n", + "\n", + " return x, y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a61c3a0dd3339a42", + "metadata": {}, + "outputs": [], + "source": [ + "# Update the MASTER_CONFIG with batch_size and context_window parameters\n", + "MASTER_CONFIG.update({\n", + " 'batch_size': 8, # Number of batches to be processed at each random split\n", + " 'context_window': 16 # Number of characters in each input (x) and target (y) sequence of each batch\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf9bdacdfc815807", + "metadata": {}, + "outputs": [], + "source": [ + "# Obtain batches for training using the specified batch size and context window\n", + "xs, ys = get_batches(dataset, 'train', MASTER_CONFIG['batch_size'], MASTER_CONFIG['context_window'])\n", + "\n", + "# Decode the sequences to obtain the corresponding text representations\n", + "decoded_samples = [(decode(xs[i].tolist()), decode(ys[i].tolist())) for i in range(len(xs))]\n", + "\n", + "# Print the random sample\n", + "print(decoded_samples)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfbe5a734e783ac6", + "metadata": {}, + "outputs": [], + "source": [ + "@torch.no_grad() # Don't compute gradients for this function\n", + "def evaluate_loss(model, config=MASTER_CONFIG):\n", + " # Placeholder for the evaluation results\n", + " out = {}\n", + "\n", + " # Set the model to evaluation mode\n", + " model.eval()\n", + "\n", + " # Iterate through training and validation splits\n", + " for split in [\"train\", \"val\"]:\n", + " # Placeholder for individual losses\n", + " losses = []\n", + "\n", + " # Generate 10 batches for evaluation\n", + " for _ in range(10):\n", + " # Get input sequences (xb) and target sequences (yb)\n", + " xb, yb = get_batches(dataset, split, config['batch_size'], config['context_window'])\n", + "\n", + " # Perform model inference and calculate the loss\n", + " _, loss = model(xb, yb)\n", + "\n", + " # Append the loss to the list\n", + " losses.append(loss.item())\n", + "\n", + " # Calculate the mean loss for the split and store it in the output dictionary\n", + " out[split] = np.mean(losses)\n", + "\n", + " # Set the model back to training mode\n", + " model.train()\n", + "\n", + " return out" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a606f9ba323d4389", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a8930d6083f2930", + "metadata": {}, + "outputs": [], + "source": [ + "# Function to perform training\n", + "def train(model, optimizer, scheduler=None, config=MASTER_CONFIG, print_logs=False):\n", + " # Placeholder for storing losses\n", + " losses = []\n", + "\n", + " # Start tracking time\n", + " start_time = time.time()\n", + "\n", + " # Iterate through epochs\n", + " for epoch in range(config['epochs']):\n", + " # Zero out gradients\n", + " optimizer.zero_grad()\n", + "\n", + " # Obtain batches for training\n", + " xs, ys = get_batches(dataset, 'train', config['batch_size'], config['context_window'])\n", + "\n", + " # Forward pass through the model to calculate logits and loss\n", + " logits, loss = model(xs, targets=ys)\n", + "\n", + " # Backward pass and optimization step\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " # If a learning rate scheduler is provided, adjust the learning rate\n", + " if scheduler:\n", + " scheduler.step()\n", + "\n", + " # Log progress every specified interval\n", + " if epoch % config['log_interval'] == 0:\n", + " # Calculate batch time\n", + " batch_time = time.time() - start_time\n", + "\n", + " # Evaluate loss on validation set\n", + " x = evaluate_loss(model)\n", + "\n", + " # Store the validation loss\n", + " losses += [x]\n", + "\n", + " # Print progress logs if specified\n", + " if print_logs:\n", + " print(f\"Epoch {epoch} | val loss {x['val']:.3f} | Time {batch_time:.3f} | ETA in seconds {batch_time * (config['epochs'] - epoch)/config['log_interval'] :.3f}\")\n", + "\n", + " # Reset the timer\n", + " start_time = time.time()\n", + "\n", + " # Print learning rate if a scheduler is provided\n", + " if scheduler:\n", + " print(\"lr: \", scheduler.get_lr())\n", + "\n", + " # Print the final validation loss\n", + " print(\"Validation loss: \", losses[-1]['val'])\n", + "\n", + " # Plot the training and validation loss curves\n", + " return pd.DataFrame(losses).plot()\n", + "\n", + "# Execute the training process\n", + "# train(model, optimizer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3889f433a82e96e5", + "metadata": {}, + "outputs": [], + "source": [ + "class RMSNorm(nn.Module):\n", + " def __init__(self, layer_shape, eps=1e-8, bias=False):\n", + " super(RMSNorm, self).__init__()\n", + "\n", + " # Registering a learnable parameter 'scale' as a parameter of the module\n", + " self.register_parameter(\"scale\", nn.Parameter(torch.ones(layer_shape)))\n", + "\n", + " def forward(self, x):\n", + " \"\"\"\n", + " Assumes shape is (batch, seq_len, d_model)\n", + " \"\"\"\n", + " # Calculating the Frobenius norm, RMS = 1/sqrt(N) * Frobenius norm\n", + " ff_rms = torch.linalg.norm(x, dim=(1,2)) * x[0].numel() ** -.5\n", + "\n", + " # Normalizing the input tensor 'x' with respect to RMS\n", + " raw = x / ff_rms.unsqueeze(-1).unsqueeze(-1)\n", + "\n", + " # Scaling the normalized tensor using the learnable parameter 'scale'\n", + " return self.scale[:x.shape[1], :].unsqueeze(0) * raw" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a232f8fb217d8a09", + "metadata": {}, + "outputs": [], + "source": [ + "def get_rotary_matrix(context_window, embedding_dim):\n", + " # Initialize a tensor for the rotary matrix with zeros\n", + " R = torch.zeros((context_window, embedding_dim, embedding_dim), requires_grad=False)\n", + "\n", + " # Loop through each position in the context window\n", + " for position in range(context_window):\n", + " # Loop through each dimension in the embedding\n", + " for i in range(embedding_dim // 2):\n", + " # Calculate the rotation angle (theta) based on the position and embedding dimension\n", + " theta = 10000. ** (-2. * (i - 1) / embedding_dim)\n", + " # Calculate the rotated matrix elements using sine and cosine functions\n", + " m_theta = position * theta\n", + " R[position, 2 * i, 2 * i] = np.cos(m_theta)\n", + " R[position, 2 * i, 2 * i + 1] = -np.sin(m_theta)\n", + " R[position, 2 * i + 1, 2 * i] = np.sin(m_theta)\n", + " R[position, 2 * i + 1, 2 * i + 1] = np.cos(m_theta)\n", + " return R" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10aedc140006fa95", + "metadata": {}, + "outputs": [], + "source": [ + "class RoPEMaskedAttentionHead(nn.Module):\n", + " def __init__(self, config):\n", + " super().__init__()\n", + " self.config = config\n", + " # Linear transformation for query\n", + " self.w_q = nn.Linear(config['d_model'], config['d_model'], bias=False)\n", + " # Linear transformation for key\n", + " self.w_k = nn.Linear(config['d_model'], config['d_model'], bias=False)\n", + " # Linear transformation for value\n", + " self.w_v = nn.Linear(config['d_model'], config['d_model'], bias=False)\n", + " # Obtain rotary matrix for positional embeddings\n", + " self.R = get_rotary_matrix(config['context_window'], config['d_model'])\n", + "\n", + " def get_rotary_matrix(context_window, embedding_dim):\n", + " # Initialize a tensor for the rotary matrix with zeros\n", + " R = torch.zeros((context_window, embedding_dim, embedding_dim), requires_grad=False)\n", + "\n", + " # Loop through each position in the context window\n", + " for position in range(context_window):\n", + " # Loop through each dimension in the embedding\n", + " for i in range(embedding_dim // 2):\n", + " # Calculate the rotation angle (theta) based on the position and embedding dimension\n", + " theta = 10000. ** (-2. * (i - 1) / embedding_dim)\n", + " # Calculate the rotated matrix elements using sine and cosine functions\n", + " m_theta = position * theta\n", + " R[position, 2 * i, 2 * i] = np.cos(m_theta)\n", + " R[position, 2 * i, 2 * i + 1] = -np.sin(m_theta)\n", + " R[position, 2 * i + 1, 2 * i] = np.sin(m_theta)\n", + " R[position, 2 * i + 1, 2 * i + 1] = np.cos(m_theta)\n", + " return R\n", + "\n", + " def forward(self, x, return_attn_weights=False):\n", + " # x: input tensor of shape (batch, sequence length, dimension)\n", + "\n", + " b, m, d = x.shape # batch size, sequence length, dimension\n", + "\n", + " # Linear transformations for Q, K, and V\n", + " q = self.w_q(x)\n", + " k = self.w_k(x)\n", + " v = self.w_v(x)\n", + "\n", + " # Rotate Q and K using the RoPE matrix\n", + " q_rotated = (torch.bmm(q.transpose(0, 1), self.R[:m])).transpose(0, 1)\n", + " k_rotated = (torch.bmm(k.transpose(0, 1), self.R[:m])).transpose(0, 1)\n", + "\n", + " # Perform scaled dot-product attention\n", + " activations = F.scaled_dot_product_attention(\n", + " q_rotated, k_rotated, v, dropout_p=0.1, is_causal=True\n", + " )\n", + "\n", + " if return_attn_weights:\n", + " # Create a causal attention mask\n", + " attn_mask = torch.tril(torch.ones((m, m)), diagonal=0)\n", + " # Calculate attention weights and add causal mask\n", + " attn_weights = torch.bmm(q_rotated, k_rotated.transpose(1, 2)) / np.sqrt(d) + attn_mask\n", + " attn_weights = F.softmax(attn_weights, dim=-1)\n", + " return activations, attn_weights\n", + "\n", + " return activations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e919117e21d94e3", + "metadata": {}, + "outputs": [], + "source": [ + "class RoPEMaskedAttentionHead(nn.Module):\n", + " def __init__(self, config):\n", + " super().__init__()\n", + " self.config = config\n", + " # Linear transformation for query\n", + " self.w_q = nn.Linear(config['d_model'], config['d_model'], bias=False)\n", + " # Linear transformation for key\n", + " self.w_k = nn.Linear(config['d_model'], config['d_model'], bias=False)\n", + " # Linear transformation for value\n", + " self.w_v = nn.Linear(config['d_model'], config['d_model'], bias=False)\n", + " # Obtain rotary matrix for positional embeddings\n", + " self.R = get_rotary_matrix(config['context_window'], config['d_model'])\n", + "\n", + " def get_rotary_matrix(context_window, embedding_dim):\n", + " # Initialize a tensor for the rotary matrix with zeros\n", + " R = torch.zeros((context_window, embedding_dim, embedding_dim), requires_grad=False)\n", + "\n", + " # Loop through each position in the context window\n", + " for position in range(context_window):\n", + " # Loop through each dimension in the embedding\n", + " for i in range(embedding_dim // 2):\n", + " # Calculate the rotation angle (theta) based on the position and embedding dimension\n", + " theta = 10000. ** (-2. * (i - 1) / embedding_dim)\n", + " # Calculate the rotated matrix elements using sine and cosine functions\n", + " m_theta = position * theta\n", + " R[position, 2 * i, 2 * i] = np.cos(m_theta)\n", + " R[position, 2 * i, 2 * i + 1] = -np.sin(m_theta)\n", + " R[position, 2 * i + 1, 2 * i] = np.sin(m_theta)\n", + " R[position, 2 * i + 1, 2 * i + 1] = np.cos(m_theta)\n", + " return R\n", + "\n", + " def forward(self, x, return_attn_weights=False):\n", + " # x: input tensor of shape (batch, sequence length, dimension)\n", + "\n", + " b, m, d = x.shape # batch size, sequence length, dimension\n", + "\n", + " # Linear transformations for Q, K, and V\n", + " q = self.w_q(x)\n", + " k = self.w_k(x)\n", + " v = self.w_v(x)\n", + "\n", + " # Rotate Q and K using the RoPE matrix\n", + " q_rotated = (torch.bmm(q.transpose(0, 1), self.R[:m])).transpose(0, 1)\n", + " k_rotated = (torch.bmm(k.transpose(0, 1), self.R[:m])).transpose(0, 1)\n", + "\n", + " # Perform scaled dot-product attention\n", + " activations = F.scaled_dot_product_attention(\n", + " q_rotated, k_rotated, v, dropout_p=0.1, is_causal=True\n", + " )\n", + "\n", + " if return_attn_weights:\n", + " # Create a causal attention mask\n", + " attn_mask = torch.tril(torch.ones((m, m)), diagonal=0)\n", + " # Calculate attention weights and add causal mask\n", + " attn_weights = torch.bmm(q_rotated, k_rotated.transpose(1, 2)) / np.sqrt(d) + attn_mask\n", + " attn_weights = F.softmax(attn_weights, dim=-1)\n", + " return activations, attn_weights\n", + "\n", + " return activations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f62b1d235695764c", + "metadata": {}, + "outputs": [], + "source": [ + "class RoPEMaskedMultiheadAttention(nn.Module):\n", + " def __init__(self, config):\n", + " super().__init__()\n", + " self.config = config\n", + " # Create a list of RoPEMaskedAttentionHead instances as attention heads\n", + " self.heads = nn.ModuleList([\n", + " RoPEMaskedAttentionHead(config) for _ in range(config['n_heads'])\n", + " ])\n", + " self.linear = nn.Linear(config['n_heads'] * config['d_model'], config['d_model']) # Linear layer after concatenating heads\n", + " self.dropout = nn.Dropout(.1) # Dropout layer\n", + "\n", + " def forward(self, x):\n", + " # x: input tensor of shape (batch, sequence length, dimension)\n", + "\n", + " # Process each attention head and concatenate the results\n", + " heads = [h(x) for h in self.heads]\n", + " x = torch.cat(heads, dim=-1)\n", + "\n", + " # Apply linear transformation to the concatenated output\n", + " x = self.linear(x)\n", + "\n", + " # Apply dropout\n", + " x = self.dropout(x)\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9236f037781b069", + "metadata": {}, + "outputs": [], + "source": [ + "# Update the master configuration with the number of attention heads\n", + "MASTER_CONFIG.update({\n", + " 'n_heads': 8,\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec0ca4d23bdd9fd4", + "metadata": {}, + "outputs": [], + "source": [ + "class RopeModel(nn.Module):\n", + " def __init__(self, config):\n", + " super().__init__()\n", + " self.config = config\n", + "\n", + " # Embedding layer for input tokens\n", + " self.embedding = nn.Embedding(config['vocab_size'], config['d_model'])\n", + "\n", + " # RMSNorm layer for pre-normalization\n", + " self.rms = RMSNorm((config['context_window'], config['d_model']))\n", + "\n", + " # RoPEMaskedMultiheadAttention layer\n", + " self.rope_attention = RoPEMaskedMultiheadAttention(config)\n", + "\n", + " # Linear layer followed by ReLU activation\n", + " self.linear = nn.Sequential(\n", + " nn.Linear(config['d_model'], config['d_model']),\n", + " nn.ReLU(),\n", + " )\n", + "\n", + " # Final linear layer for prediction\n", + " self.last_linear = nn.Linear(config['d_model'], config['vocab_size'])\n", + "\n", + " print(\"model params:\", sum([m.numel() for m in self.parameters()]))\n", + "\n", + " def forward(self, idx, targets=None):\n", + " # idx: input indices\n", + " x = self.embedding(idx)\n", + "\n", + " # One block of attention\n", + " x = self.rms(x) # RMS pre-normalization\n", + " x = x + self.rope_attention(x)\n", + "\n", + " x = self.rms(x) # RMS pre-normalization\n", + " x = x + self.linear(x)\n", + "\n", + " logits = self.last_linear(x)\n", + "\n", + " if targets is not None:\n", + " loss = F.cross_entropy(logits.view(-1, self.config['vocab_size']), targets.view(-1))\n", + " return logits, loss\n", + "\n", + " else:\n", + " return logits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95271a5b0b29b43c", + "metadata": {}, + "outputs": [], + "source": [ + "# Create an instance of RopeModel (RMSNorm, RoPE, Multi-Head)\n", + "model = RopeModel(MASTER_CONFIG)\n", + "\n", + "# Obtain batches for training\n", + "xs, ys = get_batches(dataset, 'train', MASTER_CONFIG['batch_size'], MASTER_CONFIG['context_window'])\n", + "\n", + "# Calculate logits and loss using the model\n", + "logits, loss = model(xs, ys)\n", + "\n", + "# Define the Adam optimizer for model parameters\n", + "optimizer = torch.optim.Adam(model.parameters())\n", + "\n", + "# Train the model\n", + "train(model, optimizer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b1afff55705a505", + "metadata": {}, + "outputs": [], + "source": [ + "# Updating training configuration with more epochs and a logging interval\n", + "MASTER_CONFIG.update({\n", + " \"epochs\": 5000,\n", + " \"log_interval\": 10,\n", + "})\n", + "\n", + "# Training the model with the updated configuration\n", + "train(model, optimizer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44e3b30f92614023", + "metadata": {}, + "outputs": [], + "source": [ + "class SwiGLU(nn.Module):\n", + " \"\"\"\n", + " Swish-Gated Linear Unit\n", + " https://arxiv.org/pdf/2002.05202v1.pdf\n", + " \"\"\"\n", + " def __init__(self, size):\n", + " super().__init__()\n", + " self.linear_gate = nn.Linear(size, size)\n", + " self.linear = nn.Linear(size, size)\n", + " self.beta = torch.randn(1, requires_grad=True)\n", + "\n", + " self.beta = nn.Parameter(torch.ones(1))\n", + " self.register_parameter(\"beta\", self.beta)\n", + "\n", + " def forward(self, x):\n", + " swish_gate = self.linear_gate(x) * torch.sigmoid(self.beta * self.linear_gate(x))\n", + " out = swish_gate * self.linear(x)\n", + " return out" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b46c7a9b8e6da8e", + "metadata": {}, + "outputs": [], + "source": [ + "class RopeModel(nn.Module):\n", + " def __init__(self, config):\n", + " super().__init__()\n", + " self.config = config\n", + "\n", + " # Embedding layer for input tokens\n", + " self.embedding = nn.Embedding(config['vocab_size'], config['d_model'])\n", + "\n", + " # RMSNorm layer for pre-normalization\n", + " self.rms = RMSNorm((config['context_window'], config['d_model']))\n", + "\n", + " # Multi-head attention layer with RoPE (Rotary Positional Embeddings)\n", + " self.rope_attention = RoPEMaskedMultiheadAttention(config)\n", + "\n", + " # Linear layer followed by SwiGLU activation\n", + " self.linear = nn.Sequential(\n", + " nn.Linear(config['d_model'], config['d_model']),\n", + " SwiGLU(config['d_model']), # Adding SwiGLU activation\n", + " )\n", + "\n", + " # Output linear layer\n", + " self.last_linear = nn.Linear(config['d_model'], config['vocab_size'])\n", + "\n", + " # Printing total model parameters\n", + " print(\"model params:\", sum([m.numel() for m in self.parameters()]))\n", + "\n", + " def forward(self, idx, targets=None):\n", + " x = self.embedding(idx)\n", + "\n", + " # One block of attention\n", + " x = self.rms(x) # RMS pre-normalization\n", + " x = x + self.rope_attention(x)\n", + "\n", + " x = self.rms(x) # RMS pre-normalization\n", + " x = x + self.linear(x) # Applying SwiGLU activation\n", + "\n", + " logits = self.last_linear(x)\n", + "\n", + " if targets is not None:\n", + " # Calculate cross-entropy loss if targets are provided\n", + " loss = F.cross_entropy(logits.view(-1, self.config['vocab_size']), targets.view(-1))\n", + " return logits, loss\n", + "\n", + " else:\n", + " return logits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8548a11ec0c40be", + "metadata": {}, + "outputs": [], + "source": [ + "# Create an instance of RopeModel (RMSNorm, RoPE, Multi-Head, SwiGLU)\n", + "model = RopeModel(MASTER_CONFIG)\n", + "\n", + "# Obtain batches for training\n", + "xs, ys = get_batches(dataset, 'train', MASTER_CONFIG['batch_size'], MASTER_CONFIG['context_window'])\n", + "\n", + "# Calculate logits and loss using the model\n", + "logits, loss = model(xs, ys)\n", + "\n", + "# Define the Adam optimizer for model parameters\n", + "optimizer = torch.optim.Adam(model.parameters())\n", + "\n", + "# Train the model\n", + "train(model, optimizer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2622420706bbc513", + "metadata": {}, + "outputs": [], + "source": [ + "# Update model configurations for the number of layers\n", + "MASTER_CONFIG.update({\n", + " 'n_layers': 4, # Set the number of layers to 4\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7b49bf56c2f8b4b", + "metadata": {}, + "outputs": [], + "source": [ + "# add RMSNorm and residual conncection\n", + "class LlamaBlock(nn.Module):\n", + " def __init__(self, config):\n", + " super().__init__()\n", + " self.config = config\n", + "\n", + " self.rms = RMSNorm((config['context_window'], config['d_model']))\n", + "\n", + " self.attention = RoPEMaskedMultiheadAttention(config)\n", + " self.feedforward = nn.Sequential(\n", + " nn.Linear(config['d_model'], config['d_model']),\n", + " SwiGLU(config['d_model']),\n", + " )\n", + "\n", + " def forward(self, x):\n", + " x = self.rms(x) # rms pre-normalization\n", + " x = x + self.attention(x)\n", + "\n", + " x = self.rms(x) # rms pre-normalization\n", + " x = x + self.feedforward(x)\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "431beed0a02fc772", + "metadata": {}, + "outputs": [], + "source": [ + "MASTER_CONFIG" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa931e6781403fcc", + "metadata": {}, + "outputs": [], + "source": [ + "block = LlamaBlock(MASTER_CONFIG)\n", + "block(torch.randn(MASTER_CONFIG['batch_size'], MASTER_CONFIG['context_window'], MASTER_CONFIG['d_model']));" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e26360bd91af5bae", + "metadata": {}, + "outputs": [], + "source": [ + "# Create an instance of the LlamaBlock class with the provided configuration\n", + "block = LlamaBlock(MASTER_CONFIG)\n", + "\n", + "# Generate a random tensor with the specified batch size, context window, and model dimension\n", + "random_input = torch.randn(MASTER_CONFIG['batch_size'], MASTER_CONFIG['context_window'], MASTER_CONFIG['d_model'])\n", + "\n", + "# Apply the LlamaBlock to the random input tensor\n", + "output = block(random_input)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f84fb6ee09aadea5", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import OrderedDict\n", + "\n", + "MASTER_CONFIG.update({\n", + " 'n_layers': 4,\n", + "})\n", + "\n", + "class Llama(nn.Module):\n", + " def __init__(self, config):\n", + " super().__init__()\n", + " self.config = config\n", + " # Embedding layer for token representations\n", + " self.embeddings = nn.Embedding(config['vocab_size'], config['d_model'])\n", + " # Sequential block of LlamaBlocks based on the specified number of layers\n", + " self.llama_blocks = nn.Sequential(\n", + " OrderedDict([(f\"llama_{i}\", LlamaBlock(config)) for i in range(config['n_layers'])])\n", + " )\n", + " # Feedforward network (FFN) for final output\n", + " self.ffn = nn.Sequential(\n", + " nn.Linear(config['d_model'], config['d_model']),\n", + " SwiGLU(config['d_model']),\n", + " nn.Linear(config['d_model'], config['vocab_size']),\n", + " )\n", + "\n", + " # Print total number of parameters in the model\n", + " print(\"model params:\", sum([m.numel() for m in self.parameters()]))\n", + "\n", + " def forward(self, idx, targets=None):\n", + " # Input token indices are passed through the embedding layer\n", + " x = self.embeddings(idx)\n", + " # Process the input through the LlamaBlocks\n", + " x = self.llama_blocks(x)\n", + " # Pass the processed input through the final FFN for output logits\n", + " logits = self.ffn(x)\n", + "\n", + " # If targets are not provided, return only the logits\n", + " if targets is None:\n", + " return logits\n", + " # If targets are provided, compute and return the cross-entropy loss\n", + " else:\n", + " loss = F.cross_entropy(logits.view(-1, self.config['vocab_size']), targets.view(-1))\n", + " return logits, loss" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0ff7f505eddaefd", + "metadata": {}, + "outputs": [], + "source": [ + "# Create an instance of RopeModel (RMSNorm, RoPE, Multi-Head, SwiGLU, N_layers)\n", + "llama = Llama(MASTER_CONFIG)\n", + "\n", + "# Obtain batches for training\n", + "xs, ys = get_batches(dataset, 'train', MASTER_CONFIG['batch_size'], MASTER_CONFIG['context_window'])\n", + "\n", + "# Calculate logits and loss using the model\n", + "logits, loss = llama(xs, ys)\n", + "\n", + "# Define the Adam optimizer for model parameters\n", + "optimizer = torch.optim.Adam(llama.parameters())\n", + "\n", + "# Train the model\n", + "train(llama, optimizer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca7605a6b36f613d", + "metadata": {}, + "outputs": [], + "source": [ + "# Update the number of epochs in the configuration\n", + "MASTER_CONFIG.update({\n", + " 'epochs': 10000,\n", + "})\n", + "# Train the LLaMA model for the specified number of epochs\n", + "train(llama, optimizer, scheduler=None, config=MASTER_CONFIG)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}