diff --git a/helper_functions.py b/helper_functions.py new file mode 100644 index 0000000..a06b780 --- /dev/null +++ b/helper_functions.py @@ -0,0 +1,170 @@ +import os +import urllib.request +from tqdm import tqdm +import json +import numpy as np +import tensorflow as tf +import torch + +def download_file(url, destination): + # Send a GET request to download the file + + try: + with urllib.request.urlopen(url) as response: + # Get the total file size from headers, defaulting to 0 if not present + file_size = int(response.headers.get("Content-Length", 0)) + + # Check if file exists and has the same size + if os.path.exists(destination): + file_size_local = os.path.getsize(destination) + if file_size == file_size_local: + print(f"File already exists and is up-to-date: {destination}") + return + + # Define the block size for reading the file + block_size = 1024 # 1 Kilobyte + + # Initialize the progress bar with total file size + progress_bar_description = os.path.basename(url) # Extract filename from URL + with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: + # Open the destination file in binary write mode + with open(destination, "wb") as file: + # Read the file in chunks and write to destination + while True: + chunk = response.read(block_size) + if not chunk: + break + file.write(chunk) + progress_bar.update(len(chunk)) # Update progress bar + except urllib.error.HTTPError: + s = ( + f"The specified URL ({url}) is incorrect, the internet connection cannot be established," + "\nor the requested file is temporarily unavailable.\nPlease visit the following website" + " for help: https://github.com/rasbt/LLMs-from-scratch/discussions/273") + print(s) + +def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): + # Initialize parameters dictionary with empty blocks for each layer + params = {"blocks": [{} for _ in range(settings["n_layer"])]} + + # Iterate over each variable in the checkpoint + for name, _ in tf.train.list_variables(ckpt_path): + # Load the variable and remove singleton dimensions + variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name)) + + # Process the variable name to extract relevant parts + variable_name_parts = name.split("/")[1:] # Skip the 'model/' prefix + + # Identify the target dictionary for the variable + target_dict = params + if variable_name_parts[0].startswith("h"): + layer_number = int(variable_name_parts[0][1:]) + target_dict = params["blocks"][layer_number] + + # Recursively access or create nested dictionaries + for key in variable_name_parts[1:-1]: + target_dict = target_dict.setdefault(key, {}) + + # Assign the variable array to the last key + last_key = variable_name_parts[-1] + target_dict[last_key] = variable_array + + return params + + +def download_and_load_gpt2(model_size, models_dir): + # Validate model size + allowed_sizes = ("124M", "355M", "774M", "1558M") + if model_size not in allowed_sizes: + raise ValueError(f"Model size not in {allowed_sizes}") + + # Define paths + model_dir = os.path.join(models_dir, model_size) + base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models" + filenames = [ + "checkpoint", "encoder.json", "hparams.json", + "model.ckpt.data-00000-of-00001", "model.ckpt.index", + "model.ckpt.meta", "vocab.bpe" + ] + + # Download files + os.makedirs(model_dir, exist_ok=True) + for filename in filenames: + file_url = os.path.join(base_url, model_size, filename) + file_path = os.path.join(model_dir, filename) + download_file(file_url, file_path) + + # Load settings and params + tf_ckpt_path = tf.train.latest_checkpoint(model_dir) + settings = json.load(open(os.path.join(model_dir, "hparams.json"))) + params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings) + + return settings, params + + +def assign(left, right): + if left.shape != right.shape: + raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}") + return torch.nn.Parameter(torch.tensor(right)) + + +def load_weights_into_gpt(gpt, params): + gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe']) + gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte']) + + for b in range(len(params["blocks"])): + q_w, k_w, v_w = np.split( + (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1) + gpt.trf_blocks[b].att.W_query.weight = assign( + gpt.trf_blocks[b].att.W_query.weight, q_w.T) + gpt.trf_blocks[b].att.W_key.weight = assign( + gpt.trf_blocks[b].att.W_key.weight, k_w.T) + gpt.trf_blocks[b].att.W_value.weight = assign( + gpt.trf_blocks[b].att.W_value.weight, v_w.T) + + q_b, k_b, v_b = np.split( + (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1) + gpt.trf_blocks[b].att.W_query.bias = assign( + gpt.trf_blocks[b].att.W_query.bias, q_b) + gpt.trf_blocks[b].att.W_key.bias = assign( + gpt.trf_blocks[b].att.W_key.bias, k_b) + gpt.trf_blocks[b].att.W_value.bias = assign( + gpt.trf_blocks[b].att.W_value.bias, v_b) + + gpt.trf_blocks[b].att.out_proj.weight = assign( + gpt.trf_blocks[b].att.out_proj.weight, + params["blocks"][b]["attn"]["c_proj"]["w"].T) + gpt.trf_blocks[b].att.out_proj.bias = assign( + gpt.trf_blocks[b].att.out_proj.bias, + params["blocks"][b]["attn"]["c_proj"]["b"]) + + gpt.trf_blocks[b].ff.layers[0].weight = assign( + gpt.trf_blocks[b].ff.layers[0].weight, + params["blocks"][b]["mlp"]["c_fc"]["w"].T) + gpt.trf_blocks[b].ff.layers[0].bias = assign( + gpt.trf_blocks[b].ff.layers[0].bias, + params["blocks"][b]["mlp"]["c_fc"]["b"]) + gpt.trf_blocks[b].ff.layers[2].weight = assign( + gpt.trf_blocks[b].ff.layers[2].weight, + params["blocks"][b]["mlp"]["c_proj"]["w"].T) + gpt.trf_blocks[b].ff.layers[2].bias = assign( + gpt.trf_blocks[b].ff.layers[2].bias, + params["blocks"][b]["mlp"]["c_proj"]["b"]) + + gpt.trf_blocks[b].norm1.scale = assign( + gpt.trf_blocks[b].norm1.scale, + params["blocks"][b]["ln_1"]["g"]) + gpt.trf_blocks[b].norm1.shift = assign( + gpt.trf_blocks[b].norm1.shift, + params["blocks"][b]["ln_1"]["b"]) + gpt.trf_blocks[b].norm2.scale = assign( + gpt.trf_blocks[b].norm2.scale, + params["blocks"][b]["ln_2"]["g"]) + gpt.trf_blocks[b].norm2.shift = assign( + gpt.trf_blocks[b].norm2.shift, + params["blocks"][b]["ln_2"]["b"]) + + gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"]) + gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"]) + gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"]) + diff --git a/lab6.ipynb b/lab6.ipynb index 869f12c..436ca54 100644 --- a/lab6.ipynb +++ b/lab6.ipynb @@ -2,34 +2,26 @@ "cells": [ { "cell_type": "markdown", - "id": "12e91914-5f51-43fa-b65b-625e73b4d17b", + "id": "c2520ec3-722f-4f44-bdd1-885b13e7afbf", "metadata": { - "id": "12e91914-5f51-43fa-b65b-625e73b4d17b" + "id": "c2520ec3-722f-4f44-bdd1-885b13e7afbf" }, "source": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "\n", - "Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka
\n", - "
Code repository: https://github.com/rasbt/LLMs-from-scratch\n", - "
\n", - "
\n", - "\n", - "
" + "# Lab 6: Finetuning To Follow Instructions" ] }, { "cell_type": "markdown", - "id": "c2520ec3-722f-4f44-bdd1-885b13e7afbf", - "metadata": { - "id": "c2520ec3-722f-4f44-bdd1-885b13e7afbf" - }, + "id": "2626e3f5", + "metadata": {}, "source": [ - "# Chapter 7: Finetuning To Follow Instructions" + "This lab covers:\n", + "- Preparing a dataset with instructions (instruction + response pairs)\n", + "- Finetuning the LLM to follow instructions\n", + "- Evaluation of the LLM\n", + "\n", + "\n", + "" ] }, { @@ -48,11 +40,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "matplotlib version: 3.7.1\n", - "tiktoken version: 0.7.0\n", - "torch version: 2.4.0\n", - "tqdm version: 4.66.4\n", - "tensorflow version: 2.15.0\n" + "matplotlib version: 3.8.2\n", + "tiktoken version: 0.8.0\n", + "torch version: 2.2.0\n", + "tqdm version: 4.66.2\n", + "tensorflow version: 2.18.0\n" ] } ], @@ -77,7 +69,7 @@ "id": "264fca98-2f9a-4193-b435-2abfa3b4142f" }, "source": [ - "" + "" ] }, { @@ -87,7 +79,7 @@ "id": "8bbc68e9-75b3-41f1-ac2c-e071c3cd0813" }, "source": [ - "## 7.1 Introduction to instruction finetuning" + "## 1 Introduction to instruction finetuning" ] }, { @@ -97,9 +89,9 @@ "id": "53dba24a-6805-496c-9a7f-c75e2d3527ab" }, "source": [ - "- In chapter 5, we saw that pretraining an LLM involves a training procedure where it learns to generate one word at a time\n", + "- In lab 4, we saw that pretraining an LLM involves a training procedure where it learns to generate one word at a time\n", "- Hence, a pretrained LLM is good at text completion, but it is not good at following instructions\n", - "- In this chapter, we teach the LLM to follow instructions better" + "- In this lab, we teach the LLM to follow instructions better" ] }, { @@ -109,19 +101,7 @@ "id": "18dc0535-0904-44ed-beaf-9b678292ef35" }, "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "id": "b4698b23-12e0-4bd7-a140-ccb3dd71d4e8", - "metadata": { - "id": "b4698b23-12e0-4bd7-a140-ccb3dd71d4e8" - }, - "source": [ - "- The topics covered in this chapter are summarized in the figure below\n", - "\n", - "" + "" ] }, { @@ -131,7 +111,7 @@ "id": "5384f0cf-ef3c-4436-a5fa-59bd25649f86" }, "source": [ - "## 7.2 Preparing a dataset for supervised instruction finetuning" + "## 2 Preparing a dataset for supervised instruction finetuning" ] }, { @@ -141,7 +121,7 @@ "id": "f8b34ff8-619f-4e89-bd03-ce513269760d" }, "source": [ - "- We will work with an instruction dataset I prepared for this chapter" + "- We will work with an instruction dataset that was already prepared in advance" ] }, { @@ -285,7 +265,7 @@ "id": "dffa4f70-44d4-4be4-89a9-2159f4885b10" }, "source": [ - "" + "" ] }, { @@ -295,7 +275,7 @@ "id": "dd79a74e-befb-491c-be49-f777a6a5b6a6" }, "source": [ - "- In this chapter, we use Alpaca-style prompt formatting, which was the original prompt template for instruction finetuning\n", + "- In this lab, we use Alpaca-style prompt formatting, which was the original prompt template for instruction finetuning\n", "- Below, we format the input that we will pass as input to the LLM" ] }, @@ -472,7 +452,7 @@ "id": "fcaaf606-f913-4445-8301-632ae10d387d" }, "source": [ - "## 7.3 Organizing data into training batches" + "## 3 Organizing data into training batches" ] }, { @@ -482,7 +462,7 @@ "id": "233f63bd-9755-4d07-8884-5e2e5345cf27" }, "source": [ - "" + "" ] }, { @@ -494,7 +474,7 @@ "source": [ "- We tackle this dataset batching in several steps, as summarized in the figure below\n", "\n", - "" + "" ] }, { @@ -504,9 +484,9 @@ "id": "b9af423f-aad9-4b3c-bea5-153021c04862" }, "source": [ - "- First, we implement an `InstructionDataset` class that pre-tokenizes all inputs in the dataset, similar to the `SpamDataset` in chapter 6\n", + "- First, we implement an `InstructionDataset` class that pre-tokenizes all inputs in the dataset, similar to the `SpamDataset` in lab 5\n", "\n", - "" + "" ] }, { @@ -550,8 +530,8 @@ "id": "384f0e69-4b22-41c0-a25d-f077527eddd1" }, "source": [ - "- Similar to chapter 6, we want to collect multiple training examples in a batch to accelerate training; this requires padding all inputs to a similar length\n", - "- Also similar to the previous chapter, we use the `<|endoftext|>` token as a padding token" + "- Similar to lab 5, we want to collect multiple training examples in a batch to accelerate training; this requires padding all inputs to a similar length\n", + "- Also similar to the previous lab, we use the `<|endoftext|>` token as a padding token" ] }, { @@ -588,7 +568,7 @@ "id": "9e5bd7bc-f347-4cf8-a0c2-94cb8799e427" }, "source": [ - "- In chapter 6, we padded all examples in a dataset to the same length\n", + "- In lab 5, we padded all examples in a dataset to the same length\n", " - Here, we take a more sophisticated approach and develop a custom \"collate\" function that we can pass to the data loader\n", " - This custom collate function pads the training examples in each batch to have the same length (but different batches can have different lengths)" ] @@ -600,7 +580,7 @@ "id": "65c4d943-4aa8-4a44-874e-05bc6831fbd3" }, "source": [ - "" + "" ] }, { @@ -688,7 +668,7 @@ "id": "c46832ab-39b7-45f8-b330-ac9adfa10d1b" }, "source": [ - "" + "" ] }, { @@ -709,7 +689,7 @@ "id": "0386b6fe-3455-4e70-becd-a5a4681ba2ef" }, "source": [ - "" + "" ] }, { @@ -792,7 +772,7 @@ "source": [ "- Next, we introduce an `ignore_index` value to replace all padding token IDs with a new value; the purpose of this `ignore_index` is that we can ignore padding values in the loss function (more on that later)\n", "\n", - "\n", + "\n", "\n", "- Concretely, this means that we replace the token IDs corresponding to `50256` with `-100` as illustrated below" ] @@ -804,7 +784,7 @@ "id": "bd4bed33-956e-4b3f-a09c-586d8203109a" }, "source": [ - "" + "" ] }, { @@ -911,7 +891,7 @@ }, "source": [ "- Let's see what this replacement by -100 accomplishes\n", - "- For illustration purposes, let's assume we have a small classification task with 2 class labels, 0 and 1, similar to chapter 6\n", + "- For illustration purposes, let's assume we have a small classification task with 2 class labels, 0 and 1, similar to lab 5\n", "- If we have the following logits values (outputs of the last layer of the model), we calculate the following loss" ] }, @@ -1058,7 +1038,7 @@ "id": "fab8f0ed-80e8-4fd9-bf84-e5d0e0bc0a39" }, "source": [ - "" + "" ] }, { @@ -1068,7 +1048,7 @@ "id": "bccaf048-ec95-498c-9155-d5b3ccba6c96" }, "source": [ - "## 7.4 Creating data loaders for an instruction dataset" + "## 4 Creating data loaders for an instruction dataset" ] }, { @@ -1088,7 +1068,7 @@ "id": "9fffe390-b226-4d5c-983f-9f4da773cb82" }, "source": [ - "" + "" ] }, { @@ -1104,7 +1084,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 36, "id": "etpqqWh8phKc", "metadata": { "colab": { @@ -1118,7 +1098,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Device: cuda\n" + "Device: cpu\n" ] } ], @@ -1130,11 +1110,11 @@ "# which is much faster than on an Apple CPU (as measured on an M3 MacBook Air).\n", "# However, the resulting loss values may be slightly different.\n", "\n", - "#if torch.cuda.is_available():\n", + "# if torch.cuda.is_available():\n", "# device = torch.device(\"cuda\")\n", - "#elif torch.backends.mps.is_available():\n", + "# elif torch.backends.mps.is_available():\n", "# device = torch.device(\"mps\")\n", - "#else:\n", + "# else:\n", "# device = torch.device(\"cpu\")\n", "\n", "print(\"Device:\", device)" @@ -1142,7 +1122,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 37, "id": "4e47fb30-c2c6-4e6d-a64c-76cc65be4a2c", "metadata": { "id": "4e47fb30-c2c6-4e6d-a64c-76cc65be4a2c" @@ -1165,12 +1145,12 @@ "id": "8ff42c29-8b81-45e5-ae8d-b97cd1cf447a" }, "source": [ - "- Next, we instantiate the data loaders similar to previous chapters, except that we now provide our own collate function for the batching process" + "- Next, we instantiate the data loaders similar to labs chapters, except that we now provide our own collate function for the batching process" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 38, "id": "BtWkgir6Hlpe", "metadata": { "id": "BtWkgir6Hlpe" @@ -1198,7 +1178,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 39, "id": "1d097dc8-ad34-4f05-b435-e4147965f532", "metadata": { "id": "1d097dc8-ad34-4f05-b435-e4147965f532" @@ -1238,7 +1218,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 40, "id": "GGs1AI3vHpnX", "metadata": { "colab": { @@ -1391,7 +1371,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 41, "id": "21b8fd02-014f-4481-9b71-5bfee8f9dfcd", "metadata": { "colab": { @@ -1411,8 +1391,7 @@ " 985, 576, 13, 198, 198, 21017, 23412, 25, 198, 464,\n", " 5156, 318, 845, 13779, 13, 198, 198, 21017, 18261, 25,\n", " 198, 464, 5156, 318, 355, 13779, 355, 257, 4936, 13,\n", - " 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256],\n", - " device='cuda:0')\n" + " 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256])\n" ] } ], @@ -1432,7 +1411,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 42, "id": "51649ab4-1a7e-4a9e-92c5-950a24fde211", "metadata": { "colab": { @@ -1452,8 +1431,7 @@ " 576, 13, 198, 198, 21017, 23412, 25, 198, 464, 5156,\n", " 318, 845, 13779, 13, 198, 198, 21017, 18261, 25, 198,\n", " 464, 5156, 318, 355, 13779, 355, 257, 4936, 13, 50256,\n", - " -100, -100, -100, -100, -100, -100, -100, -100, -100],\n", - " device='cuda:0')\n" + " -100, -100, -100, -100, -100, -100, -100, -100, -100])\n" ] } ], @@ -1468,7 +1446,7 @@ "id": "d6aad445-8f19-4238-b9bf-db80767fb91a" }, "source": [ - "## 7.5 Loading a pretrained LLM" + "## 5 Loading a pretrained LLM" ] }, { @@ -1478,7 +1456,7 @@ "id": "5a5c07d1-4fc9-4846-94cf-b11a085a667b" }, "source": [ - "- In this section, we load a pretrained GPT model using the same code that we used in section 5.5 of chapter 5 and section 6.4 in chapter 6" + "- In this section, we load a pretrained GPT model" ] }, { @@ -1488,7 +1466,7 @@ "id": "8d1b438f-88af-413f-96a9-f059c6c55fc4" }, "source": [ - "" + "" ] }, { @@ -1498,12 +1476,12 @@ "id": "8c68eda7-e02e-4caa-846b-ca6dbd396ca2" }, "source": [ - "- However, instead of loading the smallest 124 million parameter model, we load the medium version with 355 million parameters since the 124 million model is too small for achieving qualitatively reasonable results via instruction finetuning" + "- We load the medium version with 355 million parameters for achieving qualitatively reasonable results via instruction finetuning" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "0d249d67-5eba-414e-9bd2-972ebf01329d", "metadata": { "colab": { @@ -1512,31 +1490,10 @@ "id": "0d249d67-5eba-414e-9bd2-972ebf01329d", "outputId": "3f08f5e1-ca7c-406d-e2ae-1b5fcafad3f2" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-07-25 02:22:49.969483: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", - "2024-07-25 02:22:50.023103: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", - "2024-07-25 02:22:50.023136: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", - "2024-07-25 02:22:50.024611: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", - "2024-07-25 02:22:50.033304: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", - "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2024-07-25 02:22:51.282247: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 169kiB/s]\n", - "encoder.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 2.43MiB/s]\n", - "hparams.json: 100%|██████████| 91.0/91.0 [00:00<00:00, 168kiB/s]\n", - "model.ckpt.data-00000-of-00001: 100%|██████████| 1.42G/1.42G [00:56<00:00, 25.0MiB/s]\n", - "model.ckpt.index: 100%|██████████| 10.4k/10.4k [00:00<00:00, 16.5MiB/s]\n", - "model.ckpt.meta: 100%|██████████| 927k/927k [00:00<00:00, 1.96MiB/s]\n", - "vocab.bpe: 100%|██████████| 456k/456k [00:00<00:00, 1.53MiB/s]\n" - ] - } - ], + "outputs": [], "source": [ - "from gpt_download import download_and_load_gpt2\n", - "from previous_chapters import GPTModel, load_weights_into_gpt\n", + "from helper_functions import download_and_load_gpt2, load_weights_into_gpt\n", + "from previous_labs import GPTModel\n", "\n", "\n", "BASE_CONFIG = {\n", @@ -1580,7 +1537,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 44, "id": "7bd32b7c-5b44-4d25-a09f-46836802ca74", "metadata": { "colab": { @@ -1610,14 +1567,14 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 45, "id": "2e3e68e0-2627-4c65-b4e7-1e0667e4f6fa", "metadata": { "id": "2e3e68e0-2627-4c65-b4e7-1e0667e4f6fa" }, "outputs": [], "source": [ - "from previous_chapters import (\n", + "from previous_labs import (\n", " generate,\n", " text_to_token_ids,\n", " token_ids_to_text\n", @@ -1646,7 +1603,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 46, "id": "ba4a55bf-a245-48d8-beda-2838a58fb5ba", "metadata": { "colab": { @@ -1694,7 +1651,7 @@ "id": "70d27b9d-a942-4cf5-b797-848c5f01e723" }, "source": [ - "## 7.6 Finetuning the LLM on instruction data" + "## 6 Finetuning the LLM on instruction data" ] }, { @@ -1706,21 +1663,21 @@ "source": [ "- In this section, we finetune the model\n", "\n", - "\n", + "\n", "\n", - "- Note that we can reuse all the loss calculation and training functions that we used in previous chapters" + "- Note that we can reuse all the loss calculation and training functions that we used in previous labs" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 47, "id": "65444865-df87-4d98-9faf-875e1c4be860", "metadata": { "id": "65444865-df87-4d98-9faf-875e1c4be860" }, "outputs": [], "source": [ - "from previous_chapters import (\n", + "from previous_labs import (\n", " calc_loss_loader,\n", " train_model_simple\n", ")" @@ -1733,12 +1690,12 @@ "id": "00083059-aa41-4d37-8a17-1c72d1b1ca00" }, "source": [ - "- Let's calculate the initial training and validation set loss before we start training (as in previous chapters, the goal is to minimize the loss)" + "- Let's calculate the initial training and validation set loss before we start training (as in previous labs, the goal is to minimize the loss)" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 48, "id": "d99fc6f8-63b2-43da-adbb-a7b6b92c8dd5", "metadata": { "colab": { @@ -1752,8 +1709,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training loss: 3.82590970993042\n", - "Validation loss: 3.761933755874634\n" + "Training loss: 3.8258948802947996\n", + "Validation loss: 3.7619192123413088\n" ] } ], @@ -1777,7 +1734,7 @@ "id": "12a6da8f-15b3-42b0-a136-619b7a35c3e9" }, "source": [ - "- Note that the training is a bit more expensive than in previous chapters since we are using a larger model (355 million instead of 124 million parameters)\n", + "- Note that the training is a bit more expensive than in previous labs since we are using a larger model (355 million parameters)\n", "- The runtimes for various devices are shown for reference below (running this notebook on a compatible GPU device requires no changes to the code)" ] }, @@ -1801,14 +1758,12 @@ "| gpt2-small (124M) | GPU (L4) | 0.69 minutes |\n", "| gpt2-small (124M) | GPU (A100) | 0.39 minutes |\n", "\n", - "\n", - "\n", - "- I ran this notebook using the `\"gpt2-medium (355M)\"` model" + "\n" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 49, "id": "78bcf83a-1fff-4540-97c1-765c4016d5e3", "metadata": { "colab": { @@ -1823,55 +1778,55 @@ "output_type": "stream", "text": [ "Ep 1 (Step 000000): Train loss 2.637, Val loss 2.626\n", - "Ep 1 (Step 000005): Train loss 1.174, Val loss 1.102\n", - "Ep 1 (Step 000010): Train loss 0.872, Val loss 0.944\n", + "Ep 1 (Step 000005): Train loss 1.174, Val loss 1.103\n", + "Ep 1 (Step 000010): Train loss 0.872, Val loss 0.945\n", "Ep 1 (Step 000015): Train loss 0.857, Val loss 0.906\n", "Ep 1 (Step 000020): Train loss 0.776, Val loss 0.881\n", "Ep 1 (Step 000025): Train loss 0.754, Val loss 0.859\n", "Ep 1 (Step 000030): Train loss 0.799, Val loss 0.836\n", "Ep 1 (Step 000035): Train loss 0.714, Val loss 0.808\n", "Ep 1 (Step 000040): Train loss 0.672, Val loss 0.806\n", - "Ep 1 (Step 000045): Train loss 0.633, Val loss 0.789\n", - "Ep 1 (Step 000050): Train loss 0.663, Val loss 0.783\n", - "Ep 1 (Step 000055): Train loss 0.760, Val loss 0.763\n", + "Ep 1 (Step 000045): Train loss 0.633, Val loss 0.790\n", + "Ep 1 (Step 000050): Train loss 0.662, Val loss 0.783\n", + "Ep 1 (Step 000055): Train loss 0.760, Val loss 0.764\n", "Ep 1 (Step 000060): Train loss 0.719, Val loss 0.743\n", - "Ep 1 (Step 000065): Train loss 0.653, Val loss 0.735\n", + "Ep 1 (Step 000065): Train loss 0.652, Val loss 0.735\n", "Ep 1 (Step 000070): Train loss 0.532, Val loss 0.729\n", - "Ep 1 (Step 000075): Train loss 0.569, Val loss 0.728\n", + "Ep 1 (Step 000075): Train loss 0.569, Val loss 0.729\n", "Ep 1 (Step 000080): Train loss 0.605, Val loss 0.725\n", "Ep 1 (Step 000085): Train loss 0.509, Val loss 0.709\n", "Ep 1 (Step 000090): Train loss 0.562, Val loss 0.691\n", "Ep 1 (Step 000095): Train loss 0.500, Val loss 0.681\n", - "Ep 1 (Step 000100): Train loss 0.503, Val loss 0.677\n", + "Ep 1 (Step 000100): Train loss 0.502, Val loss 0.677\n", "Ep 1 (Step 000105): Train loss 0.564, Val loss 0.670\n", - "Ep 1 (Step 000110): Train loss 0.555, Val loss 0.666\n", + "Ep 1 (Step 000110): Train loss 0.555, Val loss 0.667\n", "Ep 1 (Step 000115): Train loss 0.508, Val loss 0.664\n", "Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.' ### Response: The meal is prepared every day by the chef.<|endoftext|>The following is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Convert the active sentence to passive:\n", "Ep 2 (Step 000120): Train loss 0.435, Val loss 0.672\n", - "Ep 2 (Step 000125): Train loss 0.451, Val loss 0.687\n", - "Ep 2 (Step 000130): Train loss 0.447, Val loss 0.683\n", - "Ep 2 (Step 000135): Train loss 0.405, Val loss 0.682\n", - "Ep 2 (Step 000140): Train loss 0.409, Val loss 0.681\n", - "Ep 2 (Step 000145): Train loss 0.369, Val loss 0.680\n", - "Ep 2 (Step 000150): Train loss 0.382, Val loss 0.675\n", - "Ep 2 (Step 000155): Train loss 0.413, Val loss 0.675\n", - "Ep 2 (Step 000160): Train loss 0.415, Val loss 0.683\n", + "Ep 2 (Step 000125): Train loss 0.451, Val loss 0.686\n", + "Ep 2 (Step 000130): Train loss 0.447, Val loss 0.682\n", + "Ep 2 (Step 000135): Train loss 0.404, Val loss 0.682\n", + "Ep 2 (Step 000140): Train loss 0.410, Val loss 0.681\n", + "Ep 2 (Step 000145): Train loss 0.369, Val loss 0.681\n", + "Ep 2 (Step 000150): Train loss 0.381, Val loss 0.676\n", + "Ep 2 (Step 000155): Train loss 0.412, Val loss 0.676\n", + "Ep 2 (Step 000160): Train loss 0.415, Val loss 0.684\n", "Ep 2 (Step 000165): Train loss 0.379, Val loss 0.686\n", - "Ep 2 (Step 000170): Train loss 0.323, Val loss 0.681\n", - "Ep 2 (Step 000175): Train loss 0.337, Val loss 0.669\n", - "Ep 2 (Step 000180): Train loss 0.392, Val loss 0.656\n", - "Ep 2 (Step 000185): Train loss 0.415, Val loss 0.657\n", - "Ep 2 (Step 000190): Train loss 0.340, Val loss 0.648\n", - "Ep 2 (Step 000195): Train loss 0.330, Val loss 0.634\n", - "Ep 2 (Step 000200): Train loss 0.310, Val loss 0.634\n", - "Ep 2 (Step 000205): Train loss 0.352, Val loss 0.630\n", - "Ep 2 (Step 000210): Train loss 0.367, Val loss 0.630\n", - "Ep 2 (Step 000215): Train loss 0.394, Val loss 0.635\n", - "Ep 2 (Step 000220): Train loss 0.299, Val loss 0.648\n", - "Ep 2 (Step 000225): Train loss 0.346, Val loss 0.661\n", - "Ep 2 (Step 000230): Train loss 0.292, Val loss 0.659\n", + "Ep 2 (Step 000170): Train loss 0.323, Val loss 0.682\n", + "Ep 2 (Step 000175): Train loss 0.337, Val loss 0.670\n", + "Ep 2 (Step 000180): Train loss 0.393, Val loss 0.658\n", + "Ep 2 (Step 000185): Train loss 0.416, Val loss 0.659\n", + "Ep 2 (Step 000190): Train loss 0.340, Val loss 0.650\n", + "Ep 2 (Step 000195): Train loss 0.330, Val loss 0.637\n", + "Ep 2 (Step 000200): Train loss 0.310, Val loss 0.637\n", + "Ep 2 (Step 000205): Train loss 0.352, Val loss 0.632\n", + "Ep 2 (Step 000210): Train loss 0.367, Val loss 0.631\n", + "Ep 2 (Step 000215): Train loss 0.396, Val loss 0.635\n", + "Ep 2 (Step 000220): Train loss 0.301, Val loss 0.649\n", + "Ep 2 (Step 000225): Train loss 0.349, Val loss 0.662\n", + "Ep 2 (Step 000230): Train loss 0.294, Val loss 0.658\n", "Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.' ### Response: The meal is cooked every day by the chef.<|endoftext|>The following is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: What is the capital of the United Kingdom\n", - "Training completed in 1.84 minutes.\n" + "Training completed in 80.16 minutes.\n" ] } ], @@ -1911,7 +1866,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 50, "id": "4acd368b-1403-4807-a218-9102e35bfdbb", "metadata": { "colab": { @@ -1924,7 +1879,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ "
" ] @@ -1934,7 +1889,7 @@ } ], "source": [ - "from previous_chapters import plot_losses\n", + "from previous_labs import plot_losses\n", "\n", "epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))\n", "plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)" @@ -1958,7 +1913,7 @@ "id": "87b79a47-13f9-4d1f-87b1-3339bafaf2a3" }, "source": [ - "## 7.7 Extracting and saving responses" + "## 7 Extracting and saving responses" ] }, { @@ -1968,7 +1923,7 @@ "id": "5a25cc88-1758-4dd0-b8bf-c044cbf2dd49" }, "source": [ - "" + "" ] }, { @@ -1985,7 +1940,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 51, "id": "VQ2NZMbfucAc", "metadata": { "colab": { @@ -2011,7 +1966,7 @@ ">> The car is as fast as lightning.\n", "\n", "Model response:\n", - ">> The car is as fast as a bullet.\n", + ">> The car is as fast as a cheetah.\n", "-------------------------------------\n", "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", "\n", @@ -2076,7 +2031,7 @@ "- As we can see based on the test set instructions, given responses, and the model's responses, the model performs relatively well\n", "- The answers to the first and last instructions are clearly correct\n", "- The second answer is close; the model answers with \"cumulus cloud\" instead of \"cumulonimbus\" (however, note that cumulus clouds can develop into cumulonimbus clouds, which are capable of producing thunderstorms)\n", - "- Most importantly, we can see that model evaluation is not as straightforward as in the previous chapter, where we just had to calculate the percentage of correct spam/non-spam class labels to obtain the classification accuracy\n", + "- Most importantly, we can see that model evaluation is not as straightforward as in the previous labs, where we just had to calculate the percentage of correct spam/non-spam class labels to obtain the classification accuracy\n", "- In practice, instruction-finetuned LLMs such as chatbots are evaluated via multiple approaches\n", " - short-answer and multiple choice benchmarks such as MMLU (\"Measuring Massive Multitask Language Understanding\", [https://arxiv.org/abs/2009.03300](https://arxiv.org/abs/2009.03300)), which test the knowledge of a model\n", " - human preference comparison to other LLMs, such as LMSYS chatbot arena ([https://arena.lmsys.org](https://arena.lmsys.org))\n", @@ -2088,7 +2043,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 52, "id": "-PNGKzY4snKP", "metadata": { "colab": { @@ -2102,7 +2057,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 110/110 [01:11<00:00, 1.54it/s]\n" + "python(18511) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.\n", + "100%|██████████| 110/110 [11:30<00:00, 6.28s/it]\n" ] } ], @@ -2142,7 +2098,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 53, "id": "u-AvCCMTnPSE", "metadata": { "colab": { @@ -2156,7 +2112,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'instruction': 'Rewrite the sentence using a simile.', 'input': 'The car is very fast.', 'output': 'The car is as fast as lightning.', 'model_response': 'The car is as fast as a bullet.'}\n" + "{'instruction': 'Rewrite the sentence using a simile.', 'input': 'The car is very fast.', 'output': 'The car is as fast as lightning.', 'model_response': 'The car is as fast as a cheetah.'}\n" ] } ], @@ -2176,7 +2132,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 54, "id": "8cBU0iHmVfOI", "metadata": { "colab": { @@ -2214,7 +2170,7 @@ "id": "obgoGI89dgPm" }, "source": [ - "## 7.8 Evaluating the finetuned LLM" + "## 8 Evaluating the finetuned LLM" ] }, { @@ -2224,7 +2180,7 @@ "id": "805b9d30-7336-499f-abb5-4a21be3129f5" }, "source": [ - "" + "" ] }, { @@ -2235,8 +2191,8 @@ }, "source": [ "- In this section, we automate the response evaluation of the finetuned LLM using another, larger LLM\n", - "- In particular, we use an instruction-finetuned 8-billion-parameter Llama 3 model by Meta AI that can be run locally via ollama ([https://ollama.com](https://ollama.com))\n", - "- (Alternatively, if you prefer using a more capable LLM like GPT-4 via the OpenAI API, please see the [llm-instruction-eval-openai.ipynb](../03_model-evaluation/llm-instruction-eval-openai.ipynb) notebook)" + "- In particular, we use an instruction-finetuned 3-billion-parameter Llama 3.2 model by Meta AI that can be run locally via ollama ([https://ollama.com](https://ollama.com))\n", + "- (Alternatively, if you prefer using a more capable LLM like GPT-4 via the OpenAI API, please see the [llm-instruction-eval-openai.ipynb](https://github.com/rasbt/LLMs-from-scratch/blob/bb31de89993441224e9005926dedad95395bb058/ch07/03_model-evaluation/llm-instruction-eval-openai.ipynb) notebook)" ] }, { @@ -2264,23 +2220,28 @@ "\n", "- In general, before we can use ollama from the command line, we have to either start the ollama application or run `ollama serve` in a separate terminal\n", "\n", - "\n", + "\n", + "\n", "\n", + "- With the ollama application or `ollama serve` running in a different terminal, on the command line, execute the following command to try out \n", + " - the 8-billion-parameter Llama 3 model (the model, which takes up 4.7 GB of storage space, will be automatically downloaded the first time you execute this command)\n", + " - or the 3-billion-parameter Llama 3.2 model (the model, which takes up 2 GB of storage space)\n", + " - other models available, such as mistral, gemma, qwen, phi3, etc. (https://ollama.com/search)\n", "\n", - "- With the ollama application or `ollama serve` running in a different terminal, on the command line, execute the following command to try out the 8-billion-parameter Llama 3 model (the model, which takes up 4.7 GB of storage space, will be automatically downloaded the first time you execute this command)\n", + "- We will be using the 3B LLama 3.2 model in this lab\n", "\n", "```bash\n", "# 8B model\n", - "ollama run llama3\n", + "ollama run llama3.2\n", "```\n", "\n", "\n", "The output looks like as follows\n", "\n", "```\n", - "$ ollama run llama3\n", + "$ ollama run llama3.2\n", "pulling manifest\n", - "pulling 6a0746a1ec1a... 100% ▕████████████████▏ 4.7 GB\n", + "pulling 6a0746a1ec1a... 100% ▕████████████████▏ 2 GB\n", "pulling 4fa551d4f938... 100% ▕████████████████▏  12 KB\n", "pulling 8ab4849b038c... 100% ▕████████████████▏  254 B\n", "pulling 577073ffcc6c... 100% ▕████████████████▏  110 B\n", @@ -2291,11 +2252,9 @@ "success\n", "```\n", "\n", - "- Note that `llama3` refers to the instruction finetuned 8-billion-parameter Llama 3 model\n", - "\n", - "- Using ollama with the `\"llama3\"` model (a 8B parameter model) requires 16 GB of RAM; if this is not supported by your machine, you can try the smaller model, such as the 3.8B parameter phi-3 model by setting `model = \"phi-3\"`, which only requires 8 GB of RAM\n", + "- Note that `llama3.2` refers to the instruction finetuned 3-billion-parameter Llama 3.2 model\n", "\n", - "- Alternatively, you can also use the larger 70-billion-parameter Llama 3 model, if your machine supports it, by replacing `llama3` with `llama3:70b`\n", + "- Alternatively, you can also use the larger 70-billion-parameter Llama 3 model, if your machine supports it, by replacing `llama3.2` with `llama3:70b`\n", "\n", "- After the download has been completed, you will see a command line prompt that allows you to chat with the model\n", "\n", @@ -2333,7 +2292,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 58, "id": "026e8570-071e-48a2-aa38-64d7be35f288", "metadata": { "colab": { @@ -2372,7 +2331,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "723c9b00-e3cd-4092-83c3-6e48b5cf65b0", "metadata": { "id": "723c9b00-e3cd-4092-83c3-6e48b5cf65b0" @@ -2380,7 +2339,7 @@ "outputs": [], "source": [ "# This cell is optional; it allows you to restart the notebook\n", - "# and only run section 7.7 without rerunning any of the previous code\n", + "# and only run section 7 without rerunning any of the previous code\n", "import json\n", "from tqdm import tqdm\n", "\n", @@ -2416,7 +2375,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 60, "id": "e3ae0e10-2b28-42ce-8ea2-d9366a58088f", "metadata": { "id": "e3ae0e10-2b28-42ce-8ea2-d9366a58088f" @@ -2426,21 +2385,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Llamas are herbivores, which means they primarily feed on plant-based foods. Their diet typically consists of:\n", - "\n", - "1. Grasses: Llamas love to graze on various types of grasses, including tall grasses, short grasses, and even weeds.\n", - "2. Hay: High-quality hay, such as alfalfa or timothy hay, is a staple in a llama's diet. They enjoy the sweet taste and texture of fresh hay.\n", - "3. Grains: Llamas may receive grains like oats, barley, or corn as part of their daily ration. However, it's essential to provide these grains in moderation, as they can be high in calories.\n", - "4. Fruits and vegetables: Llamas enjoy a variety of fruits and veggies, such as apples, carrots, sweet potatoes, and leafy greens like kale or spinach.\n", - "5. Minerals: Llamas require access to mineral supplements, which help maintain their overall health and well-being.\n", + "Llamas are herbivores, which means they primarily eat plants and plant-based foods. Their diet typically consists of:\n", "\n", - "In the wild, llamas might also eat:\n", + "1. Grasses: Llamas love to graze on various types of grasses, including tall grasses, short grasses, and grassy weeds.\n", + "2. Hay: High-quality hay, such as timothy hay or alfalfa hay, is a staple in a llama's diet. It provides essential nutrients like fiber, protein, and vitamins.\n", + "3. Grains: Llamas may also be fed grains like oats, barley, or corn, but these should not make up more than 10% of their diet.\n", + "4. Fruits and vegetables: Fresh fruits and vegetables, such as apples, carrots, and sweet potatoes, can be given to llamas as treats or added to their hay.\n", + "5. Browse: Llamas may also eat browse, which includes leaves, twigs, and other vegetation from trees and shrubs.\n", "\n", - "1. Leaves: They'll munch on leaves from trees and shrubs, including plants like willow, alder, and birch.\n", - "2. Bark: In some cases, llamas may eat the bark of certain trees, like aspen or cottonwood.\n", - "3. Mosses and lichens: These non-vascular plants can be a tasty snack for llamas.\n", + "It's essential to note that llamas have a unique digestive system, with a four-chambered stomach, which allows them to break down and extract nutrients from plant material more efficiently than many other animals. However, this also means they can be prone to certain health issues if their diet is not balanced or if they eat too much of the wrong foods.\n", "\n", - "In captivity, llama owners typically provide a balanced diet that includes a mix of hay, grains, and fruits/vegetables. It's essential to consult with a veterinarian or experienced llama breeder to determine the best feeding plan for your llama.\n" + "A good rule of thumb for llama owners is to provide a high-quality hay-based diet with limited amounts of grains and treats, and to ensure access to fresh water at all times.\n" ] } ], @@ -2449,7 +2404,7 @@ "\n", "def query_model(\n", " prompt,\n", - " model=\"llama3\",\n", + " model=\"llama3.2\",\n", " url=\"http://localhost:11434/api/chat\"\n", "):\n", " # Create the data payload as a dictionary\n", @@ -2491,7 +2446,7 @@ " return response_data\n", "\n", "\n", - "model = \"llama3\"\n", + "model = \"llama3.2\"\n", "result = query_model(\"What do Llamas eat?\", model)\n", "print(result)" ] @@ -2508,7 +2463,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 61, "id": "86b839d4-064d-4178-b2d7-01691b452e5e", "metadata": { "id": "86b839d4-064d-4178-b2d7-01691b452e5e" @@ -2523,20 +2478,16 @@ ">> The car is as fast as lightning.\n", "\n", "Model response:\n", - ">> The car is as fast as a bullet.\n", + ">> The car is as fast as a cheetah.\n", "\n", "Score:\n", - ">> I'd rate the model response \"The car is as fast as a bullet.\" an 85 out of 100.\n", - "\n", - "Here's why:\n", + ">> To rewrite the sentence using a simile, we need to compare the speed of the car to something else.\n", "\n", - "* The response uses a simile correctly, comparing the speed of the car to something else (in this case, a bullet).\n", - "* The comparison is relevant and makes sense, as bullets are known for their high velocity.\n", - "* The phrase \"as fast as\" is used correctly to introduce the simile.\n", + "Correct output: The car is as fast as lightning.\n", "\n", - "The only reason I wouldn't give it a perfect score is that some people might find the comparison slightly less vivid or evocative than others. For example, comparing something to lightning (as in the original response) can be more dramatic and attention-grabbing. However, \"as fast as a bullet\" is still a strong and effective simile that effectively conveys the idea of the car's speed.\n", + "Score: 100\n", "\n", - "Overall, I think the model did a great job!\n", + "Explanation: A simile is a figure of speech that compares two unlike things by using \"like\" or \"as.\" In this case, comparing the speed of the car to lightning is a common and effective way to convey its incredible speed.\n", "\n", "-------------------------\n", "\n", @@ -2547,15 +2498,9 @@ ">> The type of cloud associated with thunderstorms is a cumulus cloud.\n", "\n", "Score:\n", - ">> I'd score this model response as 40 out of 100.\n", + ">> I would rate the model response a 20.\n", "\n", - "Here's why:\n", - "\n", - "* The model correctly identifies that thunderstorms are related to clouds (correctly identifying the type of phenomenon).\n", - "* However, it incorrectly specifies the type of cloud associated with thunderstorms. Cumulus clouds are not typically associated with thunderstorms; cumulonimbus clouds are.\n", - "* The response lacks precision and accuracy in its description.\n", - "\n", - "Overall, while the model attempts to address the instruction, it provides an incorrect answer, which is a significant error.\n", + "The reason for this low score is that the model response contains an error in its classification of clouds. Cumulonimbus clouds are indeed associated with thunderstorms, but cumulus clouds are typically associated with fair weather and are often seen on warm, sunny days. The correct term should be \"cumulonimbus\" instead of \"cumulus\".\n", "\n", "-------------------------\n", "\n", @@ -2566,13 +2511,21 @@ ">> The author of 'Pride and Prejudice' is Jane Austen.\n", "\n", "Score:\n", - ">> I'd rate my own response as 95 out of 100. Here's why:\n", + ">> ### Input\n", + "Name the author of 'Pride and Prejudice'.\n", + "\n", + "### Output\n", + "Jane Austen.\n", "\n", - "* The response accurately answers the question by naming the author of 'Pride and Prejudice' as Jane Austen.\n", - "* The response is concise and clear, making it easy to understand.\n", - "* There are no grammatical errors or ambiguities that could lead to confusion.\n", + "### Score: 100/100\n", "\n", - "The only reason I wouldn't give myself a perfect score is that the response is slightly redundant - it's not necessary to rephrase the question in the answer. A more concise response would be simply \"Jane Austen.\"\n", + "The response is correct because it:\n", + "\n", + "1. Accurately identifies Jane Austen as the author of 'Pride and Prejudice'.\n", + "2. Is in a complete sentence, making it easy to understand.\n", + "3. Does not contain any grammatical errors or unnecessary words.\n", + "\n", + "Overall, the response is clear, concise, and accurate, which is why it scores 100/100.\n", "\n", "-------------------------\n" ] @@ -2602,14 +2555,13 @@ "id": "b114fd65-9cfb-45f6-ab74-8331da136bf3" }, "source": [ - "- As we can see, the Llama 3 model provides a reasonable evaluation and also gives partial points if a model is not entirely correct, as we can see based on the \"cumulus cloud\" answer\n", - "- Note that the previous prompt returns very verbose evaluations; we can tweak the prompt to generate integer responses in the range between 0 and 100 (where 100 is best) to calculate an average score for our model\n", - "- The evaluation of the 110 entries in the test set takes about 1 minute on an M3 MacBook Air laptop" + "- As we can see, the Llama 3.2 model provides a reasonable evaluation and also gives partial points if a model is not entirely correct, as we can see based on the \"cumulus cloud\" answer\n", + "- Note that the previous prompt returns very verbose evaluations; we can tweak the prompt to generate integer responses in the range between 0 and 100 (where 100 is best) to calculate an average score for our model" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "9d7bca69-97c4-47a5-9aa0-32f116fa37eb", "metadata": { "id": "9d7bca69-97c4-47a5-9aa0-32f116fa37eb" @@ -2640,7 +2592,7 @@ } ], "source": [ - "def generate_model_scores(json_data, json_key, model=\"llama3\"):\n", + "def generate_model_scores(json_data, json_key, model=\"llama3.2\"):\n", " scores = []\n", " for entry in tqdm(json_data, desc=\"Scoring entries\"):\n", " prompt = (\n", @@ -2683,7 +2635,7 @@ "id": "6408768b-2784-44f1-b48e-aed0c1eb9b94" }, "source": [ - "- For reference, the original\n", + "- For reference, for the Llama 3 8B models:\n", " - Llama 3 8B base model achieves a score of 58.51\n", " - Llama 3 8B instruct model achieves a score of 82.65" ] @@ -2695,7 +2647,7 @@ "id": "412d7325-284a-446c-92a1-5aa8acc52dee" }, "source": [ - "## 7.9 Conclusions" + "## 9 Conclusions" ] }, { @@ -2705,60 +2657,18 @@ "id": "tIbNMluCDjVM" }, "source": [ - "### 7.9.1 What's next\n", "\n", - "- This marks the final chapter of this book\n", "- We covered the major steps of the LLM development cycle: implementing an LLM architecture, pretraining an LLM, and finetuning it\n", "\n", - "\n", - "\n", - "- An optional step that is sometimes followed after instruction finetuning, as described in this chapter, is preference finetuning\n", - "- Preference finetuning process can be particularly useful for customizing a model to better align with specific user preferences; see the [../04_preference-tuning-with-dpo](../04_preference-tuning-with-dpo) folder if you are interested in this\n", + "\n", "\n", - "- This GitHub repository also contains a large selection of additional bonus material you may enjoy; for more information, please see the [Bonus Material](https://github.com/rasbt/LLMs-from-scratch?tab=readme-ov-file#bonus-material) section on this repository's README page\n", + "- An optional step that is sometimes followed after instruction finetuning, as described in this lab, is preference finetuning\n", + "- Preference finetuning process can be particularly useful for customizing a model to better align with specific user preferences; see the [../04_preference-tuning-with-dpo](https://github.com/rasbt/LLMs-from-scratch/tree/bb31de89993441224e9005926dedad95395bb058/ch07/04_preference-tuning-with-dpo) folder if you are interested in this\n", "\n", - "### 7.9.2 Staying up to date in a fast-moving field\n", - "\n", - "- No code in this section\n", - "\n", - "### 7.9.3 Final words\n", - "\n", - "- I hope you enjoyed this journey of implementing an LLM from the ground up and coding the pretraining and finetuning functions\n", - "- In my opinion, implementing an LLM from scratch is the best way to understand how LLMs work; I hope you gained a better understanding through this approach\n", - "- While this book serves educational purposes, you may be interested in using different and more powerful LLMs for real-world applications\n", - " - For this, you may consider popular tools such as axolotl ([https://github.com/OpenAccess-AI-Collective/axolotl](https://github.com/OpenAccess-AI-Collective/axolotl)) or LitGPT ([https://github.com/Lightning-AI/litgpt](https://github.com/Lightning-AI/litgpt)), which I help developing" - ] - }, - { - "cell_type": "markdown", - "id": "f9853e7f-a81a-4806-9728-be1690807185", - "metadata": { - "id": "f9853e7f-a81a-4806-9728-be1690807185" - }, - "source": [ - "## Summary and takeaways\n", - "\n", - "- See the [./gpt_instruction_finetuning.py](./gpt_instruction_finetuning.py) script, a self-contained script for classification finetuning\n", - "- [./ollama_evaluate.py](./ollama_evaluate.py) is a standalone script based on section 7.8 that evaluates a JSON file containing \"output\" and \"response\" keys via Ollama and Llama 3\n", - "- The [./load-finetuned-model.ipynb](./load-finetuned-model.ipynb) notebook illustrates how to load the finetuned model in a new session\n", - "- You can find the exercise solutions in [./exercise-solutions.ipynb](./exercise-solutions.ipynb)" - ] - }, - { - "cell_type": "markdown", - "id": "b9cc51ec-e06c-4470-b626-48401a037851", - "metadata": {}, - "source": [ - "## What's next?\n", + "- You can also find Bonus Material for the book we've been following in these labs here: [Bonus Material](https://github.com/rasbt/LLMs-from-scratch?tab=readme-ov-file#bonus-material)\n", "\n", - "- Congrats on completing the book; in case you are looking for additional resources, I added several bonus sections to this GitHub repository that you might find interesting\n", - "- The complete list of bonus materials can be viewed in the main README's [Bonus Material](https://github.com/rasbt/LLMs-from-scratch?tab=readme-ov-file#bonus-material) section\n", - "- To highlight a few of my favorites:\n", - " 1. [Direct Preference Optimization (DPO) for LLM Alignment (From Scratch)](../04_preference-tuning-with-dpo/dpo-from-scratch.ipynb) implements a popular preference tuning mechanism to align the model from this chapter more closely with human preferences\n", - " 2. [Llama 3.2 From Scratch (A Standalone Notebook)](../../ch05/07_gpt_to_llama/standalone-llama32.ipynb), a from-scratch implementation of Meta AI's popular Llama 3.2, including loading the official pretrained weights; if you are up to some additional experiments, you can replace the `GPTModel` model in each of the chapters with the `Llama3Model` class (it should work as a 1:1 replacement)\n", - " 3. [Converting GPT to Llama](../../ch05/07_gpt_to_llama) contains code with step-by-step guides that explain the differences between GPT-2 and the various Llama models\n", - " 4. [Understanding the Difference Between Embedding Layers and Linear Layers](../../ch02/03_bonus_embedding-vs-matmul/embeddings-and-linear-layers.ipynb) is a conceptual explanation illustrating that the `Embedding` layer in PyTorch, which we use at the input stage of an LLM, is mathematically equivalent to a linear layer applied to one-hot encoded data\n", - "- Happy further reading!" + "- You may be interested in using different and more powerful LLMs for real-world applications\n", + " - For this, you may consider popular tools such as axolotl ([https://github.com/OpenAccess-AI-Collective/axolotl](https://github.com/OpenAccess-AI-Collective/axolotl)) or LitGPT ([https://github.com/Lightning-AI/litgpt](https://github.com/Lightning-AI/litgpt))" ] } ], @@ -2769,7 +2679,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "anabanana", "language": "python", "name": "python3" }, @@ -2783,7 +2693,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.1" } }, "nbformat": 4, diff --git a/previous_labs.py b/previous_labs.py index 1cc5219..5a60bcf 100644 --- a/previous_labs.py +++ b/previous_labs.py @@ -1,3 +1,5 @@ +import matplotlib.pyplot as plt +from matplotlib.ticker import MaxNLocator import tiktoken import torch import torch.nn as nn @@ -217,3 +219,167 @@ def generate_text_simple(model, idx, max_new_tokens, context_size): idx = torch.cat((idx, idx_next), dim=1) # (batch, n_tokens+1) return idx + + +def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None): + + # For-loop is the same as before: Get logits, and only focus on last time step + for _ in range(max_new_tokens): + idx_cond = idx[:, -context_size:] + with torch.no_grad(): + logits = model(idx_cond) + logits = logits[:, -1, :] + + # New: Filter logits with top_k sampling + if top_k is not None: + # Keep only top_k values + top_logits, _ = torch.topk(logits, top_k) + min_val = top_logits[:, -1] + logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits) + + # New: Apply temperature scaling + if temperature > 0.0: + logits = logits / temperature + + # Apply softmax to get probabilities + probs = torch.softmax(logits, dim=-1) # (batch_size, context_len) + + # Sample from the distribution + idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1) + + # Otherwise same as before: get idx of the vocab entry with the highest logits value + else: + idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch_size, 1) + + if idx_next == eos_id: # Stop generating early if end-of-sequence token is encountered and eos_id is specified + break + + # Same as before: append sampled index to the running sequence + idx = torch.cat((idx, idx_next), dim=1) # (batch_size, num_tokens+1) + + return idx + + +def text_to_token_ids(text, tokenizer): + encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"}) + encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension + return encoded_tensor + + +def token_ids_to_text(token_ids, tokenizer): + flat = token_ids.squeeze(0) # remove batch dimension + return tokenizer.decode(flat.tolist()) + + +def generate_and_print_sample(model, tokenizer, device, start_context): + model.eval() + context_size = model.pos_emb.weight.shape[0] + encoded = text_to_token_ids(start_context, tokenizer).to(device) + with torch.no_grad(): + token_ids = generate_text_simple( + model=model, idx=encoded, + max_new_tokens=50, context_size=context_size + ) + decoded_text = token_ids_to_text(token_ids, tokenizer) + print(decoded_text.replace("\n", " ")) # Compact print format + model.train() + + +def assign(left, right): + if left.shape != right.shape: + raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}") + return torch.nn.Parameter(torch.tensor(right)) + + +def calc_loss_batch(input_batch, target_batch, model, device): + input_batch, target_batch = input_batch.to(device), target_batch.to(device) + logits = model(input_batch) + loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten()) + return loss + + +def calc_loss_loader(data_loader, model, device, num_batches=None): + total_loss = 0. + if len(data_loader) == 0: + return float("nan") + elif num_batches is None: + num_batches = len(data_loader) + else: + # Reduce the number of batches to match the total number of batches in the data loader + # if num_batches exceeds the number of batches in the data loader + num_batches = min(num_batches, len(data_loader)) + for i, (input_batch, target_batch) in enumerate(data_loader): + if i < num_batches: + loss = calc_loss_batch(input_batch, target_batch, model, device) + total_loss += loss.item() + else: + break + return total_loss / num_batches + + +def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses): + fig, ax1 = plt.subplots(figsize=(5, 3)) + + # Plot training and validation loss against epochs + ax1.plot(epochs_seen, train_losses, label="Training loss") + ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss") + ax1.set_xlabel("Epochs") + ax1.set_ylabel("Loss") + ax1.legend(loc="upper right") + ax1.xaxis.set_major_locator(MaxNLocator(integer=True)) # only show integer labels on x-axis + + # Create a second x-axis for tokens seen + ax2 = ax1.twiny() # Create a second x-axis that shares the same y-axis + ax2.plot(tokens_seen, train_losses, alpha=0) # Invisible plot for aligning ticks + ax2.set_xlabel("Tokens seen") + + fig.tight_layout() # Adjust layout to make room + plt.savefig("loss-plot.pdf") + plt.show() + + +def evaluate_model(model, train_loader, val_loader, device, eval_iter): + model.eval() + with torch.no_grad(): + train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter) + val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter) + model.train() + return train_loss, val_loss + + +def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs, + eval_freq, eval_iter, start_context, tokenizer): + # Initialize lists to track losses and tokens seen + train_losses, val_losses, track_tokens_seen = [], [], [] + tokens_seen, global_step = 0, -1 + + # Main training loop + for epoch in range(num_epochs): + model.train() # Set model to training mode + + for input_batch, target_batch in train_loader: + optimizer.zero_grad() # Reset loss gradients from previous batch iteration + loss = calc_loss_batch(input_batch, target_batch, model, device) + loss.backward() # Calculate loss gradients + optimizer.step() # Update model weights using loss gradients + tokens_seen += input_batch.numel() + global_step += 1 + + # Optional evaluation step + if global_step % eval_freq == 0: + train_loss, val_loss = evaluate_model( + model, train_loader, val_loader, device, eval_iter) + train_losses.append(train_loss) + val_losses.append(val_loss) + track_tokens_seen.append(tokens_seen) + print(f"Ep {epoch+1} (Step {global_step:06d}): " + f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}") + + # Print a sample text after each epoch + generate_and_print_sample( + model, tokenizer, device, start_context + ) + + return train_losses, val_losses, track_tokens_seen + +