From 52ab2791fb3454043c630aea804ce2279464c8c9 Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Wed, 2 Oct 2024 21:21:49 -0400 Subject: [PATCH] partial rewrite of hp sweep example (#910) * adjust text, make it possible to run shorter version of example * adjust logging * formatting * partial rewrite of hyperparamter example * more text cleanup --- .../hyperparameter-sweep/hp_sweep_gpt.py | 609 +++++++++++------- 06_gpu_and_ml/hyperparameter-sweep/model.py | 9 +- 2 files changed, 384 insertions(+), 234 deletions(-) diff --git a/06_gpu_and_ml/hyperparameter-sweep/hp_sweep_gpt.py b/06_gpu_and_ml/hyperparameter-sweep/hp_sweep_gpt.py index edb5d7461..ce9dfdb33 100644 --- a/06_gpu_and_ml/hyperparameter-sweep/hp_sweep_gpt.py +++ b/06_gpu_and_ml/hyperparameter-sweep/hp_sweep_gpt.py @@ -1,68 +1,75 @@ # --- # deploy: true +# args: ["--n-steps", "200", "--n-steps-before-checkpoint", "50", "--n-steps-before-eval", "50"] # --- -# # LLM Training with Hyperparameter Optimization -# -# shakespeare -# -# When you want an LLM tailored to your specific data there are three options. -# The easiest is [Prompt Engineering](https://en.wikipedia.org/wiki/Prompt_engineering) -# but the quality of the results aren't very high. The next option is -# [fine-tuning](https://modal.com/docs/examples/llm-finetuning) which is more -# involved and improves results significantly. The final option is training an LLM -# from scratch which is the most involved but may allow for the highest caliber results. -# In addition, you may be able to shrink the model considerably and save money on inference -# costs after training. -# -# In this example we will explore training from scratch. In fact, we'll train -# 8 LLMs in parallel with different hyperparameters and then select the best -# one. Along the way we will utlize many Modal utilities: [distributed volumes](https://modal.com/docs/guide/volumes), -# multiple [web endpoints](https://modal.com/docs/guide/webhooks), -# and [parallel container execution](https://modal.com/docs/guide/scale#parallel-execution-of-inputs), -# in essence showing you how to combine multiple techniques into one powerful project. Sound -# challenging? Modal makes it easy. -# -# ## Training -# ### Basic Setup -# First we `import modal`, `fastapi` for serving tensorboard, torch -# LLM model (`AttentionModel`), and a `Dataset` class. The torch model is a nano GPT style model -# very similar to [Karpathy's](https://github.com/ShariqM/modal_nano_gpt/blob/master/model.py). -# The `Dataset` class manages the Shakespeare text data which is available -# [here](https://github.com/ShariqM/modal_nano_gpt/blob/master/model.py). -import logging as L +# # Train an SLM from scratch with early-stopping grid search over hyperparameters -import modal +# ![shakespeare](./shakespeare.png) -L.basicConfig( - level=L.INFO, - format="%(asctime)s %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s", - datefmt="%b %d %H:%M:%S", -) +# When you want a language model that performs well on your task, there are three options, +# ordered by the degree of customization: + +# - [**Prompt Engineering**](https://en.wikipedia.org/wiki/Prompt_engineering): +# large and capable language models understand tasks in natural language, so you can +# carefully design a natural language "prompt" to elicit the desired behavior. + +# - [**Fine-Tuning**](https://modal.com/docs/examples/llm-finetuning): +# those same language models were trained by gradient descent on data sets representing tasks, +# and they can be further trained by gradient descent on data sets representative of your task. + +# - **Training from Scratch**: +# if you have enough data for your task, you can throw the pretrained model away and make your own. + +# Each step adds additional engineering complexity, but also leads to a superior cost-performance Pareto frontier +# for your tasks. Fine-tuned models at one-tenth the size regularly outperform more generic models, +# and models trained from scratch outperform them. + +# Because these models are so much smaller than the Large Language Models that power generic +# assistant chatbots like ChatGPT and Claude, they are often called _Small Language Models_ (SLMs). +# In this example, we will explore training an SLM from scratch on Modal. + +# In fact, we'll train 8 SLMs in parallel with different hyperparameters +# and then select the best one for additional training. + +# We'll monitor this training live and serve our training and trained models +# as web endpoints and simple browser UIs. + +# Along the way we'll use many features of the Modal platform: +# [distributed volumes](https://modal.com/docs/guide/volumes), +# multiple [web endpoints](https://modal.com/docs/guide/webhooks), +# and [parallel container execution](https://modal.com/docs/guide/scale#parallel-execution-of-inputs). + +# Together, these features give every machine learning and AI team +# the same infrastructural capabilities that the most sophisticated companies +# have in their internal platforms. + +# ## Basic Setup + +import logging as L import urllib.request from dataclasses import dataclass from pathlib import Path +import modal from fastapi import FastAPI from fastapi.responses import FileResponse from modal import Image +from pydantic import BaseModel -# We'll use A10G GPUs for training which are able to train the model -# in 10 minutes while keeping costs under ~$1. Since the default modal function -# [timeout](https://modal.com/docs/guide/timeouts) is only 5 minutes -# we need to increase the 20 minutes. +MINUTES = 60 # seconds -gpu = "A10G" -timeout_s = 20 * 60 # 20 minutes +# We'll use A10G GPUs for training which are able to train the model to recognizably improved performance +# in ~15 minutes while keeping costs under ~$1. +gpu = "A10G" # ### Create a Volume + # Since we'll be coordinating training across multiple machines we'll use a # single [Volume](https://modal.com/docs/guide/volumes) -# to store the `dataset`, checkpointed models, and TensorBoard logs. Modal Volumes do -# not automatically synchronize writes so we'll have to be careful to use -# `commit()` and `reload()` calls when appropriate. +# to store the `dataset`, checkpointed models, and TensorBoard logs. volume = modal.Volume.from_name( "example-hp-sweep-gpt-volume", create_if_missing=True @@ -73,24 +80,35 @@ tb_log_path = volume_path / "tb_logs" save_path = volume_path / "models" -# ### Define a container image -# The container image is based on the latest Debian slim image with `torch` -# for training, `gradio` for serving a web interface, and `tensorboard` for -# monitoring training. +# ### Defining container images + +# The container imagefor training is based on Modal's default slim Debian Linux image with `torch` +# for defining and running our neural network and `tensorboard` for monitoring training. + +# We also copy over the model definition files from the local machine. image = ( Image.debian_slim(python_version="3.11") .pip_install( "torch==2.1.2", - "gradio~=4.44.0", - "pydantic>=2", "tensorboard==2.17.1", - "fastapi==0.114.2", "numpy<2", ) .copy_local_file(Path(__file__).parent / "model.py", "/root/model.py") ) +# We'll spin up a separate container to monitor the training logs with TensorBoard. + +monitoring_image = Image.debian_slim(python_version="3.11").pip_install( + "tensorboard==2.17.1" +) + +# And we'll deploy a web UI for interacting with our trained models using Gradio. + +ui_image = Image.debian_slim(python_version="3.11").pip_install( + "gradio~=4.44.0", "pydantic>=2", "fastapi==0.114.2" +) + app = modal.App("example-hp-sweep-gpt") with image.imports(): @@ -103,48 +121,61 @@ from model import AttentionModel, Dataset from torch.utils.tensorboard import SummaryWriter -# ### Training Function +# ## Training Function # Here we define the training function making sure to include the `image`, # `volume`, `gpu`, and `timeout` parameters. -# Training consists of specificying optimization parameters, loading the -# `dataset`, building the `model`, setting up tensorboard logging, -# checkpointing, and finally the training itself. +# Training consists of specifying optimization parameters, loading the +# `dataset`, building the `model`, setting up TensorBoard logging & +# checkpointing, and then finally executing the `training_loop` itself. @app.function( - image=image, volumes={volume_path: volume}, gpu=gpu, timeout=timeout_s + image=image, volumes={volume_path: volume}, gpu=gpu, timeout=20 * MINUTES ) def train_model( - node_rank, n_nodes, hparams, experiment_name, run_to_first_save=False + node_rank, + n_nodes, + hparams, + experiment_name, + run_to_first_save=False, + n_steps=3000, + n_steps_before_eval=None, + n_steps_before_checkpoint=None, ): - # Optimization, Data, and Model prep ### + # optimizer, data, and model prep batch_size = 64 - n_steps = 3000 + learning_rate = 3e-4 + n_eval_steps = 100 - n_steps_before_eval = int(n_steps / 8) # eval eight times per run - n_steps_before_checkpoint = int(n_steps / 4) # save four times per run + n_steps_before_eval = ( + n_steps_before_eval + if n_steps_before_eval is not None + else int(n_steps / 8) # eval eight times per run + ) + n_steps_before_checkpoint = ( + n_steps_before_checkpoint + if n_steps_before_checkpoint is not None + else int(n_steps / 4) # save four times per run + ) + train_percent = 0.9 - learning_rate = 3e-4 - prepend_logs = f"[Node {node_rank+1}/{n_nodes}] " - # Use GPU if available. - device = "cuda" if torch.cuda.is_available() else "cpu" - L.info(f"{prepend_logs} Remote Device: {device} // GPU: {gpu}") + L.basicConfig( + level=L.INFO, + format=f"\033[0;32m%(asctime)s %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] [Node {node_rank+1}/{n_nodes}] %(message)s\033[0m", + datefmt="%b %d %H:%M:%S", + ) - input_file_path = volume_path / "shakespeare_char.txt" - volume.reload() # Make sure we have the latest data. - if not os.path.exists(input_file_path): - L.info(f"{prepend_logs} Downloading Shakespeare dataset...") - data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt" - urllib.request.urlretrieve(data_url, input_file_path) + # use GPU if available + device = "cuda" + L.info("Remote Device: %s // GPU: %s", device, gpu) - volume.commit() # Commit to disk + input_file_path = volume_path / "shakespeare_char.txt" + text = prepare_data(input_file_path, volume) - # Construct dataset - with open(input_file_path, "r", encoding="utf-8") as f: - text = f.read() + # construct dataset dataset = Dataset( text, train_percent, @@ -154,63 +185,55 @@ def train_model( device, ) - # Build Model - model = AttentionModel(dataset.vocab_size, hparams, device) - model.to(device) - optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) + # build model + model = build_model(hparams, dataset.vocab_size, device) num_parameters = sum(p.numel() for p in model.parameters()) - L.info(f"{prepend_logs} Num parameters: {num_parameters}") + L.info(f"Num parameters: {num_parameters}") + + # setup optimizer + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) - # Tensorboard logging & checkpointing prep + # TensorBoard logging & checkpointing prep model_name = ( f"{experiment_name}" f"_context_size={hparams.context_size}_n_heads={hparams.n_heads}" f"_dropout={hparams.dropout}" ) - L.info(f"{prepend_logs} Model Name: {model_name}") + L.info(f"Model Name: {model_name}") - # Save logs to something like: + # save logs to something like: # volume/logs/E2024-01-01-000000.000000/ # E2024-01-01-000000.000000_context=8_n_heads=1_dropout=0.0/train model_log_dir = tb_log_path / f"{experiment_name}/{model_name}" - os.makedirs(model_log_dir, exist_ok=True) + model_log_dir.mkdir(parents=True, exist_ok=True) train_writer = SummaryWriter(log_dir=f"{model_log_dir}/train") val_writer = SummaryWriter(log_dir=f"{model_log_dir}/val") - # Save hyperparameters to tensorboard for easy reference - pretty_hparams_str = "" - for k, v in hparams.__dict__.items(): - pretty_hparams_str += f"{k}: {v}\n" - pretty_hparams_str += f"Num parameters: {num_parameters}" + # save hyperparameters to TensorBoard for easy reference + pretty_hparams_str = "\n".join( + f"{k}: {v}" for k, v in hparams.__dict__.items() + ) + pretty_hparams_str += f"\nNum parameters: {num_parameters}" train_writer.add_text("Hyperparameters", pretty_hparams_str) - # Load & Save models to something like: - # volume/models/E2024-01-01-000000.000000/ - # E2024-01-01-000000.000000_context=8_n_heads=1_dropout=0.0/nano_gpt_model.pt model_save_dir = save_path / experiment_name / model_name if model_save_dir.exists(): - L.info(f"{prepend_logs} Loading model from checkpiont...") + L.info("Loading model from checkpoint...") checkpoint = torch.load(str(model_save_dir / model_filename)) - if run_to_first_save: - L.info( - f"{prepend_logs} Already done. Container Restart? Stopping early..." - ) - return node_rank, checkpoint["val_loss"], hparams - else: - # Create symlink to the best model so it's easy to find for web serving. + if not run_to_first_save: + # create symlink to the best model so it's easy to find for web serving os.symlink( str(model_save_dir / model_filename), str(save_path / experiment_name / best_model_filename), ) - volume.commit() # Commit the symlink. + volume.commit() # commit the symlink model.load_state_dict(checkpoint["model"]) start_step = checkpoint["steps"] + 1 else: - assert run_to_first_save, "should have loaded ckpt" # can remove later. - os.makedirs(model_save_dir, exist_ok=True) + model_save_dir.mkdir(parents=True, exist_ok=True) start_step = 0 - # Save metadata for training restarts and inference + # save metadata for training restarts and inference checkpoint = { "model": model.state_dict(), "chars": dataset.chars, @@ -221,54 +244,29 @@ def train_model( "finished_training": False, } - # Training - t_last = timer() - for step in range(start_step, n_steps + 1): - # sample a batch of data - xb, yb = dataset.get_batch("train") - - # evaluate the loss, calculate & apply gradients - logits, loss = model.forward(xb, yb) - optimizer.zero_grad(set_to_none=True) - loss.backward() - optimizer.step() - - # log training loss - train_writer.add_scalar("Cross Entropy Loss", loss.item(), step) - - # evaluate model on validation set - if step % n_steps_before_eval == 0: - out = dataset.eval_model(model) - runtime_s = timer() - t_last - L.info( - f"{prepend_logs} {step:5d}) // {runtime_s:>5.2f}s" - f" // Train Loss: {out['train']:.2f} // Val Loss:" - f" {out['val']:.2f}" - ) - val_writer.add_scalar("Cross Entropy Loss", out["val"], step) - t_last = timer() - train_writer.flush() - volume.commit() - - # save model with checkpoint information - if step > 0 and step % n_steps_before_checkpoint == 0: - L.info(f"{prepend_logs} Saving model to {model_save_dir}") - checkpoint["finished_training"] = ( - step >= n_steps - ) # Mark as finished if we hit n steps. - checkpoint["steps"] = step - checkpoint["val_loss"] = out["val"] - torch.save(checkpoint, model_save_dir / model_filename) - volume.commit() - if run_to_first_save: - L.info(f"{prepend_logs} Stopping early...") - break + checkpoint_path = model_save_dir / model_filename + + out = training_loop( + start_step, + n_steps, + n_steps_before_eval, + n_steps_before_checkpoint, + dataset, + model, + optimizer, + train_writer, + val_writer, + checkpoint, + checkpoint_path, + run_to_first_save, + ) return node_rank, float(out["val"]), hparams -# ### Main Entry Point -# The main entry point runs coordinates the hyperparameter optimization training. +# ## Launching a hyperparameter sweep from a `local_entrypoint` + +# The main entry point coordinates the hyperparameter optimization. # First we specify the default hyperparameters for the model, taken from # [Karpathy's biggest model](https://www.youtube.com/watch?v=kCc8FmEb1nY&t=5976s), # which add up to 10 million total neural network parameters. @@ -283,48 +281,58 @@ class ModelHyperparameters: dropout: float = 0.2 -# Next we define the main entry point which runs the hyperparameter -# optimization. It will train 8 models in parallel across 8 containers, each +# Next we define the local entrypoint function. +# It will train 8 models in parallel across 8 containers, each # with different hyperparameters, varying the number of heads (`n_heads`), the -# `context_size` -# (called the block size in Karpathy lingo), and the dropout rate (`dropout`). To run in +# `context_size` (called the block size in Karpathy lingo), and the dropout rate (`dropout`). To run in # parallel we need to use the [starmap function](https://modal.com/docs/guide/scale#parallel-execution-of-inputs). -# -# Training for each model until the first checkpoint, and then stop early so we + +# We train all of the models until the first checkpoint, and then stop early so we # can compare the validation losses. Then we'll restart training for the best # model and save it to the models directory. @app.local_entrypoint() -def main(): +def main( + n_steps: int = 3000, + n_steps_before_checkpoint: int = None, + n_steps_before_eval: int = None, +): from datetime import datetime + from itertools import product - experiment_name = f"E{datetime.now().strftime('%Y-%m%d-%H%M%S.%f')}" + experiment_name = f"E{datetime.now().strftime('%Y-%m-%d-%H%M%S.%f')}" default_hparams = ModelHyperparameters() - # Build list of hyperparameters to train & validate - hparams_list = [] - h_options = (1, default_hparams.n_heads) - c_options = (8, default_hparams.context_size) - d_options = (0.1, default_hparams.dropout) + # build list of hyperparameters to train & validate + nheads_options = (1, default_hparams.n_heads) + context_size_options = (8, default_hparams.context_size) + dropout_options = (0.1, default_hparams.dropout) hparams_list = [ - ModelHyperparameters( - n_heads=n_heads, context_size=context_size, dropout=dropout + ModelHyperparameters(n_heads=h, context_size=c, dropout=d) + for h, c, d in product( + nheads_options, context_size_options, dropout_options ) - for n_heads in h_options - for context_size in c_options - for dropout in d_options ] - # Run training for each hyperparameter setting + # run training for each hyperparameter setting results = [] stop_early = True # stop early so we can compare val losses - L.info(f"Testing {len(hparams_list)} hyperparameter settings") + print(f"Testing {len(hparams_list)} hyperparameter settings") n_nodes = len(hparams_list) for result in train_model.starmap( [ - (i, n_nodes, h, experiment_name, stop_early) + ( + i, + n_nodes, + h, + experiment_name, + stop_early, + n_steps, + n_steps_before_eval, + n_steps_before_checkpoint, + ) for i, h in enumerate(hparams_list) ], order_outputs=False, @@ -332,23 +340,32 @@ def main(): # result = (node_rank, val_loss, hparams) node_rank = result[0] results.append(result) - L.info( + print( f"[Node {node_rank+1}/{n_nodes}] Finished." f" Early stop val loss result: {result[1:]}" ) - # Find the model and hparams with the lowest validation loss + # find the model and hparams with the lowest validation loss best_result = min(results, key=lambda x: x[1]) - L.info(f"Best early stop val loss result: {best_result}") + print(f"Best early stop val loss result: {best_result}") best_hparams = best_result[-1] - # Finish training with best hparams + # finish training with best hparams node_rank = 0 - n_nodes = 1 # Only one node for final training. - train_model.remote(node_rank, n_nodes, best_hparams, experiment_name) + n_nodes = 1 # only one node for final training run + train_model.remote( + node_rank, + n_nodes, + best_hparams, + experiment_name, + not stop_early, + n_steps, + n_steps_before_eval, + n_steps_before_checkpoint, + ) -# After running `modal run hp_sweep_gpt::main` you should see output like this: +# After running `modal run hp_sweep_gpt.py` you should see output like this: # ``` # Sep 16 21:20:39 INFO [hp_sweep_gpt.py.train_model:127] [Node 1/8] Remote Device: cuda // GPU: A10G # Sep 16 21:20:40 INFO [hp_sweep_gpt.py.train_model:149] [Node 1/8] Num parameters: 10693697 @@ -359,26 +376,34 @@ def main(): # ``` -# ### Bonus: Tensorboard Web App -# To monitor our training we will create a Tensorboard WSGI web app, it will +# ### Monitoring experiments with TensorBoard + +# To monitor our training we will create a TensorBoard WSGI web app, it will # display the progress of our training across all 8 models. We'll use the latest -# experiment tensorboard logs available on the `volume`. -@app.function(image=image, volumes={volume_path: volume}) +# logs for the most recent experiment written to the Volume. + + +@app.function(image=monitoring_image, volumes={volume_path: volume}) @modal.wsgi_app() def monitor_training(): import time - L.info("Tensorboard: Waiting 10 seconds for training to start...") - time.sleep(10) # Wait for experiment folder to be created by training. - volume.reload() # Make sure we have the latest data. + print("📈 TensorBoard: Waiting for logs...") + ct = 0 + while not tb_log_path.exists(): + ct += 1 + if ct > 10: + raise Exception("No logs found after 10 seconds.") + volume.reload() # make sure we have the latest data. + time.sleep(1) - # Obtain the latest log path + # obtain the latest log path tb_log_paths = glob.glob(f"{tb_log_path}/*") latest_tb_log_path = max(tb_log_paths, key=os.path.getctime) monitor_path = Path(latest_tb_log_path) - L.info(f"Monitoring: {monitor_path.name}") + print(f"📈 Monitoring: {monitor_path.name}") - # Start tensorboard with the latest log path + # start TensorBoard with the latest log path board = tensorboard.program.TensorBoard() board.configure(logdir=str(monitor_path)) (data_provider, deprecated_multiplexer) = board._make_data_provider() @@ -392,30 +417,35 @@ def monitor_training(): return wsgi_app -# After training your Tensorboard will look something like this: -# [[./tensorboard.png|alt=tensorboard]] +# After training your TensorBoard UI will look something like this: + +# [[./tensorboard.png|alt=TensorBoard]] + # Notice that there are 8 models training, and the one with the lowest # validation loss at step 600 continues training to 3000 steps. -# ## Web Serving (another bonus) +# ## Serving the trained model as a web endpoint + # ### Setup + # Initialize some variables for web serving: web_app = FastAPI() assets_path = Path(__file__).parent / "assets" -# ### Inference class -# Now we will create a class for running inference only on the trained model. -# -# We choose the latest experiment that has a best model checkpoint -# and to load that model for inference. In case training is still ongoing, +# ### Remote inference with Modal `Cls`es + +# Now we will create a class for running model inference. + +# We choose the latest experiment that has a "best" model checkpoint +# and load that model for inference. In case training is still ongoing, # we check for updated models on the fly and load them if available. @app.cls(image=image, volumes={volume_path: volume}, gpu=gpu) class ModelInference: def build_encode_decode(self, chars): - # Create funcs for converting text into digits (encode) and + # create funcs for converting text into digits (encode) and # vice versa (decode) stoi = {c: i for i, c in enumerate(chars)} itos = {i: c for i, c in enumerate(chars)} @@ -429,7 +459,7 @@ def decode(l): return encode, decode def load_model_impl(self): - # Loop through all model dirs and load the latest available model + # loop through all model dirs and load the latest available model save_model_dirs = glob.glob(f"{save_path}/*") sorted_model_dirs = sorted( save_model_dirs, key=os.path.getctime, reverse=True @@ -438,12 +468,12 @@ def load_model_impl(self): for latest_model_dir in sorted_model_dirs: if self.use_model_dir == latest_model_dir and self.is_fully_trained: return # Already loaded - L.info(f"Attemping to load from: {latest_model_dir} ...") + print(f"Attemping to load from: {latest_model_dir} ...") try: checkpoint = torch.load( f"{latest_model_dir}/{best_model_filename}" ) - L.info("Successfully loaded model.") + print("Successfully loaded model.") found_model = True break except Exception as e: @@ -451,7 +481,6 @@ def load_model_impl(self): if not found_model: raise Exception("No models ready for serving.") - # Model loaded successfully. Print info about the model self.use_model_dir = latest_model_dir hparams = checkpoint["hparams"] chars = checkpoint["chars"] @@ -459,13 +488,13 @@ def load_model_impl(self): val_loss = checkpoint["val_loss"] self.is_fully_trained = checkpoint["finished_training"] - L.info( + print( f"Loaded model with {steps} train steps " f" and val loss of {val_loss:.2f}" f" (fully_trained={self.is_fully_trained}" ) - # Reconstruct encode/decode + # reconstruct encode/decode vocab_size = len(chars) self.encode, self.decode = self.build_encode_decode(chars) @@ -483,21 +512,23 @@ def load_model(self): @modal.method() def generate(self, prompt): - self.load_model_impl() # Load updated model if aviailable, o/w no op. + self.load_model_impl() # load updated model if available - # Generate 1000 new characters from input prompt + # generate 1000 new characters from input prompt n_new_tokens = 1000 encoded_prompt = self.encode(prompt) - # Create a torch tensor from the encoded prompt + # create a torch tensor from the encoded prompt torch_input = torch.tensor(encoded_prompt, dtype=torch.long) - torch_input = torch_input.view(1, len(torch_input)) # Add batch dim. + torch_input = torch_input.view(1, len(torch_input)) # add batch dim torch_input = torch_input.to(self.device) - # Generate new tokens + # generate new tokens gen_out = self.model.generate(torch_input, n_new_tokens)[0] # 0th batch - # Decode from digits to text - chars_out = self.decode([x for x in gen_out.tolist()]) - # Join the characters into a string and return + # decode from digits to text + chars_out = self.decode([x for x in gen_out.tolist()])[ + len(encoded_prompt) : + ] + # join the characters into a string and return str_out = "".join(chars_out) return str_out @@ -505,45 +536,61 @@ def generate(self, prompt): # First, we create a simple POST web endpoint for generating text. +class GenerationRequest(BaseModel): + prompt: str + + @app.function() @modal.web_endpoint(method="POST", docs=True) -def web_generate(item: dict): - output = ModelInference().generate.remote(item["prompt"]) - return {"web_generate": output} +def web_generate(request: GenerationRequest): + output = ModelInference().generate.remote(request.prompt) + return {"output": output} # That will allow us to generate text via a simple `curl` command like this: + # ```bash -# curl -X POST -H 'Content-Type: application/json' --data-binary '{"prompt": "\n"}' https://shariqm--modal-nano-gpt-web-generate-dev.modal.run +# curl -X POST -H 'Content-Type: application/json' --data-binary '{"prompt": "\n"}' https://your-workspace-name--modal-nano-gpt-web-generate-dev.modal.run # ``` + # which will return something like: -# ```bash -# {'web_generate':'\nBRUTUS:\nThe broy trefore anny pleasory to\nwip me state of villoor so:\nFortols listhey for brother beat the else\nBe all, ill of lo-love in igham;\nAh, here all that queen and hould you father offer'} + +# ```json +# { +# "output": +# "BRUTUS: +# The broy trefore anny pleasory to +# wip me state of villoor so: +# Fortols listhey for brother beat the else +# Be all, ill of lo-love in igham; +# Ah, here all that queen and hould you father offer" +# } # ``` -# + # It's not exactly Shakespeare, but at least it shows our model learned something! -# Second, we create a Gradio web app for generating text via a nice looking -# website. Notice that we don't include a `gpu` in the `app.function` -# parameters since it's not needed, saving us GPU costs for this container. +# ### Serving a Gradio UI with `asgi_app` + +# Second, we create a Gradio web UI for generating text via a graphical user interface in the browser. +# That way our fellow team members and stakeholders can easily interact with the model and give feedback. @app.function( - image=image, + image=ui_image, concurrency_limit=1, allow_concurrent_inputs=1000, mounts=[modal.Mount.from_local_dir(assets_path, remote_path="/assets")], ) @modal.asgi_app() -def fastapi_app(): +def ui(): import gradio as gr from gradio.routes import mount_gradio_app - # Call out to the inference in a separate Modal environment with a GPU + # call out to the inference in a separate Modal environment with a GPU def go(text=""): if not text: text = "\n" - return ModelInference().generate.remote(text) + return text + ModelInference().generate.remote(text) example_prompts = [ "DUKE OF YORK:\nWhere art thou Lucas?", @@ -568,14 +615,14 @@ async def background(): primary_hue="green", secondary_hue="emerald", neutral_hue="neutral" ) - # add a gradio UI around inference + # add a Gradio UI around inference with gr.Blocks(theme=theme, css=css, title="Tiny LLM") as interface: - # Title + # title gr.Markdown( "# Generate Shakespeare text using the prompt", ) - # Input and Output + # input and output with gr.Row(): with gr.Column(): gr.Markdown("## Input:") @@ -591,7 +638,7 @@ async def background(): lines=10, ) - # Button to trigger inference and a link to Modal + # button to trigger inference and a link to Modal with gr.Row(): btn = gr.Button("Generate", variant="primary", scale=2) btn.click( @@ -604,7 +651,7 @@ async def background(): link="https://modal.com", ) - # Example prompts + # example prompts with gr.Column(variant="compact"): # add in a few examples to inspire users for ii, prompt in enumerate(example_prompts): @@ -619,10 +666,106 @@ async def background(): ) -# The Gradio app will look something like this: +# The Gradio UI will look something like this: # [[./gradio.png|alt=gradio]] -# We hope you enjoyed this example. Message us on Slack if you need help! +# ## Addenda + +# The remainder of this code is boilerplate for the training loop. +# There's a lot! If you'd rather not write this stuff yourself, +# consider a training framework like [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable) +# or [Hugging Face](https://huggingface.co/transformers/main_classes/trainer.html). + + +def log_evals(result, step, t_last, val_writer, train_writer): + runtime_s = timer() - t_last + L.info( + f"{step:5d}) // {runtime_s:>5.2f}s" + f" // Train Loss: {result['train']:.2f} // Val Loss:" + f" {result['val']:.2f}" + ) + val_writer.add_scalar("Cross Entropy Loss", result["val"], step) + val_writer.add_text("Sample Output", result["sample"], step) + train_writer.flush() + + return result + + +def training_loop( + start_step, + n_steps, + n_steps_before_eval, + n_steps_before_checkpoint, + dataset, + model, + optimizer, + train_writer, + val_writer, + checkpoint, + checkpoint_path, + run_to_first_save, +): + t_last = timer() + for step in range(start_step, n_steps + 1): + # sample a batch of data + xb, yb = dataset.get_batch("train") + + # evaluate the loss, calculate & apply gradients + logits, loss = model.forward(xb, yb) + optimizer.zero_grad(set_to_none=True) + loss.backward() + optimizer.step() + + # log training loss + train_writer.add_scalar("Cross Entropy Loss", loss.item(), step) + + # evaluate model on validation set + if step % n_steps_before_eval == 0: + out = dataset.eval_model(model) + log_evals(out, step, t_last, val_writer, train_writer) + volume.commit() + t_last = timer() + + # save model with checkpoint information + if step > 0 and step % n_steps_before_checkpoint == 0: + checkpoint["steps"] = step + checkpoint["val_loss"] = out["val"] + + # mark as finished if we hit n steps. + checkpoint["finished_training"] = step >= n_steps + + L.info(f"Saving checkpoint to {checkpoint_path}") + save_checkpoint(checkpoint, checkpoint_path) + + if run_to_first_save: + L.info("Stopping early...") + break + return out + + +def save_checkpoint(checkpoint, checkpoint_path): + torch.save(checkpoint, checkpoint_path) + volume.commit() + + +def prepare_data(input_file_path: Path, volume: modal.Volume) -> str: + """Download and read the dataset.""" + volume.reload() + if not input_file_path.exists(): + L.info("Downloading Shakespeare dataset...") + data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt" + urllib.request.urlretrieve(data_url, input_file_path) + volume.commit() + return input_file_path.read_text() + + +def build_model(hparams, vocab_size, device): + """Initialize the model and move it to the device.""" + model = AttentionModel(vocab_size, hparams, device) + model.to(device) + return model + -# ## Further Examples -# [Scale out](https://modal.com/docs/guide/scale#parallel-execution-of-inputs) +def setup_optimizer(model, learning_rate): + """Set up the optimizer for the model.""" + return torch.optim.AdamW(model.parameters(), lr=learning_rate) diff --git a/06_gpu_and_ml/hyperparameter-sweep/model.py b/06_gpu_and_ml/hyperparameter-sweep/model.py index 19dbe72db..a23816bce 100644 --- a/06_gpu_and_ml/hyperparameter-sweep/model.py +++ b/06_gpu_and_ml/hyperparameter-sweep/model.py @@ -89,9 +89,16 @@ def eval_model(self, model): losses = torch.zeros(self.n_eval_steps) for k in range(self.n_eval_steps): xb, yb = self.get_batch(split) - logits, loss = model.forward(xb, yb) # Modal: Why need forward? + logits, loss = model.forward(xb, yb) losses[k] = loss out[split] = losses.mean() + torch_input = torch.tensor(self.encode("HAMLET:\n"), dtype=torch.long) + torch_input = torch_input.view(1, len(torch_input)) # add batch dim + torch_input = torch_input.to(self.device) + out["sample"] = "".join( + self.decode(model.generate(torch_input, 100)[0].tolist()) + ) + model.train() return out