From f8a7164c25f6ccf7d9de81c3fd9380eefd0bc187 Mon Sep 17 00:00:00 2001 From: Gracechung-sw Date: Tue, 25 Jun 2024 16:09:16 +0900 Subject: [PATCH] add code about hyperparameter tuner of sagemaker --- .../clip06_hyperparameter_tuning.ipynb | 240 ++++++++++++++++++ .../hyperparameter_tuner/code/train.py | 197 ++++++++++++++ 2 files changed, 437 insertions(+) create mode 100644 aws-sagemaker/hyperparameter_tuner/clip06_hyperparameter_tuning.ipynb create mode 100644 aws-sagemaker/hyperparameter_tuner/code/train.py diff --git a/aws-sagemaker/hyperparameter_tuner/clip06_hyperparameter_tuning.ipynb b/aws-sagemaker/hyperparameter_tuner/clip06_hyperparameter_tuning.ipynb new file mode 100644 index 0000000..93f6360 --- /dev/null +++ b/aws-sagemaker/hyperparameter_tuner/clip06_hyperparameter_tuning.ipynb @@ -0,0 +1,240 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0d4fe723-1c13-4679-b415-e1554c5ca1a4", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "!{sys.executable} -m pip install --upgrade pip\n", + "!{sys.executable} -m pip install --upgrade boto3\n", + "!{sys.executable} -m pip install --upgrade sagemaker\n", + "!{sys.executable} -m pip install --upgrade tensorflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01b0b892-37ba-4d74-9a17-65b1295c50de", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "\n", + "import sagemaker\n", + "from sagemaker.tensorflow import TensorFlow\n", + "from sagemaker import get_execution_role\n", + "\n", + "sess = sagemaker.Session()\n", + "region = sess.boto_region_name\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b84ddf1-573d-48b7-8e98-4c265cd2e225", + "metadata": {}, + "outputs": [], + "source": [ + "# 데이터 다운로드 받는 코드\n", + "# Data download from http://yann.lecun.com/exdb/mnist/\n", + "\n", + "import logging\n", + "import boto3\n", + "from botocore.exceptions import ClientError\n", + "\n", + "public_bucket = f\"sagemaker-example-files-prod-{region}\"\n", + "local_data_dir = \"/tmp/data\"\n", + "\n", + "\n", + "# Download training and testing data from a public S3 bucket\n", + "def download_from_s3(data_dir=\"/tmp/data\", train=True):\n", + " \"\"\"Download MNIST dataset and convert it to numpy array\n", + "\n", + " Args:\n", + " data_dir (str): directory to save the data\n", + " train (bool): download training set\n", + "\n", + " Returns:\n", + " None\n", + " \"\"\"\n", + " # project root\n", + " if not os.path.exists(data_dir):\n", + " os.makedirs(data_dir)\n", + "\n", + " if train:\n", + " images_file = \"train-images-idx3-ubyte.gz\"\n", + " labels_file = \"train-labels-idx1-ubyte.gz\"\n", + " else:\n", + " images_file = \"t10k-images-idx3-ubyte.gz\"\n", + " labels_file = \"t10k-labels-idx1-ubyte.gz\"\n", + "\n", + " # download objects\n", + " s3 = boto3.client(\"s3\")\n", + " bucket = public_bucket\n", + " for obj in [images_file, labels_file]:\n", + " key = os.path.join(\"datasets/image/MNIST\", obj)\n", + " dest = os.path.join(data_dir, obj)\n", + " if not os.path.exists(dest):\n", + " s3.download_file(bucket, key, dest)\n", + " return\n", + "\n", + "\n", + "download_from_s3(local_data_dir, True)\n", + "download_from_s3(local_data_dir, False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "370ca19f-6f75-4ab5-aba1-23f8c494898f", + "metadata": {}, + "outputs": [], + "source": [ + "# TensorFlow Estimator 생성\n", + "est = TensorFlow(\n", + " entry_point=\"train.py\", # train.py: 학습에 사용할 코드. 이 코드는 code 폴더에 저장되어 있음\n", + " source_dir=\"code\",\n", + " role=role,\n", + " framework_version=\"2.3.1\",\n", + " # model_dir: remote에 있는 instance, 즉 sagemaker instance에 저장되는 model artifact가 저장되는 디렉토리\n", + " model_dir=\"/opt/ml/model\",\n", + " py_version=\"py37\",\n", + " instance_type=\"ml.m5.xlarge\",\n", + " instance_count=1,\n", + " volume_size=250, # volue size 지정: 250GB\n", + " hyperparameters={\n", + " \"batch-size\": 512,\n", + " \"epochs\": 4,\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "630e65fa-034e-4efb-bb2c-e1e9fb57c222", + "metadata": {}, + "outputs": [], + "source": [ + "prefix = \"mnist\"\n", + "bucket = sess.default_bucket()\n", + "\n", + "\"\"\"\n", + "Uploads data from the local directory to an S3 bucket.\n", + "\n", + "Args:\n", + " local_data_dir (str): The local directory path where the data is located.\n", + "\n", + "Returns:\n", + " dict: A dictionary containing the S3 bucket location for the training and testing data.\n", + "\n", + "\"\"\"\n", + "loc = sess.upload_data(path=local_data_dir, bucket=bucket, key_prefix=prefix)\n", + "\n", + "channels = {\"training\": loc, \"testing\": loc}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "753a86df-8ddd-422e-955a-7e6899ad3fe9", + "metadata": {}, + "outputs": [], + "source": [ + "# Hyperparameter Tuning\n", + "# Hyperparameter tuning을 위한 parameter range 설정\n", + "from sagemaker.tuner import ContinuousParameter, HyperparameterTuner\n", + "\n", + "# learning rate를 1e-4 ~ 1e-3 사이의 값으로 설정\n", + "hyperparameter_range = {\"learning-rate\": ContinuousParameter(1e-4, 1e-3)}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c10a6615-84ba-45d2-8f8c-749d7a9ebeec", + "metadata": {}, + "outputs": [], + "source": [ + "objective_metric_name = \"average test loss\"\n", + "# metic을 어떤 정책을 기반으로 최적화 할 것인지 설정\n", + "objective_type = \"Minimize\" # 만약 average test loss를 높이고 싶으면 Maximize로 설정\n", + "\n", + "metric_definitions = [\n", + " {\n", + " \"Name\": \"average test loss\",\n", + " \"Regex\": \"Test Loss: ([0-9\\\\.]+)\",\n", + " }\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55a5a9b8-0ffb-41f4-aa3f-de5f6e32dcf4", + "metadata": {}, + "outputs": [], + "source": [ + "tuner = HyperparameterTuner(\n", + " est,\n", + " objective_metric_name,\n", + " hyperparameter_range,\n", + " metric_definitions,\n", + " max_jobs=3,\n", + " max_parallel_jobs=3,\n", + " objective_type=objective_type\n", + ")\n", + "\n", + "tuner.fit(inputs=channels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e359fb4a-ad25-4a22-8058-d594397b6a1d", + "metadata": {}, + "outputs": [], + "source": [ + "# Deploy model using SageMaker Endpoint SDK\n", + "\n", + "from sagemaker import Model\n", + "\n", + "model = Model(model_data='s3://your-s3-bucket/path/to/model.tar.gz',\n", + " # e.g., '763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-inference:2.3.1-cpu'\n", + " image_uri='your-container-image-url',\n", + " role='arn:aws:iam::account-id:role/role-name')\n", + "\n", + "predictor = model.deploy(initial_instance_count=1,\n", + " instance_type='ml.m4.xlarge')\n", + "result = predictor.predict(input_data)\n", + "predictor.delete_endpoint() # 이렇게 실습을 진행한 뒤에는 꼭 endpoint를 삭제해서 과금을 막아야 한다." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/aws-sagemaker/hyperparameter_tuner/code/train.py b/aws-sagemaker/hyperparameter_tuner/code/train.py new file mode 100644 index 0000000..7211e59 --- /dev/null +++ b/aws-sagemaker/hyperparameter_tuner/code/train.py @@ -0,0 +1,197 @@ +from __future__ import print_function + +import argparse +import gzip +import json +import logging +import os +import sys +import traceback + +import numpy as np +import tensorflow as tf +from tensorflow.keras import Model +from tensorflow.keras.layers import Conv2D, Dense, Flatten + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) +logger.addHandler(logging.StreamHandler(sys.stdout)) + + +# Define the model object + + +class SmallConv(Model): + def __init__(self): + super(SmallConv, self).__init__() + self.conv1 = Conv2D(32, 3, activation="relu") + self.flatten = Flatten() + self.d1 = Dense(128, activation="relu") + self.d2 = Dense(10) + + def call(self, x): + x = self.conv1(x) + x = self.flatten(x) + x = self.d1(x) + return self.d2(x) + + +# Decode and preprocess data +def convert_to_numpy(data_dir, images_file, labels_file): + """Byte string to numpy arrays""" + with gzip.open(os.path.join(data_dir, images_file), "rb") as f: + images = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1, 28, 28) + + with gzip.open(os.path.join(data_dir, labels_file), "rb") as f: + labels = np.frombuffer(f.read(), np.uint8, offset=8) + + return (images, labels) + + +def mnist_to_numpy(data_dir, train): + """Load raw MNIST data into numpy array + + Args: + data_dir (str): directory of MNIST raw data. + This argument can be accessed via SM_CHANNEL_TRAINING + + train (bool): use training data + + Returns: + tuple of images and labels as numpy array + """ + + if train: + images_file = "train-images-idx3-ubyte.gz" + labels_file = "train-labels-idx1-ubyte.gz" + else: + images_file = "t10k-images-idx3-ubyte.gz" + labels_file = "t10k-labels-idx1-ubyte.gz" + + return convert_to_numpy(data_dir, images_file, labels_file) + + +def normalize(x, axis): + eps = np.finfo(float).eps + mean = np.mean(x, axis=axis, keepdims=True) + # avoid division by zero + std = np.std(x, axis=axis, keepdims=True) + eps + return (x - mean) / std + + +# Training logic + + +def train(args): + # create data loader from the train / test channels + x_train, y_train = mnist_to_numpy(data_dir=args.train, train=True) + x_test, y_test = mnist_to_numpy(data_dir=args.test, train=False) + + x_train, x_test = x_train.astype(np.float32), x_test.astype(np.float32) + + # normalize the inputs to mean 0 and std 1 + x_train, x_test = normalize(x_train, (1, 2)), normalize(x_test, (1, 2)) + + # expand channel axis + # tf uses depth minor convention + x_train, x_test = np.expand_dims(x_train, axis=3), np.expand_dims(x_test, axis=3) + + # normalize the data to mean 0 and std 1 + train_loader = ( + tf.data.Dataset.from_tensor_slices((x_train, y_train)) + .shuffle(len(x_train)) + .batch(args.batch_size) + ) + + test_loader = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(args.batch_size) + + model = SmallConv() + model.compile() + loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + optimizer = tf.keras.optimizers.Adam( + learning_rate=args.learning_rate, beta_1=args.beta_1, beta_2=args.beta_2 + ) + + train_loss = tf.keras.metrics.Mean(name="train_loss") + train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="train_accuracy") + + test_loss = tf.keras.metrics.Mean(name="test_loss") + test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="test_accuracy") + + @tf.function + def train_step(images, labels): + with tf.GradientTape() as tape: + predictions = model(images, training=True) + loss = loss_fn(labels, predictions) + grad = tape.gradient(loss, model.trainable_variables) + optimizer.apply_gradients(zip(grad, model.trainable_variables)) + + train_loss(loss) + train_accuracy(labels, predictions) + return + + @tf.function + def test_step(images, labels): + predictions = model(images, training=False) + t_loss = loss_fn(labels, predictions) + test_loss(t_loss) + test_accuracy(labels, predictions) + return + + logger.info("Training starts ...") + for epoch in range(args.epochs): + train_loss.reset_states() + train_accuracy.reset_states() + test_loss.reset_states() + test_accuracy.reset_states() + + for batch, (images, labels) in enumerate(train_loader): + train_step(images, labels) + + logger.info( + f"Epoch {epoch + 1}, " + f"Loss: {train_loss.result()}, " + f"Accuracy: {train_accuracy.result()}, " + ) + + for images, labels in test_loader: + test_step(images, labels) + + # metric for the hyperparameter tunner + logger.info(f"Test Loss: {test_loss.result()}") + logger.info(f"Test Accuracy: {test_accuracy.result()}") + + # Save the model + # A version number is needed for the serving container + # to load the model + version = "00000000" + ckpt_dir = os.path.join(args.model_dir, version) + if not os.path.exists(ckpt_dir): + os.makedirs(ckpt_dir) + model.save(ckpt_dir) + return + + +def parse_args(): + parser = argparse.ArgumentParser() + + parser.add_argument("--batch-size", type=int, default=32) + parser.add_argument("--epochs", type=int, default=1) + parser.add_argument("--learning-rate", type=float, default=1e-3) + parser.add_argument("--beta_1", type=float, default=0.9) + parser.add_argument("--beta_2", type=float, default=0.999) + + # Environment variables given by the training image + parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"]) + parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAINING"]) + parser.add_argument("--test", type=str, default=os.environ["SM_CHANNEL_TESTING"]) + + parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"]) + parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"])) + + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + train(args)