From cbc2b1bca9d610731ac924ceef91f9acbdefd701 Mon Sep 17 00:00:00 2001 From: Sindhu Somasundaram <56774226+sindhuvahinis@users.noreply.github.com> Date: Tue, 12 Dec 2023 17:15:43 -0800 Subject: [PATCH] [CI][fix] Sagemaker integration test cloudwatch metrics fix (#1385) --- .github/workflows/sagemaker-integration.yml | 24 ++-- .../llm/sagemaker-endpoint-tests.py | 107 +++++++++++------- 2 files changed, 80 insertions(+), 51 deletions(-) diff --git a/.github/workflows/sagemaker-integration.yml b/.github/workflows/sagemaker-integration.yml index 1662b8176..f516fddc5 100644 --- a/.github/workflows/sagemaker-integration.yml +++ b/.github/workflows/sagemaker-integration.yml @@ -16,9 +16,9 @@ on: required: false default: '' run_benchmark: - description: 'Runs benchmark and upload to cloud watch mertcis if set to true.' + description: 'Runs benchmark and upload to cloud watch metrics' required: false - default: true + default: 'true' schedule: - cron: '0 4 * * *' @@ -57,7 +57,8 @@ jobs: timeout-minutes: 120 needs: create-runners env: - run_benchmark: ${{ github.event.inputs.run_benchmark }} + run_benchmark: ${{ github.event.inputs.run_benchmark || 'true' }} + image_type: ${{ github.event.inputs.mode || 'nightly' }} steps: - uses: actions/checkout@v3 - name: Set up Python3 @@ -77,25 +78,25 @@ jobs: - name: MME Test working-directory: tests/integration run: | - python3 llm/sagemaker-endpoint-tests.py deepspeed-mme djl_mme ${{ github.event.inputs.mode || 'nightly' }} + python3 llm/sagemaker-endpoint-tests.py deepspeed-mme djl_mme ${image_type} ${run_benchmark} - name: Test gpt2xl working-directory: tests/integration run: | - python3 llm/sagemaker-endpoint-tests.py gpt2-xl djl ${{ github.event.inputs.mode || 'nightly' }} + python3 llm/sagemaker-endpoint-tests.py gpt2-xl djl ${image_type} ${run_benchmark} echo "sleep 30 seconds to allow endpoint deletion" sleep 30 - name: Test stable diffusion if: success() || failure() working-directory: tests/integration run: | - python3 llm/sagemaker-endpoint-tests.py stable-diffusion-2-1-base djl ${{ github.event.inputs.mode || 'nightly' }} + python3 llm/sagemaker-endpoint-tests.py stable-diffusion-2-1-base djl ${image_type} ${run_benchmark} echo "sleep 30 seconds to allow endpoint deletion" sleep 30 - name: Test opt-1.3b if: success() || failure() working-directory: tests/integration run: | - python3 llm/sagemaker-endpoint-tests.py opt-1-3-b djl ${{ github.event.inputs.mode || 'nightly' }} + python3 llm/sagemaker-endpoint-tests.py opt-1-3-b djl ${image_type} ${run_benchmark} echo "sleep 30 seconds to allow endpoint deletion" sleep 30 @@ -104,7 +105,8 @@ jobs: timeout-minutes: 120 needs: create-runners env: - run_benchmark: ${{ github.event.inputs.run_benchmark }} + run_benchmark: ${{ github.event.inputs.run_benchmark || 'true' }} + image_type: ${{ github.event.inputs.mode || 'nightly' }} steps: - uses: actions/checkout@v3 - name: Set up Python3 @@ -125,21 +127,21 @@ jobs: if: success() || failure() working-directory: tests/integration run: | - python3 llm/sagemaker-endpoint-tests.py gpt-j-6b djl ${{ github.event.inputs.mode || 'nightly' }} + python3 llm/sagemaker-endpoint-tests.py gpt-j-6b djl ${image_type} ${run_benchmark} echo "sleep 30 seconds to allow endpoint deletion" sleep 30 - name: Test gpt-neo-2.7b no code DeepSpeed if: success() || failure() working-directory: tests/integration run: | - python3 llm/sagemaker-endpoint-tests.py gpt-neo-2-7-b no_code ${{ github.event.inputs.mode || 'nightly' }} + python3 llm/sagemaker-endpoint-tests.py gpt-neo-2-7-b no_code ${image_type} ${run_benchmark} echo "sleep 30 seconds to allow endpoint deletion" sleep 30 - name: Test DeepSpeed pythia-12b if: success() || failure() working-directory: tests/integration run: | - python3 llm/sagemaker-endpoint-tests.py pythia-12b djl ${{ github.event.inputs.mode || 'nightly' }} + python3 llm/sagemaker-endpoint-tests.py pythia-12b djl ${image_type} ${run_benchmark} echo "sleep 30 seconds to allow endpoint deletion" sleep 30 diff --git a/tests/integration/llm/sagemaker-endpoint-tests.py b/tests/integration/llm/sagemaker-endpoint-tests.py index 19bf8afd0..fc8f064ef 100644 --- a/tests/integration/llm/sagemaker-endpoint-tests.py +++ b/tests/integration/llm/sagemaker-endpoint-tests.py @@ -9,6 +9,11 @@ from argparse import ArgumentParser import numpy as np + +def boolean_arg(value): + return str(value).lower() == "true" + + parser = ArgumentParser( description= "This Script deploys a model with predefined configuration to a SageMaker Inference Endpoint" @@ -24,6 +29,11 @@ help="Whether to use release or nightly images for testing", choices=["nightly", "release", "candidate"]) +parser.add_argument( + "run_benchmark", + help="Whether to run benchmark and upload the metrics to cloudwatch", + type=boolean_arg) + ROLE = "arn:aws:iam::185921645874:role/AmazonSageMaker-ExeuctionRole-IntegrationTests" DEFAULT_INSTANCE_TYPE = "ml.g5.12xlarge" DEFAULT_PAYLOAD = {"inputs": "Deep Learning is"} @@ -95,12 +105,14 @@ MME_CONFIGS = { "deepspeed-mme": { "models": [{ + 'name': 'gpt-neo-2-7b', "model_id": "EleutherAI/gpt-neo-2.7B", "model_kwargs": { "dtype": "fp16", "number_of_partitions": 1, } }, { + 'name': 'opt-1-3b', "model_id": "s3://djl-llm-sm-endpoint-tests/opt-1.3b/", "model_kwargs": { "dtype": "fp16", @@ -114,7 +126,7 @@ } } -ENGINE_TO_METRIC_CONFIG_ENGINE = {"Python": "Accelerate"} +ENGINE_TO_METRIC_CONFIG_ENGINE = {"python": "accelerate"} NIGHTLY_IMAGES = { "python": @@ -172,56 +184,58 @@ def _upload_metrics(data): 'Unit': 'Milliseconds', 'Value': data['p90'] }, { - 'MetricName': f"{data['metric_name']}_p99", + 'MetricName': f"{data['metric_name']}-p99", 'Unit': 'Milliseconds', 'Value': data['p99'] }]) + print( + f"Uploaded metrics data with metric prefix {data['metric_name']} to AWS CloudWatch" + ) -def _get_metric_name(name, model): - - engine = model.engine.value[0] - metric_config_engine = ENGINE_TO_METRIC_CONFIG_ENGINE.get(engine, engine) +def _get_metric_name(name, model, engine, instance_type): + engine_name = ENGINE_TO_METRIC_CONFIG_ENGINE.get(engine, engine) num_partitions = 1 - if model.number_of_partitions: + if hasattr(model, 'number_of_partitions') and model.number_of_partitions: num_partitions = model.number_of_partitions + return f"{name}-{engine_name}-{num_partitions}p-{instance_type}" - return f"{name}-{metric_config_engine}-{num_partitions}p" - - -def _run_benchmarks(predictor, config, metric_name): - for _ in range(10): - predictor.predict(config.get("payload", DEFAULT_PAYLOAD)) +def _run_benchmarks(predictor, payload_data, metric_name, target_model=None): + for _ in range(3): + predictor.predict(data=payload_data, target_model=target_model) latencies = [] - iterations = 100 + iterations = 25 begin = time.time() for _ in range(iterations): start = time.time() - predictor.predict(config.get("payload", DEFAULT_PAYLOAD)) + predictor.predict(data=payload_data, target_model=target_model) latencies.append((time.time() - start) * 1000) elapsed = (time.time() - begin) * 1000 - benchmark_data = {} - benchmark_data['metric_name'] = metric_name - benchmark_data['throughput'] = iterations / elapsed * 1000 - benchmark_data['avg'] = sum(latencies) / iterations - benchmark_data['p50'] = np.percentile(latencies, 50) - benchmark_data['p90'] = np.percentile(latencies, 90) - benchmark_data['p99'] = np.percentile(latencies, 99) + benchmark_data = { + 'metric_name': metric_name, + 'throughput': iterations / elapsed * 1000, + 'avg': sum(latencies) / iterations, + 'p50': np.percentile(latencies, 50), + 'p90': np.percentile(latencies, 90), + 'p99': np.percentile(latencies, 99) + } _upload_metrics(benchmark_data) -def mme_test(name, image_type): +def mme_test(name, image_type, run_benchmark): config = MME_CONFIGS.get(name) session = get_sagemaker_session( default_bucket_prefix=get_name_for_resource("mme-tests")) models = config.get("models") + framework = config.get("framework") + instance_type = config.get("instance_type", DEFAULT_INSTANCE_TYPE) created_models = [] mme = None predictor = None @@ -237,12 +251,12 @@ def mme_test(name, image_type): created_models.append(model) if image_type == "nightly": - mme_image_uri = NIGHTLY_IMAGES[config.get("framework")] + mme_image_uri = NIGHTLY_IMAGES[framework] elif image_type == "candidate": - mme_image_uri = CANDIDATE_IMAGES[config.get("framework")] + mme_image_uri = CANDIDATE_IMAGES[framework] else: mme_image_uri = sagemaker.image_uris.retrieve( - framework="djl-" + config.get("framework"), + framework="djl-" + framework, version=RELEASE_VERSION, region=REGION) mme = MultiDataModel(get_name_for_resource(name), @@ -256,13 +270,21 @@ def mme_test(name, image_type): predictor = mme.deploy( 1, - config.get("instance_type", DEFAULT_INSTANCE_TYPE), + instance_type, serializer=sagemaker.serializers.JSONSerializer(), deserializer=sagemaker.deserializers.JSONDeserializer()) - for model in list(mme.list_models()): + for i, model in enumerate(list(mme.list_models())): outputs = predictor.predict(DEFAULT_PAYLOAD, target_model=model) print(outputs) + if run_benchmark: + _run_benchmarks(predictor=predictor, + payload_data=DEFAULT_PAYLOAD, + metric_name=_get_metric_name( + models[i]['name'], created_models[i], + framework, instance_type), + target_model=model) + except Exception as e: print(f"Encountered error for creating model {name}. Exception: {e}") raise e @@ -279,17 +301,17 @@ def mme_test(name, image_type): def no_code_endpoint_test(name, image_type): config = HUGGING_FACE_NO_CODE_CONFIGS.get(name) data = config.get("payload", DEFAULT_PAYLOAD) + framework = config.get("framework") session = get_sagemaker_session( default_bucket_prefix=get_name_for_resource("no-code-tests")) model = None predictor = None if image_type == "nightly": - image_uri = NIGHTLY_IMAGES[config.get("framework")] + image_uri = NIGHTLY_IMAGES[framework] elif image_type == "candidate": - image_uri = CANDIDATE_IMAGES[config.get("framework")] + image_uri = CANDIDATE_IMAGES[framework] else: - image_uri = sagemaker.image_uris.retrieve(framework="djl-" + - config.get("framework"), + image_uri = sagemaker.image_uris.retrieve(framework="djl-" + framework, version=RELEASE_VERSION, region=REGION) try: @@ -319,7 +341,7 @@ def no_code_endpoint_test(name, image_type): model.delete_model() -def single_model_endpoint_test(name, image_type): +def single_model_endpoint_test(name, image_type, run_benchmark): config = SINGLE_MODEL_ENDPOINT_CONFIGS.get(name) data = config.get("payload", DEFAULT_PAYLOAD) session = get_sagemaker_session( @@ -335,10 +357,11 @@ def single_model_endpoint_test(name, image_type): name=get_name_for_resource(name), **config.get("model_kwargs"), ) + engine_name = model.engine.value[0].lower() if image_type == "nightly": - model.image_uri = NIGHTLY_IMAGES[model.engine.value[0].lower()] + model.image_uri = NIGHTLY_IMAGES[engine_name] elif image_type == "candidate": - model.image_uri = CANDIDATE_IMAGES[model.engine.value[0].lower()] + model.image_uri = CANDIDATE_IMAGES[engine_name] if config.get("partition", False): model.partition(instance_type=DEFAULT_INSTANCE_TYPE, @@ -351,10 +374,13 @@ def single_model_endpoint_test(name, image_type): outputs = predictor.predict(data=data) print(outputs) - if os.getenv("run_benchmark"): + if run_benchmark: _run_benchmarks(predictor=predictor, - config=config, - metric_name=_get_metric_name(name, model)) + payload_data=data, + metric_name=_get_metric_name( + name, model, engine_name, + DEFAULT_INSTANCE_TYPE), + target_model=None) except Exception as e: print(f"Encountered error for creating model {name}. Exception: {e}") @@ -373,11 +399,12 @@ def single_model_endpoint_test(name, image_type): test_case = args.test_case image_type = args.image_type if test_case == "djl": - single_model_endpoint_test(model_name, image_type) + single_model_endpoint_test(model_name, image_type, args.run_benchmark) elif test_case == "no_code": + # skipping running benchmark for this for now, as we are not testing new models here. no_code_endpoint_test(model_name, image_type) elif test_case == "djl_mme": - mme_test(model_name, image_type) + mme_test(model_name, image_type, args.run_benchmark) else: raise ValueError( f"{test_case} is not a valid test case. Valid choices: [djl, no_code, djl_mme])"