diff --git a/cm-mlops/script/get-generic-python-lib/_cm.json b/cm-mlops/script/get-generic-python-lib/_cm.json index b0e0c5cc4e..5f935b7c9a 100644 --- a/cm-mlops/script/get-generic-python-lib/_cm.json +++ b/cm-mlops/script/get-generic-python-lib/_cm.json @@ -481,6 +481,19 @@ "CM_POLYGRAPHY_VERSION" ] }, + "onnx-graphsurgeon": { + "deps": [ + { + "tags": "get,generic-python-lib,_package.nvidia-pyindex" + } + ], + "env": { + "CM_GENERIC_PYTHON_PACKAGE_NAME": "onnx_graphsurgeon" + }, + "new_env_keys": [ + "CM_ONNX_GRAPHSURGEON_VERSION" + ] + }, "protobuf": { "env": { "CM_GENERIC_PYTHON_PACKAGE_NAME": "protobuf" diff --git a/cm-mlops/script/get-ml-model-bert-large-squad/_cm.json b/cm-mlops/script/get-ml-model-bert-large-squad/_cm.json index 6c69d2bcde..6e11b041b4 100644 --- a/cm-mlops/script/get-ml-model-bert-large-squad/_cm.json +++ b/cm-mlops/script/get-ml-model-bert-large-squad/_cm.json @@ -34,6 +34,11 @@ }, "update_tags_from_env_with_prefix": { "_url.": [ "CM_PACKAGE_URL" ] + }, + "force_cache": true, + "extra_cache_tags": "bert-large,ml-model", + "skip_if_env": { + "CM_ML_MODEL_BERT_PACKED": [ "yes" ] } } ], @@ -211,6 +216,130 @@ "env": { "CM_PACKAGE_URL": "https://github.com/mlcommons/inference_results_v2.1/raw/master/open/NeuralMagic/code/bert/deepsparse/models/oBERT-Large_95sparse_block4_qat.onnx.tar.xz" } + }, + "unpacked": { + "group": "packing", + "default": true, + "env": { + "CM_ML_MODEL_BERT_PACKED": "no" + } + }, + "packed": { + "group": "packing", + "env": { + "CM_ML_MODEL_BERT_PACKED": "yes" + }, + "deps": [ + { + "tags": "get,python3", + "names": [ "python", "python3" ] + }, + { + "tags": "get,generic-python-lib,_torch", + "names": [ "torch", "pytorch" ] + }, + { + "tags": "get,generic-python-lib,_package.tensorflow", + "names": [ "tensorflow" ] + }, + { + "tags": "get,generic-python-lib,_package.setuptools_rust", + "names": [ "setuptools_rust" ] + }, + { + "tags": "get,generic-python-lib,_package.transformers", + "names": [ "transformers" ], + "version": "2.4.0" + }, + { + "tags": "get,generic-python-lib,_package.protobuf", + "names": [ "protobuf" ], + "version1": "3.20.3" + }, + { + "tags": "get,generic-python-lib,_package.onnx", + "names": [ "onnx" ], + "version1": "1.12.0" + }, + { + "tags": "get,generic-python-lib,_onnx-graphsurgeon", + "names": [ "onnx-graphsurgeon" ], + "version1": "0.3.26" + }, + { + "tags": "get,generic-python-lib,_package.onnx-simplifier", + "names": [ "onnx-simplifier" ], + "version1": "0.3.7" + }, + { + "tags": "get,generic-python-lib,_numpy", + "names": [ "numpy" ], + "version1": "1.23.0" + }, + { + "tags": "get,mlperf,inference,src", + "names": [ "inference-src" ] + } + ], + "prehook_deps": [ + { + "tags": "download,file,_wget,_url.https://zenodo.org/record/3733868/files/model.ckpt-5474.data-00000-of-00001", + "env": { + "CM_DOWNLOAD_FILENAME": "model.ckpt-5474.data-00000-of-00001", + "CM_DOWNLOAD_FINAL_ENV_NAME": "CM_BERT_CHECKPOINT_DATA_PATH", + "CM_DOWNLOAD_PATH": "<<>>", + "CM_DOWNLOAD_CHECKSUM": "3089b27c559906a868878741d992ade7" + }, + "force_cache": true, + "extra_cache_tags": "bert,checkpoint,weights,bert-large" + }, + { + "tags": "download,file,_wget,_url.https://zenodo.org/record/3733868/files/model.ckpt-5474.index", + "env": { + "CM_DOWNLOAD_FILENAME": "model.ckpt-5474.index", + "CM_DOWNLOAD_FINAL_ENV_NAME": "CM_BERT_CHECKPOINT_INDEX_PATH", + "CM_DOWNLOAD_PATH": "<<>>", + "CM_DOWNLOAD_CHECKSUM": "d23d61572d9404da4dac3363b5bc735b" + }, + "force_cache": true, + "extra_cache_tags": "bert,checkpoint-index,bert-large" + }, + { + "tags": "download,file,_wget,_url.https://zenodo.org/record/3733868/files/model.ckpt-5474.meta", + "env": { + "CM_DOWNLOAD_FILENAME": "model.ckpt-5474.meta", + "CM_DOWNLOAD_FINAL_ENV_NAME": "CM_BERT_CHECKPOINT_META_PATH", + "CM_DOWNLOAD_PATH": "<<>>", + "CM_DOWNLOAD_CHECKSUM": "83e11e57eea14c9e9a246af74af40d66" + }, + "force_cache": true, + "extra_cache_tags": "bert,checkpoint-meta,bert-large" + }, + { + "tags": "download,file,_wget,_url.https://zenodo.org/record/3733868/files/vocab.txt", + "env": { + "CM_DOWNLOAD_FILENAME": "vocab.txt", + "CM_DOWNLOAD_FINAL_ENV_NAME": "CM_BERT_VOCAB_PATH", + "CM_DOWNLOAD_PATH": "<<>>", + "CM_DOWNLOAD_CHECKSUM": "64800d5d8528ce344256daf115d4965e" + }, + "force_cache": true, + "extra_cache_tags": "bert,vocab,bert-large" + }, + { + "tags": "download,file,_wget,_url.https://raw.githubusercontent.com/krai/axs2kilt/main/model_onnx_bert_large_packed_recipe/convert_model.py", + "env": { + "CM_DOWNLOAD_FILENAME": "convert_model.py", + "CM_DOWNLOAD_FINAL_ENV_NAME": "CM_BERT_CONVERTER_CODE_PATH", + "CM_DOWNLOAD_CHECKSUM": "94c91ce422e8f36f9d98b4926e2ad688" + }, + "force_cache": true, + "extra_cache_tags": "bert,checkpoint,converter,code,bert-large" + } + ], + "new_env_keys": [ + "CM_BERT_" + ] } } } diff --git a/cm-mlops/script/get-ml-model-bert-large-squad/customize.py b/cm-mlops/script/get-ml-model-bert-large-squad/customize.py index 3989da52ce..8ddca46b08 100644 --- a/cm-mlops/script/get-ml-model-bert-large-squad/customize.py +++ b/cm-mlops/script/get-ml-model-bert-large-squad/customize.py @@ -6,6 +6,13 @@ def preprocess(i): os_info = i['os_info'] env = i['env'] + if env.get('CM_ML_MODEL_BERT_PACKED', '') == 'yes': + i['run_script_input']['script_name'] = "run-packed" + env['CM_BERT_CONFIG_PATH'] = os.path.join(env['CM_MLPERF_INFERENCE_BERT_PATH'], "bert_config.json") + env['CM_BERT_CHECKPOINT_DOWNLOAD_DIR'] = os.path.join(os.getcwd(), "downloaded") + env['CM_ML_MODEL_FILE_WITH_PATH'] = os.path.join(os.getcwd(), "model.onnx") + env['CM_ML_MODEL_BERT_PACKED_PATH'] = os.path.join(os.getcwd(), "model.onnx") + return {'return':0} def postprocess(i): diff --git a/cm-mlops/script/get-ml-model-bert-large-squad/run-packed.sh b/cm-mlops/script/get-ml-model-bert-large-squad/run-packed.sh new file mode 100644 index 0000000000..361a2d4904 --- /dev/null +++ b/cm-mlops/script/get-ml-model-bert-large-squad/run-packed.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +cmd="${CM_PYTHON_BIN_WITH_PATH} ${CM_BERT_CONVERTER_CODE_PATH} --src '${PWD}/downloaded/model.ckpt-5474' --dest '$PWD/' --config_path '${CM_BERT_CONFIG_PATH}'" +echo $cmd +eval $cmd +test $? -eq 0 || exit $? diff --git a/cm-mlops/script/get-preprocessed-dataset-squad/_cm.yaml b/cm-mlops/script/get-preprocessed-dataset-squad/_cm.yaml new file mode 100644 index 0000000000..3e7f7be518 --- /dev/null +++ b/cm-mlops/script/get-preprocessed-dataset-squad/_cm.yaml @@ -0,0 +1,82 @@ +alias: get-preprocessed-dataset-squad +automation_alias: script +automation_uid: 5b4e0237da074764 +cache: true +deps: + - tags: get,python3 + names: + - python + - python3 + - tags: get,mlperf,inference,src + names: + - inference-src + - tags: get,squad,dataset,original + names: + - squad-dataset + - tags: get,squad,vocab + names: + - squad-vocab + - tags: get,generic-python-lib,_package.tokenization + +docker_input_mapping: {} +input_description: {} +env: + CM_DATASET_MAX_QUERY_LENGTH: 64 + +input_mapping: {} +new_env_keys: + - CM_DATASET_SQUAD_TOKENIZED_* +new_state_keys: [] +post_deps: [] +posthook_deps: [] +prehook_deps: [] +tags: +- get +- dataset +- preprocessed +- tokenized +- squad +uid: 7cd1d9b7e8af4788 +variations: + calib1: + group: calibration-set + env: + CM_SQUAD_CALIBRATION_SET: one + calib2: + group: calibration-set + env: + CM_SQUAD_CALIBRATION_SET: two + no-calib: + group: calibration-set + default: true + env: + CM_SQUAD_CALIBRATION_SET: two + raw: + group: raw + default: true + env: + CM_DATASET_RAW: "yes" + pickle: + group: raw + env: + CM_DATASET_RAW: "no" + seq-length.#: + group: seq-length + env: + CM_DATASET_MAX_SEQ_LENGTH: "#" + seq-length.384: + group: seq-length + default: true + env: + CM_DATASET_MAX_SEQ_LENGTH: 384 + doc-stride.#: + group: doc-stride + env: + CM_DATASET_DOC_STRIDE: "#" + doc-stride.128: + group: doc-stride + default: true + env: + CM_DATASET_DOC_STRIDE: 128 + +versions: {} diff --git a/cm-mlops/script/get-preprocessed-dataset-squad/customize.py b/cm-mlops/script/get-preprocessed-dataset-squad/customize.py new file mode 100644 index 0000000000..510e1032bc --- /dev/null +++ b/cm-mlops/script/get-preprocessed-dataset-squad/customize.py @@ -0,0 +1,45 @@ +from cmind import utils +import os + +def preprocess(i): + + os_info = i['os_info'] + + env = i['env'] + + meta = i['meta'] + + automation = i['automation'] + + quiet = (env.get('CM_QUIET', False) == 'yes') + + if env.get('CM_SQUAD_CALIBRATION_SET') == "one": + env['DATASET_CALIBRATION_FILE'] = os.path.join(env['CM_MLPERF_INFERENCE_SOURCE'], 'calibration', 'SQuAD-v1.1', 'bert_calibration_features.txt') + env['DATASET_CALIBRATION_ID'] = 1 + elif env.get('CM_SQUAD_CALIBRATION_SET') == "two": + env['DATASET_CALIBRATION_FILE'] = os.path.join(env['CM_MLPERF_INFERENCE_SOURCE'], 'calibration', 'SQuAD-v1.1', 'bert_calibration_qas_ids.txt') + env['DATASET_CALIBRATION_ID'] = 2 + else: + env['DATASET_CALIBRATION_FILE'] = "" + env['DATASET_CALIBRATION_ID'] = 0 + + env['CK_ENV_MLPERF_INFERENCE'] = env['CM_MLPERF_INFERENCE_SOURCE'] + + return {'return':0} + +def postprocess(i): + + env = i['env'] + cur = os.getcwd() + + env['CM_DATASET_SQUAD_TOKENIZED_ROOT'] = cur + if env.get('CM_DATASET_RAW', '') == "yes": + env['CM_DATASET_SQUAD_TOKENIZED_INPUT_IDS'] = os.path.join(cur, 'bert_tokenized_squad_v1_1_input_ids.raw') + env['CM_DATASET_SQUAD_TOKENIZED_SEGMENT_IDS'] = os.path.join(cur, 'bert_tokenized_squad_v1_1_segment_ids.raw') + env['CM_DATASET_SQUAD_TOKENIZED_INPUT_MASK'] = os.path.join(cur, 'bert_tokenized_squad_v1_1_input_mask.raw') + + env['CM_DATASET_SQUAD_TOKENIZED_MAX_SEQ_LENGTH'] = env['CM_DATASET_MAX_SEQ_LENGTH'] + env['CM_DATASET_SQUAD_TOKENIZED_DOC_STRIDE'] = env['CM_DATASET_DOC_STRIDE'] + env['CM_DATASET_SQUAD_TOKENIZED_MAX_QUERY_LENGTH'] = env['CM_DATASET_MAX_QUERY_LENGTH'] + + return {'return':0} diff --git a/cm-mlops/script/get-preprocessed-dataset-squad/run.sh b/cm-mlops/script/get-preprocessed-dataset-squad/run.sh new file mode 100644 index 0000000000..4509506ebf --- /dev/null +++ b/cm-mlops/script/get-preprocessed-dataset-squad/run.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +#CM Script location: ${CM_TMP_CURRENT_SCRIPT_PATH} + +#To export any variable +#echo "VARIABLE_NAME=VARIABLE_VALUE" >>tmp-run-env.out + +#${CM_PYTHON_BIN_WITH_PATH} contains the path to python binary if "get,python" is added as a dependency + + + +function exit_if_error() { + test $? -eq 0 || exit $? +} + +function run() { + echo "Running: " + echo "$1" + echo "" + if [[ ${CM_FAKE_RUN} != 'yes' ]]; then + eval "$1" + exit_if_error + fi +} + +#Add your run commands here... +# run "$CM_RUN_CMD" +CUR=$PWD +run "wget -nc https://raw.githubusercontent.com/krai/ck-mlperf/master/package/dataset-squad-tokenized_for_bert/tokenize_and_pack.py" + +run "${CM_PYTHON_BIN_WITH_PATH} tokenize_and_pack.py \ + ${CM_DATASET_SQUAD_VAL_PATH} \ + ${CM_ML_MODEL_BERT_VOCAB_FILE_WITH_PATH} \ + ${CUR}/bert_tokenized_squad_v1_1 \ + ${CM_DATASET_MAX_SEQ_LENGTH} \ + ${CM_DATASET_MAX_QUERY_LENGTH} \ + ${CM_DATASET_DOC_STRIDE} \ + ${CM_DATASET_RAW} \ + ${DATASET_CALIBRATION_FILE} \ + ${DATASET_CALIBRATION_ID}"