diff --git a/scripts/build_qllama2-70b_env.sh b/scripts/build_qllama2-70b_env.sh new file mode 100644 index 000000000..c38c9cb3a --- /dev/null +++ b/scripts/build_qllama2-70b_env.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# define env. variables +model_name=qllama2-70b +model_dir=language/llama2-70b +git_dir=$(git rev-parse --show-toplevel) +work_dir=$git_dir/$model_dir +data_dir=$git_dir/data +env_name=mlperf-$model_name +conda_base=$($CONDA_EXE info --base) + +# work on model directory +cd $work_dir + +# create and enter conda env. +printf "\n============= STEP-1: Create conda environment and activate =============\n" +conda remove -n $env_name --all -y +rm -rf $conda_base/env/$env_name +conda env create -f $git_dir/scripts/envs/$model_name\_env_quant.yml # quant model multi-gpu inference를 실험할 때 +set +u +source "$conda_base/etc/profile.d/conda.sh" +conda activate $env_name +set -u + +# build mlperf loadgen +printf "\n============= STEP-2: Build mlperf loadgen =============\n" +conda install pybind11==2.10.4 -c conda-forge -y +cd $git_dir/loadgen; python -m pip install . + +# pull model and dataset +printf "\n============= STEP-3: Pull dvc data =============\n" +pip install dvc[s3] +dvc pull $data_dir/models/llama2/Llama-2-70b-chat-hf/model-0000*.dvc --force +dvc pull $data_dir/models/llama2/Llama-2-70b-chat-hf/model-0001*.dvc --force +dvc pull $data_dir/models/llama2/Llama-2-70b-chat-hf/pytorch_model-0000*.dvc --force +dvc pull $data_dir/models/llama2/Llama-2-70b-chat-hf/pytorch_model-0001*.dvc --force +# shopt -s extglob +dvc pull $data_dir/models/llama2/Llama-2-70b-chat-hf/!(model-000*|pytorch_model-000*).dvc --force +dvc pull $data_dir/dataset/open-orca/validation --force +dvc pull $data_dir/dataset/open-orca/calibration --force +dvc pull $data_dir/quantization/llama2.dvc --force +# printf "\n============= End of build =============\n" + +# exit from conda env. +conda deactivate + +# get back to git root +cd $git_dir \ No newline at end of file diff --git a/scripts/envs/qllama2-70b_env.yml b/scripts/envs/qllama2-70b_env.yml new file mode 100644 index 000000000..52fb21949 --- /dev/null +++ b/scripts/envs/qllama2-70b_env.yml @@ -0,0 +1,110 @@ +name: mlperf-qllama2-70b +channels: + - defaults +dependencies: + - python~=3.10.0 + - pip: + - --extra-index-url https://download.pytorch.org/whl/cu118 + - torch==2.1.0+cu118 + - absl-py==2.1.0 + - accelerate==0.28.0 + - aiofiles==23.2.1 + - aiohttp==3.8.6 + - aiosignal==1.3.1 + - async-timeout==4.0.3 + - attrs==23.2.0 + - boto3==1.34.67 + - botocore==1.34.67 + - certifi==2024.2.2 + - charset-normalizer==3.3.2 + - click==8.1.7 + - cmake==3.28.3 + - coloredlogs==15.0.1 + - datasets==2.18.0 + - dill==0.3.8 + - evaluate==0.4.1 + - filelock==3.13.1 + - flatbuffers==24.3.7 + - frozenlist==1.4.1 + - fsspec==2024.2.0 + - furiosa-common==0.10.1 + - git+https://github.com/furiosa-ai/furiosa-llm-models.git@MLPerf4.1-v3.11 + - furiosa-optimizer==0.10.0 + - graphviz==0.20.3 + - huggingface-hub==0.21.4 + - humanfriendly==10.0 + - idna==3.6 + - Jinja2==3.1.3 + - jmespath==1.0.1 + - joblib==1.3.2 + - lit==18.1.1 + - markdown-it-py==3.0.0 + - MarkupSafe==2.1.5 + - mdurl==0.1.2 + - git+https://github.com/furiosa-ai/model-compressor-private.git@MLPerf4.1-v3.11 + - mpmath==1.3.0 + - multidict==6.0.5 + - multipledispatch==1.0.0 + - multiprocess==0.70.16 + - networkx==3.2.1 + - nltk==3.8.1 + - numpy==1.26.4 + - nvidia-cublas-cu12==12.1.3.1 + - nvidia-cuda-cupti-cu12==12.1.105 + - nvidia-cuda-nvrtc-cu12==12.1.105 + - nvidia-cuda-runtime-cu12==12.1.105 + - nvidia-cudnn-cu12==8.9.2.26 + - nvidia-cufft-cu12==11.0.2.54 + - nvidia-curand-cu12==10.3.2.106 + - nvidia-cusolver-cu12==11.4.5.107 + - nvidia-cusparse-cu12==12.1.0.106 + - nvidia-nccl-cu12==2.20.5 + - nvidia-nvjitlink-cu12==12.5.40 + - nvidia-nvtx-cu12==12.1.105 + - onnx==1.14.1 + - onnx-simplifier==0.4.36 + - onnxoptimizer==0.3.13 + - onnxruntime==1.15.1 + - packaging==24.0 + - pandas==2.2.1 + - pillow==10.2.0 + - protobuf==5.26.0 + - psutil==5.9.8 + - pyarrow==15.0.2 + - pyarrow-hotfix==0.6 + - pybind11==2.11.1 + - pydot==2.0.0 + - Pygments==2.17.2 + - pyparsing==3.1.2 + - python-dateutil==2.9.0.post0 + - pytz==2024.1 + - PyYAML==6.0.1 + - regex==2023.12.25 + - requests==2.31.0 + - responses==0.18.0 + - rich==13.7.1 + - rouge_score==0.1.2 + - ruamel.yaml==0.18.6 + - ruamel.yaml.clib==0.2.8 + - ruff==0.4.9 + - s3transfer==0.10.1 + - safetensors==0.4.2 + - scipy==1.10.1 + - sentencepiece==0.1.99 + - simplejson==3.19.2 + - six==1.16.0 + - sympy==1.12 + - timm==0.6.11 + - tokenizers==0.13.3 + - torch==2.1.0+cu118 + - torchaudio==2.1.0+cu118 + - torchvision==0.16.0+cu118 + - tqdm==4.65.0 + - transformers==4.31.0 + - triton==2.1.0 + - typing==3.7.4.3 + - typing_extensions==4.10.0 + - tzdata==2024.1 + - urllib3==2.2.1 + - xxhash==3.4.1 + - yarl==1.9.4 \ No newline at end of file diff --git a/scripts/envs/qllama2-70b_env_quant.yml b/scripts/envs/qllama2-70b_env_quant.yml new file mode 100644 index 000000000..5c2ef0793 --- /dev/null +++ b/scripts/envs/qllama2-70b_env_quant.yml @@ -0,0 +1,110 @@ +name: mlperf-qllama2-70b +channels: + - defaults +dependencies: + - python~=3.10.0 + - pip: + - --extra-index-url https://download.pytorch.org/whl/cu118 + - torch==2.1.0+cu118 + - absl-py==2.1.0 + - git+https://github.com/furiosa-ai/accelerate-compression.git@4d7b404041834d35727064e5b1dcfcd060319ad6#egg=accelerate + - aiofiles==23.2.1 + - aiohttp==3.8.6 + - aiosignal==1.3.1 + - async-timeout==4.0.3 + - attrs==23.2.0 + - boto3==1.34.67 + - botocore==1.34.67 + - certifi==2024.2.2 + - charset-normalizer==3.3.2 + - click==8.1.7 + - cmake==3.28.3 + - coloredlogs==15.0.1 + - datasets==2.18.0 + - dill==0.3.8 + - evaluate==0.4.1 + - filelock==3.13.1 + - flatbuffers==24.3.7 + - frozenlist==1.4.1 + - fsspec==2024.2.0 + - furiosa-common==0.10.1 + - git+https://github.com/furiosa-ai/furiosa-llm-models.git@MLPerf4.1-v3.11 + - furiosa-optimizer==0.10.0 + - graphviz==0.20.3 + - huggingface-hub==0.21.4 + - humanfriendly==10.0 + - idna==3.6 + - Jinja2==3.1.3 + - jmespath==1.0.1 + - joblib==1.3.2 + - lit==18.1.1 + - markdown-it-py==3.0.0 + - MarkupSafe==2.1.5 + - mdurl==0.1.2 + - git+https://github.com/furiosa-ai/model-compressor-private.git@MLPerf4.1-v3.11 + - mpmath==1.3.0 + - multidict==6.0.5 + - multipledispatch==1.0.0 + - multiprocess==0.70.16 + - networkx==3.2.1 + - nltk==3.8.1 + - numpy==1.26.4 + - nvidia-cublas-cu12==12.1.3.1 + - nvidia-cuda-cupti-cu12==12.1.105 + - nvidia-cuda-nvrtc-cu12==12.1.105 + - nvidia-cuda-runtime-cu12==12.1.105 + - nvidia-cudnn-cu12==8.9.2.26 + - nvidia-cufft-cu12==11.0.2.54 + - nvidia-curand-cu12==10.3.2.106 + - nvidia-cusolver-cu12==11.4.5.107 + - nvidia-cusparse-cu12==12.1.0.106 + - nvidia-nccl-cu12==2.20.5 + - nvidia-nvjitlink-cu12==12.5.40 + - nvidia-nvtx-cu12==12.1.105 + - onnx==1.14.1 + - onnx-simplifier==0.4.36 + - onnxoptimizer==0.3.13 + - onnxruntime==1.15.1 + - packaging==24.0 + - pandas==2.2.1 + - pillow==10.2.0 + - protobuf==5.26.0 + - psutil==5.9.8 + - pyarrow==15.0.2 + - pyarrow-hotfix==0.6 + - pybind11==2.11.1 + - pydot==2.0.0 + - Pygments==2.17.2 + - pyparsing==3.1.2 + - python-dateutil==2.9.0.post0 + - pytz==2024.1 + - PyYAML==6.0.1 + - regex==2023.12.25 + - requests==2.31.0 + - responses==0.18.0 + - rich==13.7.1 + - rouge_score==0.1.2 + - ruamel.yaml==0.18.6 + - ruamel.yaml.clib==0.2.8 + - ruff==0.4.9 + - s3transfer==0.10.1 + - safetensors==0.4.2 + - scipy==1.10.1 + - sentencepiece==0.1.99 + - simplejson==3.19.2 + - six==1.16.0 + - sympy==1.12 + - timm==0.6.11 + - tokenizers==0.13.3 + - torch==2.1.0+cu118 + - torchaudio==2.1.0+cu118 + - torchvision==0.16.0+cu118 + - tqdm==4.65.0 + - git+https://github.com/furiosa-ai/transformers-comp.git@2b012fcf15006e2cb2b0d9735ebf5b1d08a744a8#egg=transformers + - triton==2.1.0 + - typing==3.7.4.3 + - typing_extensions==4.10.0 + - tzdata==2024.1 + - urllib3==2.2.1 + - xxhash==3.4.1 + - yarl==1.9.4 \ No newline at end of file diff --git a/scripts/eval_qllama2-70b_golden.sh b/scripts/eval_qllama2-70b_golden.sh new file mode 100644 index 000000000..e811c7547 --- /dev/null +++ b/scripts/eval_qllama2-70b_golden.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# define env. variables +model_name=qllama2-70b +model_dir=language/llama2-70b +git_dir=$(git rev-parse --show-toplevel) +work_dir=$git_dir/$model_dir +data_dir=$git_dir/data +quant_data_dir=$data_dir/quantization/llama2 +log_dir=$git_dir/logs +env_name=mlperf-$model_name +conda_base=$($CONDA_EXE info --base) + +# work on model directory +cd $work_dir + +# enter existing conda env. +source "$conda_base/etc/profile.d/conda.sh" +conda activate $env_name + +# eval model +printf "\n============= STEP-4: Run eval =============\n" +SCENARIO=${SCENARIO:="Offline"} +DATA_TYPE=${DATA_TYPE:="quant"} +N_COUNT=${N_COUNT:="24576"} # total_len = 24,576 +DEVICE=${DEVICE:="cuda:0"} + +if [ $DEVICE = "cpu" ]; + then DATA_TYPE=float32; +fi + +# quantization args +QUANT_CONFIG_PATH=$quant_data_dir/quant_config.yaml +QUANT_PARAM_PATH=$quant_data_dir/quant_param_golden.npy +QUANT_FORMAT_PATH=$quant_data_dir/quant_format_golden.yaml +MODEL_SOURCE=furiosa_llm_rope +QUANT_DATATYPE=W8A8KV8 + + +printf "<>\n" +printf "\tSCENARIO: $SCENARIO\n" +printf "\tDATA_TYPE: $DATA_TYPE\n" +printf "\tNUM_DATA: $N_COUNT\n" +printf "\tDEVICE: $DEVICE\n" + +if ((${N_COUNT} < 2000)); + then USER_CONF=$git_dir/internal_test.conf; +else + USER_CONF=user.conf; +fi + +CHECKPOINT_PATH=$data_dir/models/llama2/Llama-2-70b-chat-hf +DATASET_PATH=$data_dir/dataset/open-orca/validation/open_orca_gpt4_tokenized_llama.sampled_24576.pkl +LOG_PATH=$log_dir/$model_name/$SCENARIO/$QUANT_DATATYPE/$(date +%Y%m%d_%H%M%S%Z) + +SECONDS=0 +python -u main.py --scenario Offline \ + --model-path $CHECKPOINT_PATH \ + --mlperf-conf ../../mlperf.conf \ + --user-conf $USER_CONF \ + --total-sample-count $N_COUNT \ + --device $DEVICE \ + --dataset-path $DATASET_PATH \ + --dtype $DATA_TYPE \ + --accuracy \ + --output-log-dir $LOG_PATH \ + --quantize \ + --quant_config_path $QUANT_CONFIG_PATH \ + --quant_param_path $QUANT_PARAM_PATH \ + --quant_format_path $QUANT_FORMAT_PATH \ + --model_source $MODEL_SOURCE \ + + + +duration=$SECONDS +printf "$((duration / 60)) minutes and $((duration % 60)) seconds elapsed." &> $LOG_PATH/elapsed_time.log + +ACCURACY_LOG_FILE=$LOG_PATH/mlperf_log_accuracy.json +python evaluate-accuracy.py --checkpoint-path $CHECKPOINT_PATH \ + --mlperf-accuracy-file $ACCURACY_LOG_FILE \ + --dataset-file $DATASET_PATH --dtype int64 \ + &> $LOG_PATH/accuracy_result.log +printf "Save eval log to $LOG_PATH" + +printf "\n============= End of eval =============\n" + +# unset exported env. variables +unset SCENARIO +unset DATA_TYPE +unset N_COUNT +unset DEVICE + +# exit from conda env. +conda deactivate + +# get back to git root +cd $git_dir diff --git a/scripts/eval_qllama2-70b_golden_ci.sh b/scripts/eval_qllama2-70b_golden_ci.sh new file mode 100644 index 000000000..0aa702532 --- /dev/null +++ b/scripts/eval_qllama2-70b_golden_ci.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# define env. variables +model_name=qllama2-70b +model_dir=language/llama2-70b +git_dir=$(git rev-parse --show-toplevel) +work_dir=$git_dir/$model_dir +data_dir=$git_dir/data +quant_data_dir=$data_dir/quantization/llama2 +log_dir=$git_dir/logs +env_name=mlperf-$model_name +conda_base=$($CONDA_EXE info --base) + +# work on model directory +cd $work_dir + +# enter existing conda env. +source "$conda_base/etc/profile.d/conda.sh" +conda activate $env_name + +# eval model +printf "\n============= STEP-4: Run eval =============\n" +SCENARIO=${SCENARIO:="Offline"} +DATA_TYPE=${DATA_TYPE:="float32"} +N_COUNT=${N_COUNT:="10"} # total_len = 24,576 +DEVICE=${DEVICE:="cuda:0"} + +if [ $DEVICE = "cpu" ]; + then DATA_TYPE=float32; +fi + +# quantization args +QUANT_CONFIG_PATH=$quant_data_dir/quant_config.yaml +QUANT_PARAM_PATH=$quant_data_dir/quant_param_golden.npy +QUANT_FORMAT_PATH=$quant_data_dir/quant_format_golden.yaml +MODEL_SOURCE=furiosa_llm_rope +QUANT_DATATYPE=W8A8KV8 + + +printf "<>\n" +printf "\tSCENARIO: $SCENARIO\n" +printf "\tDATA_TYPE: $DATA_TYPE\n" +printf "\tNUM_DATA: $N_COUNT\n" +printf "\tDEVICE: $DEVICE\n" + +if ((${N_COUNT} < 2000)); + then USER_CONF=$git_dir/internal_test.conf; +else + USER_CONF=user.conf; +fi + +CHECKPOINT_PATH=$data_dir/models/llama2/Llama-2-70b-chat-hf +DATASET_PATH=$data_dir/dataset/open-orca/validation/open_orca_gpt4_tokenized_llama.sampled_24576.pkl +LOG_PATH=$log_dir/$model_name/$SCENARIO/$QUANT_DATATYPE/$(date +%Y%m%d_%H%M%S%Z) + +SECONDS=0 +python -u main.py --scenario Offline \ + --model-path $CHECKPOINT_PATH \ + --mlperf-conf ../../mlperf.conf \ + --user-conf $USER_CONF \ + --total-sample-count $N_COUNT \ + --device $DEVICE \ + --dataset-path $DATASET_PATH \ + --dtype $DATA_TYPE \ + --accuracy \ + --output-log-dir $LOG_PATH \ + --quantize \ + --quant_config_path $QUANT_CONFIG_PATH \ + --quant_param_path $QUANT_PARAM_PATH \ + --quant_format_path $QUANT_FORMAT_PATH \ + --model_source $MODEL_SOURCE \ + + + +duration=$SECONDS +printf "$((duration / 60)) minutes and $((duration % 60)) seconds elapsed." &> $LOG_PATH/elapsed_time.log + +ACCURACY_LOG_FILE=$LOG_PATH/mlperf_log_accuracy.json +python evaluate-accuracy.py --checkpoint-path $CHECKPOINT_PATH \ + --mlperf-accuracy-file $ACCURACY_LOG_FILE \ + --dataset-file $DATASET_PATH --dtype int64 \ + &> $LOG_PATH/accuracy_result.log +printf "Save eval log to $LOG_PATH" + +printf "\n============= End of eval =============\n" + +# unset exported env. variables +unset SCENARIO +unset DATA_TYPE +unset N_COUNT +unset DEVICE + +# exit from conda env. +conda deactivate + +# get back to git root +cd $git_dir