' + + '' + + _("Hide Search Matches") + + "
" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + localStorage.removeItem("sphinx_highlight_terms") + }, + + initEscapeListener: () => { + // only install a listener if it is really needed + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; + if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { + SphinxHighlight.hideSearchWords(); + event.preventDefault(); + } + }); + }, +}; + +_ready(() => { + /* Do not call highlightSearchWords() when we are on the search page. + * It will highlight words from the *previous* search query. + */ + if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords(); + SphinxHighlight.initEscapeListener(); +}); diff --git a/llm/qwen2/cpu/_static/twemoji.css b/llm/qwen2/cpu/_static/twemoji.css new file mode 100644 index 000000000..878d070d1 --- /dev/null +++ b/llm/qwen2/cpu/_static/twemoji.css @@ -0,0 +1,6 @@ +img.emoji { + height: 1em; + width: 1em; + margin: 0 .05em 0 .1em; + vertical-align: -0.1em; +} diff --git a/llm/qwen2/cpu/_static/twemoji.js b/llm/qwen2/cpu/_static/twemoji.js new file mode 100644 index 000000000..91bc868f4 --- /dev/null +++ b/llm/qwen2/cpu/_static/twemoji.js @@ -0,0 +1,10 @@ +function addEvent(element, eventName, fn) { + if (element.addEventListener) + element.addEventListener(eventName, fn, false); + else if (element.attachEvent) + element.attachEvent('on' + eventName, fn); +} + +addEvent(window, 'load', function() { + twemoji.parse(document.body, {'folder': 'svg', 'ext': '.svg'}); +}); diff --git a/llm/qwen2/cpu/genindex.html b/llm/qwen2/cpu/genindex.html new file mode 100644 index 000000000..9f5785d50 --- /dev/null +++ b/llm/qwen2/cpu/genindex.html @@ -0,0 +1,114 @@ + + + + + +Intel® Extension for PyTorch* provides dedicated optimization for running Qwen2 models faster, including technical points like paged attention, ROPE fusion, etc. And a set of data types are supported for various scenarios, including BF16, Weight Only Quantization, etc.
+There are several environment setup methodologies provided. You can choose either of them according to your usage scenario. The Docker-based ones are recommended.
+# Get the Intel® Extension for PyTorch* source code
+git clone https://github.com/intel/intel-extension-for-pytorch.git
+cd intel-extension-for-pytorch
+git checkout 2.3-qwen-2
+git submodule sync
+git submodule update --init --recursive
+
+# Build an image with the provided Dockerfile by installing from Intel® Extension for PyTorch* prebuilt wheel files
+DOCKER_BUILDKIT=1 docker build -f examples/cpu/inference/python/llm/Dockerfile -t ipex-llm:qwen2 .
+
+# Run the container with command below
+docker run --rm -it --privileged ipex-llm:qwen2 bash
+
+# When the command prompt shows inside the docker container, enter llm examples directory
+cd llm
+
+# Activate environment variables
+source ./tools/env_activate.sh
+
# Get the Intel® Extension for PyTorch* source code
+git clone https://github.com/intel/intel-extension-for-pytorch.git
+cd intel-extension-for-pytorch
+git checkout 2.3-qwen-2
+git submodule sync
+git submodule update --init --recursive
+
+# Create a conda environment (pre-built wheel only available with python=3.10)
+conda create -n llm python=3.10 -y
+conda activate llm
+
+# Setup the environment with the provided script
+# A sample "prompt.json" file for benchmarking is also downloaded
+cd examples/cpu/inference/python/llm
+bash ./tools/env_setup.sh 7
+
+# Activate environment variables
+source ./tools/env_activate.sh
+
ipex.llm provides a single script to facilitate running generation tasks as below:
+# if you are using a docker container built from commands above in Sec. 1.1, the placeholder LLM_DIR below is /home/ubuntu/llm
+# if you are using a conda env created with commands above in Sec. 1.2, the placeholder LLM_DIR below is intel-extension-for-pytorch/examples/cpu/inference/python/llm
+cd <LLM_DIR>
+python run.py --help # for more detailed usages
+
Key args of run.py | +Notes | +
---|---|
model id | +--model-name-or-path or -m to specify the <QWEN2_MODEL_ID_OR_LOCAL_PATH>, it is model id from Huggingface or downloaded local path |
+
generation | +default: beam search (beam size = 4), --greedy for greedy search |
+
input tokens | +provide fixed sizes for input prompt size, use --input-tokens for <INPUT_LENGTH> in [1024, 2048, 4096, 8192, 16384, 32768]; if --input-tokens is not used, use --prompt to choose other strings as prompt inputs |
+
output tokens | +default: 32, use --max-new-tokens to choose any other size |
+
batch size | +default: 1, use --batch-size to choose any other size |
+
token latency | +enable --token-latency to print out the first or next token latency |
+
generation iterations | +use --num-iter and --num-warmup to control the repeated iterations of generation, default: 100-iter/10-warmup |
+
streaming mode output | +greedy search only (work with --greedy ), use --streaming to enable the streaming generation output |
+
Note: You may need to log in your HuggingFace account to access the model files. Please refer to HuggingFace login.
+The <QWEN2_MODEL_ID_OR_LOCAL_PATH> in the below commands specifies the Qwen2 model you will run, which can be found from HuggingFace Models.
+unset KMP_AFFINITY
+
In the DeepSpeed cases below, we recommend --shard-model
to shard model weight sizes more even for better memory usage when running with DeepSpeed.
If using --shard-model
, it will save a copy of the shard model weights file in the path of --output-dir
(default path is ./saved_results
if not provided).
+If you have used --shard-model
and generated such a shard model path (or your model weights files are already well sharded), in further repeated benchmarks, please remove --shard-model
, and replace -m <QWEN2_MODEL_ID_OR_LOCAL_PATH>
with -m <shard model path>
to skip the repeated shard steps.
Besides, the standalone shard model function/scripts are also provided in section 2.1.1.4, in case you would like to generate the shard model weights files in advance before running distributed inference.
+Command:
deepspeed --bind_cores_to_rank run.py --benchmark -m <QWEN2_MODEL_ID_OR_LOCAL_PATH> --dtype bfloat16 --ipex --greedy --input-tokens <INPUT_LENGTH> --autotp --shard-model
+
By default, for weight-only quantization, we use quantization with Automatic Mixed Precision inference (--quant-with-amp
) to get peak performance and fair accuracy.
+For weight-only quantization with deepspeed, we quantize the model then run the benchmark. The quantized model won’t be saved.
Command:
deepspeed --bind_cores_to_rank run.py --benchmark -m <QWEN2_MODEL_ID_OR_LOCAL_PATH> --ipex --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --greedy --input-tokens <INPUT_LENGTH> --autotp --shard-model
+
To save memory usage, we could shard the model weights files under the local path before we launch distributed tests with DeepSpeed.
+cd ./utils
+# general command:
+python create_shard_model.py -m <QWEN2_MODEL_ID_OR_LOCAL_PATH> --save-path ./local_qwen2_model_shard
+# After sharding the model, using "-m ./local_qwen2_model_shard" in later tests
+
Command:
OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list> python run.py --benchmark -m <QWEN2_MODEL_ID_OR_LOCAL_PATH> --dtype bfloat16 --ipex --greedy --input-tokens <INPUT_LENGTH>
+
By default, for weight-only quantization, we use quantization with Automatic Mixed Precision inference (--quant-with-amp
) to get peak performance and fair accuracy.
Command:
OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list> python run.py --benchmark -m <QWEN2_MODEL_ID_OR_LOCAL_PATH> --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --output-dir "saved_results" --greedy --input-tokens <INPUT_LENGTH>
+
(1) numactl
is used to specify memory and cores of your hardware to get better performance. <node N> specifies the numa node id (e.g., 0 to use the memory from the first numa node). <physical cores list> specifies phsysical cores which you are using from the <node N> numa node. You can use lscpu
command in Linux to check the numa node information.
(2) For all quantization benchmarks, both quantization and inference stages will be triggered by default. For quantization stage, it will auto-generate the quantized model named best_model.pt
in the --output-dir
path, and for inference stage, it will launch the inference with the quantized model best_model.pt
. For inference-only benchmarks (avoid the repeating quantization stage), you can also reuse these quantized models for by adding --quantized-model-path <output_dir + "best_model.pt">
.
Intel® Extension for PyTorch* also provides dedicated optimization for many other Large Language Models (LLM), which cover a set of data types that are supported for various scenarios. For more details, please check this Intel® Extension for PyTorch* doc.
+