Merge branch 'master' into baichuan2

openvinotoolkit · Apr 8, 2024 · 1811b26 · 1811b26
2 parents 9ba929d + 053dd88
commit 1811b26
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 11 deletions.
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -10,6 +10,7 @@ on:
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
+
 jobs:
   cpp-beam_search_causal_lm-Mistral-7B:
     runs-on: ubuntu-20.04-16-cores
@@ -37,6 +38,21 @@ jobs:
           source ./ov/setupvars.sh
           convert_tokenizer ./Mistral-7B-v0.1/pytorch/dldt/FP16/ --output ./Mistral-7B-v0.1/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
           timeout 50s ./build/beam_search_causal_lm ./Mistral-7B-v0.1/pytorch/dldt/FP16/ 69 > ./pred.txt
+          python -c " 
+          import transformers 
+          with open('pred.txt', 'r') as file: 
+              predictions = file.read() 
+          tokenizer = transformers.LlamaTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1') 
+          tokenized = tokenizer('69', return_tensors='pt') 
+          for beam in transformers.LlamaForCausalLM.from_pretrained('mistralai/Mistral-7B-v0.1').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): 
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' 
+              idx = predictions.find(ref) 
+              if -1 == idx: 
+                  raise RuntimeError(f'Missing "{ref=}" from predictions') 
+              predictions = predictions[:idx] + predictions[idx + len(ref):] 
+          " 
+          echo "69" passed 
+
   cpp-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores
     steps:
@@ -63,6 +79,7 @@ jobs:
           source ./ov/setupvars.sh
           convert_tokenizer ./open_llama_3b_v2/pytorch/dldt/FP16/ --output ./open_llama_3b_v2/pytorch/dldt/FP16/ --with-detokenizer
           ./build/greedy_causal_lm ./open_llama_3b_v2/pytorch/dldt/FP16/ "return 0"
+
   cpp-beam_search_causal_lm-ubuntu:
     runs-on: ubuntu-20.04
     steps:
@@ -103,7 +120,7 @@ jobs:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
               predictions = predictions[:idx] + predictions[idx + len(ref):]
           "
-          echo 69 passed
+          echo "69" passed
 
           timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ Hi > ./pred.txt
           python -c "
@@ -119,7 +136,7 @@ jobs:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
               predictions = predictions[:idx] + predictions[idx + len(ref):]
           "
-          echo Hi passed
+          echo "Hi" passed
 
           timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "return 0" > ./pred.txt
           python -c "
@@ -135,7 +152,7 @@ jobs:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
               predictions = predictions[:idx] + predictions[idx + len(ref):]
           "
-          echo return 0 passed
+          echo "return 0" passed
 
           ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "你好！ 你好嗎？" > ./pred.txt
           python -c "
@@ -151,7 +168,8 @@ jobs:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
               predictions = predictions[:idx] + predictions[idx + len(ref):]
           "
-          echo 你好！ 你好嗎？ passed
+          echo "你好！ 你好嗎？" passed
+
   cpp-beam_search_causal_lm-windows:
     runs-on: windows-latest
     steps:
@@ -192,6 +210,7 @@ jobs:
           echo         raise RuntimeError(f'Missing "{ref=}" from predictions') >> ref.py
           echo     predictions = predictions[:idx] + predictions[idx + len(ref):] >> ref.py
           python ref.py
+
   cpp-beam_search_causal_lm-Qwen-7B-Chat:
     runs-on: ubuntu-20.04-16-cores
     steps:
@@ -218,6 +237,7 @@ jobs:
           source ./ov/setupvars.sh
           convert_tokenizer ./Qwen-7B-Chat/pytorch/dldt/FP16/ --output ./Qwen-7B-Chat/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
           timeout 50s ./build/beam_search_causal_lm ./Qwen-7B-Chat/pytorch/dldt/FP16/ 69 > ./pred.txt
+
   cpp-beam_search_causal_lm-Qwen1_5-7B-Chat:
     runs-on: ubuntu-20.04-16-cores
     steps:
@@ -244,6 +264,7 @@ jobs:
           source ./ov/setupvars.sh
           convert_tokenizer ./Qwen1.5-7B-Chat/pytorch/dldt/FP16/ --output ./Qwen1.5-7B-Chat/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
           timeout 50s ./build/beam_search_causal_lm ./Qwen1.5-7B-Chat/pytorch/dldt/FP16/ "你好！" > ./pred_qwen15.txt
+
   cpp-beam_search_causal_lm-Baichuan2-7B-Chat:
     runs-on: ubuntu-20.04-16-cores
     steps:
@@ -261,7 +282,7 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id baichuan-inc/Baichuan2-7B-Chat --output_dir ./Baichuan2-7B-Chat/ --precision FP16 &
+          python -m pip install --upgrade-strategy eager git+https://github.com/huggingface/optimum-intel.git@972491991710f8a92cdef35e0914de92a88995a4 -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id baichuan-inc/Baichuan2-7B-Chat --output_dir ./Baichuan2-7B-Chat/ --precision FP16 &
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
           wait
@@ -285,6 +306,7 @@ jobs:
               predictions = predictions[:idx] + predictions[idx + len(ref):]
           "
           echo 69 passed
+
   cpp-beam_search_causal_lm-Phi-2:
     runs-on: ubuntu-20.04-16-cores
     steps:
@@ -310,7 +332,8 @@ jobs:
         run: |
           source ./ov/setupvars.sh
           convert_tokenizer ./Phi-2/pytorch/dldt/FP16/ --output ./Phi-2/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
-          timeout 50s ./build/beam_search_causal_lm ./Phi-2/pytorch/dldt/FP16/ 69 > ./pred.txt        
+          timeout 50s ./build/beam_search_causal_lm ./Phi-2/pytorch/dldt/FP16/ 69 > ./pred.txt
+
   cpp-beam_search_causal_lm-notus-7b-v1:
     runs-on: ubuntu-20.04-16-cores
     steps:
@@ -337,6 +360,7 @@ jobs:
           source ./ov/setupvars.sh
           convert_tokenizer ./notus-7b-v1/pytorch/dldt/FP16/ --output ./notus-7b-v1/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
           timeout 50s ./build/beam_search_causal_lm ./notus-7b-v1/pytorch/dldt/FP16/ 69 > ./pred.txt
+
   cpp-speculative_decoding_lm-ubuntu:
     runs-on: ubuntu-20.04-16-cores
     steps:
@@ -374,7 +398,8 @@ jobs:
               predicted_speculative = f.readline()
           assert predicted_greedy == predicted_speculative
           "
-          echo speculative_decoding_lm passed
+          echo "Alan Turing was a" passed
+
   cpp-Phi-1_5:
     runs-on: ubuntu-20.04-16-cores
     steps:
@@ -417,4 +442,4 @@ jobs:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
               predictions = predictions[:idx] + predictions[idx + len(ref):]
           "
-          echo Phi-1_5 passed
+          echo "Alan Turing was a" passed
diff --git a/llm_bench/python/README.md b/llm_bench/python/README.md
@@ -122,6 +122,23 @@ Add the option `--torch_compile_backend` with the desired backend: `pytorch` or
 python ./benchmark.py -m models/llama-2-7b-chat/pytorch -d CPU --torch_compile_backend openvino
 ```
 
+## Run on 2 sockets platform
+
+benchmark.py sets openvino.properties.streams.num(1) by default
+
+| OpenVINO version    | Behaviors                                       |
+|:--------------------|:------------------------------------------------|
+| Before 2024.0.0 | streams.num(1) <br>execute on 2 sockets. |
+| 2024.0.0 | streams.num(1) <br>execute on the same socket as the APP is running on. |
+
+numactl on Linux or --load_config for benchmark.py can be used to change the behaviors.
+
+For example, --load_config config.json as following in OpenVINO 2024.0.0 will result in streams.num(1) and execute on 2 sockets.
+```
+{"INFERENCE_NUM_THREADS":<NUMBER>}
+```
+`<NUMBER>` is the number of total physical cores in 2 sockets
+
 ## Additional Resources
 ### 1. NOTE
 > If you encounter any errors, please check **[NOTES.md](./doc/NOTES.md)** which provides solutions to the known errors.

diff --git a/text_generation/causal_lm/cpp/README.md b/text_generation/causal_lm/cpp/README.md
@@ -93,12 +93,18 @@ convert_tokenizer .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ --output .\TinyL
 
 ## Run
 
-Usage:
+### Usage:
 1. `greedy_causal_lm <MODEL_DIR> "<PROMPT>"`
 2. `beam_search_causal_lm <MODEL_DIR> "<PROMPT>"`
 2. `speculative_decoding_lm <DRAFT_MODEL_DIR> <MAIN_MODEL_DIR> "<PROMPT>"`
 
-Examples:
+### Examples:
+#### Windows:
+1. `/build/Release/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "Why is the Sun yellow?"`
+2. `/build/Release/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "Why is the Sun yellow?"`
+3. `/build/Release/speculative_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ ./Llama-2-7b-chat-hf/pytorch/dldt/FP16/ "Why is the Sun yellow?"`
+
+#### Linux/MacOS:
 1. `./build/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "Why is the Sun yellow?"`
 2. `./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "Why is the Sun yellow?"`
 3. `./build/speculative_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ ./Llama-2-7b-chat-hf/pytorch/dldt/FP16/ "Why is the Sun yellow?"`
@@ -145,5 +151,4 @@ To enable Unicode characters for Windows cmd open `Region` settings from `Contro
 11. [zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta)
 12. [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 
-
 This pipeline can work with other similar topologies produced by `optimum-intel` with the same model signature.