diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml index 878ec77433..2554330601 100644 --- a/.github/workflows/llm_bench-python.yml +++ b/.github/workflows/llm_bench-python.yml @@ -27,7 +27,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9"] + python-version: ["3.10"] steps: - uses: actions/checkout@v4 @@ -73,6 +73,7 @@ jobs: GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.WWB_PATH }}/requirements.txt pip install git+https://github.com/huggingface/optimum.git GIT_CLONE_PROTECTION_ACTIVE=false pip install ${{ env.WWB_PATH }} + python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --force-reinstall python -m pytest llm_bench/python/who_what_benchmark/tests stateful: runs-on: ubuntu-20.04 @@ -80,7 +81,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: - python-version: 3.9 + python-version: "3.10" - name: Test stateful run: | GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r llm_bench/python/requirements.txt @@ -94,4 +95,5 @@ jobs: pip install git+https://github.com/huggingface/optimum.git GIT_CLONE_PROTECTION_ACTIVE=false pip install llm_bench/python/who_what_benchmark/ pip install pytest + python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --force-reinstall python -m pytest llm_bench/python/who_what_benchmark/tests diff --git a/llm_bench/python/who_what_benchmark/README.md b/llm_bench/python/who_what_benchmark/README.md index d140b0af75..ab72bc7a89 100644 --- a/llm_bench/python/who_what_benchmark/README.md +++ b/llm_bench/python/who_what_benchmark/README.md @@ -1,10 +1,15 @@ -# Simple Accuracy Benchmark for Optimized LLMs +# Simple Accuracy Benchmark for Generative AI models -Simple and quick accuracy test for compressed, quantized, pruned, distilled LLMs. It works with any model that suppors HuggingFace Transformers text generation API including: -* HuggingFace Transformers compressed models via [Bitsandbytes](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.BitsAndBytesConfig) -* [GPTQ](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.GPTQConfig) via HuggingFace API -* Llama.cpp via [BigDL-LLM](https://github.com/intel-analytics/BigDL/tree/main/python/llm) -* [OpenVINO](https://github.com/openvinotoolkit/openvino) and [NNCF](https://github.com/openvinotoolkit/nncf) via [Optimum-Intel](https://github.com/huggingface/optimum-intel) +## Features + +* Simple and quick accuracy test for compressed, quantized, pruned, distilled LLMs. It works with any model that suppors HuggingFace Transformers text generation API including: + * HuggingFace Transformers compressed models via [Bitsandbytes](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.BitsAndBytesConfig) + * [GPTQ](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.GPTQConfig) via HuggingFace API + * Llama.cpp via [BigDL-LLM](https://github.com/intel-analytics/BigDL/tree/main/python/llm) + * [OpenVINO](https://github.com/openvinotoolkit/openvino) and [NNCF](https://github.com/openvinotoolkit/nncf) via [Optimum-Intel](https://github.com/huggingface/optimum-intel) + * Support of custom datasets of the user choice +* Validation of text-to-image pipelines. Computes similarity score between generated images: + * Supports Diffusers library and Optimum-Intel via `Text2ImageEvaluator` class. The main idea is to compare similarity of text generation between baseline and optimized LLMs. @@ -19,7 +24,7 @@ base_small = AutoModelForCausalLM.from_pretrained(model_id) optimized_model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_id) -evaluator = whowhatbench.Evaluator(base_model=base_small, tokenizer=tokenizer) +evaluator = whowhatbench.TextEvaluator(base_model=base_small, tokenizer=tokenizer) metrics_per_prompt, metrics = evaluator.score(optimized_model) metric_of_interest = "similarity" @@ -50,7 +55,7 @@ metrics_per_prompt, metrics = evaluator.score(optimized_model, test_data=prompts * source eval_env/bin/activate * pip install -r requirements.txt -### CLI example +### CLI example for text-generation models ```sh wwb --help @@ -87,6 +92,18 @@ wwb --base-model meta-llama/Llama-2-7b-chat-hf --gt-data llama_2_7b_wwb_gt.csv - wwb --base-model meta-llama/Llama-2-7b-chat-hf --gt-data llama_2_7b_wwb_gt.csv --hf ``` +### Example of Stable Diffusion comparison +```sh +# Export FP16 model +optimum-cli export openvino -m SimianLuo/LCM_Dreamshaper_v7 --weight-format fp16 sd-lcm-fp16 +# Export INT8 WOQ model +optimum-cli export openvino -m SimianLuo/LCM_Dreamshaper_v7 --weight-format int8 sd-lcm-int8 +# Collect the references +wwb --base-model sd-lcm-fp16 --gt-data lcm_test/sd_xl.json --model-type text-to-image +# Compute the metric +wwb --target-model sd-lcm-int8 --gt-data lcm_test/sd_xl.json --model-type text-to-image +``` + ### Supported metrics * `similarity` - averaged similarity measured by neural network trained for sentence embeddings. The best is 1.0, the minimum is 0.0, higher-better. diff --git a/llm_bench/python/who_what_benchmark/tests/test_cli_image.py b/llm_bench/python/who_what_benchmark/tests/test_cli_image.py index f4c10eac86..a1e1b3934b 100644 --- a/llm_bench/python/who_what_benchmark/tests/test_cli_image.py +++ b/llm_bench/python/who_what_benchmark/tests/test_cli_image.py @@ -19,9 +19,9 @@ def run_wwb(args): @pytest.mark.parametrize( ("model_id", "model_type", "backend"), [ - ("hf-internal-testing/tiny-stable-diffusion-torch", "sd", "hf"), - ("hf-internal-testing/tiny-stable-diffusion-torch", "sd", "openvino"), - ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "sd-xl", "hf"), + ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "hf"), + ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "openvino"), + ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "text-to-image", "hf"), ], ) def test_image_model_types(model_id, model_type, backend): @@ -61,7 +61,7 @@ def test_image_model_types(model_id, model_type, backend): @pytest.mark.parametrize( ("model_id", "model_type", "backend"), [ - ("hf-internal-testing/tiny-stable-diffusion-torch", "sd", "hf"), + ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "hf"), ], ) def test_image_custom_dataset(model_id, model_type, backend): diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/registry.py b/llm_bench/python/who_what_benchmark/whowhatbench/registry.py index 208ba60ff3..867b53e27a 100644 --- a/llm_bench/python/who_what_benchmark/whowhatbench/registry.py +++ b/llm_bench/python/who_what_benchmark/whowhatbench/registry.py @@ -1,25 +1,12 @@ -from abc import ABC, abstractmethod -from optimum.intel import ( - OVLatentConsistencyModelPipeline, - OVStableDiffusionPipeline, - OVStableDiffusionXLPipeline, -) +from abc import ABC, abstractmethod # Registry for evaluators EVALUATOR_REGISTRY = {} MODELTYPE2TASK = { "text": "text-generation", - "sd": "image-generation", - "sd-xl": "image-generation", - "sd-lcm": "image-generation", -} - -TEXT2IMAGE_TASK2CLASS = { - "sd": OVStableDiffusionPipeline, - "sd-xl": OVStableDiffusionXLPipeline, - "sd-lcm": OVLatentConsistencyModelPipeline, + "text-to-image": "text-to-image", } diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/text2image_evaluator.py b/llm_bench/python/who_what_benchmark/whowhatbench/text2image_evaluator.py index b8b8234547..79dda2dcc9 100644 --- a/llm_bench/python/who_what_benchmark/whowhatbench/text2image_evaluator.py +++ b/llm_bench/python/who_what_benchmark/whowhatbench/text2image_evaluator.py @@ -17,11 +17,16 @@ "Illustration of an astronaut sitting in outer space, moon behind him", "A vintage illustration of a retro computer, vaporwave aesthetic, light pink and light blue", "A view from beautiful alien planet, very beautiful, surealism, retro astronaut on the first plane, 8k photo", + "red car in snowy forest, epic vista, beautiful landscape, 4k, 8k", + "A raccoon trapped inside a glass jar full of colorful candies, the background is steamy with vivid colors", + "cute cat 4k, high-res, masterpiece, best quality, soft lighting, dynamic angle", + "A cat holding a sign that says hello OpenVINO", + "A small cactus with a happy face in the Sahara desert.", ], } -@register_evaluator("image-generation") +@register_evaluator("text-to-image") class Text2ImageEvaluator(BaseEvaluator): def __init__( self, diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py b/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py index 3798bb044c..19c6aed2cd 100644 --- a/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py +++ b/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py @@ -11,11 +11,7 @@ from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM from optimum.exporters.tasks import TasksManager -from optimum.intel import ( - OVLatentConsistencyModelPipeline, - OVStableDiffusionPipeline, - OVStableDiffusionXLPipeline, -) +from optimum.intel import OVPipelineForText2Image import openvino_genai from whowhatbench import EVALUATOR_REGISTRY, MODELTYPE2TASK @@ -95,9 +91,7 @@ def load_text_model( TEXT2IMAGE_TASK2CLASS = { - "sd": OVStableDiffusionPipeline, - "sd-xl": OVStableDiffusionXLPipeline, - "sd-lcm": OVLatentConsistencyModelPipeline, + "text-to-image": OVPipelineForText2Image, } @@ -142,7 +136,7 @@ def load_model( if model_type == "text": return load_text_model(model_id, device, ov_config, use_hf, use_genai) - elif MODELTYPE2TASK[model_type] == "image-generation": + elif MODELTYPE2TASK[model_type] == "text-to-image": return load_text2image_model( model_type, model_id, device, ov_config, use_hf, use_genai ) @@ -203,9 +197,9 @@ def parse_args(): parser.add_argument( "--model-type", type=str, - choices=["text", "sd", "sd-xl", "sd-lcm"], + choices=["text", "text-to-image"], default="text", - help="Indicated the model type, e.g. 'text', 'sd'.", + help="Indicated the model type, e.g. 'text' - for LLMs, 't2im' - for text-to-image pipelines.", ) parser.add_argument( "--data-encoder", @@ -367,7 +361,7 @@ def get_evaluator(base_model, args): language=args.language, gen_answer_fn=genai_gen_answer if args.genai else None, ) - elif task == "image-generation": + elif task == "text-to-image": return EvaluatorCLS( base_model=base_model, gt_data=args.gt_data, @@ -467,7 +461,7 @@ def main(): if args.verbose and args.target_model is not None: if args.model_type == "text": print_text_results(evaluator) - elif "sd" in args.model_type: + elif "text-to-image" in args.model_type: print_image_results(evaluator)