Update Stable Diffusion models comparison (#956)

openvinotoolkit · Oct 14, 2024 · 04879dd · 04879dd
1 parent 684251c
commit 04879dd
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 43 deletions.
diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
@@ -27,7 +27,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9"]
+        python-version: ["3.10"]
 
     steps:
     - uses: actions/checkout@v4
@@ -73,14 +73,15 @@ jobs:
         GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.WWB_PATH }}/requirements.txt
         pip install git+https://github.com/huggingface/optimum.git
         GIT_CLONE_PROTECTION_ACTIVE=false pip install ${{ env.WWB_PATH }}
+        python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --force-reinstall
         python -m pytest llm_bench/python/who_what_benchmark/tests
   stateful:
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.9
+          python-version: "3.10"
       - name: Test stateful
         run: |
           GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r llm_bench/python/requirements.txt
@@ -94,4 +95,5 @@ jobs:
           pip install git+https://github.com/huggingface/optimum.git
           GIT_CLONE_PROTECTION_ACTIVE=false pip install llm_bench/python/who_what_benchmark/
           pip install pytest
+          python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --force-reinstall
           python -m pytest llm_bench/python/who_what_benchmark/tests
diff --git a/llm_bench/python/who_what_benchmark/README.md b/llm_bench/python/who_what_benchmark/README.md
@@ -1,10 +1,15 @@
-# Simple Accuracy Benchmark for Optimized LLMs
+# Simple Accuracy Benchmark for Generative AI models
 
-Simple and quick accuracy test for compressed, quantized, pruned, distilled LLMs. It works with any model that suppors HuggingFace Transformers text generation API including:
-* HuggingFace Transformers compressed models via [Bitsandbytes](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.BitsAndBytesConfig)
-* [GPTQ](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.GPTQConfig) via HuggingFace API
-* Llama.cpp via [BigDL-LLM](https://github.com/intel-analytics/BigDL/tree/main/python/llm)
-* [OpenVINO](https://github.com/openvinotoolkit/openvino) and [NNCF](https://github.com/openvinotoolkit/nncf) via [Optimum-Intel](https://github.com/huggingface/optimum-intel)
+## Features
+
+* Simple and quick accuracy test for compressed, quantized, pruned, distilled LLMs. It works with any model that suppors HuggingFace Transformers text generation API including:
+    * HuggingFace Transformers compressed models via [Bitsandbytes](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.BitsAndBytesConfig)
+    * [GPTQ](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.GPTQConfig) via HuggingFace API
+    * Llama.cpp via [BigDL-LLM](https://github.com/intel-analytics/BigDL/tree/main/python/llm)
+    * [OpenVINO](https://github.com/openvinotoolkit/openvino) and [NNCF](https://github.com/openvinotoolkit/nncf) via [Optimum-Intel](https://github.com/huggingface/optimum-intel)
+    * Support of custom datasets of the user choice
+* Validation of text-to-image pipelines. Computes similarity score between generated images:
+    * Supports Diffusers library and Optimum-Intel via `Text2ImageEvaluator` class.
 
 The main idea is to compare similarity of text generation between baseline and optimized LLMs.
 
@@ -19,7 +24,7 @@ base_small = AutoModelForCausalLM.from_pretrained(model_id)
 optimized_model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True, device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-evaluator = whowhatbench.Evaluator(base_model=base_small, tokenizer=tokenizer)
+evaluator = whowhatbench.TextEvaluator(base_model=base_small, tokenizer=tokenizer)
 metrics_per_prompt, metrics = evaluator.score(optimized_model)
 
 metric_of_interest = "similarity"
@@ -50,7 +55,7 @@ metrics_per_prompt, metrics = evaluator.score(optimized_model, test_data=prompts
 * source eval_env/bin/activate
 * pip install -r requirements.txt
 
-### CLI example
+### CLI example for text-generation models
 
 ```sh
 wwb --help
@@ -87,6 +92,18 @@ wwb --base-model meta-llama/Llama-2-7b-chat-hf --gt-data llama_2_7b_wwb_gt.csv -
 wwb --base-model meta-llama/Llama-2-7b-chat-hf --gt-data llama_2_7b_wwb_gt.csv --hf
 ```
 
+### Example of Stable Diffusion comparison
+```sh
+# Export FP16 model
+optimum-cli export openvino -m SimianLuo/LCM_Dreamshaper_v7 --weight-format fp16 sd-lcm-fp16
+# Export INT8 WOQ model
+optimum-cli export openvino -m SimianLuo/LCM_Dreamshaper_v7 --weight-format int8 sd-lcm-int8
+# Collect the references
+wwb --base-model sd-lcm-fp16 --gt-data lcm_test/sd_xl.json --model-type text-to-image
+# Compute the metric
+wwb --target-model sd-lcm-int8 --gt-data lcm_test/sd_xl.json --model-type text-to-image
+```
+
 ### Supported metrics
 
 * `similarity` - averaged similarity measured by neural network trained for sentence embeddings. The best is 1.0, the minimum is 0.0, higher-better.

diff --git a/llm_bench/python/who_what_benchmark/tests/test_cli_image.py b/llm_bench/python/who_what_benchmark/tests/test_cli_image.py
@@ -19,9 +19,9 @@ def run_wwb(args):
 @pytest.mark.parametrize(
     ("model_id", "model_type", "backend"),
     [
-        ("hf-internal-testing/tiny-stable-diffusion-torch", "sd", "hf"),
-        ("hf-internal-testing/tiny-stable-diffusion-torch", "sd", "openvino"),
-        ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "sd-xl", "hf"),
+        ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "hf"),
+        ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "openvino"),
+        ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "text-to-image", "hf"),
     ],
 )
 def test_image_model_types(model_id, model_type, backend):
@@ -61,7 +61,7 @@ def test_image_model_types(model_id, model_type, backend):
 @pytest.mark.parametrize(
     ("model_id", "model_type", "backend"),
     [
-        ("hf-internal-testing/tiny-stable-diffusion-torch", "sd", "hf"),
+        ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "hf"),
     ],
 )
 def test_image_custom_dataset(model_id, model_type, backend):

diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/registry.py b/llm_bench/python/who_what_benchmark/whowhatbench/registry.py
@@ -1,25 +1,12 @@
-from abc import ABC, abstractmethod
 
-from optimum.intel import (
-    OVLatentConsistencyModelPipeline,
-    OVStableDiffusionPipeline,
-    OVStableDiffusionXLPipeline,
-)
+from abc import ABC, abstractmethod
 
 
 # Registry for evaluators
 EVALUATOR_REGISTRY = {}
 MODELTYPE2TASK = {
     "text": "text-generation",
-    "sd": "image-generation",
-    "sd-xl": "image-generation",
-    "sd-lcm": "image-generation",
-}
-
-TEXT2IMAGE_TASK2CLASS = {
-    "sd": OVStableDiffusionPipeline,
-    "sd-xl": OVStableDiffusionXLPipeline,
-    "sd-lcm": OVLatentConsistencyModelPipeline,
+    "text-to-image": "text-to-image",
 }
 
 

diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/text2image_evaluator.py b/llm_bench/python/who_what_benchmark/whowhatbench/text2image_evaluator.py
@@ -17,11 +17,16 @@
         "Illustration of an astronaut sitting in outer space, moon behind him",
         "A vintage illustration of a retro computer, vaporwave aesthetic, light pink and light blue",
         "A view from beautiful alien planet, very beautiful, surealism, retro astronaut on the first plane, 8k photo",
+        "red car in snowy forest, epic vista, beautiful landscape, 4k, 8k",
+        "A raccoon trapped inside a glass jar full of colorful candies, the background is steamy with vivid colors",
+        "cute cat 4k, high-res, masterpiece, best quality, soft lighting, dynamic angle",
+        "A cat holding a sign that says hello OpenVINO",
+        "A small cactus with a happy face in the Sahara desert.",
     ],
 }
 
 
-@register_evaluator("image-generation")
+@register_evaluator("text-to-image")
 class Text2ImageEvaluator(BaseEvaluator):
     def __init__(
         self,

diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py b/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py
@@ -11,11 +11,7 @@
 from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
 
 from optimum.exporters.tasks import TasksManager
-from optimum.intel import (
-    OVLatentConsistencyModelPipeline,
-    OVStableDiffusionPipeline,
-    OVStableDiffusionXLPipeline,
-)
+from optimum.intel import OVPipelineForText2Image
 
 import openvino_genai
 from whowhatbench import EVALUATOR_REGISTRY, MODELTYPE2TASK
@@ -95,9 +91,7 @@ def load_text_model(
 
 
 TEXT2IMAGE_TASK2CLASS = {
-    "sd": OVStableDiffusionPipeline,
-    "sd-xl": OVStableDiffusionXLPipeline,
-    "sd-lcm": OVLatentConsistencyModelPipeline,
+    "text-to-image": OVPipelineForText2Image,
 }
 
 
@@ -142,7 +136,7 @@ def load_model(
 
     if model_type == "text":
         return load_text_model(model_id, device, ov_config, use_hf, use_genai)
-    elif MODELTYPE2TASK[model_type] == "image-generation":
+    elif MODELTYPE2TASK[model_type] == "text-to-image":
         return load_text2image_model(
             model_type, model_id, device, ov_config, use_hf, use_genai
         )
@@ -203,9 +197,9 @@ def parse_args():
     parser.add_argument(
         "--model-type",
         type=str,
-        choices=["text", "sd", "sd-xl", "sd-lcm"],
+        choices=["text", "text-to-image"],
         default="text",
-        help="Indicated the model type, e.g. 'text', 'sd'.",
+        help="Indicated the model type, e.g. 'text' - for LLMs, 't2im' - for text-to-image pipelines.",
     )
     parser.add_argument(
         "--data-encoder",
@@ -367,7 +361,7 @@ def get_evaluator(base_model, args):
                 language=args.language,
                 gen_answer_fn=genai_gen_answer if args.genai else None,
             )
-        elif task == "image-generation":
+        elif task == "text-to-image":
             return EvaluatorCLS(
                 base_model=base_model,
                 gt_data=args.gt_data,
@@ -467,7 +461,7 @@ def main():
     if args.verbose and args.target_model is not None:
         if args.model_type == "text":
             print_text_results(evaluator)
-        elif "sd" in args.model_type:
+        elif "text-to-image" in args.model_type:
             print_image_results(evaluator)