diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py index 5f7fd5c7f1..f5d4452e30 100644 --- a/tools/llm_bench/llm_bench_utils/ov_utils.py +++ b/tools/llm_bench/llm_bench_utils/ov_utils.py @@ -8,6 +8,7 @@ import logging as log import torch import time +import json import types from llm_bench_utils.hook_common import get_bench_hook from llm_bench_utils.config_class import OV_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES @@ -286,7 +287,8 @@ def create_image_gen_model(model_path, device, **kwargs): raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist') else: if kwargs.get("genai", False) and is_genai_available(log_msg=True): - return create_genai_image_gen_model(model_path, device, ov_config, **kwargs) + log.warning("GenAI pipeline is not supported for this task. Switched on default benchmarking") + # return create_genai_image_gen_model(model_path, device, ov_config, **kwargs) start = time.perf_counter() ov_model = model_class.from_pretrained(model_path, device=device, ov_config=ov_config) @@ -296,6 +298,39 @@ def create_image_gen_model(model_path, device, **kwargs): return ov_model, from_pretrained_time, False +def get_genai_clip_text_encoder(model_index_data, model_path, device, ov_config): + import openvino_genai + text_encoder_type = model_index_data.get("text_encoder", []) + if ("CLIPTextModel" in text_encoder_type): + text_encoder = openvino_genai.CLIPTextModel(model_path / "text_encoder", device.upper(), **ov_config) + else: + raise RuntimeError(f'==Failure ==: model by path:{model_path} has unsupported text encoder type {text_encoder_type}') + + return text_encoder + + +def get_genai_clip_text_encoder_with_projection(model_index_data, model_path, text_encoder_path, device, ov_config): + import openvino_genai + text_encoder_type = model_index_data.get(text_encoder_path, []) + if ("CLIPTextModelWithProjection" in text_encoder_type): + text_encoder = openvino_genai.CLIPTextModelWithProjection(model_path / text_encoder_path, device.upper(), **ov_config) + else: + raise RuntimeError(f'==Failure ==: model by path:{model_path} has unsupported {text_encoder_path} type {text_encoder_type}') + + return text_encoder + + +def get_genai_unet_model(model_index_data, model_path, device, ov_config): + import openvino_genai + unet_type = model_index_data.get("unet", []) + if ("UNet2DConditionModel" in unet_type): + unet = openvino_genai.UNet2DConditionModel(model_path / "unet", device.upper(), **ov_config) + else: + raise RuntimeError(f'==Failure ==: model by path:{model_path} has unsupported UNet type {unet_type}') + + return unet + + def create_genai_image_gen_model(model_path, device, ov_config, **kwargs): import openvino_genai @@ -303,8 +338,43 @@ def create_genai_image_gen_model(model_path, device, ov_config, **kwargs): if adapter_config: ov_config['adapters'] = adapter_config + data = {} + with open(str(model_path / "model_index.json"), 'r') as f: + data = json.load(f) + + model_class_name = data.get("_class_name", "") + start = time.perf_counter() - t2i_pipe = openvino_genai.Text2ImagePipeline(model_path, device.upper(), **ov_config) + + scheduler_type = data.get("scheduler", ["", ""])[1] + if (scheduler_type not in ["LCMScheduler", "DDIMScheduler", "LMSDiscreteScheduler", "EulerDiscreteScheduler", "FlowMatchEulerDiscreteScheduler"]): + scheduler = openvino_genai.Scheduler.from_config(model_path / "scheduler/scheduler_config.json", openvino_genai.Scheduler.Type.DDIM) + log.warning(f'Type of scheduler {scheduler_type} is unsupported. Please, be aware that it will be replaced to DDIMScheduler') + + vae_type = data.get("vae", []) + if ("AutoencoderKL" in vae_type): + vae = openvino_genai.AutoencoderKL(model_path / "vae_decoder", device.upper(), **ov_config) + else: + raise RuntimeError(f'==Failure ==: model by path:{model_path} has unsupported vae decoder type {vae_type}') + + if model_class_name == "StableDiffusionPipeline": + text_encoder = get_genai_clip_text_encoder(data, model_path, device, ov_config) + unet = get_genai_unet_model(data, model_path, device, ov_config) + t2i_pipe = openvino_genai.Text2ImagePipeline.stable_diffusion(scheduler, text_encoder, unet, vae) + elif model_class_name == "LatentConsistencyModelPipeline": + text_encoder = get_genai_clip_text_encoder(data, model_path, device, ov_config) + unet = get_genai_unet_model(data, model_path, device, ov_config) + t2i_pipe = openvino_genai.Text2ImagePipeline.latent_consistency_model(scheduler, text_encoder, unet, vae) + elif model_class_name == "StableDiffusionXLPipeline": + clip_text_encoder = get_genai_clip_text_encoder(data, model_path, device, ov_config) + clip_text_encoder_2 = get_genai_clip_text_encoder_with_projection(data, model_path, "text_encoder_2", device, ov_config) + unet = get_genai_unet_model(data, model_path, device, ov_config) + t2i_pipe = openvino_genai.Text2ImagePipeline.stable_diffusion_xl(scheduler, clip_text_encoder, clip_text_encoder_2, unet, vae) + else: + raise RuntimeError(f'==Failure ==: model by path:{model_path} has unsupported _class_name {model_class_name}') + else: + t2i_pipe = openvino_genai.Text2ImagePipeline(model_path, device.upper(), **ov_config) + end = time.perf_counter() log.info(f'Pipeline initialization time: {end - start:.2f}s') return t2i_pipe, end - start, True diff --git a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py index 81da3c7270..45b8c3a861 100644 --- a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py +++ b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py @@ -120,15 +120,6 @@ def worst_examples(self, top_k: int = 5, metric="similarity"): return res def _generate_data(self, model, gen_image_fn=None, image_dir="reference"): - if hasattr(model, "reshape") and self.resolution is not None: - if gen_image_fn is None: - model.reshape( - batch_size=1, - height=self.resolution[0], - width=self.resolution[1], - num_images_per_prompt=1, - ) - def default_gen_image_fn(model, prompt, num_inference_steps, generator=None): output = model( prompt, diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py index 61a1c6d505..48214d71ed 100644 --- a/tools/who_what_benchmark/whowhatbench/wwb.py +++ b/tools/who_what_benchmark/whowhatbench/wwb.py @@ -308,7 +308,7 @@ def parse_args(): parser.add_argument( "--image-size", type=int, - default=512, + default=None, help="Text-to-image specific parameter that defines the image resolution.", ) parser.add_argument( @@ -388,13 +388,20 @@ def genai_gen_answer(model, tokenizer, question, max_new_tokens, skip_question): def genai_gen_image(model, prompt, num_inference_steps, generator=None): - image_tensor = model.generate( - prompt, - width=model.resolution[0], - height=model.resolution[1], - num_inference_steps=num_inference_steps, - generator=generator, - ) + if model.resolution[0] is not None: + image_tensor = model.generate( + prompt, + width=model.resolution[0], + height=model.resolution[1], + num_inference_steps=num_inference_steps, + generator=generator, + ) + else: + image_tensor = model.generate( + prompt, + num_inference_steps=num_inference_steps, + generator=generator, + ) image = Image.fromarray(image_tensor.data[0]) return image