From 4c185f0c9efc1e00fbd7b0f482c64339a03b86a0 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 17 Nov 2023 11:50:14 +0530 Subject: [PATCH 01/99] add poc for benchmarking workflow. --- .github/workflows/benchmark.yml | 61 +++++++++++++++++++++++++++++++++ benchmarks/benchmark_sd.py | 60 ++++++++++++++++++++++++++++++++ benchmarks/benchmark_utils.py | 46 +++++++++++++++++++++++++ 3 files changed, 167 insertions(+) create mode 100644 .github/workflows/benchmark.yml create mode 100644 benchmarks/benchmark_sd.py create mode 100644 benchmarks/benchmark_utils.py diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 000000000000..20dd69a47ce7 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,61 @@ +name: Benchmarking tests + +on: + pull_request: + branches: + - main + push: + branches: + - ci-* + +env: + DIFFUSERS_IS_CI: yes + HF_HOME: /mnt/cache + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + PYTEST_TIMEOUT: 600 + RUN_SLOW: yes + PIPELINE_USAGE_CUTOFF: 50000 + +jobs: + torch_pipelines_cuda_benchmark_tests: + name: Torch Core Pipelines CUDA Benchmarking Tests + strategy: + fail-fast: false + max-parallel: 1 + runs-on: docker-gpu + container: + image: diffusers/diffusers-pytorch-cuda + options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 + steps: + - name: Checkout diffusers + uses: actions/checkout@v3 + with: + fetch-depth: 2 + - name: NVIDIA-SMI + run: | + nvidia-smi + - name: Install dependencies + run: | + apt-get update && apt-get install libsndfile1-dev libgl1 -y + python -m pip install -e .[quality,test] + python -m pip install git+https://github.com/huggingface/accelerate.git + mkdir benchmark_outputs + - name: Environment + run: | + python utils/print_env.py + - name: Stable Diffusion Benchmarking Tests + env: + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + run: | + cd benchmarks && python benchmark_sd.py && \ + python benchmark_sd.py --batch_size 4 && \ + python benchmark_sd.py --run_compile && \ + python benchmark_sd.py --batch_size 4 --run_compile + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: benchmark_test_reports + path: benchmark_outputs \ No newline at end of file diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py new file mode 100644 index 000000000000..6d6a1f6bfb63 --- /dev/null +++ b/benchmarks/benchmark_sd.py @@ -0,0 +1,60 @@ +import argparse +import os +import torch +from diffusers import DiffusionPipeline +from .benchmark_utils import benchmark_fn, bytes_to_giga_bytes, BenchmarkInfo, generate_markdown_table + +CKPT = "CompVis/stable-diffusion-v1-4" +PROMPT = "ghibli style, a fantasy landscape with castles" +BASE_PATH = "benchmark_outputs" + + +def load_pipeline(run_compile=False, with_tensorrt=False): + pipe = DiffusionPipeline.from_pretrained( + CKPT, torch_dtype=torch.float16, use_safetensors=True + ) + pipe = pipe.to("cuda") + + if run_compile: + pipe.unet.to(memory_format=torch.channels_last) + print("Run torch compile") + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + + pipe.set_progress_bar_config(disable=True) + return pipe + + +def run_inference(pipe, args): + _ = pipe( + prompt=PROMPT, + num_inference_steps=args.num_inference_steps, + num_images_per_prompt=args.batch_size, + ) + +def main(args): + pipeline = load_pipeline( + run_compile=args.run_compile, with_tensorrt=args.with_tensorrt + ) + + time = benchmark_fn(run_inference, pipeline, args) # in seconds. + memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. + benchmark_info = BenchmarkInfo(time=time, memory=memory) + + markdown_report = "" + markdown_report = generate_markdown_table(pipeline_name=CKPT, args=args, benchmark_info=benchmark_info, markdown_report=markdown_report) + return markdown_report + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--batch_size", type=int, default=1) + parser.add_argument("--num_inference_steps", type=int, default=50) + parser.add_argument("--run_compile", action="store_true") + args = parser.parse_args() + markdown_report = main(args) + + name = CKPT + f"-batch_sze@{args.batch_size}-num_inference_steps@{args.num_inference_steps}--run_compile@{args.run_compile}" + filepath = os.path.join(BASE_PATH, name) + with open(filepath, "w") as f: + f.write(markdown_report) + + \ No newline at end of file diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py new file mode 100644 index 000000000000..78f7368d7ceb --- /dev/null +++ b/benchmarks/benchmark_utils.py @@ -0,0 +1,46 @@ +import gc +import torch +import torch.utils.benchmark as benchmark +from dataclasses import dataclass +import argparse + +@dataclass +class BenchmarkInfo: + time: float + memory: float + + +def flush(): + gc.collect() + torch.cuda.empty_cache() + +def bytes_to_giga_bytes(bytes): + return bytes / 1024 / 1024 / 1024 + + +# Adapted from +# https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html +def benchmark_fn(f, *args, **kwargs): + t0 = benchmark.Timer( + stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f} + ) + return f"{(t0.blocked_autorange().mean):.3f}" + +def generate_markdown_table(pipeline_name: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo) -> str: + headers = ["**Parameter**", "**Value**"] + data = [ + ["Batch Size", args.batch_size], + ["Number of Inference Steps", args.num_inference_steps], + ["Run Compile", args.run_compile], + ["Time (seconds)", benchmark_info.time], + ["Memory (GBs)", benchmark_info.memory] + ] + + # Formatting the table. + markdown_table = f"## {pipeline_name}\n\n" + markdown_table += "| " + " | ".join(headers) + " |\n" + markdown_table += "|-" + "-|-".join(['' for _ in headers]) + "-|\n" + for row in data: + markdown_table += "| " + " | ".join(str(item) for item in row) + " |\n" + + return markdown_table \ No newline at end of file From 945ab176a082c7608c67a789af432dc1d88b34a8 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 17 Nov 2023 12:02:33 +0530 Subject: [PATCH 02/99] import --- benchmarks/benchmark_sd.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py index 6d6a1f6bfb63..e75d7536c6e1 100644 --- a/benchmarks/benchmark_sd.py +++ b/benchmarks/benchmark_sd.py @@ -2,7 +2,12 @@ import os import torch from diffusers import DiffusionPipeline -from .benchmark_utils import benchmark_fn, bytes_to_giga_bytes, BenchmarkInfo, generate_markdown_table + +import sys + +sys.path.append(".") + +from benchmark_utils import benchmark_fn, bytes_to_giga_bytes, BenchmarkInfo, generate_markdown_table CKPT = "CompVis/stable-diffusion-v1-4" PROMPT = "ghibli style, a fantasy landscape with castles" From b4debda668342c4d78bb200b1538d7c3a5618140 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 17 Nov 2023 12:09:22 +0530 Subject: [PATCH 03/99] fix argument --- benchmarks/benchmark_sd.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py index e75d7536c6e1..b5685c77a6b5 100644 --- a/benchmarks/benchmark_sd.py +++ b/benchmarks/benchmark_sd.py @@ -14,7 +14,7 @@ BASE_PATH = "benchmark_outputs" -def load_pipeline(run_compile=False, with_tensorrt=False): +def load_pipeline(run_compile=False): pipe = DiffusionPipeline.from_pretrained( CKPT, torch_dtype=torch.float16, use_safetensors=True ) @@ -37,9 +37,7 @@ def run_inference(pipe, args): ) def main(args): - pipeline = load_pipeline( - run_compile=args.run_compile, with_tensorrt=args.with_tensorrt - ) + pipeline = load_pipeline(run_compile=args.run_compile) time = benchmark_fn(run_inference, pipeline, args) # in seconds. memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. From 22966a1fdc1f0fad31af055d93bb6795fed46e12 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 17 Nov 2023 12:28:48 +0530 Subject: [PATCH 04/99] fix: argument --- benchmarks/benchmark_sd.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py index b5685c77a6b5..27b6ae44b31e 100644 --- a/benchmarks/benchmark_sd.py +++ b/benchmarks/benchmark_sd.py @@ -43,8 +43,7 @@ def main(args): memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. benchmark_info = BenchmarkInfo(time=time, memory=memory) - markdown_report = "" - markdown_report = generate_markdown_table(pipeline_name=CKPT, args=args, benchmark_info=benchmark_info, markdown_report=markdown_report) + markdown_report = generate_markdown_table(pipeline_name=CKPT, args=args, benchmark_info=benchmark_info) return markdown_report if __name__ == "__main__": From 12424a3a992e065eaa52dcd112022c502f7954c0 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 17 Nov 2023 12:43:55 +0530 Subject: [PATCH 05/99] fix: path --- benchmarks/benchmark_sd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py index 27b6ae44b31e..02fe156d84ff 100644 --- a/benchmarks/benchmark_sd.py +++ b/benchmarks/benchmark_sd.py @@ -54,7 +54,7 @@ def main(args): args = parser.parse_args() markdown_report = main(args) - name = CKPT + f"-batch_sze@{args.batch_size}-num_inference_steps@{args.num_inference_steps}--run_compile@{args.run_compile}" + name = CKPT + f"-batch_sze@{args.batch_size}-num_inference_steps@{args.num_inference_steps}--run_compile@{args.run_compile}.md" filepath = os.path.join(BASE_PATH, name) with open(filepath, "w") as f: f.write(markdown_report) From 122d5d90869218bf2a31dd4e2ace693483cc90a7 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 17 Nov 2023 13:00:52 +0530 Subject: [PATCH 06/99] fix --- benchmarks/benchmark_sd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py index 02fe156d84ff..d11999bf8696 100644 --- a/benchmarks/benchmark_sd.py +++ b/benchmarks/benchmark_sd.py @@ -54,7 +54,7 @@ def main(args): args = parser.parse_args() markdown_report = main(args) - name = CKPT + f"-batch_sze@{args.batch_size}-num_inference_steps@{args.num_inference_steps}--run_compile@{args.run_compile}.md" + name = CKPT.replace("/", "_") + f"-batch_sze@{args.batch_size}-num_inference_steps@{args.num_inference_steps}--run_compile@{args.run_compile}.md" filepath = os.path.join(BASE_PATH, name) with open(filepath, "w") as f: f.write(markdown_report) From c20d254ebaf98904d0f12b45d352734853877eda Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 17 Nov 2023 13:19:36 +0530 Subject: [PATCH 07/99] fix --- .github/workflows/benchmark.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 20dd69a47ce7..dc867df77430 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -40,15 +40,13 @@ jobs: apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m pip install -e .[quality,test] python -m pip install git+https://github.com/huggingface/accelerate.git - mkdir benchmark_outputs - name: Environment run: | python utils/print_env.py - name: Stable Diffusion Benchmarking Tests - env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} run: | - cd benchmarks && python benchmark_sd.py && \ + cd benchmarks && mkdir benchmark_outputs && \ + python benchmark_sd.py && \ python benchmark_sd.py --batch_size 4 && \ python benchmark_sd.py --run_compile && \ python benchmark_sd.py --batch_size 4 --run_compile From 43544edbf853d402eeda4e783e5888cedfc11483 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 17 Nov 2023 13:34:04 +0530 Subject: [PATCH 08/99] path --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index dc867df77430..e004aaa09978 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -56,4 +56,4 @@ jobs: uses: actions/upload-artifact@v2 with: name: benchmark_test_reports - path: benchmark_outputs \ No newline at end of file + path: benchmarks/benchmark_outputs \ No newline at end of file From 3c05e4179b91a3c88e6b6a3458862c46e8abde31 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Nov 2023 09:28:16 +0530 Subject: [PATCH 09/99] output csv files. --- Makefile | 2 +- benchmarks/benchmark_sd.py | 49 +++++++++++------- benchmarks/benchmark_utils.py | 98 +++++++++++++++++++++++++---------- 3 files changed, 100 insertions(+), 49 deletions(-) diff --git a/Makefile b/Makefile index 1b81f551d36d..d0ed1cf2a982 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!) export PYTHONPATH = src -check_dirs := examples scripts src tests utils +check_dirs := examples scripts src tests utils benchmarks modified_only_fixup: $(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs))) diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py index d11999bf8696..a5e0d4b9b6a4 100644 --- a/benchmarks/benchmark_sd.py +++ b/benchmarks/benchmark_sd.py @@ -1,23 +1,29 @@ import argparse import os +import sys + import torch + from diffusers import DiffusionPipeline -import sys sys.path.append(".") +from benchmark_utils import ( # noqa: E402 + BASE_PATH, + PROMPT, + BenchmarkInfo, + benchmark_fn, + bytes_to_giga_bytes, + generate_csv_dict, + write_to_csv, +) -from benchmark_utils import benchmark_fn, bytes_to_giga_bytes, BenchmarkInfo, generate_markdown_table CKPT = "CompVis/stable-diffusion-v1-4" -PROMPT = "ghibli style, a fantasy landscape with castles" -BASE_PATH = "benchmark_outputs" def load_pipeline(run_compile=False): - pipe = DiffusionPipeline.from_pretrained( - CKPT, torch_dtype=torch.float16, use_safetensors=True - ) + pipe = DiffusionPipeline.from_pretrained(CKPT, torch_dtype=torch.float16, use_safetensors=True) pipe = pipe.to("cuda") if run_compile: @@ -36,27 +42,30 @@ def run_inference(pipe, args): num_images_per_prompt=args.batch_size, ) -def main(args): + +def main(args) -> dict: pipeline = load_pipeline(run_compile=args.run_compile) - - time = benchmark_fn(run_inference, pipeline, args) # in seconds. - memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. + + time = benchmark_fn(run_inference, pipeline, args) # in seconds. + memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. benchmark_info = BenchmarkInfo(time=time, memory=memory) - - markdown_report = generate_markdown_table(pipeline_name=CKPT, args=args, benchmark_info=benchmark_info) - return markdown_report + + csv_dict = generate_csv_dict(pipeline=CKPT, args=args, benchmark_info=benchmark_info) + return csv_dict + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--batch_size", type=int, default=1) parser.add_argument("--num_inference_steps", type=int, default=50) + parser.add_argument("--model_cpu_offload", action="store_true") parser.add_argument("--run_compile", action="store_true") args = parser.parse_args() - markdown_report = main(args) + csv_dict = main(args) - name = CKPT.replace("/", "_") + f"-batch_sze@{args.batch_size}-num_inference_steps@{args.num_inference_steps}--run_compile@{args.run_compile}.md" + name = ( + CKPT.replace("/", "_") + + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv" + ) filepath = os.path.join(BASE_PATH, name) - with open(filepath, "w") as f: - f.write(markdown_report) - - \ No newline at end of file + write_to_csv(filepath, csv_dict) diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index 78f7368d7ceb..1fecd43964d8 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -1,46 +1,88 @@ -import gc -import torch -import torch.utils.benchmark as benchmark -from dataclasses import dataclass import argparse +import csv +import gc +import os +from dataclasses import dataclass +from typing import Any, Dict, List + +import torch +import torch.utils.benchmark as benchmark + + +GITHUB_SHA = os.getenv("GITHUB_SHA", None) +BENCHMARK_FIELDS = [ + "pipeline", + "batch_size", + "num_inference_steps", + "model_cpu_offload", + "run_compile", + "time (secs)", + "memory (gbs)", + "github_sha", +] + +PROMPT = "ghibli style, a fantasy landscape with castles" +BASE_PATH = "benchmark_outputs" + @dataclass class BenchmarkInfo: - time: float + time: float memory: float def flush(): + """Wipes off memory.""" gc.collect() torch.cuda.empty_cache() + torch.cuda.reset_max_memory_allocated() + torch.cuda.reset_peak_memory_stats() + def bytes_to_giga_bytes(bytes): return bytes / 1024 / 1024 / 1024 -# Adapted from -# https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html def benchmark_fn(f, *args, **kwargs): t0 = benchmark.Timer( - stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f} + stmt="f(*args, **kwargs)", + globals={"args": args, "kwargs": kwargs, "f": f}, + num_threads=torch.get_num_threads(), ) - return f"{(t0.blocked_autorange().mean):.3f}" - -def generate_markdown_table(pipeline_name: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo) -> str: - headers = ["**Parameter**", "**Value**"] - data = [ - ["Batch Size", args.batch_size], - ["Number of Inference Steps", args.num_inference_steps], - ["Run Compile", args.run_compile], - ["Time (seconds)", benchmark_info.time], - ["Memory (GBs)", benchmark_info.memory] - ] - - # Formatting the table. - markdown_table = f"## {pipeline_name}\n\n" - markdown_table += "| " + " | ".join(headers) + " |\n" - markdown_table += "|-" + "-|-".join(['' for _ in headers]) + "-|\n" - for row in data: - markdown_table += "| " + " | ".join(str(item) for item in row) + " |\n" - - return markdown_table \ No newline at end of file + return f"{(t0.blocked_autorange().mean):.3f}" + + +def generate_csv_dict(pipeline: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo) -> Dict[str, Any]: + """Packs benchmarking data into a dictionary for latter serialization.""" + data_dict = { + "pipeline": pipeline, + "batch_size": args.batch_size, + "num_inference_steps": args.num_inference_steps, + "model_cpu_offload": args.model_cpu_offload, + "run_compile": args.run_compile, + "time (secs)": benchmark_info.time, + "memory (gbs)": benchmark_info.memory, + "github_sha": GITHUB_SHA, + } + return data_dict + + +def write_to_csv(file_name: str, data_dict: Dict[str, Any]): + """Serializes a dictionary into a CSV file.""" + with open(file_name, mode="w", newline="") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=BENCHMARK_FIELDS) + writer.writeheader() + writer.writerow(data_dict) + + +def collate_csv(input_files: List[str], output_file: str): + """Collates multiple identically structured CSVs into a single CSV file.""" + with open(output_file, mode="w", newline="") as outfile: + writer = csv.DictWriter(outfile, fieldnames=BENCHMARK_FIELDS) + writer.writeheader() + + for file in input_files: + with open(file, mode="r") as infile: + reader = csv.DictReader(infile) + for row in reader: + writer.writerow(row) From 24b68fd911981da7373e4c76d4e8e0bab7da415d Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Nov 2023 09:40:44 +0530 Subject: [PATCH 10/99] workflow cleanup --- .github/workflows/benchmark.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index e004aaa09978..d96f9fe955ce 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -13,9 +13,6 @@ env: HF_HOME: /mnt/cache OMP_NUM_THREADS: 8 MKL_NUM_THREADS: 8 - PYTEST_TIMEOUT: 600 - RUN_SLOW: yes - PIPELINE_USAGE_CUTOFF: 50000 jobs: torch_pipelines_cuda_benchmark_tests: From 8e2088ea35c1c75e57cd7eda9a601ff7a5b94bc5 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Nov 2023 09:50:12 +0530 Subject: [PATCH 11/99] append token --- .github/workflows/benchmark.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index d96f9fe955ce..aa8a10ae7fb6 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -41,6 +41,8 @@ jobs: run: | python utils/print_env.py - name: Stable Diffusion Benchmarking Tests + env: + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} run: | cd benchmarks && mkdir benchmark_outputs && \ python benchmark_sd.py && \ From 01584c786e75852d66b1d7880c23d5336a556c43 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Nov 2023 10:03:17 +0530 Subject: [PATCH 12/99] add utility to push to hf dataset --- benchmarks/push_results.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 benchmarks/push_results.py diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py new file mode 100644 index 000000000000..98af01fb1fe8 --- /dev/null +++ b/benchmarks/push_results.py @@ -0,0 +1,26 @@ +import glob +import os +import sys + +from huggingface_hub import upload_file + + +sys.path.append(".") +from benchmark_utils import BASE_PATH, collate_csv # noqa: E402 + + +FINAL_CSV_FILE = "collated_results.csv" +REPO_ID = "diffusers/benchmarks" +GITHUB_SHA = os.getenv("GITHUB_SHA", None) + + +def push_to_hf_dataset(): + all_csvs = sorted(glob.glob(f"{BASE_PATH}/*.csv")) + collate_csv(all_csvs, FINAL_CSV_FILE) + + commit_message = f"upload from sha: {GITHUB_SHA}" if GITHUB_SHA is not None else "upload benchmark results" + upload_file(repo_id=REPO_ID, path_or_fileobj=FINAL_CSV_FILE, repo_type="dataset", commit_message=commit_message) + + +if __name__ == "__main__": + push_to_hf_dataset() From 853035b9c19cbbd42de639266a42c9c62b868127 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Nov 2023 10:04:44 +0530 Subject: [PATCH 13/99] fix: kw arg --- benchmarks/push_results.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py index 98af01fb1fe8..e3bb48df1d26 100644 --- a/benchmarks/push_results.py +++ b/benchmarks/push_results.py @@ -19,7 +19,13 @@ def push_to_hf_dataset(): collate_csv(all_csvs, FINAL_CSV_FILE) commit_message = f"upload from sha: {GITHUB_SHA}" if GITHUB_SHA is not None else "upload benchmark results" - upload_file(repo_id=REPO_ID, path_or_fileobj=FINAL_CSV_FILE, repo_type="dataset", commit_message=commit_message) + upload_file( + repo_id=REPO_ID, + path_in_repo=FINAL_CSV_FILE, + path_or_fileobj=FINAL_CSV_FILE, + repo_type="dataset", + commit_message=commit_message, + ) if __name__ == "__main__": From 46aaf96f5b45b2d68497d81386363dac61a5adf1 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Nov 2023 10:22:58 +0530 Subject: [PATCH 14/99] better reporting --- benchmarks/benchmark_sd.py | 4 +++- benchmarks/benchmark_utils.py | 13 +++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py index a5e0d4b9b6a4..754fb58c9e37 100644 --- a/benchmarks/benchmark_sd.py +++ b/benchmarks/benchmark_sd.py @@ -50,7 +50,9 @@ def main(args) -> dict: memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. benchmark_info = BenchmarkInfo(time=time, memory=memory) - csv_dict = generate_csv_dict(pipeline=CKPT, args=args, benchmark_info=benchmark_info) + csv_dict = generate_csv_dict( + pipeline_cls=str(pipeline.__class__.__name__), ckpt=CKPT, args=args, benchmark_info=benchmark_info + ) return csv_dict diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index 1fecd43964d8..307c3160387d 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -11,7 +11,8 @@ GITHUB_SHA = os.getenv("GITHUB_SHA", None) BENCHMARK_FIELDS = [ - "pipeline", + "pipeline_cls", + "ckpt_id", "batch_size", "num_inference_steps", "model_cpu_offload", @@ -20,9 +21,9 @@ "memory (gbs)", "github_sha", ] - PROMPT = "ghibli style, a fantasy landscape with castles" BASE_PATH = "benchmark_outputs" +TOTAL_GPU_MEMORY = torch.cuda.get_device_properties(0).total_memory / (1024**3) @dataclass @@ -52,16 +53,20 @@ def benchmark_fn(f, *args, **kwargs): return f"{(t0.blocked_autorange().mean):.3f}" -def generate_csv_dict(pipeline: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo) -> Dict[str, Any]: +def generate_csv_dict( + pipeline_cls: str, ckpt: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo +) -> Dict[str, Any]: """Packs benchmarking data into a dictionary for latter serialization.""" data_dict = { - "pipeline": pipeline, + "pipeline_cls": pipeline_cls, + "ckpt_id": ckpt, "batch_size": args.batch_size, "num_inference_steps": args.num_inference_steps, "model_cpu_offload": args.model_cpu_offload, "run_compile": args.run_compile, "time (secs)": benchmark_info.time, "memory (gbs)": benchmark_info.memory, + "actual_gpu_memory (gbs)": TOTAL_GPU_MEMORY, "github_sha": GITHUB_SHA, } return data_dict From d626eef65f24072bfeb455dce3e71d79a72c0981 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Nov 2023 10:26:24 +0530 Subject: [PATCH 15/99] fix: headers --- benchmarks/benchmark_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index 307c3160387d..1d51e89e275f 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -19,6 +19,7 @@ "run_compile", "time (secs)", "memory (gbs)", + "actual_gpu_memory (gbs)", "github_sha", ] PROMPT = "ghibli style, a fantasy landscape with castles" From ab12fe6a21d8e74e4845faa4e37e38a694ba2fbe Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Nov 2023 10:38:32 +0530 Subject: [PATCH 16/99] better formatting of the numbers. --- benchmarks/benchmark_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index 1d51e89e275f..9eb834444fac 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -42,7 +42,7 @@ def flush(): def bytes_to_giga_bytes(bytes): - return bytes / 1024 / 1024 / 1024 + return f"{(bytes / 1024 / 1024 / 1024)}:.3f" def benchmark_fn(f, *args, **kwargs): @@ -67,7 +67,7 @@ def generate_csv_dict( "run_compile": args.run_compile, "time (secs)": benchmark_info.time, "memory (gbs)": benchmark_info.memory, - "actual_gpu_memory (gbs)": TOTAL_GPU_MEMORY, + "actual_gpu_memory (gbs)": f"{(TOTAL_GPU_MEMORY):.3f}", "github_sha": GITHUB_SHA, } return data_dict From 1bb531eac7e3bb7e659d94c7cb2132ad69300236 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Nov 2023 10:39:45 +0530 Subject: [PATCH 17/99] better type annotation --- benchmarks/benchmark_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index 9eb834444fac..db40187a7d2d 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -3,7 +3,7 @@ import gc import os from dataclasses import dataclass -from typing import Any, Dict, List +from typing import Dict, List, Union import torch import torch.utils.benchmark as benchmark @@ -56,7 +56,7 @@ def benchmark_fn(f, *args, **kwargs): def generate_csv_dict( pipeline_cls: str, ckpt: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo -) -> Dict[str, Any]: +) -> Dict[str, Union[str, bool, float]]: """Packs benchmarking data into a dictionary for latter serialization.""" data_dict = { "pipeline_cls": pipeline_cls, @@ -73,7 +73,7 @@ def generate_csv_dict( return data_dict -def write_to_csv(file_name: str, data_dict: Dict[str, Any]): +def write_to_csv(file_name: str, data_dict: Dict[str, Union[str, bool, float]]): """Serializes a dictionary into a CSV file.""" with open(file_name, mode="w", newline="") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=BENCHMARK_FIELDS) From 2df4abae8b278db9477faf93711968b223fb1745 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Nov 2023 10:47:10 +0530 Subject: [PATCH 18/99] fix: formatting --- benchmarks/benchmark_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index db40187a7d2d..a1e4f169c634 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -42,7 +42,7 @@ def flush(): def bytes_to_giga_bytes(bytes): - return f"{(bytes / 1024 / 1024 / 1024)}:.3f" + return f"{(bytes / 1024 / 1024 / 1024):.3f}" def benchmark_fn(f, *args, **kwargs): From 939fe5ccb57845148479974b3e70eb4bdeafa81d Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Nov 2023 12:04:24 +0530 Subject: [PATCH 19/99] moentarily disable check --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index aa8a10ae7fb6..c993dcd31ad9 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -23,7 +23,7 @@ jobs: runs-on: docker-gpu container: image: diffusers/diffusers-pytorch-cuda - options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 + options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 --env NVIDIA_DISABLE_REQUIRE=1 steps: - name: Checkout diffusers uses: actions/checkout@v3 From 3a18e2908f64dfa233a12267d33452229bdf8e64 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Nov 2023 12:28:35 +0530 Subject: [PATCH 20/99] push results. --- .github/workflows/benchmark.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index c993dcd31ad9..8af55c011272 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -48,7 +48,8 @@ jobs: python benchmark_sd.py && \ python benchmark_sd.py --batch_size 4 && \ python benchmark_sd.py --run_compile && \ - python benchmark_sd.py --batch_size 4 --run_compile + python benchmark_sd.py --batch_size 4 --run_compile && \ + python push_results.py - name: Test suite reports artifacts if: ${{ always() }} From 71279b6f7d2704b44c60e2eba34acec38c18e8bd Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Nov 2023 14:18:20 +0530 Subject: [PATCH 21/99] remove disable check --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 8af55c011272..a20142bc36c1 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -23,7 +23,7 @@ jobs: runs-on: docker-gpu container: image: diffusers/diffusers-pytorch-cuda - options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 --env NVIDIA_DISABLE_REQUIRE=1 + options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 steps: - name: Checkout diffusers uses: actions/checkout@v3 From 3c8cc38a00ab72ce5017e609d673ecbef1940936 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Wed, 29 Nov 2023 16:56:27 +0530 Subject: [PATCH 22/99] introduce base classes. --- benchmarks/base_classes.py | 56 ++++++++++++++++ benchmarks/benchmark_sd.py | 73 +++++---------------- benchmarks/push_results.py | 2 +- benchmarks/{benchmark_utils.py => utils.py} | 0 4 files changed, 74 insertions(+), 57 deletions(-) create mode 100644 benchmarks/base_classes.py rename benchmarks/{benchmark_utils.py => utils.py} (100%) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py new file mode 100644 index 000000000000..23c8e881e235 --- /dev/null +++ b/benchmarks/base_classes.py @@ -0,0 +1,56 @@ +import os +import sys + +import torch + +from diffusers import DiffusionPipeline + + +sys.path.append(".") + +from benchmarks.utils import ( # noqa: E402 + BASE_PATH, + PROMPT, + BenchmarkInfo, + benchmark_fn, + bytes_to_giga_bytes, + generate_csv_dict, + write_to_csv, +) + + +class TextToImagePipeline: + def __init__(self, args): + pipe = DiffusionPipeline.from_pretrained(args.ckpt, torch_dtype=torch.float16, use_safetensors=True) + pipe = pipe.to("cuda") + + if args.run_compile: + pipe.unet.to(memory_format=torch.channels_last) + print("Run torch compile") + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + + pipe.set_progress_bar_config(disable=True) + self.pipe = pipe + + def run_inference(self, pipe, args): + _ = pipe( + prompt=PROMPT, + num_inference_steps=args.num_inference_steps, + num_images_per_prompt=args.batch_size, + ) + + def __call__(self, args): + time = benchmark_fn(self.run_inference, self.pipe, args) # in seconds. + memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. + benchmark_info = BenchmarkInfo(time=time, memory=memory) + + csv_dict = generate_csv_dict( + pipeline_cls=str(self.pipe.__class__.__name__), ckpt=args.ckpt, args=args, benchmark_info=benchmark_info + ) + name = ( + args.ckpt.replace("/", "_") + + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv" + ) + filepath = os.path.join(BASE_PATH, name) + write_to_csv(filepath, csv_dict) + print(f"Logs written to: {filepath}") diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py index 754fb58c9e37..c12677d9e420 100644 --- a/benchmarks/benchmark_sd.py +++ b/benchmarks/benchmark_sd.py @@ -1,73 +1,34 @@ import argparse -import os import sys -import torch - -from diffusers import DiffusionPipeline - sys.path.append(".") -from benchmark_utils import ( # noqa: E402 - BASE_PATH, - PROMPT, - BenchmarkInfo, - benchmark_fn, - bytes_to_giga_bytes, - generate_csv_dict, - write_to_csv, -) - - -CKPT = "CompVis/stable-diffusion-v1-4" - +from benchmarks.base_classes import TextToImagePipeline # noqa: E402 -def load_pipeline(run_compile=False): - pipe = DiffusionPipeline.from_pretrained(CKPT, torch_dtype=torch.float16, use_safetensors=True) - pipe = pipe.to("cuda") - if run_compile: - pipe.unet.to(memory_format=torch.channels_last) - print("Run torch compile") - pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - - pipe.set_progress_bar_config(disable=True) - return pipe - - -def run_inference(pipe, args): - _ = pipe( - prompt=PROMPT, - num_inference_steps=args.num_inference_steps, - num_images_per_prompt=args.batch_size, - ) - - -def main(args) -> dict: - pipeline = load_pipeline(run_compile=args.run_compile) - - time = benchmark_fn(run_inference, pipeline, args) # in seconds. - memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. - benchmark_info = BenchmarkInfo(time=time, memory=memory) - - csv_dict = generate_csv_dict( - pipeline_cls=str(pipeline.__class__.__name__), ckpt=CKPT, args=args, benchmark_info=benchmark_info - ) - return csv_dict +CKPT = "runwayml/stable-diffusion-v1-5" if __name__ == "__main__": parser = argparse.ArgumentParser() + parser.add_argument( + "--ckpt", + type=str, + default="runwayml/stable-diffusion-v1-5", + choices=[ + "runwayml/stable-diffusion-v1-5", + "segmind/SSD-1B", + "stabilityai/stable-diffusion-2-1", + "stabilityai/stable-diffusion-xl-base-1.0", + ], + ) parser.add_argument("--batch_size", type=int, default=1) parser.add_argument("--num_inference_steps", type=int, default=50) parser.add_argument("--model_cpu_offload", action="store_true") parser.add_argument("--run_compile", action="store_true") args = parser.parse_args() - csv_dict = main(args) + args.ckpt = CKPT - name = ( - CKPT.replace("/", "_") - + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv" - ) - filepath = os.path.join(BASE_PATH, name) - write_to_csv(filepath, csv_dict) + benchmark_pipe = TextToImagePipeline(args) + + benchmark_pipe() diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py index e3bb48df1d26..9665e6b19b77 100644 --- a/benchmarks/push_results.py +++ b/benchmarks/push_results.py @@ -6,7 +6,7 @@ sys.path.append(".") -from benchmark_utils import BASE_PATH, collate_csv # noqa: E402 +from benchmarks.utils import BASE_PATH, collate_csv # noqa: E402 FINAL_CSV_FILE = "collated_results.csv" diff --git a/benchmarks/benchmark_utils.py b/benchmarks/utils.py similarity index 100% rename from benchmarks/benchmark_utils.py rename to benchmarks/utils.py From 9683cd773fed1c4d75063ba50c3bc0fc687df8a3 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Wed, 29 Nov 2023 17:15:07 +0530 Subject: [PATCH 23/99] img2img class --- benchmarks/base_classes.py | 32 +++++++++++++++++++++++++++++--- benchmarks/benchmark_sd.py | 7 +------ benchmarks/benchmark_sd_img.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 9 deletions(-) create mode 100644 benchmarks/benchmark_sd_img.py diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 23c8e881e235..05d2e8102c03 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -3,7 +3,8 @@ import torch -from diffusers import DiffusionPipeline +from diffusers import AutoPipelineForImage2Image, AutoPipelineForText2Image +from diffusers.utils import load_image sys.path.append(".") @@ -19,9 +20,18 @@ ) +RESOLUTION_MAPPING = { + "runwayml/stable-diffusion-v1-5": (512, 512), + "stabilityai/stable-diffusion-2-1": (768, 768), + "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024), +} + + class TextToImagePipeline: + pipeline_class = AutoPipelineForText2Image + def __init__(self, args): - pipe = DiffusionPipeline.from_pretrained(args.ckpt, torch_dtype=torch.float16, use_safetensors=True) + pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16, use_safetensors=True) pipe = pipe.to("cuda") if args.run_compile: @@ -39,7 +49,7 @@ def run_inference(self, pipe, args): num_images_per_prompt=args.batch_size, ) - def __call__(self, args): + def benchmark(self, args): time = benchmark_fn(self.run_inference, self.pipe, args) # in seconds. memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. benchmark_info = BenchmarkInfo(time=time, memory=memory) @@ -54,3 +64,19 @@ def __call__(self, args): filepath = os.path.join(BASE_PATH, name) write_to_csv(filepath, csv_dict) print(f"Logs written to: {filepath}") + + +class ImageToImagePipeline(TextToImagePipeline): + pipeline_class = AutoPipelineForImage2Image + url = "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/1665_Girl_with_a_Pearl_Earring.jpg/800px-1665_Girl_with_a_Pearl_Earring.jpg" + image = load_image(url).convert("RGB") + + def run_inference(self, pipe, args): + self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) + + _ = pipe( + prompt=PROMPT, + image=self.image, + num_inference_steps=args.num_inference_steps, + num_images_per_prompt=args.batch_size, + ) diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py index c12677d9e420..3a2bdb90ee93 100644 --- a/benchmarks/benchmark_sd.py +++ b/benchmarks/benchmark_sd.py @@ -6,9 +6,6 @@ from benchmarks.base_classes import TextToImagePipeline # noqa: E402 -CKPT = "runwayml/stable-diffusion-v1-5" - - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -27,8 +24,6 @@ parser.add_argument("--model_cpu_offload", action="store_true") parser.add_argument("--run_compile", action="store_true") args = parser.parse_args() - args.ckpt = CKPT benchmark_pipe = TextToImagePipeline(args) - - benchmark_pipe() + benchmark_pipe.benchmark() diff --git a/benchmarks/benchmark_sd_img.py b/benchmarks/benchmark_sd_img.py new file mode 100644 index 000000000000..8ba0d2be42b8 --- /dev/null +++ b/benchmarks/benchmark_sd_img.py @@ -0,0 +1,28 @@ +import argparse +import sys + + +sys.path.append(".") +from benchmarks.base_classes import ImageToImagePipeline # noqa: E402 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--ckpt", + type=str, + default="runwayml/stable-diffusion-v1-5", + choices=[ + "runwayml/stable-diffusion-v1-5", + "stabilityai/stable-diffusion-2-1", + "stabilityai/stable-diffusion-xl-refiner-1.0", + ], + ) + parser.add_argument("--batch_size", type=int, default=1) + parser.add_argument("--num_inference_steps", type=int, default=50) + parser.add_argument("--model_cpu_offload", action="store_true") + parser.add_argument("--run_compile", action="store_true") + args = parser.parse_args() + + benchmark_pipe = ImageToImagePipeline(args) + benchmark_pipe.benchmark() From 274b9e17665b6a4dfde3bb4680fab45fdd99b7b0 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Wed, 29 Nov 2023 17:20:35 +0530 Subject: [PATCH 24/99] add inpainting pipeline --- benchmarks/base_classes.py | 20 +++++++++++++++++++- benchmarks/benchmark_sd_inpating.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 benchmarks/benchmark_sd_inpating.py diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 05d2e8102c03..cdb0bd525044 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -3,7 +3,7 @@ import torch -from diffusers import AutoPipelineForImage2Image, AutoPipelineForText2Image +from diffusers import AutoPipelineForImage2Image, AutoPipelineForText2Image, AutoPipelineForInpainting from diffusers.utils import load_image @@ -80,3 +80,21 @@ def run_inference(self, pipe, args): num_inference_steps=args.num_inference_steps, num_images_per_prompt=args.batch_size, ) + + +class InpatingPipeline(ImageToImagePipeline): + pipeline_class = AutoPipelineForInpainting + mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" + mask = load_image(mask_url).convert("RGB") + + def run_inference(self, pipe, args): + self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) + self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt]) + + _ = pipe( + prompt=PROMPT, + image=self.image, + mask_image=self.mask, + num_inference_steps=args.num_inference_steps, + num_images_per_prompt=args.batch_size, + ) \ No newline at end of file diff --git a/benchmarks/benchmark_sd_inpating.py b/benchmarks/benchmark_sd_inpating.py new file mode 100644 index 000000000000..dab38d204906 --- /dev/null +++ b/benchmarks/benchmark_sd_inpating.py @@ -0,0 +1,28 @@ +import argparse +import sys + + +sys.path.append(".") +from benchmarks.base_classes import InpatingPipeline # noqa: E402 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--ckpt", + type=str, + default="runwayml/stable-diffusion-v1-5", + choices=[ + "runwayml/stable-diffusion-v1-5", + "stabilityai/stable-diffusion-2-1", + "stabilityai/stable-diffusion-xl-base-1.0", + ], + ) + parser.add_argument("--batch_size", type=int, default=1) + parser.add_argument("--num_inference_steps", type=int, default=50) + parser.add_argument("--model_cpu_offload", action="store_true") + parser.add_argument("--run_compile", action="store_true") + args = parser.parse_args() + + benchmark_pipe = InpatingPipeline(args) + benchmark_pipe.benchmark() From 2b5b8aee2af3761b226acba22954f7a0bd897e85 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Wed, 29 Nov 2023 19:01:57 +0530 Subject: [PATCH 25/99] intoduce base benchmark class. --- benchmarks/base_classes.py | 25 +++++++++++++++++++------ benchmarks/benchmark_sd.py | 4 ++-- benchmarks/benchmark_sd_img.py | 4 ++-- benchmarks/benchmark_sd_inpating.py | 4 ++-- 4 files changed, 25 insertions(+), 12 deletions(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index cdb0bd525044..add4fd0edd13 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -3,7 +3,7 @@ import torch -from diffusers import AutoPipelineForImage2Image, AutoPipelineForText2Image, AutoPipelineForInpainting +from diffusers import AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image from diffusers.utils import load_image @@ -27,7 +27,20 @@ } -class TextToImagePipeline: +class BaseBenchmak: + pipeline_class = None + + def __init__(self, args): + super().__init__() + + def run_inference(self, args): + raise NotImplementedError + + def benchmark(self, args): + raise NotImplementedError + + +class TextToImageBenchmark(BaseBenchmak): pipeline_class = AutoPipelineForText2Image def __init__(self, args): @@ -66,7 +79,7 @@ def benchmark(self, args): print(f"Logs written to: {filepath}") -class ImageToImagePipeline(TextToImagePipeline): +class ImageToImageBenchmark(TextToImageBenchmark): pipeline_class = AutoPipelineForImage2Image url = "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/1665_Girl_with_a_Pearl_Earring.jpg/800px-1665_Girl_with_a_Pearl_Earring.jpg" image = load_image(url).convert("RGB") @@ -82,11 +95,11 @@ def run_inference(self, pipe, args): ) -class InpatingPipeline(ImageToImagePipeline): +class InpatingBenchmark(ImageToImageBenchmark): pipeline_class = AutoPipelineForInpainting mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" mask = load_image(mask_url).convert("RGB") - + def run_inference(self, pipe, args): self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt]) @@ -97,4 +110,4 @@ def run_inference(self, pipe, args): mask_image=self.mask, num_inference_steps=args.num_inference_steps, num_images_per_prompt=args.batch_size, - ) \ No newline at end of file + ) diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py index 3a2bdb90ee93..4c6495e61eb2 100644 --- a/benchmarks/benchmark_sd.py +++ b/benchmarks/benchmark_sd.py @@ -3,7 +3,7 @@ sys.path.append(".") -from benchmarks.base_classes import TextToImagePipeline # noqa: E402 +from benchmarks.base_classes import TextToImageBenchmark # noqa: E402 if __name__ == "__main__": @@ -25,5 +25,5 @@ parser.add_argument("--run_compile", action="store_true") args = parser.parse_args() - benchmark_pipe = TextToImagePipeline(args) + benchmark_pipe = TextToImageBenchmark(args) benchmark_pipe.benchmark() diff --git a/benchmarks/benchmark_sd_img.py b/benchmarks/benchmark_sd_img.py index 8ba0d2be42b8..74440b6ada05 100644 --- a/benchmarks/benchmark_sd_img.py +++ b/benchmarks/benchmark_sd_img.py @@ -3,7 +3,7 @@ sys.path.append(".") -from benchmarks.base_classes import ImageToImagePipeline # noqa: E402 +from benchmarks.base_classes import ImageToImageBenchmark # noqa: E402 if __name__ == "__main__": @@ -24,5 +24,5 @@ parser.add_argument("--run_compile", action="store_true") args = parser.parse_args() - benchmark_pipe = ImageToImagePipeline(args) + benchmark_pipe = ImageToImageBenchmark(args) benchmark_pipe.benchmark() diff --git a/benchmarks/benchmark_sd_inpating.py b/benchmarks/benchmark_sd_inpating.py index dab38d204906..6167775311a6 100644 --- a/benchmarks/benchmark_sd_inpating.py +++ b/benchmarks/benchmark_sd_inpating.py @@ -3,7 +3,7 @@ sys.path.append(".") -from benchmarks.base_classes import InpatingPipeline # noqa: E402 +from benchmarks.base_classes import InpatingBenchmark # noqa: E402 if __name__ == "__main__": @@ -24,5 +24,5 @@ parser.add_argument("--run_compile", action="store_true") args = parser.parse_args() - benchmark_pipe = InpatingPipeline(args) + benchmark_pipe = InpatingBenchmark(args) benchmark_pipe.benchmark() From 66b159aa52afe9f0d974e113d9847eb22d9424e0 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 30 Nov 2023 07:31:39 +0530 Subject: [PATCH 26/99] add img2img and inpainting --- .github/workflows/benchmark.yml | 7 +++---- benchmarks/base_classes.py | 2 +- ...benchmark_sd_inpating.py => benchmark_sd_inpainting.py} | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) rename benchmarks/{benchmark_sd_inpating.py => benchmark_sd_inpainting.py} (86%) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index a20142bc36c1..a474a0528a06 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -45,10 +45,9 @@ jobs: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} run: | cd benchmarks && mkdir benchmark_outputs && \ - python benchmark_sd.py && \ - python benchmark_sd.py --batch_size 4 && \ - python benchmark_sd.py --run_compile && \ - python benchmark_sd.py --batch_size 4 --run_compile && \ + python benchmark_sd.py && python benchmark_sd.py --run_compile && \ + python benchmark_sd_img.py && python benchmark_sd_img.py --run_compile && \ + python benchmark_sd_inpainting.py && python benchmark_sd_inpainting.py --run_compile && \ python push_results.py - name: Test suite reports artifacts diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index add4fd0edd13..2132b2fa8a1a 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -95,7 +95,7 @@ def run_inference(self, pipe, args): ) -class InpatingBenchmark(ImageToImageBenchmark): +class InpaintingBenchmark(ImageToImageBenchmark): pipeline_class = AutoPipelineForInpainting mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" mask = load_image(mask_url).convert("RGB") diff --git a/benchmarks/benchmark_sd_inpating.py b/benchmarks/benchmark_sd_inpainting.py similarity index 86% rename from benchmarks/benchmark_sd_inpating.py rename to benchmarks/benchmark_sd_inpainting.py index 6167775311a6..7c8afef6eb58 100644 --- a/benchmarks/benchmark_sd_inpating.py +++ b/benchmarks/benchmark_sd_inpainting.py @@ -3,7 +3,7 @@ sys.path.append(".") -from benchmarks.base_classes import InpatingBenchmark # noqa: E402 +from benchmarks.base_classes import InpaintingBenchmark # noqa: E402 if __name__ == "__main__": @@ -24,5 +24,5 @@ parser.add_argument("--run_compile", action="store_true") args = parser.parse_args() - benchmark_pipe = InpatingBenchmark(args) + benchmark_pipe = InpaintingBenchmark(args) benchmark_pipe.benchmark() From 01addbd7ff43f422cb38020f7dc52205d5586a32 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 1 Dec 2023 08:29:57 +0530 Subject: [PATCH 27/99] feat: utility to compare changes --- .github/workflows/benchmark.yml | 1 + benchmarks/push_results.py | 38 +++++++++++++++++++++++++++------ benchmarks/utils.py | 4 ++++ 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index a474a0528a06..b0a05a31813d 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -37,6 +37,7 @@ jobs: apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m pip install -e .[quality,test] python -m pip install git+https://github.com/huggingface/accelerate.git + python -m pip install pandas - name: Environment run: | python utils/print_env.py diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py index 9665e6b19b77..c9fec2b7b4a4 100644 --- a/benchmarks/push_results.py +++ b/benchmarks/push_results.py @@ -1,23 +1,49 @@ import glob -import os import sys -from huggingface_hub import upload_file +import pandas as pd +from huggingface_hub import hf_hub_download, upload_file sys.path.append(".") -from benchmarks.utils import BASE_PATH, collate_csv # noqa: E402 +from benchmarks.utils import BASE_PATH, FINAL_CSV_FILE, GITHUB_SHA, REPO_ID, collate_csv # noqa: E402 -FINAL_CSV_FILE = "collated_results.csv" -REPO_ID = "diffusers/benchmarks" -GITHUB_SHA = os.getenv("GITHUB_SHA", None) +def has_previous_benchmark() -> str: + csv_path = None + try: + csv_path = hf_hub_download(repo_id=REPO_ID, repo_type="dataset", filename=FINAL_CSV_FILE) + except FileNotFoundError: + csv_path = None + return csv_path def push_to_hf_dataset(): all_csvs = sorted(glob.glob(f"{BASE_PATH}/*.csv")) collate_csv(all_csvs, FINAL_CSV_FILE) + # If there's an existing benchmark file, we should report the changes. + csv_path = has_previous_benchmark() + if csv_path is not None: + current_results = pd.read_csv(FINAL_CSV_FILE) + previous_results = pd.read_csv(csv_path) + numeric_columns = current_results.select_dtypes(include=["float64", "int64"]).columns + numeric_columns = [c for c in numeric_columns if c not in ["batch_size", "num_inference_steps"]] + + for column in numeric_columns: + # Calculate the percentage change + current_results[column] = current_results[column].astype(float) + previous_results[column] = previous_results[column].astype(float) + percent_change = ((current_results[column] - previous_results[column]) / previous_results[column]) * 100 + + # Format the values with '+' or '-' sign and append to original values + current_results[column] = current_results[column].map(str) + percent_change.map( + lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)" + ) + + # Overwrite the current result file. + current_results.to_csv(FINAL_CSV_FILE, index=False) + commit_message = f"upload from sha: {GITHUB_SHA}" if GITHUB_SHA is not None else "upload benchmark results" upload_file( repo_id=REPO_ID, diff --git a/benchmarks/utils.py b/benchmarks/utils.py index a1e4f169c634..8592a2f4f0db 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -22,10 +22,14 @@ "actual_gpu_memory (gbs)", "github_sha", ] + PROMPT = "ghibli style, a fantasy landscape with castles" BASE_PATH = "benchmark_outputs" TOTAL_GPU_MEMORY = torch.cuda.get_device_properties(0).total_memory / (1024**3) +REPO_ID = "diffusers/benchmarks" +FINAL_CSV_FILE = "collated_results.csv" + @dataclass class BenchmarkInfo: From c30cab61a56e2bc80f3bbd9a94714fbba6bd5ccd Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 1 Dec 2023 08:57:35 +0530 Subject: [PATCH 28/99] fix --- benchmarks/push_results.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py index c9fec2b7b4a4..53cfcb712f43 100644 --- a/benchmarks/push_results.py +++ b/benchmarks/push_results.py @@ -28,7 +28,11 @@ def push_to_hf_dataset(): current_results = pd.read_csv(FINAL_CSV_FILE) previous_results = pd.read_csv(csv_path) numeric_columns = current_results.select_dtypes(include=["float64", "int64"]).columns - numeric_columns = [c for c in numeric_columns if c not in ["batch_size", "num_inference_steps"]] + numeric_columns = [ + c + for c in numeric_columns + if c not in ["batch_size", "num_inference_steps", "actual_gpu_memory (gbs)", "github_sha"] + ] for column in numeric_columns: # Calculate the percentage change From 689b9f7f5d787c8e40da726757f27ce206f4a7da Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 1 Dec 2023 09:30:42 +0530 Subject: [PATCH 29/99] fix import --- benchmarks/base_classes.py | 2 +- benchmarks/benchmark_sd.py | 2 +- benchmarks/benchmark_sd_img.py | 2 +- benchmarks/benchmark_sd_inpainting.py | 2 +- benchmarks/push_results.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 2132b2fa8a1a..61215880c146 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -9,7 +9,7 @@ sys.path.append(".") -from benchmarks.utils import ( # noqa: E402 +from utils import ( # noqa: E402 BASE_PATH, PROMPT, BenchmarkInfo, diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py index 4c6495e61eb2..d313d26ac99b 100644 --- a/benchmarks/benchmark_sd.py +++ b/benchmarks/benchmark_sd.py @@ -3,7 +3,7 @@ sys.path.append(".") -from benchmarks.base_classes import TextToImageBenchmark # noqa: E402 +from base_classes import TextToImageBenchmark # noqa: E402 if __name__ == "__main__": diff --git a/benchmarks/benchmark_sd_img.py b/benchmarks/benchmark_sd_img.py index 74440b6ada05..f34b521a1606 100644 --- a/benchmarks/benchmark_sd_img.py +++ b/benchmarks/benchmark_sd_img.py @@ -3,7 +3,7 @@ sys.path.append(".") -from benchmarks.base_classes import ImageToImageBenchmark # noqa: E402 +from base_classes import ImageToImageBenchmark # noqa: E402 if __name__ == "__main__": diff --git a/benchmarks/benchmark_sd_inpainting.py b/benchmarks/benchmark_sd_inpainting.py index 7c8afef6eb58..aadccac32e02 100644 --- a/benchmarks/benchmark_sd_inpainting.py +++ b/benchmarks/benchmark_sd_inpainting.py @@ -3,7 +3,7 @@ sys.path.append(".") -from benchmarks.base_classes import InpaintingBenchmark # noqa: E402 +from base_classes import InpaintingBenchmark # noqa: E402 if __name__ == "__main__": diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py index 53cfcb712f43..7b3e61a04977 100644 --- a/benchmarks/push_results.py +++ b/benchmarks/push_results.py @@ -6,7 +6,7 @@ sys.path.append(".") -from benchmarks.utils import BASE_PATH, FINAL_CSV_FILE, GITHUB_SHA, REPO_ID, collate_csv # noqa: E402 +from utils import BASE_PATH, FINAL_CSV_FILE, GITHUB_SHA, REPO_ID, collate_csv # noqa: E402 def has_previous_benchmark() -> str: From d046a2559e0e15995386d8cdd7468460e2dc7bcd Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 1 Dec 2023 09:35:08 +0530 Subject: [PATCH 30/99] add args --- benchmarks/benchmark_sd.py | 2 +- benchmarks/benchmark_sd_img.py | 2 +- benchmarks/benchmark_sd_inpainting.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py index d313d26ac99b..0fa24a08d639 100644 --- a/benchmarks/benchmark_sd.py +++ b/benchmarks/benchmark_sd.py @@ -26,4 +26,4 @@ args = parser.parse_args() benchmark_pipe = TextToImageBenchmark(args) - benchmark_pipe.benchmark() + benchmark_pipe.benchmark(args) diff --git a/benchmarks/benchmark_sd_img.py b/benchmarks/benchmark_sd_img.py index f34b521a1606..5525b4dae60b 100644 --- a/benchmarks/benchmark_sd_img.py +++ b/benchmarks/benchmark_sd_img.py @@ -25,4 +25,4 @@ args = parser.parse_args() benchmark_pipe = ImageToImageBenchmark(args) - benchmark_pipe.benchmark() + benchmark_pipe.benchmark(args) diff --git a/benchmarks/benchmark_sd_inpainting.py b/benchmarks/benchmark_sd_inpainting.py index aadccac32e02..8f36883e16f3 100644 --- a/benchmarks/benchmark_sd_inpainting.py +++ b/benchmarks/benchmark_sd_inpainting.py @@ -25,4 +25,4 @@ args = parser.parse_args() benchmark_pipe = InpaintingBenchmark(args) - benchmark_pipe.benchmark() + benchmark_pipe.benchmark(args) From 71f6bd9e83092241b2c850913ef736141113a26a Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 1 Dec 2023 09:38:43 +0530 Subject: [PATCH 31/99] basepath --- .github/workflows/benchmark.yml | 3 ++- benchmarks/utils.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index b0a05a31813d..b99bc8b9405a 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -44,8 +44,9 @@ jobs: - name: Stable Diffusion Benchmarking Tests env: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + BASE_PATH: benchmark_outputs run: | - cd benchmarks && mkdir benchmark_outputs && \ + cd benchmarks && mkdir ${BASE_PATH} && \ python benchmark_sd.py && python benchmark_sd.py --run_compile && \ python benchmark_sd_img.py && python benchmark_sd_img.py --run_compile && \ python benchmark_sd_inpainting.py && python benchmark_sd_inpainting.py --run_compile && \ diff --git a/benchmarks/utils.py b/benchmarks/utils.py index 8592a2f4f0db..98b3ab4afe56 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -24,7 +24,7 @@ ] PROMPT = "ghibli style, a fantasy landscape with castles" -BASE_PATH = "benchmark_outputs" +BASE_PATH = os.getenv("BASE_PATH", ".") TOTAL_GPU_MEMORY = torch.cuda.get_device_properties(0).total_memory / (1024**3) REPO_ID = "diffusers/benchmarks" From 295cf305a3a9ac76a6e9aec214e36528c6419d6a Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 1 Dec 2023 09:51:46 +0530 Subject: [PATCH 32/99] better exception handling --- benchmarks/push_results.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py index 7b3e61a04977..6ab3dc3b10c4 100644 --- a/benchmarks/push_results.py +++ b/benchmarks/push_results.py @@ -3,6 +3,7 @@ import pandas as pd from huggingface_hub import hf_hub_download, upload_file +from huggingface_hub.utils._errors import EntryNotFoundError sys.path.append(".") @@ -13,7 +14,7 @@ def has_previous_benchmark() -> str: csv_path = None try: csv_path = hf_hub_download(repo_id=REPO_ID, repo_type="dataset", filename=FINAL_CSV_FILE) - except FileNotFoundError: + except EntryNotFoundError: csv_path = None return csv_path From b5e237115afd58e3b2e24f28b3a3ffdd1797d94f Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 1 Dec 2023 09:53:26 +0530 Subject: [PATCH 33/99] better path handling --- benchmarks/base_classes.py | 4 +++- benchmarks/utils.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 61215880c146..99f363b5fca8 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -72,7 +72,9 @@ def benchmark(self, args): ) name = ( args.ckpt.replace("/", "_") - + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv" + + "_" + + self.pipe.__class__.__name + + +f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv" ) filepath = os.path.join(BASE_PATH, name) write_to_csv(filepath, csv_dict) diff --git a/benchmarks/utils.py b/benchmarks/utils.py index 98b3ab4afe56..88c09be6d54d 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -24,7 +24,7 @@ ] PROMPT = "ghibli style, a fantasy landscape with castles" -BASE_PATH = os.getenv("BASE_PATH", ".") +BASE_PATH = os.getenv("BASE_PATH", ".") TOTAL_GPU_MEMORY = torch.cuda.get_device_properties(0).total_memory / (1024**3) REPO_ID = "diffusers/benchmarks" From e7aed9ec8f13d066165d1ad73e75d17f28be5f61 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 1 Dec 2023 09:57:06 +0530 Subject: [PATCH 34/99] fix --- benchmarks/base_classes.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 99f363b5fca8..83b526b0370d 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -67,13 +67,14 @@ def benchmark(self, args): memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. benchmark_info = BenchmarkInfo(time=time, memory=memory) + pipeline_class_name = str(self.pipe.__class__.__name__) csv_dict = generate_csv_dict( - pipeline_cls=str(self.pipe.__class__.__name__), ckpt=args.ckpt, args=args, benchmark_info=benchmark_info + pipeline_cls=pipeline_class_name, ckpt=args.ckpt, args=args, benchmark_info=benchmark_info ) name = ( args.ckpt.replace("/", "_") + "_" - + self.pipe.__class__.__name + + pipeline_class_name + +f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv" ) filepath = os.path.join(BASE_PATH, name) From 8eb8baffbac75488ad1a4a72b0e306415a2a3e72 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 1 Dec 2023 09:59:56 +0530 Subject: [PATCH 35/99] fix --- benchmarks/base_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 83b526b0370d..d3844ad52c43 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -75,7 +75,7 @@ def benchmark(self, args): args.ckpt.replace("/", "_") + "_" + pipeline_class_name - + +f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv" + + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv" ) filepath = os.path.join(BASE_PATH, name) write_to_csv(filepath, csv_dict) From 3cb02f8bd1071077499ece67717991a8bbec1111 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 1 Dec 2023 10:50:59 +0530 Subject: [PATCH 36/99] remove --- benchmarks/base_classes.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index d3844ad52c43..1d3c6d1b77b2 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -3,7 +3,7 @@ import torch -from diffusers import AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image +from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image, StableDiffusionControlNetPipeline from diffusers.utils import load_image @@ -114,3 +114,22 @@ def run_inference(self, pipe, args): num_inference_steps=args.num_inference_steps, num_images_per_prompt=args.batch_size, ) + + +class ControlNetBenchmark(BaseBenchmak): # Pick up + pipeline_class = StableDiffusionControlNetPipeline + aux_network_class = ControlNetModel + image_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" + mask = load_image(image_url).convert("RGB") + + def run_inference(self, pipe, args): + self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) + self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt]) + + _ = pipe( + prompt=PROMPT, + image=self.image, + mask_image=self.mask, + num_inference_steps=args.num_inference_steps, + num_images_per_prompt=args.batch_size, + ) \ No newline at end of file From 60c980c8ac2b1e9076d68dd76a31c2c6279ba2d4 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 1 Dec 2023 11:07:46 +0530 Subject: [PATCH 37/99] ifx --- benchmarks/base_classes.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 1d3c6d1b77b2..6526074f31d0 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -87,9 +87,11 @@ class ImageToImageBenchmark(TextToImageBenchmark): url = "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/1665_Girl_with_a_Pearl_Earring.jpg/800px-1665_Girl_with_a_Pearl_Earring.jpg" image = load_image(url).convert("RGB") - def run_inference(self, pipe, args): + def __init__(self, args): + super.__init__(args) self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) + def run_inference(self, pipe, args): _ = pipe( prompt=PROMPT, image=self.image, @@ -103,10 +105,12 @@ class InpaintingBenchmark(ImageToImageBenchmark): mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" mask = load_image(mask_url).convert("RGB") - def run_inference(self, pipe, args): + def __init__(self, args): + super.__init__(args) self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt]) + def run_inference(self, pipe, args): _ = pipe( prompt=PROMPT, image=self.image, @@ -116,15 +120,19 @@ def run_inference(self, pipe, args): ) -class ControlNetBenchmark(BaseBenchmak): # Pick up +class ControlNetBenchmark(BaseBenchmak): pipeline_class = StableDiffusionControlNetPipeline aux_network_class = ControlNetModel + + # TODO: change the URL. image_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" - mask = load_image(image_url).convert("RGB") + image = load_image(image_url).convert("RGB") - def run_inference(self, pipe, args): + def __init__(self, args): + self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) - self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt]) + + def run_inference(self, pipe, args): _ = pipe( prompt=PROMPT, From cd91b622c8b9de5a42ca44b5b3b2b01993c0ee85 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 1 Dec 2023 17:15:27 +0530 Subject: [PATCH 38/99] fix --- benchmarks/base_classes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 6526074f31d0..a4aba9439743 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -88,7 +88,7 @@ class ImageToImageBenchmark(TextToImageBenchmark): image = load_image(url).convert("RGB") def __init__(self, args): - super.__init__(args) + super().__init__(args) self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) def run_inference(self, pipe, args): @@ -106,7 +106,7 @@ class InpaintingBenchmark(ImageToImageBenchmark): mask = load_image(mask_url).convert("RGB") def __init__(self, args): - super.__init__(args) + super().__init__(args) self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt]) From 1782d5a1815fb6e05d84836efaa562735338e534 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 09:07:11 +0530 Subject: [PATCH 39/99] add: support for controlnet. --- .github/workflows/benchmark.yml | 1 + benchmarks/base_classes.py | 72 +++++++++++++++++++++++------- benchmarks/benchmark_controlnet.py | 26 +++++++++++ 3 files changed, 83 insertions(+), 16 deletions(-) create mode 100644 benchmarks/benchmark_controlnet.py diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index b99bc8b9405a..08e670c29d7d 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -50,6 +50,7 @@ jobs: python benchmark_sd.py && python benchmark_sd.py --run_compile && \ python benchmark_sd_img.py && python benchmark_sd_img.py --run_compile && \ python benchmark_sd_inpainting.py && python benchmark_sd_inpainting.py --run_compile && \ + python benchmark_controlnet.py && python benchmark_sd_inpainting.py --run_compile && \ python push_results.py - name: Test suite reports artifacts diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index a4aba9439743..cb6338cf58ff 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -3,7 +3,14 @@ import torch -from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image, StableDiffusionControlNetPipeline +from diffusers import ( + AutoPipelineForImage2Image, + AutoPipelineForInpainting, + AutoPipelineForText2Image, + ControlNetModel, + StableDiffusionControlNetPipeline, + StableDiffusionXLControlNetPipeline, +) from diffusers.utils import load_image @@ -15,6 +22,7 @@ BenchmarkInfo, benchmark_fn, bytes_to_giga_bytes, + flush, generate_csv_dict, write_to_csv, ) @@ -22,10 +30,14 @@ RESOLUTION_MAPPING = { "runwayml/stable-diffusion-v1-5": (512, 512), + "lllyasviel/sd-controlnet-canny": (512, 512), + "diffusers/controlnet-canny-sdxl-1.0": (1024, 1024), "stabilityai/stable-diffusion-2-1": (768, 768), "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024), } +CONTROLNET_MAPPING = {} + class BaseBenchmak: pipeline_class = None @@ -39,6 +51,17 @@ def run_inference(self, args): def benchmark(self, args): raise NotImplementedError + def get_result_filepath(self, args): + pipeline_class_name = str(self.pipe.__class__.__name__) + name = ( + args.ckpt.replace("/", "_") + + "_" + + pipeline_class_name + + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv" + ) + filepath = os.path.join(BASE_PATH, name) + return filepath + class TextToImageBenchmark(BaseBenchmak): pipeline_class = AutoPipelineForText2Image @@ -71,15 +94,10 @@ def benchmark(self, args): csv_dict = generate_csv_dict( pipeline_cls=pipeline_class_name, ckpt=args.ckpt, args=args, benchmark_info=benchmark_info ) - name = ( - args.ckpt.replace("/", "_") - + "_" - + pipeline_class_name - + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv" - ) - filepath = os.path.join(BASE_PATH, name) + filepath = self.get_result_filepath(args) write_to_csv(filepath, csv_dict) print(f"Logs written to: {filepath}") + flush() class ImageToImageBenchmark(TextToImageBenchmark): @@ -120,24 +138,46 @@ def run_inference(self, pipe, args): ) -class ControlNetBenchmark(BaseBenchmak): - pipeline_class = StableDiffusionControlNetPipeline +class ControlNetBenchmark(TextToImageBenchmark): + pipeline_class = StableDiffusionControlNetPipeline aux_network_class = ControlNetModel - # TODO: change the URL. - image_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" + image_url = ( + "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png" + ) image = load_image(image_url).convert("RGB") def __init__(self, args): - + if isinstance(self.pipeline_class, StableDiffusionControlNetPipeline): + root_ckpt = "runwayml/stable-diffusion-v1-5" + elif isinstance(self.pipeline_class, StableDiffusionXLControlNetPipeline): + root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0" + + aux_network = self.aux_network_class.from_pretrained( + args.ckpt, torch_dtype=torch.float16, use_safetensors=True + ) + pipe = self.pipeline_class.from_pretrained( + root_ckpt, controlnet=aux_network, torch_dtype=torch.float16, use_safetensors=True + ) + pipe = pipe.to("cuda") + + if args.run_compile: + pipe.unet.to(memory_format=torch.channels_last) + pipe.controlnet.to(memory_format=torch.channels_last) + print("Run torch compile") + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True) + self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) def run_inference(self, pipe, args): - _ = pipe( prompt=PROMPT, image=self.image, - mask_image=self.mask, num_inference_steps=args.num_inference_steps, num_images_per_prompt=args.batch_size, - ) \ No newline at end of file + ) + + +class ControlNetSDXLBenchmark(ControlNetBenchmark): + pipeline_class = StableDiffusionXLControlNetPipeline diff --git a/benchmarks/benchmark_controlnet.py b/benchmarks/benchmark_controlnet.py new file mode 100644 index 000000000000..9217004461dc --- /dev/null +++ b/benchmarks/benchmark_controlnet.py @@ -0,0 +1,26 @@ +import argparse +import sys + + +sys.path.append(".") +from base_classes import ControlNetBenchmark, ControlNetSDXLBenchmark # noqa: E402 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--ckpt", + type=str, + default="lllyasviel/sd-controlnet-canny", + choices=["lllyasviel/sd-controlnet-canny", "diffusers/controlnet-canny-sdxl-1.0"], + ) + parser.add_argument("--batch_size", type=int, default=1) + parser.add_argument("--num_inference_steps", type=int, default=50) + parser.add_argument("--model_cpu_offload", action="store_true") + parser.add_argument("--run_compile", action="store_true") + args = parser.parse_args() + + benchmark_pipe = ( + ControlNetBenchmark(args) if args.ckpt == "lllyasviel/sd-controlnet-canny" else ControlNetSDXLBenchmark(args) + ) + benchmark_pipe.benchmark(args) From df5dead87930baa617de2644d1ca5c5196b4952a Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 09:21:59 +0530 Subject: [PATCH 40/99] image_url -> url --- benchmarks/base_classes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index cb6338cf58ff..8b4dc8c4ef11 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -142,10 +142,10 @@ class ControlNetBenchmark(TextToImageBenchmark): pipeline_class = StableDiffusionControlNetPipeline aux_network_class = ControlNetModel - image_url = ( + url = ( "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png" ) - image = load_image(image_url).convert("RGB") + image = load_image(url).convert("RGB") def __init__(self, args): if isinstance(self.pipeline_class, StableDiffusionControlNetPipeline): From c6c545c6b340bba46eea52b2d64a918a4a7b0dc1 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 09:25:39 +0530 Subject: [PATCH 41/99] move images to huggingface hub --- benchmarks/base_classes.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 8b4dc8c4ef11..209e84e678a0 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -102,7 +102,7 @@ def benchmark(self, args): class ImageToImageBenchmark(TextToImageBenchmark): pipeline_class = AutoPipelineForImage2Image - url = "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/1665_Girl_with_a_Pearl_Earring.jpg/800px-1665_Girl_with_a_Pearl_Earring.jpg" + url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg/800px-1665_Girl_with_a_Pearl_Earring.jpg" image = load_image(url).convert("RGB") def __init__(self, args): @@ -120,7 +120,7 @@ def run_inference(self, pipe, args): class InpaintingBenchmark(ImageToImageBenchmark): pipeline_class = AutoPipelineForInpainting - mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" + mask_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/overture-creations-5sI6fQgYIuo_mask.png" mask = load_image(mask_url).convert("RGB") def __init__(self, args): @@ -142,9 +142,7 @@ class ControlNetBenchmark(TextToImageBenchmark): pipeline_class = StableDiffusionControlNetPipeline aux_network_class = ControlNetModel - url = ( - "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png" - ) + url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png" image = load_image(url).convert("RGB") def __init__(self, args): From b358c87cb9f37435ed5e32c9b1235c93ad32a801 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 09:35:33 +0530 Subject: [PATCH 42/99] correct urls. --- benchmarks/base_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 209e84e678a0..31789e31457b 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -102,7 +102,7 @@ def benchmark(self, args): class ImageToImageBenchmark(TextToImageBenchmark): pipeline_class = AutoPipelineForImage2Image - url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg/800px-1665_Girl_with_a_Pearl_Earring.jpg" + url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg" image = load_image(url).convert("RGB") def __init__(self, args): From 93b491b4014fa5d3a97037c3fc011c2bc270a30e Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 09:59:04 +0530 Subject: [PATCH 43/99] root_ckpt --- .github/workflows/benchmark.yml | 2 +- benchmarks/base_classes.py | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 08e670c29d7d..1c807c436665 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -41,7 +41,7 @@ jobs: - name: Environment run: | python utils/print_env.py - - name: Stable Diffusion Benchmarking Tests + - name: Diffusers Benchmarking env: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} BASE_PATH: benchmark_outputs diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 31789e31457b..6d3adb23ed43 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -141,21 +141,17 @@ def run_inference(self, pipe, args): class ControlNetBenchmark(TextToImageBenchmark): pipeline_class = StableDiffusionControlNetPipeline aux_network_class = ControlNetModel + root_ckpt = "runwayml/stable-diffusion-v1-5" url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png" image = load_image(url).convert("RGB") def __init__(self, args): - if isinstance(self.pipeline_class, StableDiffusionControlNetPipeline): - root_ckpt = "runwayml/stable-diffusion-v1-5" - elif isinstance(self.pipeline_class, StableDiffusionXLControlNetPipeline): - root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0" - aux_network = self.aux_network_class.from_pretrained( args.ckpt, torch_dtype=torch.float16, use_safetensors=True ) pipe = self.pipeline_class.from_pretrained( - root_ckpt, controlnet=aux_network, torch_dtype=torch.float16, use_safetensors=True + self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16, use_safetensors=True ) pipe = pipe.to("cuda") @@ -179,3 +175,4 @@ def run_inference(self, pipe, args): class ControlNetSDXLBenchmark(ControlNetBenchmark): pipeline_class = StableDiffusionXLControlNetPipeline + root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0" From 748f6dcc3cc5e9b5f56ea82f8de7f9228e43a6c9 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 13:05:20 +0530 Subject: [PATCH 44/99] flush before benchmarking --- benchmarks/base_classes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 6d3adb23ed43..5c9468643406 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -86,6 +86,8 @@ def run_inference(self, pipe, args): ) def benchmark(self, args): + flush() + time = benchmark_fn(self.run_inference, self.pipe, args) # in seconds. memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. benchmark_info = BenchmarkInfo(time=time, memory=memory) @@ -97,7 +99,6 @@ def benchmark(self, args): filepath = self.get_result_filepath(args) write_to_csv(filepath, csv_dict) print(f"Logs written to: {filepath}") - flush() class ImageToImageBenchmark(TextToImageBenchmark): From 5d5d5fdfbbe883728b7ae9f57eca05143725bbf4 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 13:07:34 +0530 Subject: [PATCH 45/99] don't install accelerate from source --- .github/workflows/benchmark.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 1c807c436665..7aa9c761032b 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -36,7 +36,6 @@ jobs: run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m pip install -e .[quality,test] - python -m pip install git+https://github.com/huggingface/accelerate.git python -m pip install pandas - name: Environment run: | From 46510825ba25de2a156d7e915ef65600a7914ff7 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 13:15:17 +0530 Subject: [PATCH 46/99] add runner --- benchmarks/run_all.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 benchmarks/run_all.py diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py new file mode 100644 index 000000000000..ac91770fff09 --- /dev/null +++ b/benchmarks/run_all.py @@ -0,0 +1,17 @@ +import glob +import subprocess + + +PATTERN = "benchmark_*.py" + + +def main(): + python_files = glob.glob(PATTERN) + + for file in python_files: + subprocess.run(["python", file]) + subprocess.run(["python", f"{file} --run_compile"]) + + +if __name__ == "__main__": + main() From 8e805796d2b8cbb1240bb8719411ca7d9cf69601 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 13:16:46 +0530 Subject: [PATCH 47/99] simplify Diffusers Benchmarking step --- .github/workflows/benchmark.yml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 7aa9c761032b..b671abd77faa 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -45,12 +45,7 @@ jobs: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} BASE_PATH: benchmark_outputs run: | - cd benchmarks && mkdir ${BASE_PATH} && \ - python benchmark_sd.py && python benchmark_sd.py --run_compile && \ - python benchmark_sd_img.py && python benchmark_sd_img.py --run_compile && \ - python benchmark_sd_inpainting.py && python benchmark_sd_inpainting.py --run_compile && \ - python benchmark_controlnet.py && python benchmark_sd_inpainting.py --run_compile && \ - python push_results.py + cd benchmarks && mkdir ${BASE_PATH} && python run_all.py && python push_results.py - name: Test suite reports artifacts if: ${{ always() }} From d49ad655965ad2b48dc873c19f622428fd7bae57 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 14:27:47 +0530 Subject: [PATCH 48/99] change runner --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index b671abd77faa..7b1877d5334c 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -20,7 +20,7 @@ jobs: strategy: fail-fast: false max-parallel: 1 - runs-on: docker-gpu + runs-on: [single-gpu, nvidia-gpu, a10, ci] container: image: diffusers/diffusers-pytorch-cuda options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 From 7c7846b80262b02a96fc555508ba2ab485dc4da8 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 14:39:55 +0530 Subject: [PATCH 49/99] fix: subprocess call. --- benchmarks/run_all.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index ac91770fff09..4e769d229ff3 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -10,7 +10,7 @@ def main(): for file in python_files: subprocess.run(["python", file]) - subprocess.run(["python", f"{file} --run_compile"]) + subprocess.run(["python", file, "--run_compile"]) if __name__ == "__main__": From 5dbcbf58b18b69a1da6d4eca755667c4182a8b57 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 14:45:38 +0530 Subject: [PATCH 50/99] filter percentage values --- benchmarks/push_results.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py index 6ab3dc3b10c4..0bfb0a8d0cf9 100644 --- a/benchmarks/push_results.py +++ b/benchmarks/push_results.py @@ -28,6 +28,7 @@ def push_to_hf_dataset(): if csv_path is not None: current_results = pd.read_csv(FINAL_CSV_FILE) previous_results = pd.read_csv(csv_path) + numeric_columns = current_results.select_dtypes(include=["float64", "int64"]).columns numeric_columns = [ c @@ -36,6 +37,8 @@ def push_to_hf_dataset(): ] for column in numeric_columns: + previous_results[column] = previous_results[column].apply(lambda x: x.split()[0]) + # Calculate the percentage change current_results[column] = current_results[column].astype(float) previous_results[column] = previous_results[column].astype(float) From cb8572a7da58ee50a4cf8943f86c0808a5f52ce3 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 14:51:37 +0530 Subject: [PATCH 51/99] fix controlnet benchmark --- benchmarks/base_classes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 5c9468643406..6ed529efdf7a 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -156,6 +156,9 @@ def __init__(self, args): ) pipe = pipe.to("cuda") + pipe.set_progress_bar_config(disable=True) + self.pipe = pipe + if args.run_compile: pipe.unet.to(memory_format=torch.channels_last) pipe.controlnet.to(memory_format=torch.channels_last) From 6dec96cac28ab5ad1edd9b9a0085221143f199ff Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 15:02:23 +0530 Subject: [PATCH 52/99] add t2i adapters. --- benchmarks/base_classes.py | 12 ++++++++++++ benchmarks/benchmark_t2i_adapter.py | 26 ++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 benchmarks/benchmark_t2i_adapter.py diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 6ed529efdf7a..68e3b40c110c 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -8,7 +8,9 @@ AutoPipelineForInpainting, AutoPipelineForText2Image, ControlNetModel, + StableDiffusionAdapterPipeline, StableDiffusionControlNetPipeline, + StableDiffusionXLAdapterPipeline, StableDiffusionXLControlNetPipeline, ) from diffusers.utils import load_image @@ -180,3 +182,13 @@ def run_inference(self, pipe, args): class ControlNetSDXLBenchmark(ControlNetBenchmark): pipeline_class = StableDiffusionXLControlNetPipeline root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0" + + +class T2IAdapterBenchmark(ControlNetBenchmark): + pipeline_class = StableDiffusionAdapterPipeline + root_ckpt = "CompVis/stable-diffusion-v1-4" + + +class T2IAdapterSDXLBenchmark(T2IAdapterBenchmark): + pipeline_class = StableDiffusionXLAdapterPipeline + root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0" diff --git a/benchmarks/benchmark_t2i_adapter.py b/benchmarks/benchmark_t2i_adapter.py new file mode 100644 index 000000000000..7016e5c66129 --- /dev/null +++ b/benchmarks/benchmark_t2i_adapter.py @@ -0,0 +1,26 @@ +import argparse +import sys + + +sys.path.append(".") +from base_classes import ControlNetBenchmark, ControlNetSDXLBenchmark # noqa: E402 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--ckpt", + type=str, + default="TencentARC/t2iadapter_canny_sd14v1", + choices=["TencentARC/t2iadapter_canny_sd14v1", "TencentARC/t2i-adapter-canny-sdxl-1.0"], + ) + parser.add_argument("--batch_size", type=int, default=1) + parser.add_argument("--num_inference_steps", type=int, default=50) + parser.add_argument("--model_cpu_offload", action="store_true") + parser.add_argument("--run_compile", action="store_true") + args = parser.parse_args() + + benchmark_pipe = ( + ControlNetBenchmark(args) if args.ckpt == "lllyasviel/sd-controlnet-canny" else ControlNetSDXLBenchmark(args) + ) + benchmark_pipe.benchmark(args) From 86d597f6e8201dedc9b58b7bf91ded5ba3a7329f Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 15:04:16 +0530 Subject: [PATCH 53/99] fix filter columns --- benchmarks/push_results.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py index 0bfb0a8d0cf9..abc4f197f34e 100644 --- a/benchmarks/push_results.py +++ b/benchmarks/push_results.py @@ -18,6 +18,11 @@ def has_previous_benchmark() -> str: csv_path = None return csv_path +def filter_float(value): + if isinstance(value, str): + return value.split()[0] + return value + def push_to_hf_dataset(): all_csvs = sorted(glob.glob(f"{BASE_PATH}/*.csv")) @@ -37,7 +42,7 @@ def push_to_hf_dataset(): ] for column in numeric_columns: - previous_results[column] = previous_results[column].apply(lambda x: x.split()[0]) + previous_results[column] = previous_results[column].apply(lambda x: filter_float(x)) # Calculate the percentage change current_results[column] = current_results[column].astype(float) From fa7bfe13459fb7e015410b35bb95f9cbfa39b587 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 15:18:58 +0530 Subject: [PATCH 54/99] fix t2i adapter benchmark --- benchmarks/benchmark_t2i_adapter.py | 6 ++++-- benchmarks/push_results.py | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_t2i_adapter.py b/benchmarks/benchmark_t2i_adapter.py index 7016e5c66129..44b04b470ea6 100644 --- a/benchmarks/benchmark_t2i_adapter.py +++ b/benchmarks/benchmark_t2i_adapter.py @@ -3,7 +3,7 @@ sys.path.append(".") -from base_classes import ControlNetBenchmark, ControlNetSDXLBenchmark # noqa: E402 +from base_classes import T2IAdapterBenchmark, T2IAdapterSDXLBenchmark # noqa: E402 if __name__ == "__main__": @@ -21,6 +21,8 @@ args = parser.parse_args() benchmark_pipe = ( - ControlNetBenchmark(args) if args.ckpt == "lllyasviel/sd-controlnet-canny" else ControlNetSDXLBenchmark(args) + T2IAdapterBenchmark(args) + if args.ckpt == "TencentARC/t2iadapter_canny_sd14v1" + else T2IAdapterSDXLBenchmark(args) ) benchmark_pipe.benchmark(args) diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py index abc4f197f34e..fb2559802ebd 100644 --- a/benchmarks/push_results.py +++ b/benchmarks/push_results.py @@ -18,6 +18,7 @@ def has_previous_benchmark() -> str: csv_path = None return csv_path + def filter_float(value): if isinstance(value, str): return value.split()[0] From 59df524f3cb7114fdb98b21c43ec5254929f8f08 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 15:39:05 +0530 Subject: [PATCH 55/99] fix init. --- benchmarks/base_classes.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 68e3b40c110c..da9095d8f1cd 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -183,12 +183,21 @@ class ControlNetSDXLBenchmark(ControlNetBenchmark): pipeline_class = StableDiffusionXLControlNetPipeline root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0" + def __init__(self, args): + super().__init__(args) + class T2IAdapterBenchmark(ControlNetBenchmark): pipeline_class = StableDiffusionAdapterPipeline root_ckpt = "CompVis/stable-diffusion-v1-4" + def __init__(self, args): + super().__init__(args) + class T2IAdapterSDXLBenchmark(T2IAdapterBenchmark): pipeline_class = StableDiffusionXLAdapterPipeline root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0" + + def __init__(self, args): + super().__init__(args) From 3cd0f592247a062f1ff1dc085c743563551e9714 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 16:02:38 +0530 Subject: [PATCH 56/99] fix --- benchmarks/base_classes.py | 4 ++++ benchmarks/run_all.py | 1 + 2 files changed, 5 insertions(+) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index da9095d8f1cd..432b871023eb 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -12,6 +12,7 @@ StableDiffusionControlNetPipeline, StableDiffusionXLAdapterPipeline, StableDiffusionXLControlNetPipeline, + T2IAdapter ) from diffusers.utils import load_image @@ -90,6 +91,8 @@ def run_inference(self, pipe, args): def benchmark(self, args): flush() + print(f"Running benchmark with: {args}\n") + time = benchmark_fn(self.run_inference, self.pipe, args) # in seconds. memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. benchmark_info = BenchmarkInfo(time=time, memory=memory) @@ -189,6 +192,7 @@ def __init__(self, args): class T2IAdapterBenchmark(ControlNetBenchmark): pipeline_class = StableDiffusionAdapterPipeline + aux_network_class = T2IAdapter root_ckpt = "CompVis/stable-diffusion-v1-4" def __init__(self, args): diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index 4e769d229ff3..8f81ae11bfe3 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -9,6 +9,7 @@ def main(): python_files = glob.glob(PATTERN) for file in python_files: + print(f"Running {file}.") subprocess.run(["python", file]) subprocess.run(["python", file, "--run_compile"]) From 8583db84d68135b035800839f9af5cb3d7e34975 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 16:38:04 +0530 Subject: [PATCH 57/99] remove safetensors flag --- benchmarks/base_classes.py | 14 +++++--------- benchmarks/run_all.py | 2 +- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 432b871023eb..05ad5cc5d68f 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -12,7 +12,7 @@ StableDiffusionControlNetPipeline, StableDiffusionXLAdapterPipeline, StableDiffusionXLControlNetPipeline, - T2IAdapter + T2IAdapter, ) from diffusers.utils import load_image @@ -70,7 +70,7 @@ class TextToImageBenchmark(BaseBenchmak): pipeline_class = AutoPipelineForText2Image def __init__(self, args): - pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16, use_safetensors=True) + pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16) pipe = pipe.to("cuda") if args.run_compile: @@ -91,7 +91,7 @@ def run_inference(self, pipe, args): def benchmark(self, args): flush() - print(f"Running benchmark with: {args}\n") + print(f"Running benchmark with: {dict(args)}\n") time = benchmark_fn(self.run_inference, self.pipe, args) # in seconds. memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. @@ -153,12 +153,8 @@ class ControlNetBenchmark(TextToImageBenchmark): image = load_image(url).convert("RGB") def __init__(self, args): - aux_network = self.aux_network_class.from_pretrained( - args.ckpt, torch_dtype=torch.float16, use_safetensors=True - ) - pipe = self.pipeline_class.from_pretrained( - self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16, use_safetensors=True - ) + aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16) + pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16) pipe = pipe.to("cuda") pipe.set_progress_bar_config(disable=True) diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index 8f81ae11bfe3..9c058f9f3d39 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -9,7 +9,7 @@ def main(): python_files = glob.glob(PATTERN) for file in python_files: - print(f"Running {file}.") + print(f"******Running file: {file} ******") subprocess.run(["python", file]) subprocess.run(["python", file, "--run_compile"]) From 6b9bf4af528b501734762091fce41d9b5700e554 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 16:46:28 +0530 Subject: [PATCH 58/99] fix args print --- benchmarks/base_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 05ad5cc5d68f..70afa958f7c4 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -91,7 +91,7 @@ def run_inference(self, pipe, args): def benchmark(self, args): flush() - print(f"Running benchmark with: {dict(args)}\n") + print(f"Running benchmark with: {vars(args)}\n") time = benchmark_fn(self.run_inference, self.pipe, args) # in seconds. memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. From 38160f1ae3413ab88d6fb66d216e35ce602fa9d3 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 17:05:11 +0530 Subject: [PATCH 59/99] fix --- benchmarks/base_classes.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 70afa958f7c4..fd862d992f3a 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -154,7 +154,11 @@ class ControlNetBenchmark(TextToImageBenchmark): def __init__(self, args): aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16) - pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16) + + if self.aux_network_class == ControlNetModel: + pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16) + else: + pipe = self.pipeline_class.from_pretrained(self.root_ckpt, adapter=aux_network, torch_dtype=torch.float16) pipe = pipe.to("cuda") pipe.set_progress_bar_config(disable=True) @@ -162,10 +166,17 @@ def __init__(self, args): if args.run_compile: pipe.unet.to(memory_format=torch.channels_last) - pipe.controlnet.to(memory_format=torch.channels_last) + if self.aux_network_class == ControlNetModel: + pipe.controlnet.to(memory_format=torch.channels_last) + else: + pipe.adapter.to(memory_format=torch.channels_last) + print("Run torch compile") pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True) + if self.aux_network_class == ControlNetModel: + pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True) + else: + pipe.adapter = torch.compile(pipe.adapter, mode="reduce-overhead", fullgraph=True) self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) From e6116b07403fed6d08eabba24656b256d019114a Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 17:08:09 +0530 Subject: [PATCH 60/99] feat: run_command --- benchmarks/run_all.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index 9c058f9f3d39..0a92a5f0fedf 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -1,17 +1,38 @@ import glob import subprocess +from typing import List PATTERN = "benchmark_*.py" +class SubprocessCallException(Exception): + pass + +# Taken from `test_examples_utils.py` +def run_command(command: List[str], return_stdout=False): + """ + Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture + if an error occurred while running `command` + """ + try: + output = subprocess.check_output(command, stderr=subprocess.STDOUT) + if return_stdout: + if hasattr(output, "decode"): + output = output.decode("utf-8") + return output + except subprocess.CalledProcessError as e: + raise SubprocessCallException( + f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}" + ) from e + def main(): python_files = glob.glob(PATTERN) for file in python_files: print(f"******Running file: {file} ******") - subprocess.run(["python", file]) - subprocess.run(["python", file, "--run_compile"]) + run_command(f"python {file}".split()) + run_command(f"python {file} --run_compile".split()) if __name__ == "__main__": From d98fbe12559e01da1d343a9f7ae34d880e9d1fe8 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 17:23:54 +0530 Subject: [PATCH 61/99] add adapter resolution mapping --- benchmarks/base_classes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index fd862d992f3a..64ca46c7eda3 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -35,6 +35,8 @@ "runwayml/stable-diffusion-v1-5": (512, 512), "lllyasviel/sd-controlnet-canny": (512, 512), "diffusers/controlnet-canny-sdxl-1.0": (1024, 1024), + "TencentARC/t2iadapter_canny_sd14v1": (512, 512), + "TencentARC/t2i-adapter-canny-sdxl-1.0": (1024, 1024), "stabilityai/stable-diffusion-2-1": (768, 768), "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024), } From c93278de352dac8dc235878d48d07fdfc3b07109 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 18:01:22 +0530 Subject: [PATCH 62/99] benchmark t2i adapter fix. --- benchmarks/base_classes.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 64ca46c7eda3..d8ca428a6cde 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -41,8 +41,6 @@ "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024), } -CONTROLNET_MAPPING = {} - class BaseBenchmak: pipeline_class = None @@ -156,11 +154,7 @@ class ControlNetBenchmark(TextToImageBenchmark): def __init__(self, args): aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16) - - if self.aux_network_class == ControlNetModel: - pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16) - else: - pipe = self.pipeline_class.from_pretrained(self.root_ckpt, adapter=aux_network, torch_dtype=torch.float16) + pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16) pipe = pipe.to("cuda") pipe.set_progress_bar_config(disable=True) @@ -168,17 +162,11 @@ def __init__(self, args): if args.run_compile: pipe.unet.to(memory_format=torch.channels_last) - if self.aux_network_class == ControlNetModel: - pipe.controlnet.to(memory_format=torch.channels_last) - else: - pipe.adapter.to(memory_format=torch.channels_last) + pipe.controlnet.to(memory_format=torch.channels_last) print("Run torch compile") pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - if self.aux_network_class == ControlNetModel: - pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True) - else: - pipe.adapter = torch.compile(pipe.adapter, mode="reduce-overhead", fullgraph=True) + pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True) self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) @@ -205,7 +193,22 @@ class T2IAdapterBenchmark(ControlNetBenchmark): root_ckpt = "CompVis/stable-diffusion-v1-4" def __init__(self, args): - super().__init__(args) + aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16) + pipe = self.pipeline_class.from_pretrained(self.root_ckpt, adapter=aux_network, torch_dtype=torch.float16) + pipe = pipe.to("cuda") + + pipe.set_progress_bar_config(disable=True) + self.pipe = pipe + + if args.run_compile: + pipe.unet.to(memory_format=torch.channels_last) + pipe.adapter.to(memory_format=torch.channels_last) + + print("Run torch compile") + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + pipe.adapter = torch.compile(pipe.adapter, mode="reduce-overhead", fullgraph=True) + + self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) class T2IAdapterSDXLBenchmark(T2IAdapterBenchmark): From 924096fbb28f421826d3abf00d92d8666758ad77 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 18:15:48 +0530 Subject: [PATCH 63/99] fix adapter input --- benchmarks/base_classes.py | 7 +++++-- benchmarks/run_all.py | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index d8ca428a6cde..877ad23db4a7 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -163,7 +163,7 @@ def __init__(self, args): if args.run_compile: pipe.unet.to(memory_format=torch.channels_last) pipe.controlnet.to(memory_format=torch.channels_last) - + print("Run torch compile") pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True) @@ -192,6 +192,9 @@ class T2IAdapterBenchmark(ControlNetBenchmark): aux_network_class = T2IAdapter root_ckpt = "CompVis/stable-diffusion-v1-4" + url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter.png" + image = load_image(url).convert("RGB") + def __init__(self, args): aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16) pipe = self.pipeline_class.from_pretrained(self.root_ckpt, adapter=aux_network, torch_dtype=torch.float16) @@ -203,7 +206,7 @@ def __init__(self, args): if args.run_compile: pipe.unet.to(memory_format=torch.channels_last) pipe.adapter.to(memory_format=torch.channels_last) - + print("Run torch compile") pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) pipe.adapter = torch.compile(pipe.adapter, mode="reduce-overhead", fullgraph=True) diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index 0a92a5f0fedf..1cfa8d6c72a2 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -5,9 +5,11 @@ PATTERN = "benchmark_*.py" + class SubprocessCallException(Exception): pass + # Taken from `test_examples_utils.py` def run_command(command: List[str], return_stdout=False): """ From 628591d9d86d9f09b562d1e5f3187801a9d62933 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 18:18:53 +0530 Subject: [PATCH 64/99] fix --- benchmarks/base_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 877ad23db4a7..064c95212c95 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -193,7 +193,7 @@ class T2IAdapterBenchmark(ControlNetBenchmark): root_ckpt = "CompVis/stable-diffusion-v1-4" url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter.png" - image = load_image(url).convert("RGB") + image = load_image(url) def __init__(self, args): aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16) From 0f4ae4eff7132bc2a7cebc68ac62717ce634885d Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 18:20:50 +0530 Subject: [PATCH 65/99] convert to L. --- benchmarks/base_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 064c95212c95..26a9d42c7011 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -193,7 +193,7 @@ class T2IAdapterBenchmark(ControlNetBenchmark): root_ckpt = "CompVis/stable-diffusion-v1-4" url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter.png" - image = load_image(url) + image = load_image(url).convert("L") def __init__(self, args): aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16) From de739fa784eac6910a811dbb7a2f34d7cc433384 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 19:38:35 +0530 Subject: [PATCH 66/99] add flush() add appropriate places --- benchmarks/base_classes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 26a9d42c7011..ca38b144de5b 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -98,12 +98,14 @@ def benchmark(self, args): benchmark_info = BenchmarkInfo(time=time, memory=memory) pipeline_class_name = str(self.pipe.__class__.__name__) + flush() csv_dict = generate_csv_dict( pipeline_cls=pipeline_class_name, ckpt=args.ckpt, args=args, benchmark_info=benchmark_info ) filepath = self.get_result_filepath(args) write_to_csv(filepath, csv_dict) print(f"Logs written to: {filepath}") + flush() class ImageToImageBenchmark(TextToImageBenchmark): From cb9f9c6d8f9c9cdc72bd47e4b03acbd27a6af1b5 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 19:41:12 +0530 Subject: [PATCH 67/99] better filtering --- benchmarks/push_results.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py index fb2559802ebd..27549639fa6e 100644 --- a/benchmarks/push_results.py +++ b/benchmarks/push_results.py @@ -21,7 +21,7 @@ def has_previous_benchmark() -> str: def filter_float(value): if isinstance(value, str): - return value.split()[0] + return float(value.split()[0]) return value @@ -37,9 +37,7 @@ def push_to_hf_dataset(): numeric_columns = current_results.select_dtypes(include=["float64", "int64"]).columns numeric_columns = [ - c - for c in numeric_columns - if c not in ["batch_size", "num_inference_steps", "actual_gpu_memory (gbs)", "github_sha"] + c for c in numeric_columns if c not in ["batch_size", "num_inference_steps", "actual_gpu_memory (gbs)"] ] for column in numeric_columns: From d7aee28421cfd36eb6b2f3e165293ed0b43b7548 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 20:11:30 +0530 Subject: [PATCH 68/99] okay --- .github/workflows/benchmark.yml | 1 + benchmarks/run_all.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 7b1877d5334c..301b952e0f5f 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -45,6 +45,7 @@ jobs: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} BASE_PATH: benchmark_outputs run: | + python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))" cd benchmarks && mkdir ${BASE_PATH} && python run_all.py && python push_results.py - name: Test suite reports artifacts diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index 1cfa8d6c72a2..1ca533bfdfcc 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -32,7 +32,7 @@ def main(): python_files = glob.glob(PATTERN) for file in python_files: - print(f"******Running file: {file} ******") + print(f"****** Running file: {file} ******") run_command(f"python {file}".split()) run_command(f"python {file} --run_compile".split()) From 385ffbb57c6f2dddd38d79303e8ae1efbe2b5066 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 20:13:29 +0530 Subject: [PATCH 69/99] get env for torch --- .github/workflows/benchmark.yml | 2 +- benchmarks/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 301b952e0f5f..3190091ea3c8 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -45,7 +45,7 @@ jobs: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} BASE_PATH: benchmark_outputs run: | - python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))" + export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))") cd benchmarks && mkdir ${BASE_PATH} && python run_all.py && python push_results.py - name: Test suite reports artifacts diff --git a/benchmarks/utils.py b/benchmarks/utils.py index 88c09be6d54d..1c98c48c989f 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -25,7 +25,7 @@ PROMPT = "ghibli style, a fantasy landscape with castles" BASE_PATH = os.getenv("BASE_PATH", ".") -TOTAL_GPU_MEMORY = torch.cuda.get_device_properties(0).total_memory / (1024**3) +TOTAL_GPU_MEMORY = os.getenv("TOTAL_GPU_MEMORY", torch.cuda.get_device_properties(0).total_memory / (1024**3)) REPO_ID = "diffusers/benchmarks" FINAL_CSV_FILE = "collated_results.csv" From 611ae1338296814b88a7c12d6239e8e2de4551ca Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 20:20:22 +0530 Subject: [PATCH 70/99] convert to float --- benchmarks/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/utils.py b/benchmarks/utils.py index 1c98c48c989f..5fce920ac6c3 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -25,7 +25,7 @@ PROMPT = "ghibli style, a fantasy landscape with castles" BASE_PATH = os.getenv("BASE_PATH", ".") -TOTAL_GPU_MEMORY = os.getenv("TOTAL_GPU_MEMORY", torch.cuda.get_device_properties(0).total_memory / (1024**3)) +TOTAL_GPU_MEMORY = float(os.getenv("TOTAL_GPU_MEMORY", torch.cuda.get_device_properties(0).total_memory / (1024**3))) REPO_ID = "diffusers/benchmarks" FINAL_CSV_FILE = "collated_results.csv" From b3a91d8e0926f0f67137649d09ff9ad310b6994e Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 4 Dec 2023 20:48:05 +0530 Subject: [PATCH 71/99] fix --- benchmarks/push_results.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py index 27549639fa6e..a37fc828a2c8 100644 --- a/benchmarks/push_results.py +++ b/benchmarks/push_results.py @@ -48,10 +48,10 @@ def push_to_hf_dataset(): previous_results[column] = previous_results[column].astype(float) percent_change = ((current_results[column] - previous_results[column]) / previous_results[column]) * 100 - # Format the values with '+' or '-' sign and append to original values - current_results[column] = current_results[column].map(str) + percent_change.map( - lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)" - ) + # Format the values with '+' or '-' sign and append to original values + current_results[column] = current_results[column].map(str) + percent_change.map( + lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)" + ) # Overwrite the current result file. current_results.to_csv(FINAL_CSV_FILE, index=False) From e55913e1665e5765739b6ae641512257f9dc6aaa Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 5 Dec 2023 08:42:38 +0530 Subject: [PATCH 72/99] filter out nans. --- benchmarks/push_results.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py index a37fc828a2c8..062b5d959797 100644 --- a/benchmarks/push_results.py +++ b/benchmarks/push_results.py @@ -52,6 +52,7 @@ def push_to_hf_dataset(): current_results[column] = current_results[column].map(str) + percent_change.map( lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)" ) + current_results[column] = current_results[column].map(lambda x: x.replace(" (nan%)", "")) # Overwrite the current result file. current_results.to_csv(FINAL_CSV_FILE, index=False) From dc3063a7eaefe05801aee3982cee6d19d4f835aa Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 5 Dec 2023 08:43:14 +0530 Subject: [PATCH 73/99] better coment --- benchmarks/push_results.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py index 062b5d959797..962e07c6d74c 100644 --- a/benchmarks/push_results.py +++ b/benchmarks/push_results.py @@ -41,7 +41,7 @@ def push_to_hf_dataset(): ] for column in numeric_columns: - previous_results[column] = previous_results[column].apply(lambda x: filter_float(x)) + previous_results[column] = previous_results[column].map(lambda x: filter_float(x)) # Calculate the percentage change current_results[column] = current_results[column].astype(float) @@ -52,6 +52,7 @@ def push_to_hf_dataset(): current_results[column] = current_results[column].map(str) + percent_change.map( lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)" ) + # There might be newly added rows. So, filter out the NaNs. current_results[column] = current_results[column].map(lambda x: x.replace(" (nan%)", "")) # Overwrite the current result file. From 63aee7954bfc9f3ce4c8d19615f428dc7d9cb67d Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 5 Dec 2023 08:47:00 +0530 Subject: [PATCH 74/99] sdxl --- benchmarks/run_all.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index 1ca533bfdfcc..b1e4da46c401 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -33,8 +33,19 @@ def main(): for file in python_files: print(f"****** Running file: {file} ******") - run_command(f"python {file}".split()) - run_command(f"python {file} --run_compile".split()) + command = f"python {file}" + run_command(command.split()) + + command += " --run_compile" + run_command(command.split()) + + if file == "benchmark_sd.py": + for ckpt in ["segmind/SSD-1B", "stabilityai/stable-diffusion-xl-base-1.0"]: + command = f"python {file} --ckpt {ckpt}" + run_command(command.split()) + + command += " --run_compile" + run_command(command.split()) if __name__ == "__main__": From 9a9d5ea6464ae1ef4a7770af03086f773114f795 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 5 Dec 2023 15:18:54 +0530 Subject: [PATCH 75/99] sdxl for other benchmarks. --- benchmarks/run_all.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index b1e4da46c401..5753db2d72a8 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -47,6 +47,26 @@ def main(): command += " --run_compile" run_command(command.split()) + elif file in ["benchmark_sd_img.py", "benchmark_sd_inpainting.py"]: + sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0" + command = f"python {file} --ckpt {sdxl_ckpt}" + run_command(command.split()) + + command += " --run_compile" + run_command(command.split()) + + elif file in ["benchmark_controlnet.py", "benchmark_t2i_adapter.py"]: + sdxl_ckpt = ( + "diffusers/controlnet-canny-sdxl-1.0" + if "controlnet" == file + else "TencentARC/t2i-adapter-canny-sdxl-1.0" + ) + command = f"python {file} --ckpt {sdxl_ckpt}" + run_command(command.split()) + + command += " --run_compile" + run_command(command.split()) + if __name__ == "__main__": main() From c8f6eefd54ceae8abb69abfecaa362ef843513ff Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 5 Dec 2023 15:34:09 +0530 Subject: [PATCH 76/99] fix: condition --- benchmarks/run_all.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index 5753db2d72a8..bef93b4ab31a 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -58,7 +58,7 @@ def main(): elif file in ["benchmark_controlnet.py", "benchmark_t2i_adapter.py"]: sdxl_ckpt = ( "diffusers/controlnet-canny-sdxl-1.0" - if "controlnet" == file + if "controlnet" in file else "TencentARC/t2i-adapter-canny-sdxl-1.0" ) command = f"python {file} --ckpt {sdxl_ckpt}" From 4a67437d1e26fe7d49930f06e04b7538db805694 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 5 Dec 2023 17:39:52 +0530 Subject: [PATCH 77/99] fix: condition for inpainting --- benchmarks/run_all.py | 6 +++++- src/diffusers/models/__init__.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index bef93b4ab31a..779685e452c5 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -48,7 +48,11 @@ def main(): run_command(command.split()) elif file in ["benchmark_sd_img.py", "benchmark_sd_inpainting.py"]: - sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0" + sdxl_ckpt = ( + "stabilityai/stable-diffusion-xl-refiner-1.0" + if "inpainting" not in file + else "stabilityai/stable-diffusion-xl-base-1.0" + ) command = f"python {file} --ckpt {sdxl_ckpt}" run_command(command.split()) diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py index 49ee3ee6af6b..e3794939e25e 100644 --- a/src/diffusers/models/__init__.py +++ b/src/diffusers/models/__init__.py @@ -33,8 +33,8 @@ _import_structure["consistency_decoder_vae"] = ["ConsistencyDecoderVAE"] _import_structure["controlnet"] = ["ControlNetModel"] _import_structure["dual_transformer_2d"] = ["DualTransformer2DModel"] - _import_structure["modeling_utils"] = ["ModelMixin"] _import_structure["embeddings"] = ["ImageProjection"] + _import_structure["modeling_utils"] = ["ModelMixin"] _import_structure["prior_transformer"] = ["PriorTransformer"] _import_structure["t5_film_transformer"] = ["T5FilmDecoder"] _import_structure["transformer_2d"] = ["Transformer2DModel"] From eedf218edeb06789a8021b83fd6e05b9a25ee9ed Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 5 Dec 2023 18:15:42 +0530 Subject: [PATCH 78/99] fix: mapping for resolution --- benchmarks/base_classes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index ca38b144de5b..194605f75fa5 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -38,6 +38,7 @@ "TencentARC/t2iadapter_canny_sd14v1": (512, 512), "TencentARC/t2i-adapter-canny-sdxl-1.0": (1024, 1024), "stabilityai/stable-diffusion-2-1": (768, 768), + "stabilityai/stable-diffusion-xl-base-1.0": (1024, 1024), "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024), } From e300038267a298a8d63b540e730c13ef7163d8c9 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 5 Dec 2023 19:05:12 +0530 Subject: [PATCH 79/99] fix --- benchmarks/base_classes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 194605f75fa5..35939317f1b8 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -221,5 +221,8 @@ class T2IAdapterSDXLBenchmark(T2IAdapterBenchmark): pipeline_class = StableDiffusionXLAdapterPipeline root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0" + url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter_sdxl.png" + image = load_image(url) + def __init__(self, args): super().__init__(args) From 60614f5a750d3d47cc3c54580ffa36bd9c9196be Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 5 Dec 2023 20:07:31 +0530 Subject: [PATCH 80/99] include kandinsky and wuerstchen --- benchmarks/base_classes.py | 4 ++++ ...hmark_sd.py => benchmark_text_to_image.py} | 3 ++- benchmarks/run_all.py | 20 +++++++++++++------ 3 files changed, 20 insertions(+), 7 deletions(-) rename benchmarks/{benchmark_sd.py => benchmark_text_to_image.py} (90%) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 35939317f1b8..94f8e597f1c2 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -79,6 +79,10 @@ def __init__(self, args): print("Run torch compile") pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + if hasattr(pipe, "movq") and getattr(pipe, "movq", None) is not None: + pipe.movq.to(memory_format=torch.channels_last) + pipe.movq = torch.compile(pipe.movq, mode="reduce-overhead", fullgraph=True) + pipe.set_progress_bar_config(disable=True) self.pipe = pipe diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_text_to_image.py similarity index 90% rename from benchmarks/benchmark_sd.py rename to benchmarks/benchmark_text_to_image.py index 0fa24a08d639..50c04dd550c9 100644 --- a/benchmarks/benchmark_sd.py +++ b/benchmarks/benchmark_text_to_image.py @@ -15,8 +15,9 @@ choices=[ "runwayml/stable-diffusion-v1-5", "segmind/SSD-1B", - "stabilityai/stable-diffusion-2-1", "stabilityai/stable-diffusion-xl-base-1.0", + "kandinsky-community/kandinsky-2-2-decoder", + "warp-ai/wuerstchen", ], ) parser.add_argument("--batch_size", type=int, default=1) diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index 779685e452c5..913a24d98d12 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -33,14 +33,22 @@ def main(): for file in python_files: print(f"****** Running file: {file} ******") - command = f"python {file}" - run_command(command.split()) - command += " --run_compile" - run_command(command.split()) + if file != "benchmark_text_to_image.py": + command = f"python {file}" + run_command(command.split()) + + command += " --run_compile" + run_command(command.split()) - if file == "benchmark_sd.py": - for ckpt in ["segmind/SSD-1B", "stabilityai/stable-diffusion-xl-base-1.0"]: + if file == "benchmark_text_to_image.py": + for ckpt in [ + "runwayml/stable-diffusion-v1-5", + "segmind/SSD-1B", + "stabilityai/stable-diffusion-xl-base-1.0", + "kandinsky-community/kandinsky-2-2-decoder", + "warp-ai/wuerstchen", + ]: command = f"python {file} --ckpt {ckpt}" run_command(command.split()) From b394168516978be36651a3717cea330b5497af3b Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 5 Dec 2023 21:04:31 +0530 Subject: [PATCH 81/99] fix: Wuerstchen --- benchmarks/base_classes.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 94f8e597f1c2..f16107fa7224 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -13,6 +13,7 @@ StableDiffusionXLAdapterPipeline, StableDiffusionXLControlNetPipeline, T2IAdapter, + WuerstchenCombinedPipeline, ) from diffusers.utils import load_image @@ -75,13 +76,18 @@ def __init__(self, args): pipe = pipe.to("cuda") if args.run_compile: - pipe.unet.to(memory_format=torch.channels_last) - print("Run torch compile") - pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - - if hasattr(pipe, "movq") and getattr(pipe, "movq", None) is not None: - pipe.movq.to(memory_format=torch.channels_last) - pipe.movq = torch.compile(pipe.movq, mode="reduce-overhead", fullgraph=True) + if not isinstance(pipe, WuerstchenCombinedPipeline): + pipe.unet.to(memory_format=torch.channels_last) + print("Run torch compile") + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + + if hasattr(pipe, "movq") and getattr(pipe, "movq", None) is not None: + pipe.movq.to(memory_format=torch.channels_last) + pipe.movq = torch.compile(pipe.movq, mode="reduce-overhead", fullgraph=True) + else: + print("Run torch compile") + pipe.decoder = torch.compile(pipe.decoder, mode="reduce-overhead", fullgraph=True) + pipe.vqgan = torch.compile(pipe.vqgan, mode="reduce-overhead", fullgraph=True) pipe.set_progress_bar_config(disable=True) self.pipe = pipe From b7eb3fbf14dd2c4ac8b33590170a527c63efd04c Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 5 Dec 2023 21:27:25 +0530 Subject: [PATCH 82/99] Empty-Commit From 821726d7c0fba25f06ed8bba26984d9ccc014871 Mon Sep 17 00:00:00 2001 From: Aryan V S Date: Thu, 7 Dec 2023 12:31:41 +0530 Subject: [PATCH 83/99] [Community] AnimateDiff + Controlnet Pipeline (#5928) * begin work on animatediff + controlnet pipeline * complete todos, uncomment multicontrolnet, input checks Co-Authored-By: EdoardoBotta * update Co-Authored-By: EdoardoBotta * add example * update community README * Update examples/community/README.md --------- Co-authored-by: EdoardoBotta Co-authored-by: Patrick von Platen --- examples/community/README.md | 65 + .../pipeline_animatediff_controlnet.py | 1137 +++++++++++++++++ 2 files changed, 1202 insertions(+) create mode 100644 examples/community/pipeline_animatediff_controlnet.py diff --git a/examples/community/README.md b/examples/community/README.md index 1d13e2822b77..78a89acf7a57 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -50,6 +50,7 @@ prompt-to-prompt | change parts of a prompt and retain image structure (see [pap | Latent Consistency Interpolation Pipeline | Interpolate the latent space of Latent Consistency Models with multiple prompts | [Latent Consistency Interpolation Pipeline](#latent-consistency-interpolation-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1pK3NrLWJSiJsBynLns1K1-IDTW9zbPvl?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) | | Regional Prompting Pipeline | Assign multiple prompts for different regions | [Regional Prompting Pipeline](#regional-prompting-pipeline) | - | [hako-mikan](https://github.com/hako-mikan) | | LDM3D-sr (LDM3D upscaler) | Upscale low resolution RGB and depth inputs to high resolution | [StableDiffusionUpscaleLDM3D Pipeline](https://github.com/estelleafl/diffusers/tree/ldm3d_upscaler_community/examples/community#stablediffusionupscaleldm3d-pipeline) | - | [Estelle Aflalo](https://github.com/estelleafl) | +| AnimateDiff ControlNet Pipeline | Combines AnimateDiff with precise motion control using ControlNets | [AnimateDiff ControlNet Pipeline](#animatediff-controlnet-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SKboYeGjEQmQPWoFC0aLYpBlYdHXkvAu?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) and [Edoardo Botta](https://github.com/EdoardoBotta) | | DemoFusion Pipeline | Implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973) | [DemoFusion Pipeline](#DemoFusion) | - | [Ruoyi Du](https://github.com/RuoyiDu) | To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly. @@ -2839,6 +2840,70 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur * Reconstructed image: * ![dps_generated_image](https://github.com/tongdaxu/Images/assets/22267548/b74f084d-93f4-4845-83d8-44c0fa758a5f) +### AnimateDiff ControlNet Pipeline + +This pipeline combines AnimateDiff and ControlNet. Enjoy precise motion control for your videos! Refer to [this](https://github.com/huggingface/diffusers/issues/5866) issue for more details. + +```py +import torch +from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter +from diffusers.pipelines import DiffusionPipeline +from diffusers.schedulers import DPMSolverMultistepScheduler +from PIL import Image + +motion_id = "guoyww/animatediff-motion-adapter-v1-5-2" +adapter = MotionAdapter.from_pretrained(motion_id) +controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_openpose", torch_dtype=torch.float16) +vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16) + +model_id = "SG161222/Realistic_Vision_V5.1_noVAE" +pipe = DiffusionPipeline.from_pretrained( + model_id, + motion_adapter=adapter, + controlnet=controlnet, + vae=vae, + custom_pipeline="pipeline_animatediff_controlnet", +).to(device="cuda", dtype=torch.float16) +pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained( + model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1 +) +pipe.enable_vae_slicing() + +conditioning_frames = [] +for i in range(1, 16 + 1): + conditioning_frames.append(Image.open(f"frame_{i}.png")) + +prompt = "astronaut in space, dancing" +negative_prompt = "bad quality, worst quality, jpeg artifacts, ugly" +result = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + width=512, + height=768, + conditioning_frames=conditioning_frames, + num_inference_steps=12, +).frames[0] + +from diffusers.utils import export_to_gif +export_to_gif(result.frames[0], "result.gif") +``` + + + + + + + + + + + + + + + + +
Conditioning Frames
input-frames
AnimateDiff model: SG161222/Realistic_Vision_V5.1_noVAE
gif-1gif-2
AnimateDiff model: CardosAnime
gif-1gif-2
### DemoFusion This pipeline is the official implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973). The original repo can be found at [repo](https://github.com/PRIS-CV/DemoFusion). diff --git a/examples/community/pipeline_animatediff_controlnet.py b/examples/community/pipeline_animatediff_controlnet.py new file mode 100644 index 000000000000..785f1ee55ec2 --- /dev/null +++ b/examples/community/pipeline_animatediff_controlnet.py @@ -0,0 +1,1137 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from dataclasses import dataclass +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +import torch +import torch.nn.functional as F +from PIL import Image +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection + +from diffusers.image_processor import PipelineImageInput, VaeImageProcessor +from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel, UNetMotionModel +from diffusers.models.lora import adjust_lora_scale_text_encoder +from diffusers.models.unet_motion_model import MotionAdapter +from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel +from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.schedulers import ( + DDIMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, +) +from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers +from diffusers.utils.torch_utils import is_compiled_module, randn_tensor + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter + >>> from diffusers.pipelines import DiffusionPipeline + >>> from diffusers.schedulers import DPMSolverMultistepScheduler + >>> from PIL import Image + + >>> motion_id = "guoyww/animatediff-motion-adapter-v1-5-2" + >>> adapter = MotionAdapter.from_pretrained(motion_id) + >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_openpose", torch_dtype=torch.float16) + >>> vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16) + + >>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE" + >>> pipe = DiffusionPipeline.from_pretrained( + ... model_id, + ... motion_adapter=adapter, + ... controlnet=controlnet, + ... vae=vae, + ... custom_pipeline="pipeline_animatediff_controlnet", + ... ).to(device="cuda", dtype=torch.float16) + >>> pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained( + ... model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1 + ... ) + >>> pipe.enable_vae_slicing() + + >>> conditioning_frames = [] + >>> for i in range(1, 16 + 1): + ... conditioning_frames.append(Image.open(f"frame_{i}.png")) + + >>> prompt = "astronaut in space, dancing" + >>> negative_prompt = "bad quality, worst quality, jpeg artifacts, ugly" + >>> result = pipe( + ... prompt=prompt, + ... negative_prompt=negative_prompt, + ... width=512, + ... height=768, + ... conditioning_frames=conditioning_frames, + ... num_inference_steps=12, + ... ).frames[0] + + >>> from diffusers.utils import export_to_gif + >>> export_to_gif(result.frames[0], "result.gif") + ``` +""" + + +def tensor2vid(video: torch.Tensor, processor, output_type="np"): + # Based on: + # https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78 + + batch_size, channels, num_frames, height, width = video.shape + outputs = [] + for batch_idx in range(batch_size): + batch_vid = video[batch_idx].permute(1, 0, 2, 3) + batch_output = processor.postprocess(batch_vid, output_type) + + outputs.append(batch_output) + + return outputs + + +@dataclass +class AnimateDiffControlNetPipelineOutput(BaseOutput): + frames: Union[torch.Tensor, np.ndarray] + + +class AnimateDiffControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin): + r""" + Pipeline for text-to-video generation. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods + implemented for all pipelines (downloading, saving, running on a particular device, etc.). + + The pipeline also inherits the following loading methods: + - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings + - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)). + tokenizer (`CLIPTokenizer`): + A [`~transformers.CLIPTokenizer`] to tokenize text. + unet ([`UNet2DConditionModel`]): + A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents. + motion_adapter ([`MotionAdapter`]): + A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + """ + + model_cpu_offload_seq = "text_encoder->unet->vae" + _optional_components = ["feature_extractor", "image_encoder"] + _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + motion_adapter: MotionAdapter, + controlnet: Union[ControlNetModel, MultiControlNetModel], + scheduler: Union[ + DDIMScheduler, + PNDMScheduler, + LMSDiscreteScheduler, + EulerDiscreteScheduler, + EulerAncestralDiscreteScheduler, + DPMSolverMultistepScheduler, + ], + feature_extractor: Optional[CLIPImageProcessor] = None, + image_encoder: Optional[CLIPVisionModelWithProjection] = None, + ): + super().__init__() + unet = UNetMotionModel.from_unet2d(unet, motion_adapter) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + motion_adapter=motion_adapter, + controlnet=controlnet, + scheduler=scheduler, + feature_extractor=feature_extractor, + image_encoder=image_encoder, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.control_image_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False + ) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt + def encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + clip_skip: Optional[int] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + lora_scale (`float`, *optional*): + A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + + # dynamically adjust the LoRA scale + if not USE_PEFT_BACKEND: + adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) + else: + scale_lora_layers(self.text_encoder, lora_scale) + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, self.tokenizer) + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + if clip_skip is None: + prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask) + prompt_embeds = prompt_embeds[0] + else: + prompt_embeds = self.text_encoder( + text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True + ) + # Access the `hidden_states` first, that contains a tuple of + # all the hidden states from the encoder layers. Then index into + # the tuple to access the hidden states from the desired layer. + prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)] + # We also need to apply the final LayerNorm here to not mess with the + # representations. The `last_hidden_states` that we typically use for + # obtaining the final prompt representations passes through the LayerNorm + # layer. + prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + + if self.text_encoder is not None: + prompt_embeds_dtype = self.text_encoder.dtype + elif self.unet is not None: + prompt_embeds_dtype = self.unet.dtype + else: + prompt_embeds_dtype = prompt_embeds.dtype + + prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder, lora_scale) + + return prompt_embeds, negative_prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, device, num_images_per_prompt): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + + uncond_image_embeds = torch.zeros_like(image_embeds) + return image_embeds, uncond_image_embeds + + # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents + def decode_latents(self, latents): + latents = 1 / self.vae.config.scaling_factor * latents + + batch_size, channels, num_frames, height, width = latents.shape + latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width) + + image = self.vae.decode(latents).sample + video = ( + image[None, :] + .reshape( + ( + batch_size, + num_frames, + -1, + ) + + image.shape[2:] + ) + .permute(0, 2, 1, 3, 4) + ) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + video = video.float() + return video + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to + compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. + """ + self.vae.enable_slicing() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to + computing decoding in one step. + """ + self.vae.disable_slicing() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling + def enable_vae_tiling(self): + r""" + Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to + compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow + processing larger images. + """ + self.vae.enable_tiling() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling + def disable_vae_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to + computing decoding in one step. + """ + self.vae.disable_tiling() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu + def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): + r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. + + The suffixes after the scaling factors represent the stages where they are being applied. + + Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values + that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. + + Args: + s1 (`float`): + Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to + mitigate "oversmoothing effect" in the enhanced denoising process. + s2 (`float`): + Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to + mitigate "oversmoothing effect" in the enhanced denoising process. + b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. + b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. + """ + if not hasattr(self, "unet"): + raise ValueError("The pipeline must have `unet` for using FreeU.") + self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu + def disable_freeu(self): + """Disables the FreeU mechanism if enabled.""" + self.unet.disable_freeu() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs + def check_inputs( + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + callback_on_step_end_tensor_inputs=None, + image=None, + controlnet_conditioning_scale=1.0, + control_guidance_start=0.0, + control_guidance_end=1.0, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + # `prompt` needs more sophisticated handling when there are multiple + # conditionings. + if isinstance(self.controlnet, MultiControlNetModel): + if isinstance(prompt, list): + logger.warning( + f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}" + " prompts. The conditionings will be fixed across the prompts." + ) + + # Check `image` + is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance( + self.controlnet, torch._dynamo.eval_frame.OptimizedModule + ) + if ( + isinstance(self.controlnet, ControlNetModel) + or is_compiled + and isinstance(self.controlnet._orig_mod, ControlNetModel) + ): + if isinstance(image, list): + for image_ in image: + self.check_image(image_, prompt, prompt_embeds) + else: + self.check_image(image, prompt, prompt_embeds) + elif ( + isinstance(self.controlnet, MultiControlNetModel) + or is_compiled + and isinstance(self.controlnet._orig_mod, MultiControlNetModel) + ): + if not isinstance(image, list): + raise TypeError("For multiple controlnets: `image` must be type `list`") + + # When `image` is a nested list: + # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]]) + elif any(isinstance(i, list) for i in image): + raise ValueError("A single batch of multiple conditionings are supported at the moment.") + elif len(image) != len(self.controlnet.nets): + raise ValueError( + f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets." + ) + + for control_ in image: + for image_ in control_: + self.check_image(image_, prompt, prompt_embeds) + else: + assert False + + # Check `controlnet_conditioning_scale` + if ( + isinstance(self.controlnet, ControlNetModel) + or is_compiled + and isinstance(self.controlnet._orig_mod, ControlNetModel) + ): + if not isinstance(controlnet_conditioning_scale, float): + raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.") + elif ( + isinstance(self.controlnet, MultiControlNetModel) + or is_compiled + and isinstance(self.controlnet._orig_mod, MultiControlNetModel) + ): + if isinstance(controlnet_conditioning_scale, list): + if any(isinstance(i, list) for i in controlnet_conditioning_scale): + raise ValueError("A single batch of multiple conditionings are supported at the moment.") + elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len( + self.controlnet.nets + ): + raise ValueError( + "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have" + " the same length as the number of controlnets" + ) + else: + assert False + + if not isinstance(control_guidance_start, (tuple, list)): + control_guidance_start = [control_guidance_start] + + if not isinstance(control_guidance_end, (tuple, list)): + control_guidance_end = [control_guidance_end] + + if len(control_guidance_start) != len(control_guidance_end): + raise ValueError( + f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list." + ) + + if isinstance(self.controlnet, MultiControlNetModel): + if len(control_guidance_start) != len(self.controlnet.nets): + raise ValueError( + f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}." + ) + + for start, end in zip(control_guidance_start, control_guidance_end): + if start >= end: + raise ValueError( + f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}." + ) + if start < 0.0: + raise ValueError(f"control guidance start: {start} can't be smaller than 0.") + if end > 1.0: + raise ValueError(f"control guidance end: {end} can't be larger than 1.0.") + + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image + def check_image(self, image, prompt, prompt_embeds): + image_is_pil = isinstance(image, Image.Image) + image_is_tensor = isinstance(image, torch.Tensor) + image_is_np = isinstance(image, np.ndarray) + image_is_pil_list = isinstance(image, list) and isinstance(image[0], Image.Image) + image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor) + image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray) + + if ( + not image_is_pil + and not image_is_tensor + and not image_is_np + and not image_is_pil_list + and not image_is_tensor_list + and not image_is_np_list + ): + raise TypeError( + f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}" + ) + + if image_is_pil: + image_batch_size = 1 + else: + image_batch_size = len(image) + + if prompt is not None and isinstance(prompt, str): + prompt_batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + prompt_batch_size = len(prompt) + elif prompt_embeds is not None: + prompt_batch_size = prompt_embeds.shape[0] + + if image_batch_size != 1 and image_batch_size != prompt_batch_size: + raise ValueError( + f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}" + ) + + # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents + def prepare_latents( + self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None + ): + shape = ( + batch_size, + num_channels_latents, + num_frames, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image + def prepare_image( + self, + image, + width, + height, + batch_size, + num_images_per_prompt, + device, + dtype, + do_classifier_free_guidance=False, + guess_mode=False, + ): + image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32) + image_batch_size = image.shape[0] + + if image_batch_size == 1: + repeat_by = batch_size + else: + # image batch size is the same as prompt batch size + repeat_by = num_images_per_prompt + + image = image.repeat_interleave(repeat_by, dim=0) + + image = image.to(device=device, dtype=dtype) + + if do_classifier_free_guidance and not guess_mode: + image = torch.cat([image] * 2) + + return image + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def clip_skip(self): + return self._clip_skip + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + @property + def do_classifier_free_guidance(self): + return self._guidance_scale > 1 + + @property + def cross_attention_kwargs(self): + return self._cross_attention_kwargs + + @property + def num_timesteps(self): + return self._num_timesteps + + @torch.no_grad() + def __call__( + self, + prompt: Union[str, List[str]] = None, + num_frames: Optional[int] = 16, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_videos_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + conditioning_frames: Optional[List[PipelineImageInput]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + controlnet_conditioning_scale: Union[float, List[float]] = 1.0, + guess_mode: bool = False, + control_guidance_start: Union[float, List[float]] = 0.0, + control_guidance_end: Union[float, List[float]] = 1.0, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + **kwargs, + ): + r""" + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The height in pixels of the generated video. + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The width in pixels of the generated video. + num_frames (`int`, *optional*, defaults to 16): + The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds + amounts to 2 seconds of video. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality videos at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. Latents should be of shape + `(batch_size, num_channel, num_frames, height, width)`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image (`PipelineImageInput`, *optional*): + Optional image input to work with IP Adapters. + conditioning_frames (`List[PipelineImageInput]`, *optional*): + The ControlNet input condition to provide guidance to the `unet` for generation. If multiple ControlNets + are specified, images must be passed as a list such that each element of the list can be correctly + batched for input to a single ControlNet. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or + `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead + of a plain tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0): + The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added + to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set + the corresponding scale as a list. + guess_mode (`bool`, *optional*, defaults to `False`): + The ControlNet encoder tries to recognize the content of the input image even if you remove all + prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended. + control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0): + The percentage of total steps at which the ControlNet starts applying. + control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0): + The percentage of total steps at which the ControlNet stops applying. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + allback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeine class. + + Examples: + + Returns: + [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is + returned, otherwise a `tuple` is returned where the first element is a list with the generated frames. + """ + + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + if callback is not None: + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + if callback_steps is not None: + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + + controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet + + # align format for control guidance + if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list): + control_guidance_start = len(control_guidance_end) * [control_guidance_start] + elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list): + control_guidance_end = len(control_guidance_start) * [control_guidance_end] + elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list): + mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1 + control_guidance_start, control_guidance_end = ( + mult * [control_guidance_start], + mult * [control_guidance_end], + ) + + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + + num_videos_per_prompt = 1 + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt=prompt, + height=height, + width=width, + callback_steps=callback_steps, + negative_prompt=negative_prompt, + callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + image=conditioning_frames, + controlnet_conditioning_scale=controlnet_conditioning_scale, + control_guidance_start=control_guidance_start, + control_guidance_end=control_guidance_end, + ) + + self._guidance_scale = guidance_scale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): + controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets) + + global_pool_conditions = ( + controlnet.config.global_pool_conditions + if isinstance(controlnet, ControlNetModel) + else controlnet.nets[0].config.global_pool_conditions + ) + guess_mode = guess_mode or global_pool_conditions + + # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt, + device, + num_videos_per_prompt, + self.do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, + clip_skip=self.clip_skip, + ) + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + if self.do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + if ip_adapter_image is not None: + image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_videos_per_prompt) + if self.do_classifier_free_guidance: + image_embeds = torch.cat([negative_image_embeds, image_embeds]) + + if isinstance(controlnet, ControlNetModel): + conditioning_frames = self.prepare_image( + image=conditioning_frames, + width=width, + height=height, + batch_size=batch_size * num_videos_per_prompt * num_frames, + num_images_per_prompt=num_videos_per_prompt, + device=device, + dtype=controlnet.dtype, + do_classifier_free_guidance=self.do_classifier_free_guidance, + guess_mode=guess_mode, + ) + elif isinstance(controlnet, MultiControlNetModel): + cond_prepared_frames = [] + for frame_ in conditioning_frames: + prepared_frame = self.prepare_image( + image=frame_, + width=width, + height=height, + batch_size=batch_size * num_videos_per_prompt * num_frames, + num_images_per_prompt=num_videos_per_prompt, + device=device, + dtype=controlnet.dtype, + do_classifier_free_guidance=self.do_classifier_free_guidance, + guess_mode=guess_mode, + ) + + cond_prepared_frames.append(prepared_frame) + + conditioning_frames = cond_prepared_frames + else: + assert False + + # 4. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + self._num_timesteps = len(timesteps) + + # 5. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * num_videos_per_prompt, + num_channels_latents, + num_frames, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7. Add image embeds for IP-Adapter + added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None + + # 7.1 Create tensor stating which controlnets to keep + controlnet_keep = [] + for i in range(len(timesteps)): + keeps = [ + 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e) + for s, e in zip(control_guidance_start, control_guidance_end) + ] + controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps) + + # Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + if guess_mode and self.do_classifier_free_guidance: + # Infer ControlNet only for the conditional batch. + control_model_input = latents + control_model_input = self.scheduler.scale_model_input(control_model_input, t) + controlnet_prompt_embeds = prompt_embeds.chunk(2)[1] + else: + control_model_input = latent_model_input + controlnet_prompt_embeds = prompt_embeds + controlnet_prompt_embeds = controlnet_prompt_embeds.repeat_interleave(num_frames, dim=0) + + if isinstance(controlnet_keep[i], list): + cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])] + else: + controlnet_cond_scale = controlnet_conditioning_scale + if isinstance(controlnet_cond_scale, list): + controlnet_cond_scale = controlnet_cond_scale[0] + cond_scale = controlnet_cond_scale * controlnet_keep[i] + + control_model_input = torch.transpose(control_model_input, 1, 2) + control_model_input = control_model_input.reshape( + (-1, control_model_input.shape[2], control_model_input.shape[3], control_model_input.shape[4]) + ) + + down_block_res_samples, mid_block_res_sample = self.controlnet( + control_model_input, + t, + encoder_hidden_states=controlnet_prompt_embeds, + controlnet_cond=conditioning_frames, + conditioning_scale=cond_scale, + guess_mode=guess_mode, + return_dict=False, + ) + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=self.cross_attention_kwargs, + added_cond_kwargs=added_cond_kwargs, + down_block_additional_residuals=down_block_res_samples, + mid_block_additional_residual=mid_block_res_sample, + ).sample + + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + if output_type == "latent": + return AnimateDiffControlNetPipelineOutput(frames=latents) + + # Post-processing + video_tensor = self.decode_latents(latents) + + if output_type == "pt": + video = video_tensor + else: + video = tensor2vid(video_tensor, self.image_processor, output_type=output_type) + + # Offload all models + self.maybe_free_model_hooks() + + if not return_dict: + return (video,) + + return AnimateDiffControlNetPipelineOutput(frames=video) From 3dc2362b5a89380f66ac006b1a787411fa1a9574 Mon Sep 17 00:00:00 2001 From: Beinsezii <39478211+Beinsezii@users.noreply.github.com> Date: Wed, 6 Dec 2023 23:51:04 -0800 Subject: [PATCH 84/99] EulerDiscreteScheduler add `rescale_betas_zero_snr` (#6024) * EulerDiscreteScheduler add `rescale_betas_zero_snr` --- .../schedulers/scheduling_euler_discrete.py | 56 +++++++++++++++++++ tests/schedulers/test_scheduler_euler.py | 4 ++ 2 files changed, 60 insertions(+) diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py index 0e2dd5c983e3..802ba0f099f9 100644 --- a/src/diffusers/schedulers/scheduling_euler_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_discrete.py @@ -92,6 +92,43 @@ def alpha_bar_fn(t): return torch.tensor(betas, dtype=torch.float32) +# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr +def rescale_zero_terminal_snr(betas): + """ + Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) + + + Args: + betas (`torch.FloatTensor`): + the betas that the scheduler is being initialized with. + + Returns: + `torch.FloatTensor`: rescaled betas with zero terminal SNR + """ + # Convert betas to alphas_bar_sqrt + alphas = 1.0 - betas + alphas_cumprod = torch.cumprod(alphas, dim=0) + alphas_bar_sqrt = alphas_cumprod.sqrt() + + # Store old values. + alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone() + alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone() + + # Shift so the last timestep is zero. + alphas_bar_sqrt -= alphas_bar_sqrt_T + + # Scale so the first timestep is back to the old value. + alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T) + + # Convert alphas_bar_sqrt to betas + alphas_bar = alphas_bar_sqrt**2 # Revert sqrt + alphas = alphas_bar[1:] / alphas_bar[:-1] # Revert cumprod + alphas = torch.cat([alphas_bar[0:1], alphas]) + betas = 1 - alphas + + return betas + + class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin): """ Euler scheduler. @@ -128,6 +165,10 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin): An offset added to the inference steps. You can use a combination of `offset=1` and `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable Diffusion. + rescale_betas_zero_snr (`bool`, defaults to `False`): + Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and + dark samples instead of limiting it to samples with medium brightness. Loosely related to + [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506). """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -149,6 +190,7 @@ def __init__( timestep_spacing: str = "linspace", timestep_type: str = "discrete", # can be "discrete" or "continuous" steps_offset: int = 0, + rescale_betas_zero_snr: bool = False, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -163,9 +205,17 @@ def __init__( else: raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + if rescale_betas_zero_snr: + self.betas = rescale_zero_terminal_snr(self.betas) + self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) + if rescale_betas_zero_snr: + # Close to 0 without being 0 so first sigma is not inf + # FP16 smallest positive subnormal works well here + self.alphas_cumprod[-1] = 2**-24 + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy() @@ -420,6 +470,9 @@ def step( if self.step_index is None: self._init_step_index(timestep) + # Upcast to avoid precision issues when computing prev_sample + sample = sample.to(torch.float32) + sigma = self.sigmas[self.step_index] gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0 @@ -456,6 +509,9 @@ def step( prev_sample = sample + derivative * dt + # Cast sample back to model compatible dtype + prev_sample = prev_sample.to(model_output.dtype) + # upon completion increase step index by one self._step_index += 1 diff --git a/tests/schedulers/test_scheduler_euler.py b/tests/schedulers/test_scheduler_euler.py index 3249d7032bad..41c418c5064c 100644 --- a/tests/schedulers/test_scheduler_euler.py +++ b/tests/schedulers/test_scheduler_euler.py @@ -45,6 +45,10 @@ def test_timestep_type(self): def test_karras_sigmas(self): self.check_over_configs(use_karras_sigmas=True, sigma_min=0.02, sigma_max=700.0) + def test_rescale_betas_zero_snr(self): + for rescale_betas_zero_snr in [True, False]: + self.check_over_configs(rescale_betas_zero_snr=rescale_betas_zero_snr) + def test_full_loop_no_noise(self): scheduler_class = self.scheduler_classes[0] scheduler_config = self.get_scheduler_config() From 26a8c00840be3f4ca1f7339f1fc44dd9f3faa8d2 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 7 Dec 2023 13:57:27 +0530 Subject: [PATCH 85/99] Revert "[Community] AnimateDiff + Controlnet Pipeline (#5928)" This reverts commit 821726d7c0fba25f06ed8bba26984d9ccc014871. --- examples/community/README.md | 65 - .../pipeline_animatediff_controlnet.py | 1137 ----------------- 2 files changed, 1202 deletions(-) delete mode 100644 examples/community/pipeline_animatediff_controlnet.py diff --git a/examples/community/README.md b/examples/community/README.md index 78a89acf7a57..1d13e2822b77 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -50,7 +50,6 @@ prompt-to-prompt | change parts of a prompt and retain image structure (see [pap | Latent Consistency Interpolation Pipeline | Interpolate the latent space of Latent Consistency Models with multiple prompts | [Latent Consistency Interpolation Pipeline](#latent-consistency-interpolation-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1pK3NrLWJSiJsBynLns1K1-IDTW9zbPvl?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) | | Regional Prompting Pipeline | Assign multiple prompts for different regions | [Regional Prompting Pipeline](#regional-prompting-pipeline) | - | [hako-mikan](https://github.com/hako-mikan) | | LDM3D-sr (LDM3D upscaler) | Upscale low resolution RGB and depth inputs to high resolution | [StableDiffusionUpscaleLDM3D Pipeline](https://github.com/estelleafl/diffusers/tree/ldm3d_upscaler_community/examples/community#stablediffusionupscaleldm3d-pipeline) | - | [Estelle Aflalo](https://github.com/estelleafl) | -| AnimateDiff ControlNet Pipeline | Combines AnimateDiff with precise motion control using ControlNets | [AnimateDiff ControlNet Pipeline](#animatediff-controlnet-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SKboYeGjEQmQPWoFC0aLYpBlYdHXkvAu?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) and [Edoardo Botta](https://github.com/EdoardoBotta) | | DemoFusion Pipeline | Implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973) | [DemoFusion Pipeline](#DemoFusion) | - | [Ruoyi Du](https://github.com/RuoyiDu) | To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly. @@ -2840,70 +2839,6 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur * Reconstructed image: * ![dps_generated_image](https://github.com/tongdaxu/Images/assets/22267548/b74f084d-93f4-4845-83d8-44c0fa758a5f) -### AnimateDiff ControlNet Pipeline - -This pipeline combines AnimateDiff and ControlNet. Enjoy precise motion control for your videos! Refer to [this](https://github.com/huggingface/diffusers/issues/5866) issue for more details. - -```py -import torch -from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter -from diffusers.pipelines import DiffusionPipeline -from diffusers.schedulers import DPMSolverMultistepScheduler -from PIL import Image - -motion_id = "guoyww/animatediff-motion-adapter-v1-5-2" -adapter = MotionAdapter.from_pretrained(motion_id) -controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_openpose", torch_dtype=torch.float16) -vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16) - -model_id = "SG161222/Realistic_Vision_V5.1_noVAE" -pipe = DiffusionPipeline.from_pretrained( - model_id, - motion_adapter=adapter, - controlnet=controlnet, - vae=vae, - custom_pipeline="pipeline_animatediff_controlnet", -).to(device="cuda", dtype=torch.float16) -pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained( - model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1 -) -pipe.enable_vae_slicing() - -conditioning_frames = [] -for i in range(1, 16 + 1): - conditioning_frames.append(Image.open(f"frame_{i}.png")) - -prompt = "astronaut in space, dancing" -negative_prompt = "bad quality, worst quality, jpeg artifacts, ugly" -result = pipe( - prompt=prompt, - negative_prompt=negative_prompt, - width=512, - height=768, - conditioning_frames=conditioning_frames, - num_inference_steps=12, -).frames[0] - -from diffusers.utils import export_to_gif -export_to_gif(result.frames[0], "result.gif") -``` - - - - - - - - - - - - - - - - -
Conditioning Frames
input-frames
AnimateDiff model: SG161222/Realistic_Vision_V5.1_noVAE
gif-1gif-2
AnimateDiff model: CardosAnime
gif-1gif-2
### DemoFusion This pipeline is the official implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973). The original repo can be found at [repo](https://github.com/PRIS-CV/DemoFusion). diff --git a/examples/community/pipeline_animatediff_controlnet.py b/examples/community/pipeline_animatediff_controlnet.py deleted file mode 100644 index 785f1ee55ec2..000000000000 --- a/examples/community/pipeline_animatediff_controlnet.py +++ /dev/null @@ -1,1137 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -from dataclasses import dataclass -from typing import Any, Callable, Dict, List, Optional, Union - -import numpy as np -import torch -import torch.nn.functional as F -from PIL import Image -from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection - -from diffusers.image_processor import PipelineImageInput, VaeImageProcessor -from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin -from diffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel, UNetMotionModel -from diffusers.models.lora import adjust_lora_scale_text_encoder -from diffusers.models.unet_motion_model import MotionAdapter -from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel -from diffusers.pipelines.pipeline_utils import DiffusionPipeline -from diffusers.schedulers import ( - DDIMScheduler, - DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, - LMSDiscreteScheduler, - PNDMScheduler, -) -from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers -from diffusers.utils.torch_utils import is_compiled_module, randn_tensor - - -logger = logging.get_logger(__name__) # pylint: disable=invalid-name - -EXAMPLE_DOC_STRING = """ - Examples: - ```py - >>> import torch - >>> from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter - >>> from diffusers.pipelines import DiffusionPipeline - >>> from diffusers.schedulers import DPMSolverMultistepScheduler - >>> from PIL import Image - - >>> motion_id = "guoyww/animatediff-motion-adapter-v1-5-2" - >>> adapter = MotionAdapter.from_pretrained(motion_id) - >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_openpose", torch_dtype=torch.float16) - >>> vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16) - - >>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE" - >>> pipe = DiffusionPipeline.from_pretrained( - ... model_id, - ... motion_adapter=adapter, - ... controlnet=controlnet, - ... vae=vae, - ... custom_pipeline="pipeline_animatediff_controlnet", - ... ).to(device="cuda", dtype=torch.float16) - >>> pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained( - ... model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1 - ... ) - >>> pipe.enable_vae_slicing() - - >>> conditioning_frames = [] - >>> for i in range(1, 16 + 1): - ... conditioning_frames.append(Image.open(f"frame_{i}.png")) - - >>> prompt = "astronaut in space, dancing" - >>> negative_prompt = "bad quality, worst quality, jpeg artifacts, ugly" - >>> result = pipe( - ... prompt=prompt, - ... negative_prompt=negative_prompt, - ... width=512, - ... height=768, - ... conditioning_frames=conditioning_frames, - ... num_inference_steps=12, - ... ).frames[0] - - >>> from diffusers.utils import export_to_gif - >>> export_to_gif(result.frames[0], "result.gif") - ``` -""" - - -def tensor2vid(video: torch.Tensor, processor, output_type="np"): - # Based on: - # https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78 - - batch_size, channels, num_frames, height, width = video.shape - outputs = [] - for batch_idx in range(batch_size): - batch_vid = video[batch_idx].permute(1, 0, 2, 3) - batch_output = processor.postprocess(batch_vid, output_type) - - outputs.append(batch_output) - - return outputs - - -@dataclass -class AnimateDiffControlNetPipelineOutput(BaseOutput): - frames: Union[torch.Tensor, np.ndarray] - - -class AnimateDiffControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin): - r""" - Pipeline for text-to-video generation. - - This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods - implemented for all pipelines (downloading, saving, running on a particular device, etc.). - - The pipeline also inherits the following loading methods: - - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights - - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights - - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters - - Args: - vae ([`AutoencoderKL`]): - Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. - text_encoder ([`CLIPTextModel`]): - Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)). - tokenizer (`CLIPTokenizer`): - A [`~transformers.CLIPTokenizer`] to tokenize text. - unet ([`UNet2DConditionModel`]): - A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents. - motion_adapter ([`MotionAdapter`]): - A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents. - scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of - [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. - """ - - model_cpu_offload_seq = "text_encoder->unet->vae" - _optional_components = ["feature_extractor", "image_encoder"] - _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] - - def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - motion_adapter: MotionAdapter, - controlnet: Union[ControlNetModel, MultiControlNetModel], - scheduler: Union[ - DDIMScheduler, - PNDMScheduler, - LMSDiscreteScheduler, - EulerDiscreteScheduler, - EulerAncestralDiscreteScheduler, - DPMSolverMultistepScheduler, - ], - feature_extractor: Optional[CLIPImageProcessor] = None, - image_encoder: Optional[CLIPVisionModelWithProjection] = None, - ): - super().__init__() - unet = UNetMotionModel.from_unet2d(unet, motion_adapter) - - self.register_modules( - vae=vae, - text_encoder=text_encoder, - tokenizer=tokenizer, - unet=unet, - motion_adapter=motion_adapter, - controlnet=controlnet, - scheduler=scheduler, - feature_extractor=feature_extractor, - image_encoder=image_encoder, - ) - self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - self.control_image_processor = VaeImageProcessor( - vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False - ) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt - def encode_prompt( - self, - prompt, - device, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - lora_scale: Optional[float] = None, - clip_skip: Optional[int] = None, - ): - r""" - Encodes the prompt into text encoder hidden states. - - Args: - prompt (`str` or `List[str]`, *optional*): - prompt to be encoded - device: (`torch.device`): - torch device - num_images_per_prompt (`int`): - number of images that should be generated per prompt - do_classifier_free_guidance (`bool`): - whether to use classifier free guidance or not - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - lora_scale (`float`, *optional*): - A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. - """ - # set lora scale so that monkey patched LoRA - # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, LoraLoaderMixin): - self._lora_scale = lora_scale - - # dynamically adjust the LoRA scale - if not USE_PEFT_BACKEND: - adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) - else: - scale_lora_layers(self.text_encoder, lora_scale) - - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if prompt_embeds is None: - # textual inversion: procecss multi-vector tokens if necessary - if isinstance(self, TextualInversionLoaderMixin): - prompt = self.maybe_convert_prompt(prompt, self.tokenizer) - - text_inputs = self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids - - if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( - text_input_ids, untruncated_ids - ): - removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] - ) - logger.warning( - "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}" - ) - - if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: - attention_mask = text_inputs.attention_mask.to(device) - else: - attention_mask = None - - if clip_skip is None: - prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask) - prompt_embeds = prompt_embeds[0] - else: - prompt_embeds = self.text_encoder( - text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True - ) - # Access the `hidden_states` first, that contains a tuple of - # all the hidden states from the encoder layers. Then index into - # the tuple to access the hidden states from the desired layer. - prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)] - # We also need to apply the final LayerNorm here to not mess with the - # representations. The `last_hidden_states` that we typically use for - # obtaining the final prompt representations passes through the LayerNorm - # layer. - prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) - - if self.text_encoder is not None: - prompt_embeds_dtype = self.text_encoder.dtype - elif self.unet is not None: - prompt_embeds_dtype = self.unet.dtype - else: - prompt_embeds_dtype = prompt_embeds.dtype - - prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) - - bs_embed, seq_len, _ = prompt_embeds.shape - # duplicate text embeddings for each generation per prompt, using mps friendly method - prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) - prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) - - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance and negative_prompt_embeds is None: - uncond_tokens: List[str] - if negative_prompt is None: - uncond_tokens = [""] * batch_size - elif prompt is not None and type(prompt) is not type(negative_prompt): - raise TypeError( - f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}." - ) - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - elif batch_size != len(negative_prompt): - raise ValueError( - f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" - f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`." - ) - else: - uncond_tokens = negative_prompt - - # textual inversion: procecss multi-vector tokens if necessary - if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) - - max_length = prompt_embeds.shape[1] - uncond_input = self.tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="pt", - ) - - if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: - attention_mask = uncond_input.attention_mask.to(device) - else: - attention_mask = None - - negative_prompt_embeds = self.text_encoder( - uncond_input.input_ids.to(device), - attention_mask=attention_mask, - ) - negative_prompt_embeds = negative_prompt_embeds[0] - - if do_classifier_free_guidance: - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = negative_prompt_embeds.shape[1] - - negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) - - negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) - negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) - - if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: - # Retrieve the original scale by scaling back the LoRA layers - unscale_lora_layers(self.text_encoder, lora_scale) - - return prompt_embeds, negative_prompt_embeds - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image - def encode_image(self, image, device, num_images_per_prompt): - dtype = next(self.image_encoder.parameters()).dtype - - if not isinstance(image, torch.Tensor): - image = self.feature_extractor(image, return_tensors="pt").pixel_values - - image = image.to(device=device, dtype=dtype) - image_embeds = self.image_encoder(image).image_embeds - image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) - - uncond_image_embeds = torch.zeros_like(image_embeds) - return image_embeds, uncond_image_embeds - - # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents - def decode_latents(self, latents): - latents = 1 / self.vae.config.scaling_factor * latents - - batch_size, channels, num_frames, height, width = latents.shape - latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width) - - image = self.vae.decode(latents).sample - video = ( - image[None, :] - .reshape( - ( - batch_size, - num_frames, - -1, - ) - + image.shape[2:] - ) - .permute(0, 2, 1, 3, 4) - ) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - video = video.float() - return video - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing - def enable_vae_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.vae.enable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing - def disable_vae_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_slicing() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling - def enable_vae_tiling(self): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.vae.enable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling - def disable_vae_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to - computing decoding in one step. - """ - self.vae.disable_tiling() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu - def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): - r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. - - The suffixes after the scaling factors represent the stages where they are being applied. - - Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values - that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. - - Args: - s1 (`float`): - Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - s2 (`float`): - Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to - mitigate "oversmoothing effect" in the enhanced denoising process. - b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. - b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. - """ - if not hasattr(self, "unet"): - raise ValueError("The pipeline must have `unet` for using FreeU.") - self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu - def disable_freeu(self): - """Disables the FreeU mechanism if enabled.""" - self.unet.disable_freeu() - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs - def prepare_extra_step_kwargs(self, generator, eta): - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - # check if the scheduler accepts generator - accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) - if accepts_generator: - extra_step_kwargs["generator"] = generator - return extra_step_kwargs - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs - def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, - callback_on_step_end_tensor_inputs=None, - image=None, - controlnet_conditioning_scale=1.0, - control_guidance_start=0.0, - control_guidance_end=1.0, - ): - if height % 8 != 0 or width % 8 != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - - if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - if callback_on_step_end_tensor_inputs is not None and not all( - k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs - ): - raise ValueError( - f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - # `prompt` needs more sophisticated handling when there are multiple - # conditionings. - if isinstance(self.controlnet, MultiControlNetModel): - if isinstance(prompt, list): - logger.warning( - f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}" - " prompts. The conditionings will be fixed across the prompts." - ) - - # Check `image` - is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance( - self.controlnet, torch._dynamo.eval_frame.OptimizedModule - ) - if ( - isinstance(self.controlnet, ControlNetModel) - or is_compiled - and isinstance(self.controlnet._orig_mod, ControlNetModel) - ): - if isinstance(image, list): - for image_ in image: - self.check_image(image_, prompt, prompt_embeds) - else: - self.check_image(image, prompt, prompt_embeds) - elif ( - isinstance(self.controlnet, MultiControlNetModel) - or is_compiled - and isinstance(self.controlnet._orig_mod, MultiControlNetModel) - ): - if not isinstance(image, list): - raise TypeError("For multiple controlnets: `image` must be type `list`") - - # When `image` is a nested list: - # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]]) - elif any(isinstance(i, list) for i in image): - raise ValueError("A single batch of multiple conditionings are supported at the moment.") - elif len(image) != len(self.controlnet.nets): - raise ValueError( - f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets." - ) - - for control_ in image: - for image_ in control_: - self.check_image(image_, prompt, prompt_embeds) - else: - assert False - - # Check `controlnet_conditioning_scale` - if ( - isinstance(self.controlnet, ControlNetModel) - or is_compiled - and isinstance(self.controlnet._orig_mod, ControlNetModel) - ): - if not isinstance(controlnet_conditioning_scale, float): - raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.") - elif ( - isinstance(self.controlnet, MultiControlNetModel) - or is_compiled - and isinstance(self.controlnet._orig_mod, MultiControlNetModel) - ): - if isinstance(controlnet_conditioning_scale, list): - if any(isinstance(i, list) for i in controlnet_conditioning_scale): - raise ValueError("A single batch of multiple conditionings are supported at the moment.") - elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len( - self.controlnet.nets - ): - raise ValueError( - "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have" - " the same length as the number of controlnets" - ) - else: - assert False - - if not isinstance(control_guidance_start, (tuple, list)): - control_guidance_start = [control_guidance_start] - - if not isinstance(control_guidance_end, (tuple, list)): - control_guidance_end = [control_guidance_end] - - if len(control_guidance_start) != len(control_guidance_end): - raise ValueError( - f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list." - ) - - if isinstance(self.controlnet, MultiControlNetModel): - if len(control_guidance_start) != len(self.controlnet.nets): - raise ValueError( - f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}." - ) - - for start, end in zip(control_guidance_start, control_guidance_end): - if start >= end: - raise ValueError( - f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}." - ) - if start < 0.0: - raise ValueError(f"control guidance start: {start} can't be smaller than 0.") - if end > 1.0: - raise ValueError(f"control guidance end: {end} can't be larger than 1.0.") - - # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image - def check_image(self, image, prompt, prompt_embeds): - image_is_pil = isinstance(image, Image.Image) - image_is_tensor = isinstance(image, torch.Tensor) - image_is_np = isinstance(image, np.ndarray) - image_is_pil_list = isinstance(image, list) and isinstance(image[0], Image.Image) - image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor) - image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray) - - if ( - not image_is_pil - and not image_is_tensor - and not image_is_np - and not image_is_pil_list - and not image_is_tensor_list - and not image_is_np_list - ): - raise TypeError( - f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}" - ) - - if image_is_pil: - image_batch_size = 1 - else: - image_batch_size = len(image) - - if prompt is not None and isinstance(prompt, str): - prompt_batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - prompt_batch_size = len(prompt) - elif prompt_embeds is not None: - prompt_batch_size = prompt_embeds.shape[0] - - if image_batch_size != 1 and image_batch_size != prompt_batch_size: - raise ValueError( - f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}" - ) - - # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents - def prepare_latents( - self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None - ): - shape = ( - batch_size, - num_channels_latents, - num_frames, - height // self.vae_scale_factor, - width // self.vae_scale_factor, - ) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - if latents is None: - latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - else: - latents = latents.to(device) - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * self.scheduler.init_noise_sigma - return latents - - # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image - def prepare_image( - self, - image, - width, - height, - batch_size, - num_images_per_prompt, - device, - dtype, - do_classifier_free_guidance=False, - guess_mode=False, - ): - image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32) - image_batch_size = image.shape[0] - - if image_batch_size == 1: - repeat_by = batch_size - else: - # image batch size is the same as prompt batch size - repeat_by = num_images_per_prompt - - image = image.repeat_interleave(repeat_by, dim=0) - - image = image.to(device=device, dtype=dtype) - - if do_classifier_free_guidance and not guess_mode: - image = torch.cat([image] * 2) - - return image - - @property - def guidance_scale(self): - return self._guidance_scale - - @property - def clip_skip(self): - return self._clip_skip - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - @property - def do_classifier_free_guidance(self): - return self._guidance_scale > 1 - - @property - def cross_attention_kwargs(self): - return self._cross_attention_kwargs - - @property - def num_timesteps(self): - return self._num_timesteps - - @torch.no_grad() - def __call__( - self, - prompt: Union[str, List[str]] = None, - num_frames: Optional[int] = 16, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_videos_per_prompt: Optional[int] = 1, - eta: float = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - ip_adapter_image: Optional[PipelineImageInput] = None, - conditioning_frames: Optional[List[PipelineImageInput]] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - cross_attention_kwargs: Optional[Dict[str, Any]] = None, - controlnet_conditioning_scale: Union[float, List[float]] = 1.0, - guess_mode: bool = False, - control_guidance_start: Union[float, List[float]] = 0.0, - control_guidance_end: Union[float, List[float]] = 1.0, - clip_skip: Optional[int] = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, - callback_on_step_end_tensor_inputs: List[str] = ["latents"], - **kwargs, - ): - r""" - The call function to the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The height in pixels of the generated video. - width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The width in pixels of the generated video. - num_frames (`int`, *optional*, defaults to 16): - The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds - amounts to 2 seconds of video. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality videos at the - expense of slower inference. - guidance_scale (`float`, *optional*, defaults to 7.5): - A higher guidance scale value encourages the model to generate images closely linked to the text - `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide what to not include in image generation. If not defined, you need to - pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). - eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies - to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make - generation deterministic. - latents (`torch.FloatTensor`, *optional*): - Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor is generated by sampling using the supplied random `generator`. Latents should be of shape - `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not - provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If - not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - ip_adapter_image (`PipelineImageInput`, *optional*): - Optional image input to work with IP Adapters. - conditioning_frames (`List[PipelineImageInput]`, *optional*): - The ControlNet input condition to provide guidance to the `unet` for generation. If multiple ControlNets - are specified, images must be passed as a list such that each element of the list can be correctly - batched for input to a single ControlNet. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or - `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead - of a plain tuple. - cross_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in - [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0): - The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added - to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set - the corresponding scale as a list. - guess_mode (`bool`, *optional*, defaults to `False`): - The ControlNet encoder tries to recognize the content of the input image even if you remove all - prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended. - control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0): - The percentage of total steps at which the ControlNet starts applying. - control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0): - The percentage of total steps at which the ControlNet stops applying. - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. - allback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. - callback_on_step_end_tensor_inputs (`List`, *optional*): - The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list - will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeine class. - - Examples: - - Returns: - [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is - returned, otherwise a `tuple` is returned where the first element is a list with the generated frames. - """ - - callback = kwargs.pop("callback", None) - callback_steps = kwargs.pop("callback_steps", None) - - if callback is not None: - deprecate( - "callback", - "1.0.0", - "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", - ) - if callback_steps is not None: - deprecate( - "callback_steps", - "1.0.0", - "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", - ) - - controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet - - # align format for control guidance - if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list): - control_guidance_start = len(control_guidance_end) * [control_guidance_start] - elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list): - control_guidance_end = len(control_guidance_start) * [control_guidance_end] - elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list): - mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1 - control_guidance_start, control_guidance_end = ( - mult * [control_guidance_start], - mult * [control_guidance_end], - ) - - # 0. Default height and width to unet - height = height or self.unet.config.sample_size * self.vae_scale_factor - width = width or self.unet.config.sample_size * self.vae_scale_factor - - num_videos_per_prompt = 1 - - # 1. Check inputs. Raise error if not correct - self.check_inputs( - prompt=prompt, - height=height, - width=width, - callback_steps=callback_steps, - negative_prompt=negative_prompt, - callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - image=conditioning_frames, - controlnet_conditioning_scale=controlnet_conditioning_scale, - control_guidance_start=control_guidance_start, - control_guidance_end=control_guidance_end, - ) - - self._guidance_scale = guidance_scale - self._clip_skip = clip_skip - self._cross_attention_kwargs = cross_attention_kwargs - - # 2. Define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - device = self._execution_device - - if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): - controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets) - - global_pool_conditions = ( - controlnet.config.global_pool_conditions - if isinstance(controlnet, ControlNetModel) - else controlnet.nets[0].config.global_pool_conditions - ) - guess_mode = guess_mode or global_pool_conditions - - # 3. Encode input prompt - text_encoder_lora_scale = ( - cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None - ) - prompt_embeds, negative_prompt_embeds = self.encode_prompt( - prompt, - device, - num_videos_per_prompt, - self.do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - lora_scale=text_encoder_lora_scale, - clip_skip=self.clip_skip, - ) - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - if self.do_classifier_free_guidance: - prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - - if ip_adapter_image is not None: - image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_videos_per_prompt) - if self.do_classifier_free_guidance: - image_embeds = torch.cat([negative_image_embeds, image_embeds]) - - if isinstance(controlnet, ControlNetModel): - conditioning_frames = self.prepare_image( - image=conditioning_frames, - width=width, - height=height, - batch_size=batch_size * num_videos_per_prompt * num_frames, - num_images_per_prompt=num_videos_per_prompt, - device=device, - dtype=controlnet.dtype, - do_classifier_free_guidance=self.do_classifier_free_guidance, - guess_mode=guess_mode, - ) - elif isinstance(controlnet, MultiControlNetModel): - cond_prepared_frames = [] - for frame_ in conditioning_frames: - prepared_frame = self.prepare_image( - image=frame_, - width=width, - height=height, - batch_size=batch_size * num_videos_per_prompt * num_frames, - num_images_per_prompt=num_videos_per_prompt, - device=device, - dtype=controlnet.dtype, - do_classifier_free_guidance=self.do_classifier_free_guidance, - guess_mode=guess_mode, - ) - - cond_prepared_frames.append(prepared_frame) - - conditioning_frames = cond_prepared_frames - else: - assert False - - # 4. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps - self._num_timesteps = len(timesteps) - - # 5. Prepare latent variables - num_channels_latents = self.unet.config.in_channels - latents = self.prepare_latents( - batch_size * num_videos_per_prompt, - num_channels_latents, - num_frames, - height, - width, - prompt_embeds.dtype, - device, - generator, - latents, - ) - - # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - # 7. Add image embeds for IP-Adapter - added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None - - # 7.1 Create tensor stating which controlnets to keep - controlnet_keep = [] - for i in range(len(timesteps)): - keeps = [ - 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e) - for s, e in zip(control_guidance_start, control_guidance_end) - ] - controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps) - - # Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - if guess_mode and self.do_classifier_free_guidance: - # Infer ControlNet only for the conditional batch. - control_model_input = latents - control_model_input = self.scheduler.scale_model_input(control_model_input, t) - controlnet_prompt_embeds = prompt_embeds.chunk(2)[1] - else: - control_model_input = latent_model_input - controlnet_prompt_embeds = prompt_embeds - controlnet_prompt_embeds = controlnet_prompt_embeds.repeat_interleave(num_frames, dim=0) - - if isinstance(controlnet_keep[i], list): - cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])] - else: - controlnet_cond_scale = controlnet_conditioning_scale - if isinstance(controlnet_cond_scale, list): - controlnet_cond_scale = controlnet_cond_scale[0] - cond_scale = controlnet_cond_scale * controlnet_keep[i] - - control_model_input = torch.transpose(control_model_input, 1, 2) - control_model_input = control_model_input.reshape( - (-1, control_model_input.shape[2], control_model_input.shape[3], control_model_input.shape[4]) - ) - - down_block_res_samples, mid_block_res_sample = self.controlnet( - control_model_input, - t, - encoder_hidden_states=controlnet_prompt_embeds, - controlnet_cond=conditioning_frames, - conditioning_scale=cond_scale, - guess_mode=guess_mode, - return_dict=False, - ) - - # predict the noise residual - noise_pred = self.unet( - latent_model_input, - t, - encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=self.cross_attention_kwargs, - added_cond_kwargs=added_cond_kwargs, - down_block_additional_residuals=down_block_res_samples, - mid_block_additional_residual=mid_block_res_sample, - ).sample - - # perform guidance - if self.do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample - - if callback_on_step_end is not None: - callback_kwargs = {} - for k in callback_on_step_end_tensor_inputs: - callback_kwargs[k] = locals()[k] - callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) - - latents = callback_outputs.pop("latents", latents) - prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) - negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - if callback is not None and i % callback_steps == 0: - callback(i, t, latents) - - if output_type == "latent": - return AnimateDiffControlNetPipelineOutput(frames=latents) - - # Post-processing - video_tensor = self.decode_latents(latents) - - if output_type == "pt": - video = video_tensor - else: - video = tensor2vid(video_tensor, self.image_processor, output_type=output_type) - - # Offload all models - self.maybe_free_model_hooks() - - if not return_dict: - return (video,) - - return AnimateDiffControlNetPipelineOutput(frames=video) From 8db59d7ca4ccb174093b68c714a0ee2fc04ed833 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 7 Dec 2023 13:57:39 +0530 Subject: [PATCH 86/99] Revert "EulerDiscreteScheduler add `rescale_betas_zero_snr` (#6024)" This reverts commit 3dc2362b5a89380f66ac006b1a787411fa1a9574. --- .../schedulers/scheduling_euler_discrete.py | 56 ------------------- tests/schedulers/test_scheduler_euler.py | 4 -- 2 files changed, 60 deletions(-) diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py index 802ba0f099f9..0e2dd5c983e3 100644 --- a/src/diffusers/schedulers/scheduling_euler_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_discrete.py @@ -92,43 +92,6 @@ def alpha_bar_fn(t): return torch.tensor(betas, dtype=torch.float32) -# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr -def rescale_zero_terminal_snr(betas): - """ - Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) - - - Args: - betas (`torch.FloatTensor`): - the betas that the scheduler is being initialized with. - - Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR - """ - # Convert betas to alphas_bar_sqrt - alphas = 1.0 - betas - alphas_cumprod = torch.cumprod(alphas, dim=0) - alphas_bar_sqrt = alphas_cumprod.sqrt() - - # Store old values. - alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone() - alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone() - - # Shift so the last timestep is zero. - alphas_bar_sqrt -= alphas_bar_sqrt_T - - # Scale so the first timestep is back to the old value. - alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T) - - # Convert alphas_bar_sqrt to betas - alphas_bar = alphas_bar_sqrt**2 # Revert sqrt - alphas = alphas_bar[1:] / alphas_bar[:-1] # Revert cumprod - alphas = torch.cat([alphas_bar[0:1], alphas]) - betas = 1 - alphas - - return betas - - class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin): """ Euler scheduler. @@ -165,10 +128,6 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin): An offset added to the inference steps. You can use a combination of `offset=1` and `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable Diffusion. - rescale_betas_zero_snr (`bool`, defaults to `False`): - Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and - dark samples instead of limiting it to samples with medium brightness. Loosely related to - [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506). """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -190,7 +149,6 @@ def __init__( timestep_spacing: str = "linspace", timestep_type: str = "discrete", # can be "discrete" or "continuous" steps_offset: int = 0, - rescale_betas_zero_snr: bool = False, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -205,17 +163,9 @@ def __init__( else: raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") - if rescale_betas_zero_snr: - self.betas = rescale_zero_terminal_snr(self.betas) - self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) - if rescale_betas_zero_snr: - # Close to 0 without being 0 so first sigma is not inf - # FP16 smallest positive subnormal works well here - self.alphas_cumprod[-1] = 2**-24 - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy() @@ -470,9 +420,6 @@ def step( if self.step_index is None: self._init_step_index(timestep) - # Upcast to avoid precision issues when computing prev_sample - sample = sample.to(torch.float32) - sigma = self.sigmas[self.step_index] gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0 @@ -509,9 +456,6 @@ def step( prev_sample = sample + derivative * dt - # Cast sample back to model compatible dtype - prev_sample = prev_sample.to(model_output.dtype) - # upon completion increase step index by one self._step_index += 1 diff --git a/tests/schedulers/test_scheduler_euler.py b/tests/schedulers/test_scheduler_euler.py index 41c418c5064c..3249d7032bad 100644 --- a/tests/schedulers/test_scheduler_euler.py +++ b/tests/schedulers/test_scheduler_euler.py @@ -45,10 +45,6 @@ def test_timestep_type(self): def test_karras_sigmas(self): self.check_over_configs(use_karras_sigmas=True, sigma_min=0.02, sigma_max=700.0) - def test_rescale_betas_zero_snr(self): - for rescale_betas_zero_snr in [True, False]: - self.check_over_configs(rescale_betas_zero_snr=rescale_betas_zero_snr) - def test_full_loop_no_noise(self): scheduler_class = self.scheduler_classes[0] scheduler_config = self.get_scheduler_config() From 4e7fb4d05aa324ac046e4ed63157064d21ed0f8d Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 7 Dec 2023 14:08:33 +0530 Subject: [PATCH 87/99] add SDXL turbo --- benchmarks/base_classes.py | 28 +++++++++++++++++++++++++++ benchmarks/benchmark_sd_img.py | 5 +++-- benchmarks/benchmark_text_to_image.py | 5 +++-- benchmarks/run_all.py | 18 ++++++++++------- 4 files changed, 45 insertions(+), 11 deletions(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index f16107fa7224..450befeaeebb 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -41,6 +41,7 @@ "stabilityai/stable-diffusion-2-1": (768, 768), "stabilityai/stable-diffusion-xl-base-1.0": (1024, 1024), "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024), + "stabilityai/sdxl-turbo": (512, 512), } @@ -119,6 +120,19 @@ def benchmark(self, args): flush() +class TurboTextToImageBenchmark(TextToImageBenchmark): + def __init__(self, args): + super().__init__(args) + + def run_inference(self, pipe, args): + _ = pipe( + prompt=PROMPT, + num_inference_steps=args.num_inference_steps, + num_images_per_prompt=args.batch_size, + guidance_scale=0.0, + ) + + class ImageToImageBenchmark(TextToImageBenchmark): pipeline_class = AutoPipelineForImage2Image url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg" @@ -137,6 +151,20 @@ def run_inference(self, pipe, args): ) +class TurboImageToImageBenchmark(ImageToImageBenchmark): + def __init__(self, args): + super().__init__(args) + + def run_inference(self, pipe, args): + _ = pipe( + prompt=PROMPT, + image=self.image, + num_inference_steps=args.num_inference_steps, + num_images_per_prompt=args.batch_size, + guidance_scale=0.0, + ) + + class InpaintingBenchmark(ImageToImageBenchmark): pipeline_class = AutoPipelineForInpainting mask_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/overture-creations-5sI6fQgYIuo_mask.png" diff --git a/benchmarks/benchmark_sd_img.py b/benchmarks/benchmark_sd_img.py index 5525b4dae60b..491e7c9a65a9 100644 --- a/benchmarks/benchmark_sd_img.py +++ b/benchmarks/benchmark_sd_img.py @@ -3,7 +3,7 @@ sys.path.append(".") -from base_classes import ImageToImageBenchmark # noqa: E402 +from base_classes import ImageToImageBenchmark, TurboImageToImageBenchmark # noqa: E402 if __name__ == "__main__": @@ -16,6 +16,7 @@ "runwayml/stable-diffusion-v1-5", "stabilityai/stable-diffusion-2-1", "stabilityai/stable-diffusion-xl-refiner-1.0", + "stabilityai/sdxl-turbo", ], ) parser.add_argument("--batch_size", type=int, default=1) @@ -24,5 +25,5 @@ parser.add_argument("--run_compile", action="store_true") args = parser.parse_args() - benchmark_pipe = ImageToImageBenchmark(args) + benchmark_pipe = ImageToImageBenchmark(args) if "turbo" not in args.ckpt else TurboImageToImageBenchmark(args) benchmark_pipe.benchmark(args) diff --git a/benchmarks/benchmark_text_to_image.py b/benchmarks/benchmark_text_to_image.py index 50c04dd550c9..6d01ac558535 100644 --- a/benchmarks/benchmark_text_to_image.py +++ b/benchmarks/benchmark_text_to_image.py @@ -3,7 +3,7 @@ sys.path.append(".") -from base_classes import TextToImageBenchmark # noqa: E402 +from base_classes import TextToImageBenchmark, TurboTextToImageBenchmark # noqa: E402 if __name__ == "__main__": @@ -18,6 +18,7 @@ "stabilityai/stable-diffusion-xl-base-1.0", "kandinsky-community/kandinsky-2-2-decoder", "warp-ai/wuerstchen", + "stabilityai/sdxl-turbo", ], ) parser.add_argument("--batch_size", type=int, default=1) @@ -26,5 +27,5 @@ parser.add_argument("--run_compile", action="store_true") args = parser.parse_args() - benchmark_pipe = TextToImageBenchmark(args) + benchmark_pipe = TextToImageBenchmark(args) if "turbo" not in args.ckpt else TurboTextToImageBenchmark(args) benchmark_pipe.benchmark(args) diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index 913a24d98d12..2da592224766 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -55,13 +55,17 @@ def main(): command += " --run_compile" run_command(command.split()) - elif file in ["benchmark_sd_img.py", "benchmark_sd_inpainting.py"]: - sdxl_ckpt = ( - "stabilityai/stable-diffusion-xl-refiner-1.0" - if "inpainting" not in file - else "stabilityai/stable-diffusion-xl-base-1.0" - ) - command = f"python {file} --ckpt {sdxl_ckpt}" + elif file == "benchmark_sd_img.py": + for ckpt in ["stabilityai/stable-diffusion-xl-refiner-1.0", "stabilityai/sdxl-turbo"]: + command = f"python {file} --ckpt {ckpt}" + run_command(command.split()) + + command += " --run_compile" + run_command(command.split()) + + elif file == "benchmark_sd_inpainting.py": + sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0" + command = f"python {file} --ckpt {ckpt}" run_command(command.split()) command += " --run_compile" From e2df761178d6c4547ca0696ddfe61754f6ab7149 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 7 Dec 2023 16:36:18 +0530 Subject: [PATCH 88/99] add lcm lora to the mix as well. --- .github/workflows/benchmark.yml | 2 +- benchmarks/base_classes.py | 19 +++++++++++++++++++ benchmarks/benchmark_t2i_lcm_lora.py | 23 +++++++++++++++++++++++ benchmarks/benchmark_text_to_image.py | 27 ++++++++++++++++++--------- benchmarks/run_all.py | 25 ++++++++++++++++--------- 5 files changed, 77 insertions(+), 19 deletions(-) create mode 100644 benchmarks/benchmark_t2i_lcm_lora.py diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 3190091ea3c8..7713e0aef111 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -36,7 +36,7 @@ jobs: run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m pip install -e .[quality,test] - python -m pip install pandas + python -m pip install peft pandas - name: Environment run: | python utils/print_env.py diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 450befeaeebb..6d0da4a98450 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -8,6 +8,7 @@ AutoPipelineForInpainting, AutoPipelineForText2Image, ControlNetModel, + LCMScheduler, StableDiffusionAdapterPipeline, StableDiffusionControlNetPipeline, StableDiffusionXLAdapterPipeline, @@ -165,6 +166,24 @@ def run_inference(self, pipe, args): ) +class LCMLoRATextToImageBenchmark(TextToImageBenchmark): + lora_id = "latent-consistency/lcm-lora-sdxl" + + def __init__(self, args): + super().__init__(args) + self.pipe.load_lora_weights(self.lora_id) + self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config) + + def run_inference(self, pipe, args): + _ = pipe( + prompt=PROMPT, + image=self.image, + num_inference_steps=args.num_inference_steps, + num_images_per_prompt=args.batch_size, + guidance_scale=1.0, + ) + + class InpaintingBenchmark(ImageToImageBenchmark): pipeline_class = AutoPipelineForInpainting mask_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/overture-creations-5sI6fQgYIuo_mask.png" diff --git a/benchmarks/benchmark_t2i_lcm_lora.py b/benchmarks/benchmark_t2i_lcm_lora.py new file mode 100644 index 000000000000..e68a6213fe5c --- /dev/null +++ b/benchmarks/benchmark_t2i_lcm_lora.py @@ -0,0 +1,23 @@ +import argparse +import sys + + +sys.path.append(".") +from base_classes import LCMLoRATextToImageBenchmark # noqa: E402 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--ckpt", + type=str, + default="stabilityai/stable-diffusion-xl-base-1.0", + ) + parser.add_argument("--batch_size", type=int, default=1) + parser.add_argument("--num_inference_steps", type=int, default=3) + parser.add_argument("--model_cpu_offload", action="store_true") + parser.add_argument("--run_compile", action="store_true") + args = parser.parse_args() + + benchmark_pipe = LCMLoRATextToImageBenchmark(args) + benchmark_pipe.benchmark(args) diff --git a/benchmarks/benchmark_text_to_image.py b/benchmarks/benchmark_text_to_image.py index 6d01ac558535..caa97b0c5e3b 100644 --- a/benchmarks/benchmark_text_to_image.py +++ b/benchmarks/benchmark_text_to_image.py @@ -6,20 +6,23 @@ from base_classes import TextToImageBenchmark, TurboTextToImageBenchmark # noqa: E402 +ALL_T2I_CKPTS = [ + "runwayml/stable-diffusion-v1-5", + "segmind/SSD-1B", + "stabilityai/stable-diffusion-xl-base-1.0", + "kandinsky-community/kandinsky-2-2-decoder", + "warp-ai/wuerstchen", + "stabilityai/sdxl-turbo", +] + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--ckpt", type=str, default="runwayml/stable-diffusion-v1-5", - choices=[ - "runwayml/stable-diffusion-v1-5", - "segmind/SSD-1B", - "stabilityai/stable-diffusion-xl-base-1.0", - "kandinsky-community/kandinsky-2-2-decoder", - "warp-ai/wuerstchen", - "stabilityai/sdxl-turbo", - ], + choices=ALL_T2I_CKPTS, ) parser.add_argument("--batch_size", type=int, default=1) parser.add_argument("--num_inference_steps", type=int, default=50) @@ -27,5 +30,11 @@ parser.add_argument("--run_compile", action="store_true") args = parser.parse_args() - benchmark_pipe = TextToImageBenchmark(args) if "turbo" not in args.ckpt else TurboTextToImageBenchmark(args) + benchmark_cls = None + if "turbo" in args.ckpt: + benchmark_cls = TurboTextToImageBenchmark + else: + benchmark_cls = TextToImageBenchmark + + benchmark_pipe = benchmark_cls(args) benchmark_pipe.benchmark(args) diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index 2da592224766..685f49e2d1c6 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -1,8 +1,13 @@ import glob import subprocess +import sys from typing import List +sys.path.append(".") +from benchmark_text_to_image import ALL_T2I_CKPTS # noqa: E402 + + PATTERN = "benchmark_*.py" @@ -34,6 +39,7 @@ def main(): for file in python_files: print(f"****** Running file: {file} ******") + # Run with canonical settings. if file != "benchmark_text_to_image.py": command = f"python {file}" run_command(command.split()) @@ -42,14 +48,12 @@ def main(): run_command(command.split()) if file == "benchmark_text_to_image.py": - for ckpt in [ - "runwayml/stable-diffusion-v1-5", - "segmind/SSD-1B", - "stabilityai/stable-diffusion-xl-base-1.0", - "kandinsky-community/kandinsky-2-2-decoder", - "warp-ai/wuerstchen", - ]: + for ckpt in ALL_T2I_CKPTS: command = f"python {file} --ckpt {ckpt}" + + if "turbo" in ckpt: + command += "--num_inference_steps 1" + run_command(command.split()) command += " --run_compile" @@ -58,14 +62,17 @@ def main(): elif file == "benchmark_sd_img.py": for ckpt in ["stabilityai/stable-diffusion-xl-refiner-1.0", "stabilityai/sdxl-turbo"]: command = f"python {file} --ckpt {ckpt}" - run_command(command.split()) + if ckpt == "stabilityai/sdxl-turbo": + command += "--num_inference_steps 1" + + run_command(command.split()) command += " --run_compile" run_command(command.split()) elif file == "benchmark_sd_inpainting.py": sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0" - command = f"python {file} --ckpt {ckpt}" + command = f"python {file} --ckpt {sdxl_ckpt}" run_command(command.split()) command += " --run_compile" From 2588853787854870920086c75106f555c8d29954 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 8 Dec 2023 00:07:55 +0530 Subject: [PATCH 89/99] fix --- benchmarks/benchmark_t2i_lcm_lora.py | 2 +- benchmarks/run_all.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark_t2i_lcm_lora.py b/benchmarks/benchmark_t2i_lcm_lora.py index e68a6213fe5c..957e0a463e28 100644 --- a/benchmarks/benchmark_t2i_lcm_lora.py +++ b/benchmarks/benchmark_t2i_lcm_lora.py @@ -14,7 +14,7 @@ default="stabilityai/stable-diffusion-xl-base-1.0", ) parser.add_argument("--batch_size", type=int, default=1) - parser.add_argument("--num_inference_steps", type=int, default=3) + parser.add_argument("--num_inference_steps", type=int, default=4) parser.add_argument("--model_cpu_offload", action="store_true") parser.add_argument("--run_compile", action="store_true") args = parser.parse_args() diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index 685f49e2d1c6..003f3c4a8a01 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -52,7 +52,7 @@ def main(): command = f"python {file} --ckpt {ckpt}" if "turbo" in ckpt: - command += "--num_inference_steps 1" + command += " --num_inference_steps 1" run_command(command.split()) @@ -64,7 +64,7 @@ def main(): command = f"python {file} --ckpt {ckpt}" if ckpt == "stabilityai/sdxl-turbo": - command += "--num_inference_steps 1" + command += " --num_inference_steps 1" run_command(command.split()) command += " --run_compile" From a7fd2c345fc73c95e518d7598d2c05f54d037a9e Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 8 Dec 2023 00:44:29 +0530 Subject: [PATCH 90/99] increase steps to 2 when running turbo i2i --- benchmarks/run_all.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index 003f3c4a8a01..af78bd738376 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -64,7 +64,7 @@ def main(): command = f"python {file} --ckpt {ckpt}" if ckpt == "stabilityai/sdxl-turbo": - command += " --num_inference_steps 1" + command += " --num_inference_steps 2" run_command(command.split()) command += " --run_compile" From b878a29fad0d1898183d527d728d0826c829ebf3 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 8 Dec 2023 08:25:44 +0530 Subject: [PATCH 91/99] debug --- benchmarks/base_classes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 6d0da4a98450..979fd211c5c0 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -153,10 +153,13 @@ def run_inference(self, pipe, args): class TurboImageToImageBenchmark(ImageToImageBenchmark): + image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png") + def __init__(self, args): super().__init__(args) def run_inference(self, pipe, args): + print(f"Image size: {self.image.size}") _ = pipe( prompt=PROMPT, image=self.image, From 1389d0e922ba0362c1430ff124037e8126e6cc34 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 8 Dec 2023 08:27:52 +0530 Subject: [PATCH 92/99] debug --- benchmarks/base_classes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 979fd211c5c0..f954f03d809a 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -157,6 +157,7 @@ class TurboImageToImageBenchmark(ImageToImageBenchmark): def __init__(self, args): super().__init__(args) + self.pipe = AutoPipelineForImage2Image.from_pretrained(args.ckpt, torch_dtype=torch.float16).to("cuda") def run_inference(self, pipe, args): print(f"Image size: {self.image.size}") From b2d35be32b18291a66baf95a81010ee0135d9384 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 8 Dec 2023 08:29:49 +0530 Subject: [PATCH 93/99] debug --- benchmarks/base_classes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index f954f03d809a..32a02d7cd5d1 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -167,6 +167,7 @@ def run_inference(self, pipe, args): num_inference_steps=args.num_inference_steps, num_images_per_prompt=args.batch_size, guidance_scale=0.0, + strength=0.5 ) From d78609d0f7cdc1c2ebd1c0440ccd469ac5c1abef Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 8 Dec 2023 08:32:15 +0530 Subject: [PATCH 94/99] fix for good --- benchmarks/base_classes.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 32a02d7cd5d1..7203e67e285e 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -104,7 +104,7 @@ def run_inference(self, pipe, args): def benchmark(self, args): flush() - print(f"Running benchmark with: {vars(args)}\n") + print(f"[INFO] {self.pipe.__class__.__name__}: Running benchmark with: {vars(args)}\n") time = benchmark_fn(self.run_inference, self.pipe, args) # in seconds. memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. @@ -153,21 +153,17 @@ def run_inference(self, pipe, args): class TurboImageToImageBenchmark(ImageToImageBenchmark): - image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png") - def __init__(self, args): super().__init__(args) - self.pipe = AutoPipelineForImage2Image.from_pretrained(args.ckpt, torch_dtype=torch.float16).to("cuda") def run_inference(self, pipe, args): - print(f"Image size: {self.image.size}") _ = pipe( prompt=PROMPT, image=self.image, num_inference_steps=args.num_inference_steps, num_images_per_prompt=args.batch_size, guidance_scale=0.0, - strength=0.5 + strength=0.5, ) From b3897f8572d0e3c85f114406b977ac15654b373b Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 8 Dec 2023 10:06:53 +0530 Subject: [PATCH 95/99] fix and isolate better --- benchmarks/base_classes.py | 1 - benchmarks/run_all.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 7203e67e285e..86b7d73e0f98 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -178,7 +178,6 @@ def __init__(self, args): def run_inference(self, pipe, args): _ = pipe( prompt=PROMPT, - image=self.image, num_inference_steps=args.num_inference_steps, num_images_per_prompt=args.batch_size, guidance_scale=1.0, diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index af78bd738376..c70fb2227383 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -47,6 +47,8 @@ def main(): command += " --run_compile" run_command(command.split()) + # Run variants. + for file in python_files: if file == "benchmark_text_to_image.py": for ckpt in ALL_T2I_CKPTS: command = f"python {file} --ckpt {ckpt}" From 8289baa8690811d614d9800be00e537941ead355 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 8 Dec 2023 16:02:52 +0530 Subject: [PATCH 96/99] fuse lora so that torch compile works with peft --- benchmarks/base_classes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 86b7d73e0f98..3593c2d071b2 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -173,6 +173,7 @@ class LCMLoRATextToImageBenchmark(TextToImageBenchmark): def __init__(self, args): super().__init__(args) self.pipe.load_lora_weights(self.lora_id) + self.pipe.fuse_lora() self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config) def run_inference(self, pipe, args): From dd54366c8a6e304a591e03bb9d0212fb1d2479a8 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 8 Dec 2023 16:31:02 +0530 Subject: [PATCH 97/99] fix: LCMLoRA --- .github/workflows/benchmark.yml | 2 +- benchmarks/base_classes.py | 32 ++++++++++++++++---------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 7713e0aef111..3190091ea3c8 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -36,7 +36,7 @@ jobs: run: | apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m pip install -e .[quality,test] - python -m pip install peft pandas + python -m pip install pandas - name: Environment run: | python utils/print_env.py diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 3593c2d071b2..1e9529db4e96 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -134,27 +134,32 @@ def run_inference(self, pipe, args): ) -class ImageToImageBenchmark(TextToImageBenchmark): - pipeline_class = AutoPipelineForImage2Image - url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg" - image = load_image(url).convert("RGB") +class LCMLoRATextToImageBenchmark(TextToImageBenchmark): + lora_id = "latent-consistency/lcm-lora-sdxl" def __init__(self, args): super().__init__(args) - self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) + self.pipe.load_lora_weights(self.lora_id) + self.pipe.fuse_lora() + self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config) def run_inference(self, pipe, args): _ = pipe( prompt=PROMPT, - image=self.image, num_inference_steps=args.num_inference_steps, num_images_per_prompt=args.batch_size, + guidance_scale=1.0, ) -class TurboImageToImageBenchmark(ImageToImageBenchmark): +class ImageToImageBenchmark(TextToImageBenchmark): + pipeline_class = AutoPipelineForImage2Image + url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg" + image = load_image(url).convert("RGB") + def __init__(self, args): super().__init__(args) + self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) def run_inference(self, pipe, args): _ = pipe( @@ -162,26 +167,21 @@ def run_inference(self, pipe, args): image=self.image, num_inference_steps=args.num_inference_steps, num_images_per_prompt=args.batch_size, - guidance_scale=0.0, - strength=0.5, ) -class LCMLoRATextToImageBenchmark(TextToImageBenchmark): - lora_id = "latent-consistency/lcm-lora-sdxl" - +class TurboImageToImageBenchmark(ImageToImageBenchmark): def __init__(self, args): super().__init__(args) - self.pipe.load_lora_weights(self.lora_id) - self.pipe.fuse_lora() - self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config) def run_inference(self, pipe, args): _ = pipe( prompt=PROMPT, + image=self.image, num_inference_steps=args.num_inference_steps, num_images_per_prompt=args.batch_size, - guidance_scale=1.0, + guidance_scale=0.0, + strength=0.5, ) From 51acace44f912bbb1325ab3c8f48e96c91087442 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 8 Dec 2023 17:36:28 +0530 Subject: [PATCH 98/99] better identification for LCM --- benchmarks/base_classes.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py index 1e9529db4e96..5d328f62b904 100644 --- a/benchmarks/base_classes.py +++ b/benchmarks/base_classes.py @@ -143,6 +143,17 @@ def __init__(self, args): self.pipe.fuse_lora() self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config) + def get_result_filepath(self, args): + pipeline_class_name = str(self.pipe.__class__.__name__) + name = ( + self.lora_id.replace("/", "_") + + "_" + + pipeline_class_name + + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv" + ) + filepath = os.path.join(BASE_PATH, name) + return filepath + def run_inference(self, pipe, args): _ = pipe( prompt=PROMPT, From 80e83110bed0636760e45ad2277095c1d5039a3c Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sat, 9 Dec 2023 09:40:23 +0530 Subject: [PATCH 99/99] change to cron job --- .github/workflows/benchmark.yml | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 3190091ea3c8..c4c3c101dbfd 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -1,12 +1,8 @@ name: Benchmarking tests on: - pull_request: - branches: - - main - push: - branches: - - ci-* + schedule: + - cron: "30 1 1,15 * *" # every 2 weeks on the 1st and the 15th of every month at 1:30 AM env: DIFFUSERS_IS_CI: yes @@ -42,7 +38,7 @@ jobs: python utils/print_env.py - name: Diffusers Benchmarking env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }} BASE_PATH: benchmark_outputs run: | export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))")