From 4c185f0c9efc1e00fbd7b0f482c64339a03b86a0 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 17 Nov 2023 11:50:14 +0530
Subject: [PATCH 01/99] add poc for benchmarking workflow.

---
 .github/workflows/benchmark.yml | 61 +++++++++++++++++++++++++++++++++
 benchmarks/benchmark_sd.py      | 60 ++++++++++++++++++++++++++++++++
 benchmarks/benchmark_utils.py   | 46 +++++++++++++++++++++++++
 3 files changed, 167 insertions(+)
 create mode 100644 .github/workflows/benchmark.yml
 create mode 100644 benchmarks/benchmark_sd.py
 create mode 100644 benchmarks/benchmark_utils.py

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 000000000000..20dd69a47ce7
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,61 @@
+name: Benchmarking tests
+
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - ci-*
+
+env:
+  DIFFUSERS_IS_CI: yes
+  HF_HOME: /mnt/cache
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  PYTEST_TIMEOUT: 600
+  RUN_SLOW: yes
+  PIPELINE_USAGE_CUTOFF: 50000
+
+jobs:
+  torch_pipelines_cuda_benchmark_tests:
+    name: Torch Core Pipelines CUDA Benchmarking Tests
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+    runs-on: docker-gpu
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+      - name: Install dependencies
+        run: |
+          apt-get update && apt-get install libsndfile1-dev libgl1 -y
+          python -m pip install -e .[quality,test]
+          python -m pip install git+https://github.com/huggingface/accelerate.git
+          mkdir benchmark_outputs
+      - name: Environment
+        run: |
+          python utils/print_env.py
+      - name: Stable Diffusion Benchmarking Tests
+        env:
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        run: |
+          cd benchmarks && python benchmark_sd.py && \
+            python benchmark_sd.py --batch_size 4 && \
+            python benchmark_sd.py --run_compile && \
+            python benchmark_sd.py --batch_size 4 --run_compile 
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: benchmark_test_reports
+          path: benchmark_outputs
\ No newline at end of file
diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py
new file mode 100644
index 000000000000..6d6a1f6bfb63
--- /dev/null
+++ b/benchmarks/benchmark_sd.py
@@ -0,0 +1,60 @@
+import argparse
+import os
+import torch
+from diffusers import DiffusionPipeline
+from .benchmark_utils import benchmark_fn, bytes_to_giga_bytes, BenchmarkInfo, generate_markdown_table
+
+CKPT = "CompVis/stable-diffusion-v1-4"
+PROMPT = "ghibli style, a fantasy landscape with castles"
+BASE_PATH = "benchmark_outputs"
+
+
+def load_pipeline(run_compile=False, with_tensorrt=False):
+    pipe = DiffusionPipeline.from_pretrained(
+        CKPT, torch_dtype=torch.float16, use_safetensors=True
+    )
+    pipe = pipe.to("cuda")
+
+    if run_compile:
+        pipe.unet.to(memory_format=torch.channels_last)
+        print("Run torch compile")
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+    pipe.set_progress_bar_config(disable=True)
+    return pipe
+
+
+def run_inference(pipe, args):
+    _ = pipe(
+        prompt=PROMPT,
+        num_inference_steps=args.num_inference_steps,
+        num_images_per_prompt=args.batch_size,
+    )
+
+def main(args):
+    pipeline = load_pipeline(
+        run_compile=args.run_compile, with_tensorrt=args.with_tensorrt
+    )
+    
+    time = benchmark_fn(run_inference, pipeline, args) # in seconds.
+    memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs.
+    benchmark_info = BenchmarkInfo(time=time, memory=memory)
+    
+    markdown_report = ""
+    markdown_report = generate_markdown_table(pipeline_name=CKPT, args=args, benchmark_info=benchmark_info, markdown_report=markdown_report)  
+    return markdown_report
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+    markdown_report = main(args)
+
+    name = CKPT + f"-batch_sze@{args.batch_size}-num_inference_steps@{args.num_inference_steps}--run_compile@{args.run_compile}"
+    filepath = os.path.join(BASE_PATH, name)
+    with open(filepath, "w") as f:
+        f.write(markdown_report)
+    
+    
\ No newline at end of file
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
new file mode 100644
index 000000000000..78f7368d7ceb
--- /dev/null
+++ b/benchmarks/benchmark_utils.py
@@ -0,0 +1,46 @@
+import gc  
+import torch 
+import torch.utils.benchmark as benchmark
+from dataclasses import dataclass
+import argparse
+
+@dataclass
+class BenchmarkInfo:
+    time: float 
+    memory: float
+
+
+def flush():
+    gc.collect()
+    torch.cuda.empty_cache()
+
+def bytes_to_giga_bytes(bytes):
+    return bytes / 1024 / 1024 / 1024
+
+
+# Adapted from
+# https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html
+def benchmark_fn(f, *args, **kwargs):
+    t0 = benchmark.Timer(
+        stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+    )
+    return f"{(t0.blocked_autorange().mean):.3f}" 
+
+def generate_markdown_table(pipeline_name: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo) -> str:
+    headers = ["**Parameter**", "**Value**"]
+    data = [
+        ["Batch Size", args.batch_size],
+        ["Number of Inference Steps", args.num_inference_steps],
+        ["Run Compile", args.run_compile],
+        ["Time (seconds)", benchmark_info.time],
+        ["Memory (GBs)", benchmark_info.memory]
+    ]
+
+    # Formatting the table.
+    markdown_table = f"## {pipeline_name}\n\n"
+    markdown_table += "| " + " | ".join(headers) + " |\n"
+    markdown_table += "|-" + "-|-".join(['' for _ in headers]) + "-|\n"
+    for row in data:
+        markdown_table += "| " + " | ".join(str(item) for item in row) + " |\n"
+
+    return markdown_table
\ No newline at end of file

From 945ab176a082c7608c67a789af432dc1d88b34a8 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 17 Nov 2023 12:02:33 +0530
Subject: [PATCH 02/99] import

---
 benchmarks/benchmark_sd.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py
index 6d6a1f6bfb63..e75d7536c6e1 100644
--- a/benchmarks/benchmark_sd.py
+++ b/benchmarks/benchmark_sd.py
@@ -2,7 +2,12 @@
 import os
 import torch
 from diffusers import DiffusionPipeline
-from .benchmark_utils import benchmark_fn, bytes_to_giga_bytes, BenchmarkInfo, generate_markdown_table
+
+import sys 
+
+sys.path.append(".")
+
+from benchmark_utils import benchmark_fn, bytes_to_giga_bytes, BenchmarkInfo, generate_markdown_table
 
 CKPT = "CompVis/stable-diffusion-v1-4"
 PROMPT = "ghibli style, a fantasy landscape with castles"

From b4debda668342c4d78bb200b1538d7c3a5618140 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 17 Nov 2023 12:09:22 +0530
Subject: [PATCH 03/99] fix argument

---
 benchmarks/benchmark_sd.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py
index e75d7536c6e1..b5685c77a6b5 100644
--- a/benchmarks/benchmark_sd.py
+++ b/benchmarks/benchmark_sd.py
@@ -14,7 +14,7 @@
 BASE_PATH = "benchmark_outputs"
 
 
-def load_pipeline(run_compile=False, with_tensorrt=False):
+def load_pipeline(run_compile=False):
     pipe = DiffusionPipeline.from_pretrained(
         CKPT, torch_dtype=torch.float16, use_safetensors=True
     )
@@ -37,9 +37,7 @@ def run_inference(pipe, args):
     )
 
 def main(args):
-    pipeline = load_pipeline(
-        run_compile=args.run_compile, with_tensorrt=args.with_tensorrt
-    )
+    pipeline = load_pipeline(run_compile=args.run_compile)
     
     time = benchmark_fn(run_inference, pipeline, args) # in seconds.
     memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs.

From 22966a1fdc1f0fad31af055d93bb6795fed46e12 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 17 Nov 2023 12:28:48 +0530
Subject: [PATCH 04/99] fix: argument

---
 benchmarks/benchmark_sd.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py
index b5685c77a6b5..27b6ae44b31e 100644
--- a/benchmarks/benchmark_sd.py
+++ b/benchmarks/benchmark_sd.py
@@ -43,8 +43,7 @@ def main(args):
     memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs.
     benchmark_info = BenchmarkInfo(time=time, memory=memory)
     
-    markdown_report = ""
-    markdown_report = generate_markdown_table(pipeline_name=CKPT, args=args, benchmark_info=benchmark_info, markdown_report=markdown_report)  
+    markdown_report = generate_markdown_table(pipeline_name=CKPT, args=args, benchmark_info=benchmark_info)  
     return markdown_report
 
 if __name__ == "__main__":

From 12424a3a992e065eaa52dcd112022c502f7954c0 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 17 Nov 2023 12:43:55 +0530
Subject: [PATCH 05/99] fix: path

---
 benchmarks/benchmark_sd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py
index 27b6ae44b31e..02fe156d84ff 100644
--- a/benchmarks/benchmark_sd.py
+++ b/benchmarks/benchmark_sd.py
@@ -54,7 +54,7 @@ def main(args):
     args = parser.parse_args()
     markdown_report = main(args)
 
-    name = CKPT + f"-batch_sze@{args.batch_size}-num_inference_steps@{args.num_inference_steps}--run_compile@{args.run_compile}"
+    name = CKPT + f"-batch_sze@{args.batch_size}-num_inference_steps@{args.num_inference_steps}--run_compile@{args.run_compile}.md"
     filepath = os.path.join(BASE_PATH, name)
     with open(filepath, "w") as f:
         f.write(markdown_report)

From 122d5d90869218bf2a31dd4e2ace693483cc90a7 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 17 Nov 2023 13:00:52 +0530
Subject: [PATCH 06/99] fix

---
 benchmarks/benchmark_sd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py
index 02fe156d84ff..d11999bf8696 100644
--- a/benchmarks/benchmark_sd.py
+++ b/benchmarks/benchmark_sd.py
@@ -54,7 +54,7 @@ def main(args):
     args = parser.parse_args()
     markdown_report = main(args)
 
-    name = CKPT + f"-batch_sze@{args.batch_size}-num_inference_steps@{args.num_inference_steps}--run_compile@{args.run_compile}.md"
+    name = CKPT.replace("/", "_") + f"-batch_sze@{args.batch_size}-num_inference_steps@{args.num_inference_steps}--run_compile@{args.run_compile}.md"
     filepath = os.path.join(BASE_PATH, name)
     with open(filepath, "w") as f:
         f.write(markdown_report)

From c20d254ebaf98904d0f12b45d352734853877eda Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 17 Nov 2023 13:19:36 +0530
Subject: [PATCH 07/99] fix

---
 .github/workflows/benchmark.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 20dd69a47ce7..dc867df77430 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -40,15 +40,13 @@ jobs:
           apt-get update && apt-get install libsndfile1-dev libgl1 -y
           python -m pip install -e .[quality,test]
           python -m pip install git+https://github.com/huggingface/accelerate.git
-          mkdir benchmark_outputs
       - name: Environment
         run: |
           python utils/print_env.py
       - name: Stable Diffusion Benchmarking Tests
-        env:
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
         run: |
-          cd benchmarks && python benchmark_sd.py && \
+          cd benchmarks && mkdir benchmark_outputs && \
+            python benchmark_sd.py && \
             python benchmark_sd.py --batch_size 4 && \
             python benchmark_sd.py --run_compile && \
             python benchmark_sd.py --batch_size 4 --run_compile 

From 43544edbf853d402eeda4e783e5888cedfc11483 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 17 Nov 2023 13:34:04 +0530
Subject: [PATCH 08/99] path

---
 .github/workflows/benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index dc867df77430..e004aaa09978 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -56,4 +56,4 @@ jobs:
         uses: actions/upload-artifact@v2
         with:
           name: benchmark_test_reports
-          path: benchmark_outputs
\ No newline at end of file
+          path: benchmarks/benchmark_outputs
\ No newline at end of file

From 3c05e4179b91a3c88e6b6a3458862c46e8abde31 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Nov 2023 09:28:16 +0530
Subject: [PATCH 09/99] output csv files.

---
 Makefile                      |  2 +-
 benchmarks/benchmark_sd.py    | 49 +++++++++++-------
 benchmarks/benchmark_utils.py | 98 +++++++++++++++++++++++++----------
 3 files changed, 100 insertions(+), 49 deletions(-)

diff --git a/Makefile b/Makefile
index 1b81f551d36d..d0ed1cf2a982 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src
 
-check_dirs := examples scripts src tests utils
+check_dirs := examples scripts src tests utils benchmarks
 
 modified_only_fixup:
 	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py
index d11999bf8696..a5e0d4b9b6a4 100644
--- a/benchmarks/benchmark_sd.py
+++ b/benchmarks/benchmark_sd.py
@@ -1,23 +1,29 @@
 import argparse
 import os
+import sys
+
 import torch
+
 from diffusers import DiffusionPipeline
 
-import sys 
 
 sys.path.append(".")
+from benchmark_utils import (  # noqa: E402
+    BASE_PATH,
+    PROMPT,
+    BenchmarkInfo,
+    benchmark_fn,
+    bytes_to_giga_bytes,
+    generate_csv_dict,
+    write_to_csv,
+)
 
-from benchmark_utils import benchmark_fn, bytes_to_giga_bytes, BenchmarkInfo, generate_markdown_table
 
 CKPT = "CompVis/stable-diffusion-v1-4"
-PROMPT = "ghibli style, a fantasy landscape with castles"
-BASE_PATH = "benchmark_outputs"
 
 
 def load_pipeline(run_compile=False):
-    pipe = DiffusionPipeline.from_pretrained(
-        CKPT, torch_dtype=torch.float16, use_safetensors=True
-    )
+    pipe = DiffusionPipeline.from_pretrained(CKPT, torch_dtype=torch.float16, use_safetensors=True)
     pipe = pipe.to("cuda")
 
     if run_compile:
@@ -36,27 +42,30 @@ def run_inference(pipe, args):
         num_images_per_prompt=args.batch_size,
     )
 
-def main(args):
+
+def main(args) -> dict:
     pipeline = load_pipeline(run_compile=args.run_compile)
-    
-    time = benchmark_fn(run_inference, pipeline, args) # in seconds.
-    memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs.
+
+    time = benchmark_fn(run_inference, pipeline, args)  # in seconds.
+    memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
     benchmark_info = BenchmarkInfo(time=time, memory=memory)
-    
-    markdown_report = generate_markdown_table(pipeline_name=CKPT, args=args, benchmark_info=benchmark_info)  
-    return markdown_report
+
+    csv_dict = generate_csv_dict(pipeline=CKPT, args=args, benchmark_info=benchmark_info)
+    return csv_dict
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--batch_size", type=int, default=1)
     parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
     parser.add_argument("--run_compile", action="store_true")
     args = parser.parse_args()
-    markdown_report = main(args)
+    csv_dict = main(args)
 
-    name = CKPT.replace("/", "_") + f"-batch_sze@{args.batch_size}-num_inference_steps@{args.num_inference_steps}--run_compile@{args.run_compile}.md"
+    name = (
+        CKPT.replace("/", "_")
+        + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
+    )
     filepath = os.path.join(BASE_PATH, name)
-    with open(filepath, "w") as f:
-        f.write(markdown_report)
-    
-    
\ No newline at end of file
+    write_to_csv(filepath, csv_dict)
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index 78f7368d7ceb..1fecd43964d8 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -1,46 +1,88 @@
-import gc  
-import torch 
-import torch.utils.benchmark as benchmark
-from dataclasses import dataclass
 import argparse
+import csv
+import gc
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, List
+
+import torch
+import torch.utils.benchmark as benchmark
+
+
+GITHUB_SHA = os.getenv("GITHUB_SHA", None)
+BENCHMARK_FIELDS = [
+    "pipeline",
+    "batch_size",
+    "num_inference_steps",
+    "model_cpu_offload",
+    "run_compile",
+    "time (secs)",
+    "memory (gbs)",
+    "github_sha",
+]
+
+PROMPT = "ghibli style, a fantasy landscape with castles"
+BASE_PATH = "benchmark_outputs"
+
 
 @dataclass
 class BenchmarkInfo:
-    time: float 
+    time: float
     memory: float
 
 
 def flush():
+    """Wipes off memory."""
     gc.collect()
     torch.cuda.empty_cache()
+    torch.cuda.reset_max_memory_allocated()
+    torch.cuda.reset_peak_memory_stats()
+
 
 def bytes_to_giga_bytes(bytes):
     return bytes / 1024 / 1024 / 1024
 
 
-# Adapted from
-# https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html
 def benchmark_fn(f, *args, **kwargs):
     t0 = benchmark.Timer(
-        stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+        stmt="f(*args, **kwargs)",
+        globals={"args": args, "kwargs": kwargs, "f": f},
+        num_threads=torch.get_num_threads(),
     )
-    return f"{(t0.blocked_autorange().mean):.3f}" 
-
-def generate_markdown_table(pipeline_name: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo) -> str:
-    headers = ["**Parameter**", "**Value**"]
-    data = [
-        ["Batch Size", args.batch_size],
-        ["Number of Inference Steps", args.num_inference_steps],
-        ["Run Compile", args.run_compile],
-        ["Time (seconds)", benchmark_info.time],
-        ["Memory (GBs)", benchmark_info.memory]
-    ]
-
-    # Formatting the table.
-    markdown_table = f"## {pipeline_name}\n\n"
-    markdown_table += "| " + " | ".join(headers) + " |\n"
-    markdown_table += "|-" + "-|-".join(['' for _ in headers]) + "-|\n"
-    for row in data:
-        markdown_table += "| " + " | ".join(str(item) for item in row) + " |\n"
-
-    return markdown_table
\ No newline at end of file
+    return f"{(t0.blocked_autorange().mean):.3f}"
+
+
+def generate_csv_dict(pipeline: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo) -> Dict[str, Any]:
+    """Packs benchmarking data into a dictionary for latter serialization."""
+    data_dict = {
+        "pipeline": pipeline,
+        "batch_size": args.batch_size,
+        "num_inference_steps": args.num_inference_steps,
+        "model_cpu_offload": args.model_cpu_offload,
+        "run_compile": args.run_compile,
+        "time (secs)": benchmark_info.time,
+        "memory (gbs)": benchmark_info.memory,
+        "github_sha": GITHUB_SHA,
+    }
+    return data_dict
+
+
+def write_to_csv(file_name: str, data_dict: Dict[str, Any]):
+    """Serializes a dictionary into a CSV file."""
+    with open(file_name, mode="w", newline="") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=BENCHMARK_FIELDS)
+        writer.writeheader()
+        writer.writerow(data_dict)
+
+
+def collate_csv(input_files: List[str], output_file: str):
+    """Collates multiple identically structured CSVs into a single CSV file."""
+    with open(output_file, mode="w", newline="") as outfile:
+        writer = csv.DictWriter(outfile, fieldnames=BENCHMARK_FIELDS)
+        writer.writeheader()
+
+        for file in input_files:
+            with open(file, mode="r") as infile:
+                reader = csv.DictReader(infile)
+                for row in reader:
+                    writer.writerow(row)

From 24b68fd911981da7373e4c76d4e8e0bab7da415d Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Nov 2023 09:40:44 +0530
Subject: [PATCH 10/99] workflow cleanup

---
 .github/workflows/benchmark.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index e004aaa09978..d96f9fe955ce 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -13,9 +13,6 @@ env:
   HF_HOME: /mnt/cache
   OMP_NUM_THREADS: 8
   MKL_NUM_THREADS: 8
-  PYTEST_TIMEOUT: 600
-  RUN_SLOW: yes
-  PIPELINE_USAGE_CUTOFF: 50000
 
 jobs:
   torch_pipelines_cuda_benchmark_tests:

From 8e2088ea35c1c75e57cd7eda9a601ff7a5b94bc5 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Nov 2023 09:50:12 +0530
Subject: [PATCH 11/99] append token

---
 .github/workflows/benchmark.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index d96f9fe955ce..aa8a10ae7fb6 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -41,6 +41,8 @@ jobs:
         run: |
           python utils/print_env.py
       - name: Stable Diffusion Benchmarking Tests
+        env:
+            HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
         run: |
           cd benchmarks && mkdir benchmark_outputs && \
             python benchmark_sd.py && \

From 01584c786e75852d66b1d7880c23d5336a556c43 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Nov 2023 10:03:17 +0530
Subject: [PATCH 12/99] add utility to push to hf dataset

---
 benchmarks/push_results.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 benchmarks/push_results.py

diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py
new file mode 100644
index 000000000000..98af01fb1fe8
--- /dev/null
+++ b/benchmarks/push_results.py
@@ -0,0 +1,26 @@
+import glob
+import os
+import sys
+
+from huggingface_hub import upload_file
+
+
+sys.path.append(".")
+from benchmark_utils import BASE_PATH, collate_csv  # noqa: E402
+
+
+FINAL_CSV_FILE = "collated_results.csv"
+REPO_ID = "diffusers/benchmarks"
+GITHUB_SHA = os.getenv("GITHUB_SHA", None)
+
+
+def push_to_hf_dataset():
+    all_csvs = sorted(glob.glob(f"{BASE_PATH}/*.csv"))
+    collate_csv(all_csvs, FINAL_CSV_FILE)
+
+    commit_message = f"upload from sha: {GITHUB_SHA}" if GITHUB_SHA is not None else "upload benchmark results"
+    upload_file(repo_id=REPO_ID, path_or_fileobj=FINAL_CSV_FILE, repo_type="dataset", commit_message=commit_message)
+
+
+if __name__ == "__main__":
+    push_to_hf_dataset()

From 853035b9c19cbbd42de639266a42c9c62b868127 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Nov 2023 10:04:44 +0530
Subject: [PATCH 13/99] fix: kw arg

---
 benchmarks/push_results.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py
index 98af01fb1fe8..e3bb48df1d26 100644
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -19,7 +19,13 @@ def push_to_hf_dataset():
     collate_csv(all_csvs, FINAL_CSV_FILE)
 
     commit_message = f"upload from sha: {GITHUB_SHA}" if GITHUB_SHA is not None else "upload benchmark results"
-    upload_file(repo_id=REPO_ID, path_or_fileobj=FINAL_CSV_FILE, repo_type="dataset", commit_message=commit_message)
+    upload_file(
+        repo_id=REPO_ID,
+        path_in_repo=FINAL_CSV_FILE,
+        path_or_fileobj=FINAL_CSV_FILE,
+        repo_type="dataset",
+        commit_message=commit_message,
+    )
 
 
 if __name__ == "__main__":

From 46aaf96f5b45b2d68497d81386363dac61a5adf1 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Nov 2023 10:22:58 +0530
Subject: [PATCH 14/99] better reporting

---
 benchmarks/benchmark_sd.py    |  4 +++-
 benchmarks/benchmark_utils.py | 13 +++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py
index a5e0d4b9b6a4..754fb58c9e37 100644
--- a/benchmarks/benchmark_sd.py
+++ b/benchmarks/benchmark_sd.py
@@ -50,7 +50,9 @@ def main(args) -> dict:
     memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
     benchmark_info = BenchmarkInfo(time=time, memory=memory)
 
-    csv_dict = generate_csv_dict(pipeline=CKPT, args=args, benchmark_info=benchmark_info)
+    csv_dict = generate_csv_dict(
+        pipeline_cls=str(pipeline.__class__.__name__), ckpt=CKPT, args=args, benchmark_info=benchmark_info
+    )
     return csv_dict
 
 
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index 1fecd43964d8..307c3160387d 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -11,7 +11,8 @@
 
 GITHUB_SHA = os.getenv("GITHUB_SHA", None)
 BENCHMARK_FIELDS = [
-    "pipeline",
+    "pipeline_cls",
+    "ckpt_id",
     "batch_size",
     "num_inference_steps",
     "model_cpu_offload",
@@ -20,9 +21,9 @@
     "memory (gbs)",
     "github_sha",
 ]
-
 PROMPT = "ghibli style, a fantasy landscape with castles"
 BASE_PATH = "benchmark_outputs"
+TOTAL_GPU_MEMORY = torch.cuda.get_device_properties(0).total_memory / (1024**3)
 
 
 @dataclass
@@ -52,16 +53,20 @@ def benchmark_fn(f, *args, **kwargs):
     return f"{(t0.blocked_autorange().mean):.3f}"
 
 
-def generate_csv_dict(pipeline: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo) -> Dict[str, Any]:
+def generate_csv_dict(
+    pipeline_cls: str, ckpt: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo
+) -> Dict[str, Any]:
     """Packs benchmarking data into a dictionary for latter serialization."""
     data_dict = {
-        "pipeline": pipeline,
+        "pipeline_cls": pipeline_cls,
+        "ckpt_id": ckpt,
         "batch_size": args.batch_size,
         "num_inference_steps": args.num_inference_steps,
         "model_cpu_offload": args.model_cpu_offload,
         "run_compile": args.run_compile,
         "time (secs)": benchmark_info.time,
         "memory (gbs)": benchmark_info.memory,
+        "actual_gpu_memory (gbs)": TOTAL_GPU_MEMORY,
         "github_sha": GITHUB_SHA,
     }
     return data_dict

From d626eef65f24072bfeb455dce3e71d79a72c0981 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Nov 2023 10:26:24 +0530
Subject: [PATCH 15/99] fix: headers

---
 benchmarks/benchmark_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index 307c3160387d..1d51e89e275f 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -19,6 +19,7 @@
     "run_compile",
     "time (secs)",
     "memory (gbs)",
+    "actual_gpu_memory (gbs)",
     "github_sha",
 ]
 PROMPT = "ghibli style, a fantasy landscape with castles"

From ab12fe6a21d8e74e4845faa4e37e38a694ba2fbe Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Nov 2023 10:38:32 +0530
Subject: [PATCH 16/99] better formatting of the numbers.

---
 benchmarks/benchmark_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index 1d51e89e275f..9eb834444fac 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -42,7 +42,7 @@ def flush():
 
 
 def bytes_to_giga_bytes(bytes):
-    return bytes / 1024 / 1024 / 1024
+    return f"{(bytes / 1024 / 1024 / 1024)}:.3f"
 
 
 def benchmark_fn(f, *args, **kwargs):
@@ -67,7 +67,7 @@ def generate_csv_dict(
         "run_compile": args.run_compile,
         "time (secs)": benchmark_info.time,
         "memory (gbs)": benchmark_info.memory,
-        "actual_gpu_memory (gbs)": TOTAL_GPU_MEMORY,
+        "actual_gpu_memory (gbs)": f"{(TOTAL_GPU_MEMORY):.3f}",
         "github_sha": GITHUB_SHA,
     }
     return data_dict

From 1bb531eac7e3bb7e659d94c7cb2132ad69300236 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Nov 2023 10:39:45 +0530
Subject: [PATCH 17/99] better type annotation

---
 benchmarks/benchmark_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index 9eb834444fac..db40187a7d2d 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -3,7 +3,7 @@
 import gc
 import os
 from dataclasses import dataclass
-from typing import Any, Dict, List
+from typing import Dict, List, Union
 
 import torch
 import torch.utils.benchmark as benchmark
@@ -56,7 +56,7 @@ def benchmark_fn(f, *args, **kwargs):
 
 def generate_csv_dict(
     pipeline_cls: str, ckpt: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo
-) -> Dict[str, Any]:
+) -> Dict[str, Union[str, bool, float]]:
     """Packs benchmarking data into a dictionary for latter serialization."""
     data_dict = {
         "pipeline_cls": pipeline_cls,
@@ -73,7 +73,7 @@ def generate_csv_dict(
     return data_dict
 
 
-def write_to_csv(file_name: str, data_dict: Dict[str, Any]):
+def write_to_csv(file_name: str, data_dict: Dict[str, Union[str, bool, float]]):
     """Serializes a dictionary into a CSV file."""
     with open(file_name, mode="w", newline="") as csvfile:
         writer = csv.DictWriter(csvfile, fieldnames=BENCHMARK_FIELDS)

From 2df4abae8b278db9477faf93711968b223fb1745 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Nov 2023 10:47:10 +0530
Subject: [PATCH 18/99] fix: formatting

---
 benchmarks/benchmark_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index db40187a7d2d..a1e4f169c634 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -42,7 +42,7 @@ def flush():
 
 
 def bytes_to_giga_bytes(bytes):
-    return f"{(bytes / 1024 / 1024 / 1024)}:.3f"
+    return f"{(bytes / 1024 / 1024 / 1024):.3f}"
 
 
 def benchmark_fn(f, *args, **kwargs):

From 939fe5ccb57845148479974b3e70eb4bdeafa81d Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Nov 2023 12:04:24 +0530
Subject: [PATCH 19/99] moentarily disable check

---
 .github/workflows/benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index aa8a10ae7fb6..c993dcd31ad9 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -23,7 +23,7 @@ jobs:
     runs-on: docker-gpu
     container:
       image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 --env NVIDIA_DISABLE_REQUIRE=1
     steps:
       - name: Checkout diffusers
         uses: actions/checkout@v3

From 3a18e2908f64dfa233a12267d33452229bdf8e64 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Nov 2023 12:28:35 +0530
Subject: [PATCH 20/99] push results.

---
 .github/workflows/benchmark.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index c993dcd31ad9..8af55c011272 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -48,7 +48,8 @@ jobs:
             python benchmark_sd.py && \
             python benchmark_sd.py --batch_size 4 && \
             python benchmark_sd.py --run_compile && \
-            python benchmark_sd.py --batch_size 4 --run_compile 
+            python benchmark_sd.py --batch_size 4 --run_compile && \
+            python push_results.py
 
       - name: Test suite reports artifacts
         if: ${{ always() }}

From 71279b6f7d2704b44c60e2eba34acec38c18e8bd Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Nov 2023 14:18:20 +0530
Subject: [PATCH 21/99] remove disable check

---
 .github/workflows/benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 8af55c011272..a20142bc36c1 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -23,7 +23,7 @@ jobs:
     runs-on: docker-gpu
     container:
       image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 --env NVIDIA_DISABLE_REQUIRE=1
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
     steps:
       - name: Checkout diffusers
         uses: actions/checkout@v3

From 3c8cc38a00ab72ce5017e609d673ecbef1940936 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 29 Nov 2023 16:56:27 +0530
Subject: [PATCH 22/99] introduce base classes.

---
 benchmarks/base_classes.py                  | 56 ++++++++++++++++
 benchmarks/benchmark_sd.py                  | 73 +++++----------------
 benchmarks/push_results.py                  |  2 +-
 benchmarks/{benchmark_utils.py => utils.py} |  0
 4 files changed, 74 insertions(+), 57 deletions(-)
 create mode 100644 benchmarks/base_classes.py
 rename benchmarks/{benchmark_utils.py => utils.py} (100%)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
new file mode 100644
index 000000000000..23c8e881e235
--- /dev/null
+++ b/benchmarks/base_classes.py
@@ -0,0 +1,56 @@
+import os
+import sys
+
+import torch
+
+from diffusers import DiffusionPipeline
+
+
+sys.path.append(".")
+
+from benchmarks.utils import (  # noqa: E402
+    BASE_PATH,
+    PROMPT,
+    BenchmarkInfo,
+    benchmark_fn,
+    bytes_to_giga_bytes,
+    generate_csv_dict,
+    write_to_csv,
+)
+
+
+class TextToImagePipeline:
+    def __init__(self, args):
+        pipe = DiffusionPipeline.from_pretrained(args.ckpt, torch_dtype=torch.float16, use_safetensors=True)
+        pipe = pipe.to("cuda")
+
+        if args.run_compile:
+            pipe.unet.to(memory_format=torch.channels_last)
+            print("Run torch compile")
+            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        pipe.set_progress_bar_config(disable=True)
+        self.pipe = pipe
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
+
+    def __call__(self, args):
+        time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
+        memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
+        benchmark_info = BenchmarkInfo(time=time, memory=memory)
+
+        csv_dict = generate_csv_dict(
+            pipeline_cls=str(self.pipe.__class__.__name__), ckpt=args.ckpt, args=args, benchmark_info=benchmark_info
+        )
+        name = (
+            args.ckpt.replace("/", "_")
+            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
+        )
+        filepath = os.path.join(BASE_PATH, name)
+        write_to_csv(filepath, csv_dict)
+        print(f"Logs written to: {filepath}")
diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py
index 754fb58c9e37..c12677d9e420 100644
--- a/benchmarks/benchmark_sd.py
+++ b/benchmarks/benchmark_sd.py
@@ -1,73 +1,34 @@
 import argparse
-import os
 import sys
 
-import torch
-
-from diffusers import DiffusionPipeline
-
 
 sys.path.append(".")
-from benchmark_utils import (  # noqa: E402
-    BASE_PATH,
-    PROMPT,
-    BenchmarkInfo,
-    benchmark_fn,
-    bytes_to_giga_bytes,
-    generate_csv_dict,
-    write_to_csv,
-)
-
-
-CKPT = "CompVis/stable-diffusion-v1-4"
-
+from benchmarks.base_classes import TextToImagePipeline  # noqa: E402
 
-def load_pipeline(run_compile=False):
-    pipe = DiffusionPipeline.from_pretrained(CKPT, torch_dtype=torch.float16, use_safetensors=True)
-    pipe = pipe.to("cuda")
 
-    if run_compile:
-        pipe.unet.to(memory_format=torch.channels_last)
-        print("Run torch compile")
-        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
-    pipe.set_progress_bar_config(disable=True)
-    return pipe
-
-
-def run_inference(pipe, args):
-    _ = pipe(
-        prompt=PROMPT,
-        num_inference_steps=args.num_inference_steps,
-        num_images_per_prompt=args.batch_size,
-    )
-
-
-def main(args) -> dict:
-    pipeline = load_pipeline(run_compile=args.run_compile)
-
-    time = benchmark_fn(run_inference, pipeline, args)  # in seconds.
-    memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
-    benchmark_info = BenchmarkInfo(time=time, memory=memory)
-
-    csv_dict = generate_csv_dict(
-        pipeline_cls=str(pipeline.__class__.__name__), ckpt=CKPT, args=args, benchmark_info=benchmark_info
-    )
-    return csv_dict
+CKPT = "runwayml/stable-diffusion-v1-5"
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="runwayml/stable-diffusion-v1-5",
+        choices=[
+            "runwayml/stable-diffusion-v1-5",
+            "segmind/SSD-1B",
+            "stabilityai/stable-diffusion-2-1",
+            "stabilityai/stable-diffusion-xl-base-1.0",
+        ],
+    )
     parser.add_argument("--batch_size", type=int, default=1)
     parser.add_argument("--num_inference_steps", type=int, default=50)
     parser.add_argument("--model_cpu_offload", action="store_true")
     parser.add_argument("--run_compile", action="store_true")
     args = parser.parse_args()
-    csv_dict = main(args)
+    args.ckpt = CKPT
 
-    name = (
-        CKPT.replace("/", "_")
-        + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
-    )
-    filepath = os.path.join(BASE_PATH, name)
-    write_to_csv(filepath, csv_dict)
+    benchmark_pipe = TextToImagePipeline(args)
+
+    benchmark_pipe()
diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py
index e3bb48df1d26..9665e6b19b77 100644
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -6,7 +6,7 @@
 
 
 sys.path.append(".")
-from benchmark_utils import BASE_PATH, collate_csv  # noqa: E402
+from benchmarks.utils import BASE_PATH, collate_csv  # noqa: E402
 
 
 FINAL_CSV_FILE = "collated_results.csv"
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/utils.py
similarity index 100%
rename from benchmarks/benchmark_utils.py
rename to benchmarks/utils.py

From 9683cd773fed1c4d75063ba50c3bc0fc687df8a3 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 29 Nov 2023 17:15:07 +0530
Subject: [PATCH 23/99] img2img class

---
 benchmarks/base_classes.py     | 32 +++++++++++++++++++++++++++++---
 benchmarks/benchmark_sd.py     |  7 +------
 benchmarks/benchmark_sd_img.py | 28 ++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 9 deletions(-)
 create mode 100644 benchmarks/benchmark_sd_img.py

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 23c8e881e235..05d2e8102c03 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -3,7 +3,8 @@
 
 import torch
 
-from diffusers import DiffusionPipeline
+from diffusers import AutoPipelineForImage2Image, AutoPipelineForText2Image
+from diffusers.utils import load_image
 
 
 sys.path.append(".")
@@ -19,9 +20,18 @@
 )
 
 
+RESOLUTION_MAPPING = {
+    "runwayml/stable-diffusion-v1-5": (512, 512),
+    "stabilityai/stable-diffusion-2-1": (768, 768),
+    "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024),
+}
+
+
 class TextToImagePipeline:
+    pipeline_class = AutoPipelineForText2Image
+
     def __init__(self, args):
-        pipe = DiffusionPipeline.from_pretrained(args.ckpt, torch_dtype=torch.float16, use_safetensors=True)
+        pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16, use_safetensors=True)
         pipe = pipe.to("cuda")
 
         if args.run_compile:
@@ -39,7 +49,7 @@ def run_inference(self, pipe, args):
             num_images_per_prompt=args.batch_size,
         )
 
-    def __call__(self, args):
+    def benchmark(self, args):
         time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
         memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
         benchmark_info = BenchmarkInfo(time=time, memory=memory)
@@ -54,3 +64,19 @@ def __call__(self, args):
         filepath = os.path.join(BASE_PATH, name)
         write_to_csv(filepath, csv_dict)
         print(f"Logs written to: {filepath}")
+
+
+class ImageToImagePipeline(TextToImagePipeline):
+    pipeline_class = AutoPipelineForImage2Image
+    url = "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/1665_Girl_with_a_Pearl_Earring.jpg/800px-1665_Girl_with_a_Pearl_Earring.jpg"
+    image = load_image(url).convert("RGB")
+
+    def run_inference(self, pipe, args):
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
+
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py
index c12677d9e420..3a2bdb90ee93 100644
--- a/benchmarks/benchmark_sd.py
+++ b/benchmarks/benchmark_sd.py
@@ -6,9 +6,6 @@
 from benchmarks.base_classes import TextToImagePipeline  # noqa: E402
 
 
-CKPT = "runwayml/stable-diffusion-v1-5"
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -27,8 +24,6 @@
     parser.add_argument("--model_cpu_offload", action="store_true")
     parser.add_argument("--run_compile", action="store_true")
     args = parser.parse_args()
-    args.ckpt = CKPT
 
     benchmark_pipe = TextToImagePipeline(args)
-
-    benchmark_pipe()
+    benchmark_pipe.benchmark()
diff --git a/benchmarks/benchmark_sd_img.py b/benchmarks/benchmark_sd_img.py
new file mode 100644
index 000000000000..8ba0d2be42b8
--- /dev/null
+++ b/benchmarks/benchmark_sd_img.py
@@ -0,0 +1,28 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from benchmarks.base_classes import ImageToImagePipeline  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="runwayml/stable-diffusion-v1-5",
+        choices=[
+            "runwayml/stable-diffusion-v1-5",
+            "stabilityai/stable-diffusion-2-1",
+            "stabilityai/stable-diffusion-xl-refiner-1.0",
+        ],
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = ImageToImagePipeline(args)
+    benchmark_pipe.benchmark()

From 274b9e17665b6a4dfde3bb4680fab45fdd99b7b0 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 29 Nov 2023 17:20:35 +0530
Subject: [PATCH 24/99] add inpainting pipeline

---
 benchmarks/base_classes.py          | 20 +++++++++++++++++++-
 benchmarks/benchmark_sd_inpating.py | 28 ++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/benchmark_sd_inpating.py

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 05d2e8102c03..cdb0bd525044 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from diffusers import AutoPipelineForImage2Image, AutoPipelineForText2Image
+from diffusers import AutoPipelineForImage2Image, AutoPipelineForText2Image, AutoPipelineForInpainting
 from diffusers.utils import load_image
 
 
@@ -80,3 +80,21 @@ def run_inference(self, pipe, args):
             num_inference_steps=args.num_inference_steps,
             num_images_per_prompt=args.batch_size,
         )
+
+
+class InpatingPipeline(ImageToImagePipeline):
+    pipeline_class = AutoPipelineForInpainting
+    mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+    mask = load_image(mask_url).convert("RGB")
+    
+    def run_inference(self, pipe, args):
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
+        self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt])
+
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            mask_image=self.mask,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
\ No newline at end of file
diff --git a/benchmarks/benchmark_sd_inpating.py b/benchmarks/benchmark_sd_inpating.py
new file mode 100644
index 000000000000..dab38d204906
--- /dev/null
+++ b/benchmarks/benchmark_sd_inpating.py
@@ -0,0 +1,28 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from benchmarks.base_classes import InpatingPipeline  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="runwayml/stable-diffusion-v1-5",
+        choices=[
+            "runwayml/stable-diffusion-v1-5",
+            "stabilityai/stable-diffusion-2-1",
+            "stabilityai/stable-diffusion-xl-base-1.0",
+        ],
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = InpatingPipeline(args)
+    benchmark_pipe.benchmark()

From 2b5b8aee2af3761b226acba22954f7a0bd897e85 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 29 Nov 2023 19:01:57 +0530
Subject: [PATCH 25/99] intoduce base benchmark class.

---
 benchmarks/base_classes.py          | 25 +++++++++++++++++++------
 benchmarks/benchmark_sd.py          |  4 ++--
 benchmarks/benchmark_sd_img.py      |  4 ++--
 benchmarks/benchmark_sd_inpating.py |  4 ++--
 4 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index cdb0bd525044..add4fd0edd13 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from diffusers import AutoPipelineForImage2Image, AutoPipelineForText2Image, AutoPipelineForInpainting
+from diffusers import AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image
 from diffusers.utils import load_image
 
 
@@ -27,7 +27,20 @@
 }
 
 
-class TextToImagePipeline:
+class BaseBenchmak:
+    pipeline_class = None
+
+    def __init__(self, args):
+        super().__init__()
+
+    def run_inference(self, args):
+        raise NotImplementedError
+
+    def benchmark(self, args):
+        raise NotImplementedError
+
+
+class TextToImageBenchmark(BaseBenchmak):
     pipeline_class = AutoPipelineForText2Image
 
     def __init__(self, args):
@@ -66,7 +79,7 @@ def benchmark(self, args):
         print(f"Logs written to: {filepath}")
 
 
-class ImageToImagePipeline(TextToImagePipeline):
+class ImageToImageBenchmark(TextToImageBenchmark):
     pipeline_class = AutoPipelineForImage2Image
     url = "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/1665_Girl_with_a_Pearl_Earring.jpg/800px-1665_Girl_with_a_Pearl_Earring.jpg"
     image = load_image(url).convert("RGB")
@@ -82,11 +95,11 @@ def run_inference(self, pipe, args):
         )
 
 
-class InpatingPipeline(ImageToImagePipeline):
+class InpatingBenchmark(ImageToImageBenchmark):
     pipeline_class = AutoPipelineForInpainting
     mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
     mask = load_image(mask_url).convert("RGB")
-    
+
     def run_inference(self, pipe, args):
         self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
         self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt])
@@ -97,4 +110,4 @@ def run_inference(self, pipe, args):
             mask_image=self.mask,
             num_inference_steps=args.num_inference_steps,
             num_images_per_prompt=args.batch_size,
-        )
\ No newline at end of file
+        )
diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py
index 3a2bdb90ee93..4c6495e61eb2 100644
--- a/benchmarks/benchmark_sd.py
+++ b/benchmarks/benchmark_sd.py
@@ -3,7 +3,7 @@
 
 
 sys.path.append(".")
-from benchmarks.base_classes import TextToImagePipeline  # noqa: E402
+from benchmarks.base_classes import TextToImageBenchmark  # noqa: E402
 
 
 if __name__ == "__main__":
@@ -25,5 +25,5 @@
     parser.add_argument("--run_compile", action="store_true")
     args = parser.parse_args()
 
-    benchmark_pipe = TextToImagePipeline(args)
+    benchmark_pipe = TextToImageBenchmark(args)
     benchmark_pipe.benchmark()
diff --git a/benchmarks/benchmark_sd_img.py b/benchmarks/benchmark_sd_img.py
index 8ba0d2be42b8..74440b6ada05 100644
--- a/benchmarks/benchmark_sd_img.py
+++ b/benchmarks/benchmark_sd_img.py
@@ -3,7 +3,7 @@
 
 
 sys.path.append(".")
-from benchmarks.base_classes import ImageToImagePipeline  # noqa: E402
+from benchmarks.base_classes import ImageToImageBenchmark  # noqa: E402
 
 
 if __name__ == "__main__":
@@ -24,5 +24,5 @@
     parser.add_argument("--run_compile", action="store_true")
     args = parser.parse_args()
 
-    benchmark_pipe = ImageToImagePipeline(args)
+    benchmark_pipe = ImageToImageBenchmark(args)
     benchmark_pipe.benchmark()
diff --git a/benchmarks/benchmark_sd_inpating.py b/benchmarks/benchmark_sd_inpating.py
index dab38d204906..6167775311a6 100644
--- a/benchmarks/benchmark_sd_inpating.py
+++ b/benchmarks/benchmark_sd_inpating.py
@@ -3,7 +3,7 @@
 
 
 sys.path.append(".")
-from benchmarks.base_classes import InpatingPipeline  # noqa: E402
+from benchmarks.base_classes import InpatingBenchmark  # noqa: E402
 
 
 if __name__ == "__main__":
@@ -24,5 +24,5 @@
     parser.add_argument("--run_compile", action="store_true")
     args = parser.parse_args()
 
-    benchmark_pipe = InpatingPipeline(args)
+    benchmark_pipe = InpatingBenchmark(args)
     benchmark_pipe.benchmark()

From 66b159aa52afe9f0d974e113d9847eb22d9424e0 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 30 Nov 2023 07:31:39 +0530
Subject: [PATCH 26/99] add img2img and inpainting

---
 .github/workflows/benchmark.yml                            | 7 +++----
 benchmarks/base_classes.py                                 | 2 +-
 ...benchmark_sd_inpating.py => benchmark_sd_inpainting.py} | 4 ++--
 3 files changed, 6 insertions(+), 7 deletions(-)
 rename benchmarks/{benchmark_sd_inpating.py => benchmark_sd_inpainting.py} (86%)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index a20142bc36c1..a474a0528a06 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -45,10 +45,9 @@ jobs:
             HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
         run: |
           cd benchmarks && mkdir benchmark_outputs && \
-            python benchmark_sd.py && \
-            python benchmark_sd.py --batch_size 4 && \
-            python benchmark_sd.py --run_compile && \
-            python benchmark_sd.py --batch_size 4 --run_compile && \
+            python benchmark_sd.py && python benchmark_sd.py --run_compile && \
+            python benchmark_sd_img.py  && python benchmark_sd_img.py --run_compile && \
+            python benchmark_sd_inpainting.py  && python benchmark_sd_inpainting.py --run_compile && \
             python push_results.py
 
       - name: Test suite reports artifacts
diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index add4fd0edd13..2132b2fa8a1a 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -95,7 +95,7 @@ def run_inference(self, pipe, args):
         )
 
 
-class InpatingBenchmark(ImageToImageBenchmark):
+class InpaintingBenchmark(ImageToImageBenchmark):
     pipeline_class = AutoPipelineForInpainting
     mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
     mask = load_image(mask_url).convert("RGB")
diff --git a/benchmarks/benchmark_sd_inpating.py b/benchmarks/benchmark_sd_inpainting.py
similarity index 86%
rename from benchmarks/benchmark_sd_inpating.py
rename to benchmarks/benchmark_sd_inpainting.py
index 6167775311a6..7c8afef6eb58 100644
--- a/benchmarks/benchmark_sd_inpating.py
+++ b/benchmarks/benchmark_sd_inpainting.py
@@ -3,7 +3,7 @@
 
 
 sys.path.append(".")
-from benchmarks.base_classes import InpatingBenchmark  # noqa: E402
+from benchmarks.base_classes import InpaintingBenchmark  # noqa: E402
 
 
 if __name__ == "__main__":
@@ -24,5 +24,5 @@
     parser.add_argument("--run_compile", action="store_true")
     args = parser.parse_args()
 
-    benchmark_pipe = InpatingBenchmark(args)
+    benchmark_pipe = InpaintingBenchmark(args)
     benchmark_pipe.benchmark()

From 01addbd7ff43f422cb38020f7dc52205d5586a32 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 1 Dec 2023 08:29:57 +0530
Subject: [PATCH 27/99] feat: utility to compare changes

---
 .github/workflows/benchmark.yml |  1 +
 benchmarks/push_results.py      | 38 +++++++++++++++++++++++++++------
 benchmarks/utils.py             |  4 ++++
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index a474a0528a06..b0a05a31813d 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -37,6 +37,7 @@ jobs:
           apt-get update && apt-get install libsndfile1-dev libgl1 -y
           python -m pip install -e .[quality,test]
           python -m pip install git+https://github.com/huggingface/accelerate.git
+          python -m pip install pandas
       - name: Environment
         run: |
           python utils/print_env.py
diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py
index 9665e6b19b77..c9fec2b7b4a4 100644
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -1,23 +1,49 @@
 import glob
-import os
 import sys
 
-from huggingface_hub import upload_file
+import pandas as pd
+from huggingface_hub import hf_hub_download, upload_file
 
 
 sys.path.append(".")
-from benchmarks.utils import BASE_PATH, collate_csv  # noqa: E402
+from benchmarks.utils import BASE_PATH, FINAL_CSV_FILE, GITHUB_SHA, REPO_ID, collate_csv  # noqa: E402
 
 
-FINAL_CSV_FILE = "collated_results.csv"
-REPO_ID = "diffusers/benchmarks"
-GITHUB_SHA = os.getenv("GITHUB_SHA", None)
+def has_previous_benchmark() -> str:
+    csv_path = None
+    try:
+        csv_path = hf_hub_download(repo_id=REPO_ID, repo_type="dataset", filename=FINAL_CSV_FILE)
+    except FileNotFoundError:
+        csv_path = None
+    return csv_path
 
 
 def push_to_hf_dataset():
     all_csvs = sorted(glob.glob(f"{BASE_PATH}/*.csv"))
     collate_csv(all_csvs, FINAL_CSV_FILE)
 
+    # If there's an existing benchmark file, we should report the changes.
+    csv_path = has_previous_benchmark()
+    if csv_path is not None:
+        current_results = pd.read_csv(FINAL_CSV_FILE)
+        previous_results = pd.read_csv(csv_path)
+        numeric_columns = current_results.select_dtypes(include=["float64", "int64"]).columns
+        numeric_columns = [c for c in numeric_columns if c not in ["batch_size", "num_inference_steps"]]
+
+        for column in numeric_columns:
+            # Calculate the percentage change
+            current_results[column] = current_results[column].astype(float)
+            previous_results[column] = previous_results[column].astype(float)
+            percent_change = ((current_results[column] - previous_results[column]) / previous_results[column]) * 100
+
+        # Format the values with '+' or '-' sign and append to original values
+        current_results[column] = current_results[column].map(str) + percent_change.map(
+            lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)"
+        )
+
+        # Overwrite the current result file.
+        current_results.to_csv(FINAL_CSV_FILE, index=False)
+
     commit_message = f"upload from sha: {GITHUB_SHA}" if GITHUB_SHA is not None else "upload benchmark results"
     upload_file(
         repo_id=REPO_ID,
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index a1e4f169c634..8592a2f4f0db 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -22,10 +22,14 @@
     "actual_gpu_memory (gbs)",
     "github_sha",
 ]
+
 PROMPT = "ghibli style, a fantasy landscape with castles"
 BASE_PATH = "benchmark_outputs"
 TOTAL_GPU_MEMORY = torch.cuda.get_device_properties(0).total_memory / (1024**3)
 
+REPO_ID = "diffusers/benchmarks"
+FINAL_CSV_FILE = "collated_results.csv"
+
 
 @dataclass
 class BenchmarkInfo:

From c30cab61a56e2bc80f3bbd9a94714fbba6bd5ccd Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 1 Dec 2023 08:57:35 +0530
Subject: [PATCH 28/99] fix

---
 benchmarks/push_results.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py
index c9fec2b7b4a4..53cfcb712f43 100644
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -28,7 +28,11 @@ def push_to_hf_dataset():
         current_results = pd.read_csv(FINAL_CSV_FILE)
         previous_results = pd.read_csv(csv_path)
         numeric_columns = current_results.select_dtypes(include=["float64", "int64"]).columns
-        numeric_columns = [c for c in numeric_columns if c not in ["batch_size", "num_inference_steps"]]
+        numeric_columns = [
+            c
+            for c in numeric_columns
+            if c not in ["batch_size", "num_inference_steps", "actual_gpu_memory (gbs)", "github_sha"]
+        ]
 
         for column in numeric_columns:
             # Calculate the percentage change

From 689b9f7f5d787c8e40da726757f27ce206f4a7da Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 1 Dec 2023 09:30:42 +0530
Subject: [PATCH 29/99] fix import

---
 benchmarks/base_classes.py            | 2 +-
 benchmarks/benchmark_sd.py            | 2 +-
 benchmarks/benchmark_sd_img.py        | 2 +-
 benchmarks/benchmark_sd_inpainting.py | 2 +-
 benchmarks/push_results.py            | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 2132b2fa8a1a..61215880c146 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -9,7 +9,7 @@
 
 sys.path.append(".")
 
-from benchmarks.utils import (  # noqa: E402
+from utils import (  # noqa: E402
     BASE_PATH,
     PROMPT,
     BenchmarkInfo,
diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py
index 4c6495e61eb2..d313d26ac99b 100644
--- a/benchmarks/benchmark_sd.py
+++ b/benchmarks/benchmark_sd.py
@@ -3,7 +3,7 @@
 
 
 sys.path.append(".")
-from benchmarks.base_classes import TextToImageBenchmark  # noqa: E402
+from base_classes import TextToImageBenchmark  # noqa: E402
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/benchmark_sd_img.py b/benchmarks/benchmark_sd_img.py
index 74440b6ada05..f34b521a1606 100644
--- a/benchmarks/benchmark_sd_img.py
+++ b/benchmarks/benchmark_sd_img.py
@@ -3,7 +3,7 @@
 
 
 sys.path.append(".")
-from benchmarks.base_classes import ImageToImageBenchmark  # noqa: E402
+from base_classes import ImageToImageBenchmark  # noqa: E402
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/benchmark_sd_inpainting.py b/benchmarks/benchmark_sd_inpainting.py
index 7c8afef6eb58..aadccac32e02 100644
--- a/benchmarks/benchmark_sd_inpainting.py
+++ b/benchmarks/benchmark_sd_inpainting.py
@@ -3,7 +3,7 @@
 
 
 sys.path.append(".")
-from benchmarks.base_classes import InpaintingBenchmark  # noqa: E402
+from base_classes import InpaintingBenchmark  # noqa: E402
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py
index 53cfcb712f43..7b3e61a04977 100644
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -6,7 +6,7 @@
 
 
 sys.path.append(".")
-from benchmarks.utils import BASE_PATH, FINAL_CSV_FILE, GITHUB_SHA, REPO_ID, collate_csv  # noqa: E402
+from utils import BASE_PATH, FINAL_CSV_FILE, GITHUB_SHA, REPO_ID, collate_csv  # noqa: E402
 
 
 def has_previous_benchmark() -> str:

From d046a2559e0e15995386d8cdd7468460e2dc7bcd Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 1 Dec 2023 09:35:08 +0530
Subject: [PATCH 30/99] add args

---
 benchmarks/benchmark_sd.py            | 2 +-
 benchmarks/benchmark_sd_img.py        | 2 +-
 benchmarks/benchmark_sd_inpainting.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py
index d313d26ac99b..0fa24a08d639 100644
--- a/benchmarks/benchmark_sd.py
+++ b/benchmarks/benchmark_sd.py
@@ -26,4 +26,4 @@
     args = parser.parse_args()
 
     benchmark_pipe = TextToImageBenchmark(args)
-    benchmark_pipe.benchmark()
+    benchmark_pipe.benchmark(args)
diff --git a/benchmarks/benchmark_sd_img.py b/benchmarks/benchmark_sd_img.py
index f34b521a1606..5525b4dae60b 100644
--- a/benchmarks/benchmark_sd_img.py
+++ b/benchmarks/benchmark_sd_img.py
@@ -25,4 +25,4 @@
     args = parser.parse_args()
 
     benchmark_pipe = ImageToImageBenchmark(args)
-    benchmark_pipe.benchmark()
+    benchmark_pipe.benchmark(args)
diff --git a/benchmarks/benchmark_sd_inpainting.py b/benchmarks/benchmark_sd_inpainting.py
index aadccac32e02..8f36883e16f3 100644
--- a/benchmarks/benchmark_sd_inpainting.py
+++ b/benchmarks/benchmark_sd_inpainting.py
@@ -25,4 +25,4 @@
     args = parser.parse_args()
 
     benchmark_pipe = InpaintingBenchmark(args)
-    benchmark_pipe.benchmark()
+    benchmark_pipe.benchmark(args)

From 71f6bd9e83092241b2c850913ef736141113a26a Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 1 Dec 2023 09:38:43 +0530
Subject: [PATCH 31/99] basepath

---
 .github/workflows/benchmark.yml | 3 ++-
 benchmarks/utils.py             | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index b0a05a31813d..b99bc8b9405a 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -44,8 +44,9 @@ jobs:
       - name: Stable Diffusion Benchmarking Tests
         env:
             HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+            BASE_PATH: benchmark_outputs
         run: |
-          cd benchmarks && mkdir benchmark_outputs && \
+          cd benchmarks && mkdir ${BASE_PATH} && \
             python benchmark_sd.py && python benchmark_sd.py --run_compile && \
             python benchmark_sd_img.py  && python benchmark_sd_img.py --run_compile && \
             python benchmark_sd_inpainting.py  && python benchmark_sd_inpainting.py --run_compile && \
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index 8592a2f4f0db..98b3ab4afe56 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -24,7 +24,7 @@
 ]
 
 PROMPT = "ghibli style, a fantasy landscape with castles"
-BASE_PATH = "benchmark_outputs"
+BASE_PATH = os.getenv("BASE_PATH", ".") 
 TOTAL_GPU_MEMORY = torch.cuda.get_device_properties(0).total_memory / (1024**3)
 
 REPO_ID = "diffusers/benchmarks"

From 295cf305a3a9ac76a6e9aec214e36528c6419d6a Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 1 Dec 2023 09:51:46 +0530
Subject: [PATCH 32/99] better exception handling

---
 benchmarks/push_results.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py
index 7b3e61a04977..6ab3dc3b10c4 100644
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -3,6 +3,7 @@
 
 import pandas as pd
 from huggingface_hub import hf_hub_download, upload_file
+from huggingface_hub.utils._errors import EntryNotFoundError
 
 
 sys.path.append(".")
@@ -13,7 +14,7 @@ def has_previous_benchmark() -> str:
     csv_path = None
     try:
         csv_path = hf_hub_download(repo_id=REPO_ID, repo_type="dataset", filename=FINAL_CSV_FILE)
-    except FileNotFoundError:
+    except EntryNotFoundError:
         csv_path = None
     return csv_path
 

From b5e237115afd58e3b2e24f28b3a3ffdd1797d94f Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 1 Dec 2023 09:53:26 +0530
Subject: [PATCH 33/99] better path handling

---
 benchmarks/base_classes.py | 4 +++-
 benchmarks/utils.py        | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 61215880c146..99f363b5fca8 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -72,7 +72,9 @@ def benchmark(self, args):
         )
         name = (
             args.ckpt.replace("/", "_")
-            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
+            + "_"
+            + self.pipe.__class__.__name
+            + +f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
         )
         filepath = os.path.join(BASE_PATH, name)
         write_to_csv(filepath, csv_dict)
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index 98b3ab4afe56..88c09be6d54d 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -24,7 +24,7 @@
 ]
 
 PROMPT = "ghibli style, a fantasy landscape with castles"
-BASE_PATH = os.getenv("BASE_PATH", ".") 
+BASE_PATH = os.getenv("BASE_PATH", ".")
 TOTAL_GPU_MEMORY = torch.cuda.get_device_properties(0).total_memory / (1024**3)
 
 REPO_ID = "diffusers/benchmarks"

From e7aed9ec8f13d066165d1ad73e75d17f28be5f61 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 1 Dec 2023 09:57:06 +0530
Subject: [PATCH 34/99] fix

---
 benchmarks/base_classes.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 99f363b5fca8..83b526b0370d 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -67,13 +67,14 @@ def benchmark(self, args):
         memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
         benchmark_info = BenchmarkInfo(time=time, memory=memory)
 
+        pipeline_class_name = str(self.pipe.__class__.__name__)
         csv_dict = generate_csv_dict(
-            pipeline_cls=str(self.pipe.__class__.__name__), ckpt=args.ckpt, args=args, benchmark_info=benchmark_info
+            pipeline_cls=pipeline_class_name, ckpt=args.ckpt, args=args, benchmark_info=benchmark_info
         )
         name = (
             args.ckpt.replace("/", "_")
             + "_"
-            + self.pipe.__class__.__name
+            + pipeline_class_name
             + +f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
         )
         filepath = os.path.join(BASE_PATH, name)

From 8eb8baffbac75488ad1a4a72b0e306415a2a3e72 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 1 Dec 2023 09:59:56 +0530
Subject: [PATCH 35/99] fix

---
 benchmarks/base_classes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 83b526b0370d..d3844ad52c43 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -75,7 +75,7 @@ def benchmark(self, args):
             args.ckpt.replace("/", "_")
             + "_"
             + pipeline_class_name
-            + +f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
+            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
         )
         filepath = os.path.join(BASE_PATH, name)
         write_to_csv(filepath, csv_dict)

From 3cb02f8bd1071077499ece67717991a8bbec1111 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 1 Dec 2023 10:50:59 +0530
Subject: [PATCH 36/99] remove

---
 benchmarks/base_classes.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index d3844ad52c43..1d3c6d1b77b2 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from diffusers import AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image
+from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image, StableDiffusionControlNetPipeline
 from diffusers.utils import load_image
 
 
@@ -114,3 +114,22 @@ def run_inference(self, pipe, args):
             num_inference_steps=args.num_inference_steps,
             num_images_per_prompt=args.batch_size,
         )
+
+
+class ControlNetBenchmark(BaseBenchmak): # Pick up
+    pipeline_class = StableDiffusionControlNetPipeline 
+    aux_network_class = ControlNetModel
+    image_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+    mask = load_image(image_url).convert("RGB")
+
+    def run_inference(self, pipe, args):
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
+        self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt])
+
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            mask_image=self.mask,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+        )
\ No newline at end of file

From 60c980c8ac2b1e9076d68dd76a31c2c6279ba2d4 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 1 Dec 2023 11:07:46 +0530
Subject: [PATCH 37/99] ifx

---
 benchmarks/base_classes.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 1d3c6d1b77b2..6526074f31d0 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -87,9 +87,11 @@ class ImageToImageBenchmark(TextToImageBenchmark):
     url = "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/1665_Girl_with_a_Pearl_Earring.jpg/800px-1665_Girl_with_a_Pearl_Earring.jpg"
     image = load_image(url).convert("RGB")
 
-    def run_inference(self, pipe, args):
+    def __init__(self, args):
+        super.__init__(args)
         self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
 
+    def run_inference(self, pipe, args):
         _ = pipe(
             prompt=PROMPT,
             image=self.image,
@@ -103,10 +105,12 @@ class InpaintingBenchmark(ImageToImageBenchmark):
     mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
     mask = load_image(mask_url).convert("RGB")
 
-    def run_inference(self, pipe, args):
+    def __init__(self, args):
+        super.__init__(args)
         self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
         self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt])
 
+    def run_inference(self, pipe, args):
         _ = pipe(
             prompt=PROMPT,
             image=self.image,
@@ -116,15 +120,19 @@ def run_inference(self, pipe, args):
         )
 
 
-class ControlNetBenchmark(BaseBenchmak): # Pick up
+class ControlNetBenchmark(BaseBenchmak): 
     pipeline_class = StableDiffusionControlNetPipeline 
     aux_network_class = ControlNetModel
+
+    # TODO: change the URL.
     image_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
-    mask = load_image(image_url).convert("RGB")
+    image = load_image(image_url).convert("RGB")
 
-    def run_inference(self, pipe, args):
+    def __init__(self, args):
+        
         self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
-        self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt])
+
+    def run_inference(self, pipe, args):
 
         _ = pipe(
             prompt=PROMPT,

From cd91b622c8b9de5a42ca44b5b3b2b01993c0ee85 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 1 Dec 2023 17:15:27 +0530
Subject: [PATCH 38/99] fix

---
 benchmarks/base_classes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 6526074f31d0..a4aba9439743 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -88,7 +88,7 @@ class ImageToImageBenchmark(TextToImageBenchmark):
     image = load_image(url).convert("RGB")
 
     def __init__(self, args):
-        super.__init__(args)
+        super().__init__(args)
         self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
 
     def run_inference(self, pipe, args):
@@ -106,7 +106,7 @@ class InpaintingBenchmark(ImageToImageBenchmark):
     mask = load_image(mask_url).convert("RGB")
 
     def __init__(self, args):
-        super.__init__(args)
+        super().__init__(args)
         self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
         self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt])
 

From 1782d5a1815fb6e05d84836efaa562735338e534 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 09:07:11 +0530
Subject: [PATCH 39/99] add: support for controlnet.

---
 .github/workflows/benchmark.yml    |  1 +
 benchmarks/base_classes.py         | 72 +++++++++++++++++++++++-------
 benchmarks/benchmark_controlnet.py | 26 +++++++++++
 3 files changed, 83 insertions(+), 16 deletions(-)
 create mode 100644 benchmarks/benchmark_controlnet.py

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index b99bc8b9405a..08e670c29d7d 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -50,6 +50,7 @@ jobs:
             python benchmark_sd.py && python benchmark_sd.py --run_compile && \
             python benchmark_sd_img.py  && python benchmark_sd_img.py --run_compile && \
             python benchmark_sd_inpainting.py  && python benchmark_sd_inpainting.py --run_compile && \
+            python benchmark_controlnet.py && python benchmark_sd_inpainting.py --run_compile && \
             python push_results.py
 
       - name: Test suite reports artifacts
diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index a4aba9439743..cb6338cf58ff 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -3,7 +3,14 @@
 
 import torch
 
-from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image, StableDiffusionControlNetPipeline
+from diffusers import (
+    AutoPipelineForImage2Image,
+    AutoPipelineForInpainting,
+    AutoPipelineForText2Image,
+    ControlNetModel,
+    StableDiffusionControlNetPipeline,
+    StableDiffusionXLControlNetPipeline,
+)
 from diffusers.utils import load_image
 
 
@@ -15,6 +22,7 @@
     BenchmarkInfo,
     benchmark_fn,
     bytes_to_giga_bytes,
+    flush,
     generate_csv_dict,
     write_to_csv,
 )
@@ -22,10 +30,14 @@
 
 RESOLUTION_MAPPING = {
     "runwayml/stable-diffusion-v1-5": (512, 512),
+    "lllyasviel/sd-controlnet-canny": (512, 512),
+    "diffusers/controlnet-canny-sdxl-1.0": (1024, 1024),
     "stabilityai/stable-diffusion-2-1": (768, 768),
     "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024),
 }
 
+CONTROLNET_MAPPING = {}
+
 
 class BaseBenchmak:
     pipeline_class = None
@@ -39,6 +51,17 @@ def run_inference(self, args):
     def benchmark(self, args):
         raise NotImplementedError
 
+    def get_result_filepath(self, args):
+        pipeline_class_name = str(self.pipe.__class__.__name__)
+        name = (
+            args.ckpt.replace("/", "_")
+            + "_"
+            + pipeline_class_name
+            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
+        )
+        filepath = os.path.join(BASE_PATH, name)
+        return filepath
+
 
 class TextToImageBenchmark(BaseBenchmak):
     pipeline_class = AutoPipelineForText2Image
@@ -71,15 +94,10 @@ def benchmark(self, args):
         csv_dict = generate_csv_dict(
             pipeline_cls=pipeline_class_name, ckpt=args.ckpt, args=args, benchmark_info=benchmark_info
         )
-        name = (
-            args.ckpt.replace("/", "_")
-            + "_"
-            + pipeline_class_name
-            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
-        )
-        filepath = os.path.join(BASE_PATH, name)
+        filepath = self.get_result_filepath(args)
         write_to_csv(filepath, csv_dict)
         print(f"Logs written to: {filepath}")
+        flush()
 
 
 class ImageToImageBenchmark(TextToImageBenchmark):
@@ -120,24 +138,46 @@ def run_inference(self, pipe, args):
         )
 
 
-class ControlNetBenchmark(BaseBenchmak): 
-    pipeline_class = StableDiffusionControlNetPipeline 
+class ControlNetBenchmark(TextToImageBenchmark):
+    pipeline_class = StableDiffusionControlNetPipeline
     aux_network_class = ControlNetModel
 
-    # TODO: change the URL.
-    image_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+    image_url = (
+        "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png"
+    )
     image = load_image(image_url).convert("RGB")
 
     def __init__(self, args):
-        
+        if isinstance(self.pipeline_class, StableDiffusionControlNetPipeline):
+            root_ckpt = "runwayml/stable-diffusion-v1-5"
+        elif isinstance(self.pipeline_class, StableDiffusionXLControlNetPipeline):
+            root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
+
+        aux_network = self.aux_network_class.from_pretrained(
+            args.ckpt, torch_dtype=torch.float16, use_safetensors=True
+        )
+        pipe = self.pipeline_class.from_pretrained(
+            root_ckpt, controlnet=aux_network, torch_dtype=torch.float16, use_safetensors=True
+        )
+        pipe = pipe.to("cuda")
+
+        if args.run_compile:
+            pipe.unet.to(memory_format=torch.channels_last)
+            pipe.controlnet.to(memory_format=torch.channels_last)
+            print("Run torch compile")
+            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+            pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
+
         self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
 
     def run_inference(self, pipe, args):
-
         _ = pipe(
             prompt=PROMPT,
             image=self.image,
-            mask_image=self.mask,
             num_inference_steps=args.num_inference_steps,
             num_images_per_prompt=args.batch_size,
-        )
\ No newline at end of file
+        )
+
+
+class ControlNetSDXLBenchmark(ControlNetBenchmark):
+    pipeline_class = StableDiffusionXLControlNetPipeline
diff --git a/benchmarks/benchmark_controlnet.py b/benchmarks/benchmark_controlnet.py
new file mode 100644
index 000000000000..9217004461dc
--- /dev/null
+++ b/benchmarks/benchmark_controlnet.py
@@ -0,0 +1,26 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import ControlNetBenchmark, ControlNetSDXLBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="lllyasviel/sd-controlnet-canny",
+        choices=["lllyasviel/sd-controlnet-canny", "diffusers/controlnet-canny-sdxl-1.0"],
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = (
+        ControlNetBenchmark(args) if args.ckpt == "lllyasviel/sd-controlnet-canny" else ControlNetSDXLBenchmark(args)
+    )
+    benchmark_pipe.benchmark(args)

From df5dead87930baa617de2644d1ca5c5196b4952a Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 09:21:59 +0530
Subject: [PATCH 40/99] image_url -> url

---
 benchmarks/base_classes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index cb6338cf58ff..8b4dc8c4ef11 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -142,10 +142,10 @@ class ControlNetBenchmark(TextToImageBenchmark):
     pipeline_class = StableDiffusionControlNetPipeline
     aux_network_class = ControlNetModel
 
-    image_url = (
+    url = (
         "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png"
     )
-    image = load_image(image_url).convert("RGB")
+    image = load_image(url).convert("RGB")
 
     def __init__(self, args):
         if isinstance(self.pipeline_class, StableDiffusionControlNetPipeline):

From c6c545c6b340bba46eea52b2d64a918a4a7b0dc1 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 09:25:39 +0530
Subject: [PATCH 41/99] move images to huggingface hub

---
 benchmarks/base_classes.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 8b4dc8c4ef11..209e84e678a0 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -102,7 +102,7 @@ def benchmark(self, args):
 
 class ImageToImageBenchmark(TextToImageBenchmark):
     pipeline_class = AutoPipelineForImage2Image
-    url = "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/1665_Girl_with_a_Pearl_Earring.jpg/800px-1665_Girl_with_a_Pearl_Earring.jpg"
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg/800px-1665_Girl_with_a_Pearl_Earring.jpg"
     image = load_image(url).convert("RGB")
 
     def __init__(self, args):
@@ -120,7 +120,7 @@ def run_inference(self, pipe, args):
 
 class InpaintingBenchmark(ImageToImageBenchmark):
     pipeline_class = AutoPipelineForInpainting
-    mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+    mask_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/overture-creations-5sI6fQgYIuo_mask.png"
     mask = load_image(mask_url).convert("RGB")
 
     def __init__(self, args):
@@ -142,9 +142,7 @@ class ControlNetBenchmark(TextToImageBenchmark):
     pipeline_class = StableDiffusionControlNetPipeline
     aux_network_class = ControlNetModel
 
-    url = (
-        "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png"
-    )
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png"
     image = load_image(url).convert("RGB")
 
     def __init__(self, args):

From b358c87cb9f37435ed5e32c9b1235c93ad32a801 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 09:35:33 +0530
Subject: [PATCH 42/99] correct urls.

---
 benchmarks/base_classes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 209e84e678a0..31789e31457b 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -102,7 +102,7 @@ def benchmark(self, args):
 
 class ImageToImageBenchmark(TextToImageBenchmark):
     pipeline_class = AutoPipelineForImage2Image
-    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg/800px-1665_Girl_with_a_Pearl_Earring.jpg"
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg"
     image = load_image(url).convert("RGB")
 
     def __init__(self, args):

From 93b491b4014fa5d3a97037c3fc011c2bc270a30e Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 09:59:04 +0530
Subject: [PATCH 43/99] root_ckpt

---
 .github/workflows/benchmark.yml | 2 +-
 benchmarks/base_classes.py      | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 08e670c29d7d..1c807c436665 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -41,7 +41,7 @@ jobs:
       - name: Environment
         run: |
           python utils/print_env.py
-      - name: Stable Diffusion Benchmarking Tests
+      - name: Diffusers Benchmarking
         env:
             HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
             BASE_PATH: benchmark_outputs
diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 31789e31457b..6d3adb23ed43 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -141,21 +141,17 @@ def run_inference(self, pipe, args):
 class ControlNetBenchmark(TextToImageBenchmark):
     pipeline_class = StableDiffusionControlNetPipeline
     aux_network_class = ControlNetModel
+    root_ckpt = "runwayml/stable-diffusion-v1-5"
 
     url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png"
     image = load_image(url).convert("RGB")
 
     def __init__(self, args):
-        if isinstance(self.pipeline_class, StableDiffusionControlNetPipeline):
-            root_ckpt = "runwayml/stable-diffusion-v1-5"
-        elif isinstance(self.pipeline_class, StableDiffusionXLControlNetPipeline):
-            root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
-
         aux_network = self.aux_network_class.from_pretrained(
             args.ckpt, torch_dtype=torch.float16, use_safetensors=True
         )
         pipe = self.pipeline_class.from_pretrained(
-            root_ckpt, controlnet=aux_network, torch_dtype=torch.float16, use_safetensors=True
+            self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16, use_safetensors=True
         )
         pipe = pipe.to("cuda")
 
@@ -179,3 +175,4 @@ def run_inference(self, pipe, args):
 
 class ControlNetSDXLBenchmark(ControlNetBenchmark):
     pipeline_class = StableDiffusionXLControlNetPipeline
+    root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"

From 748f6dcc3cc5e9b5f56ea82f8de7f9228e43a6c9 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 13:05:20 +0530
Subject: [PATCH 44/99] flush before benchmarking

---
 benchmarks/base_classes.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 6d3adb23ed43..5c9468643406 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -86,6 +86,8 @@ def run_inference(self, pipe, args):
         )
 
     def benchmark(self, args):
+        flush()
+
         time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
         memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
         benchmark_info = BenchmarkInfo(time=time, memory=memory)
@@ -97,7 +99,6 @@ def benchmark(self, args):
         filepath = self.get_result_filepath(args)
         write_to_csv(filepath, csv_dict)
         print(f"Logs written to: {filepath}")
-        flush()
 
 
 class ImageToImageBenchmark(TextToImageBenchmark):

From 5d5d5fdfbbe883728b7ae9f57eca05143725bbf4 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 13:07:34 +0530
Subject: [PATCH 45/99] don't install accelerate from source

---
 .github/workflows/benchmark.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 1c807c436665..7aa9c761032b 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -36,7 +36,6 @@ jobs:
         run: |
           apt-get update && apt-get install libsndfile1-dev libgl1 -y
           python -m pip install -e .[quality,test]
-          python -m pip install git+https://github.com/huggingface/accelerate.git
           python -m pip install pandas
       - name: Environment
         run: |

From 46510825ba25de2a156d7e915ef65600a7914ff7 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 13:15:17 +0530
Subject: [PATCH 46/99] add runner

---
 benchmarks/run_all.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 benchmarks/run_all.py

diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
new file mode 100644
index 000000000000..ac91770fff09
--- /dev/null
+++ b/benchmarks/run_all.py
@@ -0,0 +1,17 @@
+import glob
+import subprocess
+
+
+PATTERN = "benchmark_*.py"
+
+
+def main():
+    python_files = glob.glob(PATTERN)
+
+    for file in python_files:
+        subprocess.run(["python", file])
+        subprocess.run(["python", f"{file} --run_compile"])
+
+
+if __name__ == "__main__":
+    main()

From 8e805796d2b8cbb1240bb8719411ca7d9cf69601 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 13:16:46 +0530
Subject: [PATCH 47/99] simplify Diffusers Benchmarking step

---
 .github/workflows/benchmark.yml | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 7aa9c761032b..b671abd77faa 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -45,12 +45,7 @@ jobs:
             HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
             BASE_PATH: benchmark_outputs
         run: |
-          cd benchmarks && mkdir ${BASE_PATH} && \
-            python benchmark_sd.py && python benchmark_sd.py --run_compile && \
-            python benchmark_sd_img.py  && python benchmark_sd_img.py --run_compile && \
-            python benchmark_sd_inpainting.py  && python benchmark_sd_inpainting.py --run_compile && \
-            python benchmark_controlnet.py && python benchmark_sd_inpainting.py --run_compile && \
-            python push_results.py
+          cd benchmarks && mkdir ${BASE_PATH} && python run_all.py && python push_results.py
 
       - name: Test suite reports artifacts
         if: ${{ always() }}

From d49ad655965ad2b48dc873c19f622428fd7bae57 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 14:27:47 +0530
Subject: [PATCH 48/99] change runner

---
 .github/workflows/benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index b671abd77faa..7b1877d5334c 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -20,7 +20,7 @@ jobs:
     strategy:
       fail-fast: false
       max-parallel: 1
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, a10, ci]
     container:
       image: diffusers/diffusers-pytorch-cuda
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0

From 7c7846b80262b02a96fc555508ba2ab485dc4da8 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 14:39:55 +0530
Subject: [PATCH 49/99] fix: subprocess call.

---
 benchmarks/run_all.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
index ac91770fff09..4e769d229ff3 100644
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -10,7 +10,7 @@ def main():
 
     for file in python_files:
         subprocess.run(["python", file])
-        subprocess.run(["python", f"{file} --run_compile"])
+        subprocess.run(["python", file, "--run_compile"])
 
 
 if __name__ == "__main__":

From 5dbcbf58b18b69a1da6d4eca755667c4182a8b57 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 14:45:38 +0530
Subject: [PATCH 50/99] filter percentage values

---
 benchmarks/push_results.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py
index 6ab3dc3b10c4..0bfb0a8d0cf9 100644
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -28,6 +28,7 @@ def push_to_hf_dataset():
     if csv_path is not None:
         current_results = pd.read_csv(FINAL_CSV_FILE)
         previous_results = pd.read_csv(csv_path)
+
         numeric_columns = current_results.select_dtypes(include=["float64", "int64"]).columns
         numeric_columns = [
             c
@@ -36,6 +37,8 @@ def push_to_hf_dataset():
         ]
 
         for column in numeric_columns:
+            previous_results[column] = previous_results[column].apply(lambda x: x.split()[0])
+
             # Calculate the percentage change
             current_results[column] = current_results[column].astype(float)
             previous_results[column] = previous_results[column].astype(float)

From cb8572a7da58ee50a4cf8943f86c0808a5f52ce3 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 14:51:37 +0530
Subject: [PATCH 51/99] fix controlnet benchmark

---
 benchmarks/base_classes.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 5c9468643406..6ed529efdf7a 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -156,6 +156,9 @@ def __init__(self, args):
         )
         pipe = pipe.to("cuda")
 
+        pipe.set_progress_bar_config(disable=True)
+        self.pipe = pipe
+
         if args.run_compile:
             pipe.unet.to(memory_format=torch.channels_last)
             pipe.controlnet.to(memory_format=torch.channels_last)

From 6dec96cac28ab5ad1edd9b9a0085221143f199ff Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 15:02:23 +0530
Subject: [PATCH 52/99] add t2i adapters.

---
 benchmarks/base_classes.py          | 12 ++++++++++++
 benchmarks/benchmark_t2i_adapter.py | 26 ++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)
 create mode 100644 benchmarks/benchmark_t2i_adapter.py

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 6ed529efdf7a..68e3b40c110c 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -8,7 +8,9 @@
     AutoPipelineForInpainting,
     AutoPipelineForText2Image,
     ControlNetModel,
+    StableDiffusionAdapterPipeline,
     StableDiffusionControlNetPipeline,
+    StableDiffusionXLAdapterPipeline,
     StableDiffusionXLControlNetPipeline,
 )
 from diffusers.utils import load_image
@@ -180,3 +182,13 @@ def run_inference(self, pipe, args):
 class ControlNetSDXLBenchmark(ControlNetBenchmark):
     pipeline_class = StableDiffusionXLControlNetPipeline
     root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
+
+
+class T2IAdapterBenchmark(ControlNetBenchmark):
+    pipeline_class = StableDiffusionAdapterPipeline
+    root_ckpt = "CompVis/stable-diffusion-v1-4"
+
+
+class T2IAdapterSDXLBenchmark(T2IAdapterBenchmark):
+    pipeline_class = StableDiffusionXLAdapterPipeline
+    root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
diff --git a/benchmarks/benchmark_t2i_adapter.py b/benchmarks/benchmark_t2i_adapter.py
new file mode 100644
index 000000000000..7016e5c66129
--- /dev/null
+++ b/benchmarks/benchmark_t2i_adapter.py
@@ -0,0 +1,26 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import ControlNetBenchmark, ControlNetSDXLBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="TencentARC/t2iadapter_canny_sd14v1",
+        choices=["TencentARC/t2iadapter_canny_sd14v1", "TencentARC/t2i-adapter-canny-sdxl-1.0"],
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = (
+        ControlNetBenchmark(args) if args.ckpt == "lllyasviel/sd-controlnet-canny" else ControlNetSDXLBenchmark(args)
+    )
+    benchmark_pipe.benchmark(args)

From 86d597f6e8201dedc9b58b7bf91ded5ba3a7329f Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 15:04:16 +0530
Subject: [PATCH 53/99] fix filter columns

---
 benchmarks/push_results.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py
index 0bfb0a8d0cf9..abc4f197f34e 100644
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -18,6 +18,11 @@ def has_previous_benchmark() -> str:
         csv_path = None
     return csv_path
 
+def filter_float(value):
+    if isinstance(value, str):
+        return value.split()[0]
+    return value
+
 
 def push_to_hf_dataset():
     all_csvs = sorted(glob.glob(f"{BASE_PATH}/*.csv"))
@@ -37,7 +42,7 @@ def push_to_hf_dataset():
         ]
 
         for column in numeric_columns:
-            previous_results[column] = previous_results[column].apply(lambda x: x.split()[0])
+            previous_results[column] = previous_results[column].apply(lambda x: filter_float(x))
 
             # Calculate the percentage change
             current_results[column] = current_results[column].astype(float)

From fa7bfe13459fb7e015410b35bb95f9cbfa39b587 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 15:18:58 +0530
Subject: [PATCH 54/99] fix t2i adapter benchmark

---
 benchmarks/benchmark_t2i_adapter.py | 6 ++++--
 benchmarks/push_results.py          | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_t2i_adapter.py b/benchmarks/benchmark_t2i_adapter.py
index 7016e5c66129..44b04b470ea6 100644
--- a/benchmarks/benchmark_t2i_adapter.py
+++ b/benchmarks/benchmark_t2i_adapter.py
@@ -3,7 +3,7 @@
 
 
 sys.path.append(".")
-from base_classes import ControlNetBenchmark, ControlNetSDXLBenchmark  # noqa: E402
+from base_classes import T2IAdapterBenchmark, T2IAdapterSDXLBenchmark  # noqa: E402
 
 
 if __name__ == "__main__":
@@ -21,6 +21,8 @@
     args = parser.parse_args()
 
     benchmark_pipe = (
-        ControlNetBenchmark(args) if args.ckpt == "lllyasviel/sd-controlnet-canny" else ControlNetSDXLBenchmark(args)
+        T2IAdapterBenchmark(args)
+        if args.ckpt == "TencentARC/t2iadapter_canny_sd14v1"
+        else T2IAdapterSDXLBenchmark(args)
     )
     benchmark_pipe.benchmark(args)
diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py
index abc4f197f34e..fb2559802ebd 100644
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -18,6 +18,7 @@ def has_previous_benchmark() -> str:
         csv_path = None
     return csv_path
 
+
 def filter_float(value):
     if isinstance(value, str):
         return value.split()[0]

From 59df524f3cb7114fdb98b21c43ec5254929f8f08 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 15:39:05 +0530
Subject: [PATCH 55/99] fix init.

---
 benchmarks/base_classes.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 68e3b40c110c..da9095d8f1cd 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -183,12 +183,21 @@ class ControlNetSDXLBenchmark(ControlNetBenchmark):
     pipeline_class = StableDiffusionXLControlNetPipeline
     root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
 
+    def __init__(self, args):
+        super().__init__(args)
+
 
 class T2IAdapterBenchmark(ControlNetBenchmark):
     pipeline_class = StableDiffusionAdapterPipeline
     root_ckpt = "CompVis/stable-diffusion-v1-4"
 
+    def __init__(self, args):
+        super().__init__(args)
+
 
 class T2IAdapterSDXLBenchmark(T2IAdapterBenchmark):
     pipeline_class = StableDiffusionXLAdapterPipeline
     root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
+
+    def __init__(self, args):
+        super().__init__(args)

From 3cd0f592247a062f1ff1dc085c743563551e9714 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 16:02:38 +0530
Subject: [PATCH 56/99] fix

---
 benchmarks/base_classes.py | 4 ++++
 benchmarks/run_all.py      | 1 +
 2 files changed, 5 insertions(+)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index da9095d8f1cd..432b871023eb 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -12,6 +12,7 @@
     StableDiffusionControlNetPipeline,
     StableDiffusionXLAdapterPipeline,
     StableDiffusionXLControlNetPipeline,
+    T2IAdapter
 )
 from diffusers.utils import load_image
 
@@ -90,6 +91,8 @@ def run_inference(self, pipe, args):
     def benchmark(self, args):
         flush()
 
+        print(f"Running benchmark with: {args}\n")
+
         time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
         memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
         benchmark_info = BenchmarkInfo(time=time, memory=memory)
@@ -189,6 +192,7 @@ def __init__(self, args):
 
 class T2IAdapterBenchmark(ControlNetBenchmark):
     pipeline_class = StableDiffusionAdapterPipeline
+    aux_network_class = T2IAdapter
     root_ckpt = "CompVis/stable-diffusion-v1-4"
 
     def __init__(self, args):
diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
index 4e769d229ff3..8f81ae11bfe3 100644
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -9,6 +9,7 @@ def main():
     python_files = glob.glob(PATTERN)
 
     for file in python_files:
+        print(f"Running {file}.")
         subprocess.run(["python", file])
         subprocess.run(["python", file, "--run_compile"])
 

From 8583db84d68135b035800839f9af5cb3d7e34975 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 16:38:04 +0530
Subject: [PATCH 57/99] remove safetensors flag

---
 benchmarks/base_classes.py | 14 +++++---------
 benchmarks/run_all.py      |  2 +-
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 432b871023eb..05ad5cc5d68f 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -12,7 +12,7 @@
     StableDiffusionControlNetPipeline,
     StableDiffusionXLAdapterPipeline,
     StableDiffusionXLControlNetPipeline,
-    T2IAdapter
+    T2IAdapter,
 )
 from diffusers.utils import load_image
 
@@ -70,7 +70,7 @@ class TextToImageBenchmark(BaseBenchmak):
     pipeline_class = AutoPipelineForText2Image
 
     def __init__(self, args):
-        pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16, use_safetensors=True)
+        pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
         pipe = pipe.to("cuda")
 
         if args.run_compile:
@@ -91,7 +91,7 @@ def run_inference(self, pipe, args):
     def benchmark(self, args):
         flush()
 
-        print(f"Running benchmark with: {args}\n")
+        print(f"Running benchmark with: {dict(args)}\n")
 
         time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
         memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
@@ -153,12 +153,8 @@ class ControlNetBenchmark(TextToImageBenchmark):
     image = load_image(url).convert("RGB")
 
     def __init__(self, args):
-        aux_network = self.aux_network_class.from_pretrained(
-            args.ckpt, torch_dtype=torch.float16, use_safetensors=True
-        )
-        pipe = self.pipeline_class.from_pretrained(
-            self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16, use_safetensors=True
-        )
+        aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
+        pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16)
         pipe = pipe.to("cuda")
 
         pipe.set_progress_bar_config(disable=True)
diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
index 8f81ae11bfe3..9c058f9f3d39 100644
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -9,7 +9,7 @@ def main():
     python_files = glob.glob(PATTERN)
 
     for file in python_files:
-        print(f"Running {file}.")
+        print(f"******Running file: {file} ******")
         subprocess.run(["python", file])
         subprocess.run(["python", file, "--run_compile"])
 

From 6b9bf4af528b501734762091fce41d9b5700e554 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 16:46:28 +0530
Subject: [PATCH 58/99] fix args print

---
 benchmarks/base_classes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 05ad5cc5d68f..70afa958f7c4 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -91,7 +91,7 @@ def run_inference(self, pipe, args):
     def benchmark(self, args):
         flush()
 
-        print(f"Running benchmark with: {dict(args)}\n")
+        print(f"Running benchmark with: {vars(args)}\n")
 
         time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
         memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.

From 38160f1ae3413ab88d6fb66d216e35ce602fa9d3 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 17:05:11 +0530
Subject: [PATCH 59/99] fix

---
 benchmarks/base_classes.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 70afa958f7c4..fd862d992f3a 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -154,7 +154,11 @@ class ControlNetBenchmark(TextToImageBenchmark):
 
     def __init__(self, args):
         aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
-        pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16)
+
+        if self.aux_network_class == ControlNetModel:
+            pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16)
+        else:
+            pipe = self.pipeline_class.from_pretrained(self.root_ckpt, adapter=aux_network, torch_dtype=torch.float16)
         pipe = pipe.to("cuda")
 
         pipe.set_progress_bar_config(disable=True)
@@ -162,10 +166,17 @@ def __init__(self, args):
 
         if args.run_compile:
             pipe.unet.to(memory_format=torch.channels_last)
-            pipe.controlnet.to(memory_format=torch.channels_last)
+            if self.aux_network_class == ControlNetModel:
+                pipe.controlnet.to(memory_format=torch.channels_last)
+            else:
+                pipe.adapter.to(memory_format=torch.channels_last)
+            
             print("Run torch compile")
             pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-            pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
+            if self.aux_network_class == ControlNetModel:
+                pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
+            else:
+                pipe.adapter = torch.compile(pipe.adapter, mode="reduce-overhead", fullgraph=True)
 
         self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
 

From e6116b07403fed6d08eabba24656b256d019114a Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 17:08:09 +0530
Subject: [PATCH 60/99] feat: run_command

---
 benchmarks/run_all.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
index 9c058f9f3d39..0a92a5f0fedf 100644
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -1,17 +1,38 @@
 import glob
 import subprocess
+from typing import List
 
 
 PATTERN = "benchmark_*.py"
 
+class SubprocessCallException(Exception):
+    pass
+
+# Taken from `test_examples_utils.py`
+def run_command(command: List[str], return_stdout=False):
+    """
+    Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
+    if an error occurred while running `command`
+    """
+    try:
+        output = subprocess.check_output(command, stderr=subprocess.STDOUT)
+        if return_stdout:
+            if hasattr(output, "decode"):
+                output = output.decode("utf-8")
+            return output
+    except subprocess.CalledProcessError as e:
+        raise SubprocessCallException(
+            f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
+        ) from e
+
 
 def main():
     python_files = glob.glob(PATTERN)
 
     for file in python_files:
         print(f"******Running file: {file} ******")
-        subprocess.run(["python", file])
-        subprocess.run(["python", file, "--run_compile"])
+        run_command(f"python {file}".split())
+        run_command(f"python {file} --run_compile".split())
 
 
 if __name__ == "__main__":

From d98fbe12559e01da1d343a9f7ae34d880e9d1fe8 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 17:23:54 +0530
Subject: [PATCH 61/99] add adapter resolution mapping

---
 benchmarks/base_classes.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index fd862d992f3a..64ca46c7eda3 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -35,6 +35,8 @@
     "runwayml/stable-diffusion-v1-5": (512, 512),
     "lllyasviel/sd-controlnet-canny": (512, 512),
     "diffusers/controlnet-canny-sdxl-1.0": (1024, 1024),
+    "TencentARC/t2iadapter_canny_sd14v1": (512, 512),
+    "TencentARC/t2i-adapter-canny-sdxl-1.0": (1024, 1024),
     "stabilityai/stable-diffusion-2-1": (768, 768),
     "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024),
 }

From c93278de352dac8dc235878d48d07fdfc3b07109 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 18:01:22 +0530
Subject: [PATCH 62/99] benchmark t2i adapter fix.

---
 benchmarks/base_classes.py | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 64ca46c7eda3..d8ca428a6cde 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -41,8 +41,6 @@
     "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024),
 }
 
-CONTROLNET_MAPPING = {}
-
 
 class BaseBenchmak:
     pipeline_class = None
@@ -156,11 +154,7 @@ class ControlNetBenchmark(TextToImageBenchmark):
 
     def __init__(self, args):
         aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
-
-        if self.aux_network_class == ControlNetModel:
-            pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16)
-        else:
-            pipe = self.pipeline_class.from_pretrained(self.root_ckpt, adapter=aux_network, torch_dtype=torch.float16)
+        pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16)
         pipe = pipe.to("cuda")
 
         pipe.set_progress_bar_config(disable=True)
@@ -168,17 +162,11 @@ def __init__(self, args):
 
         if args.run_compile:
             pipe.unet.to(memory_format=torch.channels_last)
-            if self.aux_network_class == ControlNetModel:
-                pipe.controlnet.to(memory_format=torch.channels_last)
-            else:
-                pipe.adapter.to(memory_format=torch.channels_last)
+            pipe.controlnet.to(memory_format=torch.channels_last)
             
             print("Run torch compile")
             pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-            if self.aux_network_class == ControlNetModel:
-                pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
-            else:
-                pipe.adapter = torch.compile(pipe.adapter, mode="reduce-overhead", fullgraph=True)
+            pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
 
         self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
 
@@ -205,7 +193,22 @@ class T2IAdapterBenchmark(ControlNetBenchmark):
     root_ckpt = "CompVis/stable-diffusion-v1-4"
 
     def __init__(self, args):
-        super().__init__(args)
+        aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
+        pipe = self.pipeline_class.from_pretrained(self.root_ckpt, adapter=aux_network, torch_dtype=torch.float16)
+        pipe = pipe.to("cuda")
+
+        pipe.set_progress_bar_config(disable=True)
+        self.pipe = pipe
+
+        if args.run_compile:
+            pipe.unet.to(memory_format=torch.channels_last)
+            pipe.adapter.to(memory_format=torch.channels_last)
+            
+            print("Run torch compile")
+            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+            pipe.adapter = torch.compile(pipe.adapter, mode="reduce-overhead", fullgraph=True)
+
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
 
 
 class T2IAdapterSDXLBenchmark(T2IAdapterBenchmark):

From 924096fbb28f421826d3abf00d92d8666758ad77 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 18:15:48 +0530
Subject: [PATCH 63/99] fix adapter input

---
 benchmarks/base_classes.py | 7 +++++--
 benchmarks/run_all.py      | 2 ++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index d8ca428a6cde..877ad23db4a7 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -163,7 +163,7 @@ def __init__(self, args):
         if args.run_compile:
             pipe.unet.to(memory_format=torch.channels_last)
             pipe.controlnet.to(memory_format=torch.channels_last)
-            
+
             print("Run torch compile")
             pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
             pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
@@ -192,6 +192,9 @@ class T2IAdapterBenchmark(ControlNetBenchmark):
     aux_network_class = T2IAdapter
     root_ckpt = "CompVis/stable-diffusion-v1-4"
 
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter.png"
+    image = load_image(url).convert("RGB")
+
     def __init__(self, args):
         aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
         pipe = self.pipeline_class.from_pretrained(self.root_ckpt, adapter=aux_network, torch_dtype=torch.float16)
@@ -203,7 +206,7 @@ def __init__(self, args):
         if args.run_compile:
             pipe.unet.to(memory_format=torch.channels_last)
             pipe.adapter.to(memory_format=torch.channels_last)
-            
+
             print("Run torch compile")
             pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
             pipe.adapter = torch.compile(pipe.adapter, mode="reduce-overhead", fullgraph=True)
diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
index 0a92a5f0fedf..1cfa8d6c72a2 100644
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -5,9 +5,11 @@
 
 PATTERN = "benchmark_*.py"
 
+
 class SubprocessCallException(Exception):
     pass
 
+
 # Taken from `test_examples_utils.py`
 def run_command(command: List[str], return_stdout=False):
     """

From 628591d9d86d9f09b562d1e5f3187801a9d62933 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 18:18:53 +0530
Subject: [PATCH 64/99] fix

---
 benchmarks/base_classes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 877ad23db4a7..064c95212c95 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -193,7 +193,7 @@ class T2IAdapterBenchmark(ControlNetBenchmark):
     root_ckpt = "CompVis/stable-diffusion-v1-4"
 
     url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter.png"
-    image = load_image(url).convert("RGB")
+    image = load_image(url)
 
     def __init__(self, args):
         aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)

From 0f4ae4eff7132bc2a7cebc68ac62717ce634885d Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 18:20:50 +0530
Subject: [PATCH 65/99] convert to L.

---
 benchmarks/base_classes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 064c95212c95..26a9d42c7011 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -193,7 +193,7 @@ class T2IAdapterBenchmark(ControlNetBenchmark):
     root_ckpt = "CompVis/stable-diffusion-v1-4"
 
     url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter.png"
-    image = load_image(url)
+    image = load_image(url).convert("L")
 
     def __init__(self, args):
         aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)

From de739fa784eac6910a811dbb7a2f34d7cc433384 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 19:38:35 +0530
Subject: [PATCH 66/99] add flush() add appropriate places

---
 benchmarks/base_classes.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 26a9d42c7011..ca38b144de5b 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -98,12 +98,14 @@ def benchmark(self, args):
         benchmark_info = BenchmarkInfo(time=time, memory=memory)
 
         pipeline_class_name = str(self.pipe.__class__.__name__)
+        flush()
         csv_dict = generate_csv_dict(
             pipeline_cls=pipeline_class_name, ckpt=args.ckpt, args=args, benchmark_info=benchmark_info
         )
         filepath = self.get_result_filepath(args)
         write_to_csv(filepath, csv_dict)
         print(f"Logs written to: {filepath}")
+        flush()
 
 
 class ImageToImageBenchmark(TextToImageBenchmark):

From cb9f9c6d8f9c9cdc72bd47e4b03acbd27a6af1b5 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 19:41:12 +0530
Subject: [PATCH 67/99] better filtering

---
 benchmarks/push_results.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py
index fb2559802ebd..27549639fa6e 100644
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -21,7 +21,7 @@ def has_previous_benchmark() -> str:
 
 def filter_float(value):
     if isinstance(value, str):
-        return value.split()[0]
+        return float(value.split()[0])
     return value
 
 
@@ -37,9 +37,7 @@ def push_to_hf_dataset():
 
         numeric_columns = current_results.select_dtypes(include=["float64", "int64"]).columns
         numeric_columns = [
-            c
-            for c in numeric_columns
-            if c not in ["batch_size", "num_inference_steps", "actual_gpu_memory (gbs)", "github_sha"]
+            c for c in numeric_columns if c not in ["batch_size", "num_inference_steps", "actual_gpu_memory (gbs)"]
         ]
 
         for column in numeric_columns:

From d7aee28421cfd36eb6b2f3e165293ed0b43b7548 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 20:11:30 +0530
Subject: [PATCH 68/99] okay

---
 .github/workflows/benchmark.yml | 1 +
 benchmarks/run_all.py           | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 7b1877d5334c..301b952e0f5f 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -45,6 +45,7 @@ jobs:
             HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
             BASE_PATH: benchmark_outputs
         run: |
+          python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))"
           cd benchmarks && mkdir ${BASE_PATH} && python run_all.py && python push_results.py
 
       - name: Test suite reports artifacts
diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
index 1cfa8d6c72a2..1ca533bfdfcc 100644
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -32,7 +32,7 @@ def main():
     python_files = glob.glob(PATTERN)
 
     for file in python_files:
-        print(f"******Running file: {file} ******")
+        print(f"****** Running file: {file} ******")
         run_command(f"python {file}".split())
         run_command(f"python {file} --run_compile".split())
 

From 385ffbb57c6f2dddd38d79303e8ae1efbe2b5066 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 20:13:29 +0530
Subject: [PATCH 69/99] get env for torch

---
 .github/workflows/benchmark.yml | 2 +-
 benchmarks/utils.py             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 301b952e0f5f..3190091ea3c8 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -45,7 +45,7 @@ jobs:
             HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
             BASE_PATH: benchmark_outputs
         run: |
-          python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))"
+          export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))")
           cd benchmarks && mkdir ${BASE_PATH} && python run_all.py && python push_results.py
 
       - name: Test suite reports artifacts
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index 88c09be6d54d..1c98c48c989f 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -25,7 +25,7 @@
 
 PROMPT = "ghibli style, a fantasy landscape with castles"
 BASE_PATH = os.getenv("BASE_PATH", ".")
-TOTAL_GPU_MEMORY = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+TOTAL_GPU_MEMORY = os.getenv("TOTAL_GPU_MEMORY", torch.cuda.get_device_properties(0).total_memory / (1024**3))
 
 REPO_ID = "diffusers/benchmarks"
 FINAL_CSV_FILE = "collated_results.csv"

From 611ae1338296814b88a7c12d6239e8e2de4551ca Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 20:20:22 +0530
Subject: [PATCH 70/99] convert to float

---
 benchmarks/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index 1c98c48c989f..5fce920ac6c3 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -25,7 +25,7 @@
 
 PROMPT = "ghibli style, a fantasy landscape with castles"
 BASE_PATH = os.getenv("BASE_PATH", ".")
-TOTAL_GPU_MEMORY = os.getenv("TOTAL_GPU_MEMORY", torch.cuda.get_device_properties(0).total_memory / (1024**3))
+TOTAL_GPU_MEMORY = float(os.getenv("TOTAL_GPU_MEMORY", torch.cuda.get_device_properties(0).total_memory / (1024**3)))
 
 REPO_ID = "diffusers/benchmarks"
 FINAL_CSV_FILE = "collated_results.csv"

From b3a91d8e0926f0f67137649d09ff9ad310b6994e Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 4 Dec 2023 20:48:05 +0530
Subject: [PATCH 71/99] fix

---
 benchmarks/push_results.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py
index 27549639fa6e..a37fc828a2c8 100644
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -48,10 +48,10 @@ def push_to_hf_dataset():
             previous_results[column] = previous_results[column].astype(float)
             percent_change = ((current_results[column] - previous_results[column]) / previous_results[column]) * 100
 
-        # Format the values with '+' or '-' sign and append to original values
-        current_results[column] = current_results[column].map(str) + percent_change.map(
-            lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)"
-        )
+            # Format the values with '+' or '-' sign and append to original values
+            current_results[column] = current_results[column].map(str) + percent_change.map(
+                lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)"
+            )
 
         # Overwrite the current result file.
         current_results.to_csv(FINAL_CSV_FILE, index=False)

From e55913e1665e5765739b6ae641512257f9dc6aaa Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 5 Dec 2023 08:42:38 +0530
Subject: [PATCH 72/99] filter out nans.

---
 benchmarks/push_results.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py
index a37fc828a2c8..062b5d959797 100644
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -52,6 +52,7 @@ def push_to_hf_dataset():
             current_results[column] = current_results[column].map(str) + percent_change.map(
                 lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)"
             )
+            current_results[column] = current_results[column].map(lambda x: x.replace(" (nan%)", ""))
 
         # Overwrite the current result file.
         current_results.to_csv(FINAL_CSV_FILE, index=False)

From dc3063a7eaefe05801aee3982cee6d19d4f835aa Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 5 Dec 2023 08:43:14 +0530
Subject: [PATCH 73/99] better coment

---
 benchmarks/push_results.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py
index 062b5d959797..962e07c6d74c 100644
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -41,7 +41,7 @@ def push_to_hf_dataset():
         ]
 
         for column in numeric_columns:
-            previous_results[column] = previous_results[column].apply(lambda x: filter_float(x))
+            previous_results[column] = previous_results[column].map(lambda x: filter_float(x))
 
             # Calculate the percentage change
             current_results[column] = current_results[column].astype(float)
@@ -52,6 +52,7 @@ def push_to_hf_dataset():
             current_results[column] = current_results[column].map(str) + percent_change.map(
                 lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)"
             )
+            # There might be newly added rows. So, filter out the NaNs.
             current_results[column] = current_results[column].map(lambda x: x.replace(" (nan%)", ""))
 
         # Overwrite the current result file.

From 63aee7954bfc9f3ce4c8d19615f428dc7d9cb67d Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 5 Dec 2023 08:47:00 +0530
Subject: [PATCH 74/99] sdxl

---
 benchmarks/run_all.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
index 1ca533bfdfcc..b1e4da46c401 100644
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -33,8 +33,19 @@ def main():
 
     for file in python_files:
         print(f"****** Running file: {file} ******")
-        run_command(f"python {file}".split())
-        run_command(f"python {file} --run_compile".split())
+        command = f"python {file}"
+        run_command(command.split())
+
+        command += " --run_compile"
+        run_command(command.split())
+
+        if file == "benchmark_sd.py":
+            for ckpt in ["segmind/SSD-1B", "stabilityai/stable-diffusion-xl-base-1.0"]:
+                command = f"python {file} --ckpt {ckpt}"
+                run_command(command.split())
+
+                command += " --run_compile"
+                run_command(command.split())
 
 
 if __name__ == "__main__":

From 9a9d5ea6464ae1ef4a7770af03086f773114f795 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 5 Dec 2023 15:18:54 +0530
Subject: [PATCH 75/99] sdxl for other benchmarks.

---
 benchmarks/run_all.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
index b1e4da46c401..5753db2d72a8 100644
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -47,6 +47,26 @@ def main():
                 command += " --run_compile"
                 run_command(command.split())
 
+        elif file in ["benchmark_sd_img.py", "benchmark_sd_inpainting.py"]:
+            sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
+            command = f"python {file} --ckpt {sdxl_ckpt}"
+            run_command(command.split())
+
+            command += " --run_compile"
+            run_command(command.split())
+
+        elif file in ["benchmark_controlnet.py", "benchmark_t2i_adapter.py"]:
+            sdxl_ckpt = (
+                "diffusers/controlnet-canny-sdxl-1.0"
+                if "controlnet" == file
+                else "TencentARC/t2i-adapter-canny-sdxl-1.0"
+            )
+            command = f"python {file} --ckpt {sdxl_ckpt}"
+            run_command(command.split())
+
+            command += " --run_compile"
+            run_command(command.split())
+
 
 if __name__ == "__main__":
     main()

From c8f6eefd54ceae8abb69abfecaa362ef843513ff Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 5 Dec 2023 15:34:09 +0530
Subject: [PATCH 76/99] fix: condition

---
 benchmarks/run_all.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
index 5753db2d72a8..bef93b4ab31a 100644
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -58,7 +58,7 @@ def main():
         elif file in ["benchmark_controlnet.py", "benchmark_t2i_adapter.py"]:
             sdxl_ckpt = (
                 "diffusers/controlnet-canny-sdxl-1.0"
-                if "controlnet" == file
+                if "controlnet" in file
                 else "TencentARC/t2i-adapter-canny-sdxl-1.0"
             )
             command = f"python {file} --ckpt {sdxl_ckpt}"

From 4a67437d1e26fe7d49930f06e04b7538db805694 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 5 Dec 2023 17:39:52 +0530
Subject: [PATCH 77/99] fix: condition for inpainting

---
 benchmarks/run_all.py            | 6 +++++-
 src/diffusers/models/__init__.py | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
index bef93b4ab31a..779685e452c5 100644
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -48,7 +48,11 @@ def main():
                 run_command(command.split())
 
         elif file in ["benchmark_sd_img.py", "benchmark_sd_inpainting.py"]:
-            sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
+            sdxl_ckpt = (
+                "stabilityai/stable-diffusion-xl-refiner-1.0"
+                if "inpainting" not in file
+                else "stabilityai/stable-diffusion-xl-base-1.0"
+            )
             command = f"python {file} --ckpt {sdxl_ckpt}"
             run_command(command.split())
 
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 49ee3ee6af6b..e3794939e25e 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -33,8 +33,8 @@
     _import_structure["consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
     _import_structure["controlnet"] = ["ControlNetModel"]
     _import_structure["dual_transformer_2d"] = ["DualTransformer2DModel"]
-    _import_structure["modeling_utils"] = ["ModelMixin"]
     _import_structure["embeddings"] = ["ImageProjection"]
+    _import_structure["modeling_utils"] = ["ModelMixin"]
     _import_structure["prior_transformer"] = ["PriorTransformer"]
     _import_structure["t5_film_transformer"] = ["T5FilmDecoder"]
     _import_structure["transformer_2d"] = ["Transformer2DModel"]

From eedf218edeb06789a8021b83fd6e05b9a25ee9ed Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 5 Dec 2023 18:15:42 +0530
Subject: [PATCH 78/99] fix: mapping for resolution

---
 benchmarks/base_classes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index ca38b144de5b..194605f75fa5 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -38,6 +38,7 @@
     "TencentARC/t2iadapter_canny_sd14v1": (512, 512),
     "TencentARC/t2i-adapter-canny-sdxl-1.0": (1024, 1024),
     "stabilityai/stable-diffusion-2-1": (768, 768),
+    "stabilityai/stable-diffusion-xl-base-1.0": (1024, 1024),
     "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024),
 }
 

From e300038267a298a8d63b540e730c13ef7163d8c9 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 5 Dec 2023 19:05:12 +0530
Subject: [PATCH 79/99] fix

---
 benchmarks/base_classes.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 194605f75fa5..35939317f1b8 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -221,5 +221,8 @@ class T2IAdapterSDXLBenchmark(T2IAdapterBenchmark):
     pipeline_class = StableDiffusionXLAdapterPipeline
     root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
 
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter_sdxl.png"
+    image = load_image(url)
+
     def __init__(self, args):
         super().__init__(args)

From 60614f5a750d3d47cc3c54580ffa36bd9c9196be Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 5 Dec 2023 20:07:31 +0530
Subject: [PATCH 80/99] include kandinsky and wuerstchen

---
 benchmarks/base_classes.py                    |  4 ++++
 ...hmark_sd.py => benchmark_text_to_image.py} |  3 ++-
 benchmarks/run_all.py                         | 20 +++++++++++++------
 3 files changed, 20 insertions(+), 7 deletions(-)
 rename benchmarks/{benchmark_sd.py => benchmark_text_to_image.py} (90%)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 35939317f1b8..94f8e597f1c2 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -79,6 +79,10 @@ def __init__(self, args):
             print("Run torch compile")
             pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
 
+            if hasattr(pipe, "movq") and getattr(pipe, "movq", None) is not None:
+                pipe.movq.to(memory_format=torch.channels_last)
+                pipe.movq = torch.compile(pipe.movq, mode="reduce-overhead", fullgraph=True)
+
         pipe.set_progress_bar_config(disable=True)
         self.pipe = pipe
 
diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_text_to_image.py
similarity index 90%
rename from benchmarks/benchmark_sd.py
rename to benchmarks/benchmark_text_to_image.py
index 0fa24a08d639..50c04dd550c9 100644
--- a/benchmarks/benchmark_sd.py
+++ b/benchmarks/benchmark_text_to_image.py
@@ -15,8 +15,9 @@
         choices=[
             "runwayml/stable-diffusion-v1-5",
             "segmind/SSD-1B",
-            "stabilityai/stable-diffusion-2-1",
             "stabilityai/stable-diffusion-xl-base-1.0",
+            "kandinsky-community/kandinsky-2-2-decoder",
+            "warp-ai/wuerstchen",
         ],
     )
     parser.add_argument("--batch_size", type=int, default=1)
diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
index 779685e452c5..913a24d98d12 100644
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -33,14 +33,22 @@ def main():
 
     for file in python_files:
         print(f"****** Running file: {file} ******")
-        command = f"python {file}"
-        run_command(command.split())
 
-        command += " --run_compile"
-        run_command(command.split())
+        if file != "benchmark_text_to_image.py":
+            command = f"python {file}"
+            run_command(command.split())
+
+            command += " --run_compile"
+            run_command(command.split())
 
-        if file == "benchmark_sd.py":
-            for ckpt in ["segmind/SSD-1B", "stabilityai/stable-diffusion-xl-base-1.0"]:
+        if file == "benchmark_text_to_image.py":
+            for ckpt in [
+                "runwayml/stable-diffusion-v1-5",
+                "segmind/SSD-1B",
+                "stabilityai/stable-diffusion-xl-base-1.0",
+                "kandinsky-community/kandinsky-2-2-decoder",
+                "warp-ai/wuerstchen",
+            ]:
                 command = f"python {file} --ckpt {ckpt}"
                 run_command(command.split())
 

From b394168516978be36651a3717cea330b5497af3b Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 5 Dec 2023 21:04:31 +0530
Subject: [PATCH 81/99] fix: Wuerstchen

---
 benchmarks/base_classes.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 94f8e597f1c2..f16107fa7224 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -13,6 +13,7 @@
     StableDiffusionXLAdapterPipeline,
     StableDiffusionXLControlNetPipeline,
     T2IAdapter,
+    WuerstchenCombinedPipeline,
 )
 from diffusers.utils import load_image
 
@@ -75,13 +76,18 @@ def __init__(self, args):
         pipe = pipe.to("cuda")
 
         if args.run_compile:
-            pipe.unet.to(memory_format=torch.channels_last)
-            print("Run torch compile")
-            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
-            if hasattr(pipe, "movq") and getattr(pipe, "movq", None) is not None:
-                pipe.movq.to(memory_format=torch.channels_last)
-                pipe.movq = torch.compile(pipe.movq, mode="reduce-overhead", fullgraph=True)
+            if not isinstance(pipe, WuerstchenCombinedPipeline):
+                pipe.unet.to(memory_format=torch.channels_last)
+                print("Run torch compile")
+                pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+                if hasattr(pipe, "movq") and getattr(pipe, "movq", None) is not None:
+                    pipe.movq.to(memory_format=torch.channels_last)
+                    pipe.movq = torch.compile(pipe.movq, mode="reduce-overhead", fullgraph=True)
+            else:
+                print("Run torch compile")
+                pipe.decoder = torch.compile(pipe.decoder, mode="reduce-overhead", fullgraph=True)
+                pipe.vqgan = torch.compile(pipe.vqgan, mode="reduce-overhead", fullgraph=True)
 
         pipe.set_progress_bar_config(disable=True)
         self.pipe = pipe

From b7eb3fbf14dd2c4ac8b33590170a527c63efd04c Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 5 Dec 2023 21:27:25 +0530
Subject: [PATCH 82/99] Empty-Commit


From 821726d7c0fba25f06ed8bba26984d9ccc014871 Mon Sep 17 00:00:00 2001
From: Aryan V S <contact.aryanvs@gmail.com>
Date: Thu, 7 Dec 2023 12:31:41 +0530
Subject: [PATCH 83/99] [Community] AnimateDiff + Controlnet Pipeline (#5928)

* begin work on animatediff + controlnet pipeline

* complete todos, uncomment multicontrolnet, input checks

Co-Authored-By: EdoardoBotta <botta.edoardo@gmail.com>

* update

Co-Authored-By: EdoardoBotta <botta.edoardo@gmail.com>

* add example

* update community README

* Update examples/community/README.md

---------

Co-authored-by: EdoardoBotta <botta.edoardo@gmail.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 examples/community/README.md                  |   65 +
 .../pipeline_animatediff_controlnet.py        | 1137 +++++++++++++++++
 2 files changed, 1202 insertions(+)
 create mode 100644 examples/community/pipeline_animatediff_controlnet.py

diff --git a/examples/community/README.md b/examples/community/README.md
index 1d13e2822b77..78a89acf7a57 100755
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -50,6 +50,7 @@ prompt-to-prompt | change parts of a prompt and retain image structure (see [pap
 |   Latent Consistency Interpolation Pipeline                                                                                                    | Interpolate the latent space of Latent Consistency Models with multiple prompts                                                                                                                                                                                                                                                                                                                                                                                                                                    | [Latent Consistency Interpolation Pipeline](#latent-consistency-interpolation-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1pK3NrLWJSiJsBynLns1K1-IDTW9zbPvl?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
 |   Regional Prompting Pipeline                                                                                               | Assign multiple prompts for different regions                                                                                                                                                                                                                                                                                                                                                    |  [Regional Prompting Pipeline](#regional-prompting-pipeline) | - | [hako-mikan](https://github.com/hako-mikan) |
 | LDM3D-sr (LDM3D upscaler)                                                                                                             | Upscale low resolution RGB and depth inputs to high resolution                                                                                                                                                                                                                                                                                                                                                                                                                              | [StableDiffusionUpscaleLDM3D Pipeline](https://github.com/estelleafl/diffusers/tree/ldm3d_upscaler_community/examples/community#stablediffusionupscaleldm3d-pipeline)                                                                             | -                                                                                                                                                                                                             |                                                        [Estelle Aflalo](https://github.com/estelleafl) |
+| AnimateDiff ControlNet Pipeline                                                                                                    | Combines AnimateDiff with precise motion control using ControlNets                                                                                                                                                                                                                                                                                                                                                                                                                                    | [AnimateDiff ControlNet Pipeline](#animatediff-controlnet-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SKboYeGjEQmQPWoFC0aLYpBlYdHXkvAu?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) and [Edoardo Botta](https://github.com/EdoardoBotta) |
 |   DemoFusion Pipeline                                                                                                    | Implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [DemoFusion Pipeline](#DemoFusion)      | - |              [Ruoyi Du](https://github.com/RuoyiDu) |
 
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
@@ -2839,6 +2840,70 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur
     * Reconstructed image:
     * ![dps_generated_image](https://github.com/tongdaxu/Images/assets/22267548/b74f084d-93f4-4845-83d8-44c0fa758a5f)
 
+### AnimateDiff ControlNet Pipeline
+
+This pipeline combines AnimateDiff and ControlNet. Enjoy precise motion control for your videos! Refer to [this](https://github.com/huggingface/diffusers/issues/5866) issue for more details.
+
+```py
+import torch
+from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter
+from diffusers.pipelines import DiffusionPipeline
+from diffusers.schedulers import DPMSolverMultistepScheduler
+from PIL import Image
+
+motion_id = "guoyww/animatediff-motion-adapter-v1-5-2"
+adapter = MotionAdapter.from_pretrained(motion_id)
+controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_openpose", torch_dtype=torch.float16)
+vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
+
+model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
+pipe = DiffusionPipeline.from_pretrained(
+    model_id,
+    motion_adapter=adapter,
+    controlnet=controlnet,
+    vae=vae,
+    custom_pipeline="pipeline_animatediff_controlnet",
+).to(device="cuda", dtype=torch.float16)
+pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained(
+    model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1
+)
+pipe.enable_vae_slicing()
+
+conditioning_frames = []
+for i in range(1, 16 + 1):
+    conditioning_frames.append(Image.open(f"frame_{i}.png"))
+
+prompt = "astronaut in space, dancing"
+negative_prompt = "bad quality, worst quality, jpeg artifacts, ugly"
+result = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=512,
+    height=768,
+    conditioning_frames=conditioning_frames,
+    num_inference_steps=12,
+).frames[0]
+
+from diffusers.utils import export_to_gif
+export_to_gif(result.frames[0], "result.gif")
+```
+
+<table>
+  <tr><td colspan="2" align=center><b>Conditioning Frames</b></td></tr>
+  <tr align=center>
+    <td align=center><img src="https://user-images.githubusercontent.com/7365912/265043418-23291941-864d-495a-8ba8-d02e05756396.gif" alt="input-frames"></td>
+  </tr>
+  <tr><td colspan="2" align=center><b>AnimateDiff model: SG161222/Realistic_Vision_V5.1_noVAE</b></td></tr>
+  <tr>
+    <td align=center><img src="https://github.com/huggingface/diffusers/assets/72266394/baf301e2-d03c-4129-bd84-203a1de2b2be" alt="gif-1"></td>
+    <td align=center><img src="https://github.com/huggingface/diffusers/assets/72266394/9f923475-ecaf-452b-92c8-4e42171182d8" alt="gif-2"></td>
+  </tr>
+  <tr><td colspan="2" align=center><b>AnimateDiff model: CardosAnime</b></td></tr>
+  <tr>
+    <td align=center><img src="https://github.com/huggingface/diffusers/assets/72266394/b2c41028-38a0-45d6-86ed-fec7446b87f7" alt="gif-1"></td>
+    <td align=center><img src="https://github.com/huggingface/diffusers/assets/72266394/eb7d2952-72e4-44fa-b664-077c79b4fc70" alt="gif-2"></td>
+  </tr>
+</table>
 ### DemoFusion
 This pipeline is the official implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973).
 The original repo can be found at [repo](https://github.com/PRIS-CV/DemoFusion).
diff --git a/examples/community/pipeline_animatediff_controlnet.py b/examples/community/pipeline_animatediff_controlnet.py
new file mode 100644
index 000000000000..785f1ee55ec2
--- /dev/null
+++ b/examples/community/pipeline_animatediff_controlnet.py
@@ -0,0 +1,1137 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel, UNetMotionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.models.unet_motion_model import MotionAdapter
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter
+        >>> from diffusers.pipelines import DiffusionPipeline
+        >>> from diffusers.schedulers import DPMSolverMultistepScheduler
+        >>> from PIL import Image
+
+        >>> motion_id = "guoyww/animatediff-motion-adapter-v1-5-2"
+        >>> adapter = MotionAdapter.from_pretrained(motion_id)
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_openpose", torch_dtype=torch.float16)
+        >>> vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
+
+        >>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
+        >>> pipe = DiffusionPipeline.from_pretrained(
+        ...     model_id,
+        ...     motion_adapter=adapter,
+        ...     controlnet=controlnet,
+        ...     vae=vae,
+        ...     custom_pipeline="pipeline_animatediff_controlnet",
+        ... ).to(device="cuda", dtype=torch.float16)
+        >>> pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained(
+        ...     model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1
+        ... )
+        >>> pipe.enable_vae_slicing()
+
+        >>> conditioning_frames = []
+        >>> for i in range(1, 16 + 1):
+        ...     conditioning_frames.append(Image.open(f"frame_{i}.png"))
+
+        >>> prompt = "astronaut in space, dancing"
+        >>> negative_prompt = "bad quality, worst quality, jpeg artifacts, ugly"
+        >>> result = pipe(
+        ...     prompt=prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     width=512,
+        ...     height=768,
+        ...     conditioning_frames=conditioning_frames,
+        ...     num_inference_steps=12,
+        ... ).frames[0]
+
+        >>> from diffusers.utils import export_to_gif
+        >>> export_to_gif(result.frames[0], "result.gif")
+        ```
+"""
+
+
+def tensor2vid(video: torch.Tensor, processor, output_type="np"):
+    # Based on:
+    # https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78
+
+    batch_size, channels, num_frames, height, width = video.shape
+    outputs = []
+    for batch_idx in range(batch_size):
+        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+        batch_output = processor.postprocess(batch_vid, output_type)
+
+        outputs.append(batch_output)
+
+    return outputs
+
+
+@dataclass
+class AnimateDiffControlNetPipelineOutput(BaseOutput):
+    frames: Union[torch.Tensor, np.ndarray]
+
+
+class AnimateDiffControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
+    r"""
+    Pipeline for text-to-video generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
+        motion_adapter ([`MotionAdapter`]):
+            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        motion_adapter: MotionAdapter,
+        controlnet: Union[ControlNetModel, MultiControlNetModel],
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        feature_extractor: Optional[CLIPImageProcessor] = None,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+    ):
+        super().__init__()
+        unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = (
+            image[None, :]
+            .reshape(
+                (
+                    batch_size,
+                    num_frames,
+                    -1,
+                )
+                + image.shape[2:]
+            )
+            .permute(0, 2, 1, 3, 4)
+        )
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stages where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        image=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if isinstance(image, list):
+                for image_ in image:
+                    self.check_image(image_, prompt, prompt_embeds)
+            else:
+                self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for control_ in image:
+                for image_ in control_:
+                    self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_frames: Optional[int] = 16,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        conditioning_frames: Optional[List[PipelineImageInput]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            conditioning_frames (`List[PipelineImageInput]`, *optional*):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If multiple ControlNets
+                are specified, images must be passed as a list such that each element of the list can be correctly
+                batched for input to a single ControlNet.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
+                `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            allback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            height=height,
+            width=width,
+            callback_steps=callback_steps,
+            negative_prompt=negative_prompt,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            image=conditioning_frames,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+            control_guidance_start=control_guidance_start,
+            control_guidance_end=control_guidance_end,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None:
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_videos_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        if isinstance(controlnet, ControlNetModel):
+            conditioning_frames = self.prepare_image(
+                image=conditioning_frames,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_videos_per_prompt * num_frames,
+                num_images_per_prompt=num_videos_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif isinstance(controlnet, MultiControlNetModel):
+            cond_prepared_frames = []
+            for frame_ in conditioning_frames:
+                prepared_frame = self.prepare_image(
+                    image=frame_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_videos_per_prompt * num_frames,
+                    num_images_per_prompt=num_videos_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                cond_prepared_frames.append(prepared_frame)
+
+            conditioning_frames = cond_prepared_frames
+        else:
+            assert False
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+                controlnet_prompt_embeds = controlnet_prompt_embeds.repeat_interleave(num_frames, dim=0)
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                control_model_input = torch.transpose(control_model_input, 1, 2)
+                control_model_input = control_model_input.reshape(
+                    (-1, control_model_input.shape[2], control_model_input.shape[3], control_model_input.shape[4])
+                )
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=conditioning_frames,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                ).sample
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        if output_type == "latent":
+            return AnimateDiffControlNetPipelineOutput(frames=latents)
+
+        # Post-processing
+        video_tensor = self.decode_latents(latents)
+
+        if output_type == "pt":
+            video = video_tensor
+        else:
+            video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffControlNetPipelineOutput(frames=video)

From 3dc2362b5a89380f66ac006b1a787411fa1a9574 Mon Sep 17 00:00:00 2001
From: Beinsezii <39478211+Beinsezii@users.noreply.github.com>
Date: Wed, 6 Dec 2023 23:51:04 -0800
Subject: [PATCH 84/99] EulerDiscreteScheduler add `rescale_betas_zero_snr`
 (#6024)

* EulerDiscreteScheduler add `rescale_betas_zero_snr`
---
 .../schedulers/scheduling_euler_discrete.py   | 56 +++++++++++++++++++
 tests/schedulers/test_scheduler_euler.py      |  4 ++
 2 files changed, 60 insertions(+)

diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 0e2dd5c983e3..802ba0f099f9 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -92,6 +92,43 @@ def alpha_bar_fn(t):
     return torch.tensor(betas, dtype=torch.float32)
 
 
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.FloatTensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
 class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
     """
     Euler scheduler.
@@ -128,6 +165,10 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
             An offset added to the inference steps. You can use a combination of `offset=1` and
             `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
             Diffusion.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
     """
 
     _compatibles = [e.name for e in KarrasDiffusionSchedulers]
@@ -149,6 +190,7 @@ def __init__(
         timestep_spacing: str = "linspace",
         timestep_type: str = "discrete",  # can be "discrete" or "continuous"
         steps_offset: int = 0,
+        rescale_betas_zero_snr: bool = False,
     ):
         if trained_betas is not None:
             self.betas = torch.tensor(trained_betas, dtype=torch.float32)
@@ -163,9 +205,17 @@ def __init__(
         else:
             raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
 
+        if rescale_betas_zero_snr:
+            # Close to 0 without being 0 so first sigma is not inf
+            # FP16 smallest positive subnormal works well here
+            self.alphas_cumprod[-1] = 2**-24
+
         sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
 
@@ -420,6 +470,9 @@ def step(
         if self.step_index is None:
             self._init_step_index(timestep)
 
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+
         sigma = self.sigmas[self.step_index]
 
         gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
@@ -456,6 +509,9 @@ def step(
 
         prev_sample = sample + derivative * dt
 
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
         # upon completion increase step index by one
         self._step_index += 1
 
diff --git a/tests/schedulers/test_scheduler_euler.py b/tests/schedulers/test_scheduler_euler.py
index 3249d7032bad..41c418c5064c 100644
--- a/tests/schedulers/test_scheduler_euler.py
+++ b/tests/schedulers/test_scheduler_euler.py
@@ -45,6 +45,10 @@ def test_timestep_type(self):
     def test_karras_sigmas(self):
         self.check_over_configs(use_karras_sigmas=True, sigma_min=0.02, sigma_max=700.0)
 
+    def test_rescale_betas_zero_snr(self):
+        for rescale_betas_zero_snr in [True, False]:
+            self.check_over_configs(rescale_betas_zero_snr=rescale_betas_zero_snr)
+
     def test_full_loop_no_noise(self):
         scheduler_class = self.scheduler_classes[0]
         scheduler_config = self.get_scheduler_config()

From 26a8c00840be3f4ca1f7339f1fc44dd9f3faa8d2 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 7 Dec 2023 13:57:27 +0530
Subject: [PATCH 85/99] Revert "[Community] AnimateDiff + Controlnet Pipeline
 (#5928)"

This reverts commit 821726d7c0fba25f06ed8bba26984d9ccc014871.
---
 examples/community/README.md                  |   65 -
 .../pipeline_animatediff_controlnet.py        | 1137 -----------------
 2 files changed, 1202 deletions(-)
 delete mode 100644 examples/community/pipeline_animatediff_controlnet.py

diff --git a/examples/community/README.md b/examples/community/README.md
index 78a89acf7a57..1d13e2822b77 100755
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -50,7 +50,6 @@ prompt-to-prompt | change parts of a prompt and retain image structure (see [pap
 |   Latent Consistency Interpolation Pipeline                                                                                                    | Interpolate the latent space of Latent Consistency Models with multiple prompts                                                                                                                                                                                                                                                                                                                                                                                                                                    | [Latent Consistency Interpolation Pipeline](#latent-consistency-interpolation-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1pK3NrLWJSiJsBynLns1K1-IDTW9zbPvl?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
 |   Regional Prompting Pipeline                                                                                               | Assign multiple prompts for different regions                                                                                                                                                                                                                                                                                                                                                    |  [Regional Prompting Pipeline](#regional-prompting-pipeline) | - | [hako-mikan](https://github.com/hako-mikan) |
 | LDM3D-sr (LDM3D upscaler)                                                                                                             | Upscale low resolution RGB and depth inputs to high resolution                                                                                                                                                                                                                                                                                                                                                                                                                              | [StableDiffusionUpscaleLDM3D Pipeline](https://github.com/estelleafl/diffusers/tree/ldm3d_upscaler_community/examples/community#stablediffusionupscaleldm3d-pipeline)                                                                             | -                                                                                                                                                                                                             |                                                        [Estelle Aflalo](https://github.com/estelleafl) |
-| AnimateDiff ControlNet Pipeline                                                                                                    | Combines AnimateDiff with precise motion control using ControlNets                                                                                                                                                                                                                                                                                                                                                                                                                                    | [AnimateDiff ControlNet Pipeline](#animatediff-controlnet-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SKboYeGjEQmQPWoFC0aLYpBlYdHXkvAu?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) and [Edoardo Botta](https://github.com/EdoardoBotta) |
 |   DemoFusion Pipeline                                                                                                    | Implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [DemoFusion Pipeline](#DemoFusion)      | - |              [Ruoyi Du](https://github.com/RuoyiDu) |
 
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
@@ -2840,70 +2839,6 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur
     * Reconstructed image:
     * ![dps_generated_image](https://github.com/tongdaxu/Images/assets/22267548/b74f084d-93f4-4845-83d8-44c0fa758a5f)
 
-### AnimateDiff ControlNet Pipeline
-
-This pipeline combines AnimateDiff and ControlNet. Enjoy precise motion control for your videos! Refer to [this](https://github.com/huggingface/diffusers/issues/5866) issue for more details.
-
-```py
-import torch
-from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter
-from diffusers.pipelines import DiffusionPipeline
-from diffusers.schedulers import DPMSolverMultistepScheduler
-from PIL import Image
-
-motion_id = "guoyww/animatediff-motion-adapter-v1-5-2"
-adapter = MotionAdapter.from_pretrained(motion_id)
-controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_openpose", torch_dtype=torch.float16)
-vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
-
-model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
-pipe = DiffusionPipeline.from_pretrained(
-    model_id,
-    motion_adapter=adapter,
-    controlnet=controlnet,
-    vae=vae,
-    custom_pipeline="pipeline_animatediff_controlnet",
-).to(device="cuda", dtype=torch.float16)
-pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained(
-    model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1
-)
-pipe.enable_vae_slicing()
-
-conditioning_frames = []
-for i in range(1, 16 + 1):
-    conditioning_frames.append(Image.open(f"frame_{i}.png"))
-
-prompt = "astronaut in space, dancing"
-negative_prompt = "bad quality, worst quality, jpeg artifacts, ugly"
-result = pipe(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    width=512,
-    height=768,
-    conditioning_frames=conditioning_frames,
-    num_inference_steps=12,
-).frames[0]
-
-from diffusers.utils import export_to_gif
-export_to_gif(result.frames[0], "result.gif")
-```
-
-<table>
-  <tr><td colspan="2" align=center><b>Conditioning Frames</b></td></tr>
-  <tr align=center>
-    <td align=center><img src="https://user-images.githubusercontent.com/7365912/265043418-23291941-864d-495a-8ba8-d02e05756396.gif" alt="input-frames"></td>
-  </tr>
-  <tr><td colspan="2" align=center><b>AnimateDiff model: SG161222/Realistic_Vision_V5.1_noVAE</b></td></tr>
-  <tr>
-    <td align=center><img src="https://github.com/huggingface/diffusers/assets/72266394/baf301e2-d03c-4129-bd84-203a1de2b2be" alt="gif-1"></td>
-    <td align=center><img src="https://github.com/huggingface/diffusers/assets/72266394/9f923475-ecaf-452b-92c8-4e42171182d8" alt="gif-2"></td>
-  </tr>
-  <tr><td colspan="2" align=center><b>AnimateDiff model: CardosAnime</b></td></tr>
-  <tr>
-    <td align=center><img src="https://github.com/huggingface/diffusers/assets/72266394/b2c41028-38a0-45d6-86ed-fec7446b87f7" alt="gif-1"></td>
-    <td align=center><img src="https://github.com/huggingface/diffusers/assets/72266394/eb7d2952-72e4-44fa-b664-077c79b4fc70" alt="gif-2"></td>
-  </tr>
-</table>
 ### DemoFusion
 This pipeline is the official implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973).
 The original repo can be found at [repo](https://github.com/PRIS-CV/DemoFusion).
diff --git a/examples/community/pipeline_animatediff_controlnet.py b/examples/community/pipeline_animatediff_controlnet.py
deleted file mode 100644
index 785f1ee55ec2..000000000000
--- a/examples/community/pipeline_animatediff_controlnet.py
+++ /dev/null
@@ -1,1137 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-from PIL import Image
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
-
-from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
-from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from diffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel, UNetMotionModel
-from diffusers.models.lora import adjust_lora_scale_text_encoder
-from diffusers.models.unet_motion_model import MotionAdapter
-from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.schedulers import (
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-)
-from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
-from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import torch
-        >>> from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter
-        >>> from diffusers.pipelines import DiffusionPipeline
-        >>> from diffusers.schedulers import DPMSolverMultistepScheduler
-        >>> from PIL import Image
-
-        >>> motion_id = "guoyww/animatediff-motion-adapter-v1-5-2"
-        >>> adapter = MotionAdapter.from_pretrained(motion_id)
-        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_openpose", torch_dtype=torch.float16)
-        >>> vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
-
-        >>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
-        >>> pipe = DiffusionPipeline.from_pretrained(
-        ...     model_id,
-        ...     motion_adapter=adapter,
-        ...     controlnet=controlnet,
-        ...     vae=vae,
-        ...     custom_pipeline="pipeline_animatediff_controlnet",
-        ... ).to(device="cuda", dtype=torch.float16)
-        >>> pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained(
-        ...     model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1
-        ... )
-        >>> pipe.enable_vae_slicing()
-
-        >>> conditioning_frames = []
-        >>> for i in range(1, 16 + 1):
-        ...     conditioning_frames.append(Image.open(f"frame_{i}.png"))
-
-        >>> prompt = "astronaut in space, dancing"
-        >>> negative_prompt = "bad quality, worst quality, jpeg artifacts, ugly"
-        >>> result = pipe(
-        ...     prompt=prompt,
-        ...     negative_prompt=negative_prompt,
-        ...     width=512,
-        ...     height=768,
-        ...     conditioning_frames=conditioning_frames,
-        ...     num_inference_steps=12,
-        ... ).frames[0]
-
-        >>> from diffusers.utils import export_to_gif
-        >>> export_to_gif(result.frames[0], "result.gif")
-        ```
-"""
-
-
-def tensor2vid(video: torch.Tensor, processor, output_type="np"):
-    # Based on:
-    # https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78
-
-    batch_size, channels, num_frames, height, width = video.shape
-    outputs = []
-    for batch_idx in range(batch_size):
-        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
-        batch_output = processor.postprocess(batch_vid, output_type)
-
-        outputs.append(batch_output)
-
-    return outputs
-
-
-@dataclass
-class AnimateDiffControlNetPipelineOutput(BaseOutput):
-    frames: Union[torch.Tensor, np.ndarray]
-
-
-class AnimateDiffControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
-    r"""
-    Pipeline for text-to-video generation.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
-    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
-
-    The pipeline also inherits the following loading methods:
-        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
-        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
-        tokenizer (`CLIPTokenizer`):
-            A [`~transformers.CLIPTokenizer`] to tokenize text.
-        unet ([`UNet2DConditionModel`]):
-            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
-        motion_adapter ([`MotionAdapter`]):
-            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-    """
-
-    model_cpu_offload_seq = "text_encoder->unet->vae"
-    _optional_components = ["feature_extractor", "image_encoder"]
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        motion_adapter: MotionAdapter,
-        controlnet: Union[ControlNetModel, MultiControlNetModel],
-        scheduler: Union[
-            DDIMScheduler,
-            PNDMScheduler,
-            LMSDiscreteScheduler,
-            EulerDiscreteScheduler,
-            EulerAncestralDiscreteScheduler,
-            DPMSolverMultistepScheduler,
-        ],
-        feature_extractor: Optional[CLIPImageProcessor] = None,
-        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
-    ):
-        super().__init__()
-        unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            motion_adapter=motion_adapter,
-            controlnet=controlnet,
-            scheduler=scheduler,
-            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-        self.control_image_processor = VaeImageProcessor(
-            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
-        )
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
-    def encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        lora_scale: Optional[float] = None,
-        clip_skip: Optional[int] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            lora_scale (`float`, *optional*):
-                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
-        """
-        # set lora scale so that monkey patched LoRA
-        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
-            self._lora_scale = lora_scale
-
-            # dynamically adjust the LoRA scale
-            if not USE_PEFT_BACKEND:
-                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
-            else:
-                scale_lora_layers(self.text_encoder, lora_scale)
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            if clip_skip is None:
-                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
-                prompt_embeds = prompt_embeds[0]
-            else:
-                prompt_embeds = self.text_encoder(
-                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
-                )
-                # Access the `hidden_states` first, that contains a tuple of
-                # all the hidden states from the encoder layers. Then index into
-                # the tuple to access the hidden states from the desired layer.
-                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
-                # We also need to apply the final LayerNorm here to not mess with the
-                # representations. The `last_hidden_states` that we typically use for
-                # obtaining the final prompt representations passes through the LayerNorm
-                # layer.
-                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
-
-        if self.text_encoder is not None:
-            prompt_embeds_dtype = self.text_encoder.dtype
-        elif self.unet is not None:
-            prompt_embeds_dtype = self.unet.dtype
-        else:
-            prompt_embeds_dtype = prompt_embeds.dtype
-
-        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
-
-        return prompt_embeds, negative_prompt_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
-    def encode_image(self, image, device, num_images_per_prompt):
-        dtype = next(self.image_encoder.parameters()).dtype
-
-        if not isinstance(image, torch.Tensor):
-            image = self.feature_extractor(image, return_tensors="pt").pixel_values
-
-        image = image.to(device=device, dtype=dtype)
-        image_embeds = self.image_encoder(image).image_embeds
-        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-
-        uncond_image_embeds = torch.zeros_like(image_embeds)
-        return image_embeds, uncond_image_embeds
-
-    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-
-        batch_size, channels, num_frames, height, width = latents.shape
-        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
-
-        image = self.vae.decode(latents).sample
-        video = (
-            image[None, :]
-            .reshape(
-                (
-                    batch_size,
-                    num_frames,
-                    -1,
-                )
-                + image.shape[2:]
-            )
-            .permute(0, 2, 1, 3, 4)
-        )
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        video = video.float()
-        return video
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
-    def enable_vae_slicing(self):
-        r"""
-        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
-        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
-        """
-        self.vae.enable_slicing()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
-    def disable_vae_slicing(self):
-        r"""
-        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_slicing()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
-    def enable_vae_tiling(self):
-        r"""
-        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
-        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
-        processing larger images.
-        """
-        self.vae.enable_tiling()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
-    def disable_vae_tiling(self):
-        r"""
-        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_tiling()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
-    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
-        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
-        The suffixes after the scaling factors represent the stages where they are being applied.
-
-        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
-        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
-        Args:
-            s1 (`float`):
-                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
-                mitigate "oversmoothing effect" in the enhanced denoising process.
-            s2 (`float`):
-                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
-                mitigate "oversmoothing effect" in the enhanced denoising process.
-            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
-            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
-        """
-        if not hasattr(self, "unet"):
-            raise ValueError("The pipeline must have `unet` for using FreeU.")
-        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
-    def disable_freeu(self):
-        """Disables the FreeU mechanism if enabled."""
-        self.unet.disable_freeu()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-        callback_on_step_end_tensor_inputs=None,
-        image=None,
-        controlnet_conditioning_scale=1.0,
-        control_guidance_start=0.0,
-        control_guidance_end=1.0,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-        if callback_on_step_end_tensor_inputs is not None and not all(
-            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
-        ):
-            raise ValueError(
-                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        # `prompt` needs more sophisticated handling when there are multiple
-        # conditionings.
-        if isinstance(self.controlnet, MultiControlNetModel):
-            if isinstance(prompt, list):
-                logger.warning(
-                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
-                    " prompts. The conditionings will be fixed across the prompts."
-                )
-
-        # Check `image`
-        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
-            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
-        )
-        if (
-            isinstance(self.controlnet, ControlNetModel)
-            or is_compiled
-            and isinstance(self.controlnet._orig_mod, ControlNetModel)
-        ):
-            if isinstance(image, list):
-                for image_ in image:
-                    self.check_image(image_, prompt, prompt_embeds)
-            else:
-                self.check_image(image, prompt, prompt_embeds)
-        elif (
-            isinstance(self.controlnet, MultiControlNetModel)
-            or is_compiled
-            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
-        ):
-            if not isinstance(image, list):
-                raise TypeError("For multiple controlnets: `image` must be type `list`")
-
-            # When `image` is a nested list:
-            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
-            elif any(isinstance(i, list) for i in image):
-                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
-            elif len(image) != len(self.controlnet.nets):
-                raise ValueError(
-                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
-                )
-
-            for control_ in image:
-                for image_ in control_:
-                    self.check_image(image_, prompt, prompt_embeds)
-        else:
-            assert False
-
-        # Check `controlnet_conditioning_scale`
-        if (
-            isinstance(self.controlnet, ControlNetModel)
-            or is_compiled
-            and isinstance(self.controlnet._orig_mod, ControlNetModel)
-        ):
-            if not isinstance(controlnet_conditioning_scale, float):
-                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
-        elif (
-            isinstance(self.controlnet, MultiControlNetModel)
-            or is_compiled
-            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
-        ):
-            if isinstance(controlnet_conditioning_scale, list):
-                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
-                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
-            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
-                self.controlnet.nets
-            ):
-                raise ValueError(
-                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
-                    " the same length as the number of controlnets"
-                )
-        else:
-            assert False
-
-        if not isinstance(control_guidance_start, (tuple, list)):
-            control_guidance_start = [control_guidance_start]
-
-        if not isinstance(control_guidance_end, (tuple, list)):
-            control_guidance_end = [control_guidance_end]
-
-        if len(control_guidance_start) != len(control_guidance_end):
-            raise ValueError(
-                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
-            )
-
-        if isinstance(self.controlnet, MultiControlNetModel):
-            if len(control_guidance_start) != len(self.controlnet.nets):
-                raise ValueError(
-                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
-                )
-
-        for start, end in zip(control_guidance_start, control_guidance_end):
-            if start >= end:
-                raise ValueError(
-                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
-                )
-            if start < 0.0:
-                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
-            if end > 1.0:
-                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
-
-    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
-    def check_image(self, image, prompt, prompt_embeds):
-        image_is_pil = isinstance(image, Image.Image)
-        image_is_tensor = isinstance(image, torch.Tensor)
-        image_is_np = isinstance(image, np.ndarray)
-        image_is_pil_list = isinstance(image, list) and isinstance(image[0], Image.Image)
-        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
-        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
-
-        if (
-            not image_is_pil
-            and not image_is_tensor
-            and not image_is_np
-            and not image_is_pil_list
-            and not image_is_tensor_list
-            and not image_is_np_list
-        ):
-            raise TypeError(
-                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
-            )
-
-        if image_is_pil:
-            image_batch_size = 1
-        else:
-            image_batch_size = len(image)
-
-        if prompt is not None and isinstance(prompt, str):
-            prompt_batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            prompt_batch_size = len(prompt)
-        elif prompt_embeds is not None:
-            prompt_batch_size = prompt_embeds.shape[0]
-
-        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
-            raise ValueError(
-                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
-            )
-
-    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
-    def prepare_latents(
-        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
-    ):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            num_frames,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
-    def prepare_image(
-        self,
-        image,
-        width,
-        height,
-        batch_size,
-        num_images_per_prompt,
-        device,
-        dtype,
-        do_classifier_free_guidance=False,
-        guess_mode=False,
-    ):
-        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
-        image_batch_size = image.shape[0]
-
-        if image_batch_size == 1:
-            repeat_by = batch_size
-        else:
-            # image batch size is the same as prompt batch size
-            repeat_by = num_images_per_prompt
-
-        image = image.repeat_interleave(repeat_by, dim=0)
-
-        image = image.to(device=device, dtype=dtype)
-
-        if do_classifier_free_guidance and not guess_mode:
-            image = torch.cat([image] * 2)
-
-        return image
-
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    @property
-    def clip_skip(self):
-        return self._clip_skip
-
-    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-    # corresponds to doing no classifier free guidance.
-    @property
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1
-
-    @property
-    def cross_attention_kwargs(self):
-        return self._cross_attention_kwargs
-
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        num_frames: Optional[int] = 16,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_videos_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
-        conditioning_frames: Optional[List[PipelineImageInput]] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
-        guess_mode: bool = False,
-        control_guidance_start: Union[float, List[float]] = 0.0,
-        control_guidance_end: Union[float, List[float]] = 1.0,
-        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        **kwargs,
-    ):
-        r"""
-        The call function to the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
-            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
-                The height in pixels of the generated video.
-            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
-                The width in pixels of the generated video.
-            num_frames (`int`, *optional*, defaults to 16):
-                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
-                amounts to 2 seconds of video.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                A higher guidance scale value encourages the model to generate images closely linked to the text
-                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
-                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
-                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
-                generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
-                `(batch_size, num_channel, num_frames, height, width)`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
-                provided, text embeddings are generated from the `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
-                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            ip_adapter_image (`PipelineImageInput`, *optional*):
-                Optional image input to work with IP Adapters.
-            conditioning_frames (`List[PipelineImageInput]`, *optional*):
-                The ControlNet input condition to provide guidance to the `unet` for generation. If multiple ControlNets
-                are specified, images must be passed as a list such that each element of the list can be correctly
-                batched for input to a single ControlNet.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
-                `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
-                of a plain tuple.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
-                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
-                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
-                the corresponding scale as a list.
-            guess_mode (`bool`, *optional*, defaults to `False`):
-                The ControlNet encoder tries to recognize the content of the input image even if you remove all
-                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
-            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
-                The percentage of total steps at which the ControlNet starts applying.
-            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The percentage of total steps at which the ControlNet stops applying.
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
-            allback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeine class.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
-                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
-        """
-
-        callback = kwargs.pop("callback", None)
-        callback_steps = kwargs.pop("callback_steps", None)
-
-        if callback is not None:
-            deprecate(
-                "callback",
-                "1.0.0",
-                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
-            )
-        if callback_steps is not None:
-            deprecate(
-                "callback_steps",
-                "1.0.0",
-                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
-            )
-
-        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
-
-        # align format for control guidance
-        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
-            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
-        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
-            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
-        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
-            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
-            control_guidance_start, control_guidance_end = (
-                mult * [control_guidance_start],
-                mult * [control_guidance_end],
-            )
-
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        num_videos_per_prompt = 1
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt=prompt,
-            height=height,
-            width=width,
-            callback_steps=callback_steps,
-            negative_prompt=negative_prompt,
-            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            image=conditioning_frames,
-            controlnet_conditioning_scale=controlnet_conditioning_scale,
-            control_guidance_start=control_guidance_start,
-            control_guidance_end=control_guidance_end,
-        )
-
-        self._guidance_scale = guidance_scale
-        self._clip_skip = clip_skip
-        self._cross_attention_kwargs = cross_attention_kwargs
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        device = self._execution_device
-
-        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
-            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
-
-        global_pool_conditions = (
-            controlnet.config.global_pool_conditions
-            if isinstance(controlnet, ControlNetModel)
-            else controlnet.nets[0].config.global_pool_conditions
-        )
-        guess_mode = guess_mode or global_pool_conditions
-
-        # 3. Encode input prompt
-        text_encoder_lora_scale = (
-            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-        )
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt,
-            device,
-            num_videos_per_prompt,
-            self.do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=text_encoder_lora_scale,
-            clip_skip=self.clip_skip,
-        )
-        # For classifier free guidance, we need to do two forward passes.
-        # Here we concatenate the unconditional and text embeddings into a single batch
-        # to avoid doing two forward passes
-        if self.do_classifier_free_guidance:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-
-        if ip_adapter_image is not None:
-            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_videos_per_prompt)
-            if self.do_classifier_free_guidance:
-                image_embeds = torch.cat([negative_image_embeds, image_embeds])
-
-        if isinstance(controlnet, ControlNetModel):
-            conditioning_frames = self.prepare_image(
-                image=conditioning_frames,
-                width=width,
-                height=height,
-                batch_size=batch_size * num_videos_per_prompt * num_frames,
-                num_images_per_prompt=num_videos_per_prompt,
-                device=device,
-                dtype=controlnet.dtype,
-                do_classifier_free_guidance=self.do_classifier_free_guidance,
-                guess_mode=guess_mode,
-            )
-        elif isinstance(controlnet, MultiControlNetModel):
-            cond_prepared_frames = []
-            for frame_ in conditioning_frames:
-                prepared_frame = self.prepare_image(
-                    image=frame_,
-                    width=width,
-                    height=height,
-                    batch_size=batch_size * num_videos_per_prompt * num_frames,
-                    num_images_per_prompt=num_videos_per_prompt,
-                    device=device,
-                    dtype=controlnet.dtype,
-                    do_classifier_free_guidance=self.do_classifier_free_guidance,
-                    guess_mode=guess_mode,
-                )
-
-                cond_prepared_frames.append(prepared_frame)
-
-            conditioning_frames = cond_prepared_frames
-        else:
-            assert False
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-        self._num_timesteps = len(timesteps)
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_videos_per_prompt,
-            num_channels_latents,
-            num_frames,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Add image embeds for IP-Adapter
-        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-
-        # 7.1 Create tensor stating which controlnets to keep
-        controlnet_keep = []
-        for i in range(len(timesteps)):
-            keeps = [
-                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
-                for s, e in zip(control_guidance_start, control_guidance_end)
-            ]
-            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
-
-        # Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                if guess_mode and self.do_classifier_free_guidance:
-                    # Infer ControlNet only for the conditional batch.
-                    control_model_input = latents
-                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
-                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
-                else:
-                    control_model_input = latent_model_input
-                    controlnet_prompt_embeds = prompt_embeds
-                controlnet_prompt_embeds = controlnet_prompt_embeds.repeat_interleave(num_frames, dim=0)
-
-                if isinstance(controlnet_keep[i], list):
-                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
-                else:
-                    controlnet_cond_scale = controlnet_conditioning_scale
-                    if isinstance(controlnet_cond_scale, list):
-                        controlnet_cond_scale = controlnet_cond_scale[0]
-                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
-
-                control_model_input = torch.transpose(control_model_input, 1, 2)
-                control_model_input = control_model_input.reshape(
-                    (-1, control_model_input.shape[2], control_model_input.shape[3], control_model_input.shape[4])
-                )
-
-                down_block_res_samples, mid_block_res_sample = self.controlnet(
-                    control_model_input,
-                    t,
-                    encoder_hidden_states=controlnet_prompt_embeds,
-                    controlnet_cond=conditioning_frames,
-                    conditioning_scale=cond_scale,
-                    guess_mode=guess_mode,
-                    return_dict=False,
-                )
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=self.cross_attention_kwargs,
-                    added_cond_kwargs=added_cond_kwargs,
-                    down_block_additional_residuals=down_block_res_samples,
-                    mid_block_additional_residual=mid_block_res_sample,
-                ).sample
-
-                # perform guidance
-                if self.do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if output_type == "latent":
-            return AnimateDiffControlNetPipelineOutput(frames=latents)
-
-        # Post-processing
-        video_tensor = self.decode_latents(latents)
-
-        if output_type == "pt":
-            video = video_tensor
-        else:
-            video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
-
-        # Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (video,)
-
-        return AnimateDiffControlNetPipelineOutput(frames=video)

From 8db59d7ca4ccb174093b68c714a0ee2fc04ed833 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 7 Dec 2023 13:57:39 +0530
Subject: [PATCH 86/99] Revert "EulerDiscreteScheduler add
 `rescale_betas_zero_snr` (#6024)"

This reverts commit 3dc2362b5a89380f66ac006b1a787411fa1a9574.
---
 .../schedulers/scheduling_euler_discrete.py   | 56 -------------------
 tests/schedulers/test_scheduler_euler.py      |  4 --
 2 files changed, 60 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 802ba0f099f9..0e2dd5c983e3 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -92,43 +92,6 @@ def alpha_bar_fn(t):
     return torch.tensor(betas, dtype=torch.float32)
 
 
-# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
-def rescale_zero_terminal_snr(betas):
-    """
-    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
-
-
-    Args:
-        betas (`torch.FloatTensor`):
-            the betas that the scheduler is being initialized with.
-
-    Returns:
-        `torch.FloatTensor`: rescaled betas with zero terminal SNR
-    """
-    # Convert betas to alphas_bar_sqrt
-    alphas = 1.0 - betas
-    alphas_cumprod = torch.cumprod(alphas, dim=0)
-    alphas_bar_sqrt = alphas_cumprod.sqrt()
-
-    # Store old values.
-    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
-    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
-
-    # Shift so the last timestep is zero.
-    alphas_bar_sqrt -= alphas_bar_sqrt_T
-
-    # Scale so the first timestep is back to the old value.
-    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
-
-    # Convert alphas_bar_sqrt to betas
-    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
-    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
-    alphas = torch.cat([alphas_bar[0:1], alphas])
-    betas = 1 - alphas
-
-    return betas
-
-
 class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
     """
     Euler scheduler.
@@ -165,10 +128,6 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
             An offset added to the inference steps. You can use a combination of `offset=1` and
             `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
             Diffusion.
-        rescale_betas_zero_snr (`bool`, defaults to `False`):
-            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
-            dark samples instead of limiting it to samples with medium brightness. Loosely related to
-            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
     """
 
     _compatibles = [e.name for e in KarrasDiffusionSchedulers]
@@ -190,7 +149,6 @@ def __init__(
         timestep_spacing: str = "linspace",
         timestep_type: str = "discrete",  # can be "discrete" or "continuous"
         steps_offset: int = 0,
-        rescale_betas_zero_snr: bool = False,
     ):
         if trained_betas is not None:
             self.betas = torch.tensor(trained_betas, dtype=torch.float32)
@@ -205,17 +163,9 @@ def __init__(
         else:
             raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
-        if rescale_betas_zero_snr:
-            self.betas = rescale_zero_terminal_snr(self.betas)
-
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
 
-        if rescale_betas_zero_snr:
-            # Close to 0 without being 0 so first sigma is not inf
-            # FP16 smallest positive subnormal works well here
-            self.alphas_cumprod[-1] = 2**-24
-
         sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
 
@@ -470,9 +420,6 @@ def step(
         if self.step_index is None:
             self._init_step_index(timestep)
 
-        # Upcast to avoid precision issues when computing prev_sample
-        sample = sample.to(torch.float32)
-
         sigma = self.sigmas[self.step_index]
 
         gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
@@ -509,9 +456,6 @@ def step(
 
         prev_sample = sample + derivative * dt
 
-        # Cast sample back to model compatible dtype
-        prev_sample = prev_sample.to(model_output.dtype)
-
         # upon completion increase step index by one
         self._step_index += 1
 
diff --git a/tests/schedulers/test_scheduler_euler.py b/tests/schedulers/test_scheduler_euler.py
index 41c418c5064c..3249d7032bad 100644
--- a/tests/schedulers/test_scheduler_euler.py
+++ b/tests/schedulers/test_scheduler_euler.py
@@ -45,10 +45,6 @@ def test_timestep_type(self):
     def test_karras_sigmas(self):
         self.check_over_configs(use_karras_sigmas=True, sigma_min=0.02, sigma_max=700.0)
 
-    def test_rescale_betas_zero_snr(self):
-        for rescale_betas_zero_snr in [True, False]:
-            self.check_over_configs(rescale_betas_zero_snr=rescale_betas_zero_snr)
-
     def test_full_loop_no_noise(self):
         scheduler_class = self.scheduler_classes[0]
         scheduler_config = self.get_scheduler_config()

From 4e7fb4d05aa324ac046e4ed63157064d21ed0f8d Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 7 Dec 2023 14:08:33 +0530
Subject: [PATCH 87/99] add SDXL turbo

---
 benchmarks/base_classes.py            | 28 +++++++++++++++++++++++++++
 benchmarks/benchmark_sd_img.py        |  5 +++--
 benchmarks/benchmark_text_to_image.py |  5 +++--
 benchmarks/run_all.py                 | 18 ++++++++++-------
 4 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index f16107fa7224..450befeaeebb 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -41,6 +41,7 @@
     "stabilityai/stable-diffusion-2-1": (768, 768),
     "stabilityai/stable-diffusion-xl-base-1.0": (1024, 1024),
     "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024),
+    "stabilityai/sdxl-turbo": (512, 512),
 }
 
 
@@ -119,6 +120,19 @@ def benchmark(self, args):
         flush()
 
 
+class TurboTextToImageBenchmark(TextToImageBenchmark):
+    def __init__(self, args):
+        super().__init__(args)
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+            guidance_scale=0.0,
+        )
+
+
 class ImageToImageBenchmark(TextToImageBenchmark):
     pipeline_class = AutoPipelineForImage2Image
     url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg"
@@ -137,6 +151,20 @@ def run_inference(self, pipe, args):
         )
 
 
+class TurboImageToImageBenchmark(ImageToImageBenchmark):
+    def __init__(self, args):
+        super().__init__(args)
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+            guidance_scale=0.0,
+        )
+
+
 class InpaintingBenchmark(ImageToImageBenchmark):
     pipeline_class = AutoPipelineForInpainting
     mask_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/overture-creations-5sI6fQgYIuo_mask.png"
diff --git a/benchmarks/benchmark_sd_img.py b/benchmarks/benchmark_sd_img.py
index 5525b4dae60b..491e7c9a65a9 100644
--- a/benchmarks/benchmark_sd_img.py
+++ b/benchmarks/benchmark_sd_img.py
@@ -3,7 +3,7 @@
 
 
 sys.path.append(".")
-from base_classes import ImageToImageBenchmark  # noqa: E402
+from base_classes import ImageToImageBenchmark, TurboImageToImageBenchmark  # noqa: E402
 
 
 if __name__ == "__main__":
@@ -16,6 +16,7 @@
             "runwayml/stable-diffusion-v1-5",
             "stabilityai/stable-diffusion-2-1",
             "stabilityai/stable-diffusion-xl-refiner-1.0",
+            "stabilityai/sdxl-turbo",
         ],
     )
     parser.add_argument("--batch_size", type=int, default=1)
@@ -24,5 +25,5 @@
     parser.add_argument("--run_compile", action="store_true")
     args = parser.parse_args()
 
-    benchmark_pipe = ImageToImageBenchmark(args)
+    benchmark_pipe = ImageToImageBenchmark(args) if "turbo" not in args.ckpt else TurboImageToImageBenchmark(args)
     benchmark_pipe.benchmark(args)
diff --git a/benchmarks/benchmark_text_to_image.py b/benchmarks/benchmark_text_to_image.py
index 50c04dd550c9..6d01ac558535 100644
--- a/benchmarks/benchmark_text_to_image.py
+++ b/benchmarks/benchmark_text_to_image.py
@@ -3,7 +3,7 @@
 
 
 sys.path.append(".")
-from base_classes import TextToImageBenchmark  # noqa: E402
+from base_classes import TextToImageBenchmark, TurboTextToImageBenchmark  # noqa: E402
 
 
 if __name__ == "__main__":
@@ -18,6 +18,7 @@
             "stabilityai/stable-diffusion-xl-base-1.0",
             "kandinsky-community/kandinsky-2-2-decoder",
             "warp-ai/wuerstchen",
+            "stabilityai/sdxl-turbo",
         ],
     )
     parser.add_argument("--batch_size", type=int, default=1)
@@ -26,5 +27,5 @@
     parser.add_argument("--run_compile", action="store_true")
     args = parser.parse_args()
 
-    benchmark_pipe = TextToImageBenchmark(args)
+    benchmark_pipe = TextToImageBenchmark(args) if "turbo" not in args.ckpt else TurboTextToImageBenchmark(args)
     benchmark_pipe.benchmark(args)
diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
index 913a24d98d12..2da592224766 100644
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -55,13 +55,17 @@ def main():
                 command += " --run_compile"
                 run_command(command.split())
 
-        elif file in ["benchmark_sd_img.py", "benchmark_sd_inpainting.py"]:
-            sdxl_ckpt = (
-                "stabilityai/stable-diffusion-xl-refiner-1.0"
-                if "inpainting" not in file
-                else "stabilityai/stable-diffusion-xl-base-1.0"
-            )
-            command = f"python {file} --ckpt {sdxl_ckpt}"
+        elif file == "benchmark_sd_img.py":
+            for ckpt in ["stabilityai/stable-diffusion-xl-refiner-1.0", "stabilityai/sdxl-turbo"]:
+                command = f"python {file} --ckpt {ckpt}"
+                run_command(command.split())
+
+                command += " --run_compile"
+                run_command(command.split())
+
+        elif file == "benchmark_sd_inpainting.py":
+            sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
+            command = f"python {file} --ckpt {ckpt}"
             run_command(command.split())
 
             command += " --run_compile"

From e2df761178d6c4547ca0696ddfe61754f6ab7149 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Thu, 7 Dec 2023 16:36:18 +0530
Subject: [PATCH 88/99] add lcm lora to the mix as well.

---
 .github/workflows/benchmark.yml       |  2 +-
 benchmarks/base_classes.py            | 19 +++++++++++++++++++
 benchmarks/benchmark_t2i_lcm_lora.py  | 23 +++++++++++++++++++++++
 benchmarks/benchmark_text_to_image.py | 27 ++++++++++++++++++---------
 benchmarks/run_all.py                 | 25 ++++++++++++++++---------
 5 files changed, 77 insertions(+), 19 deletions(-)
 create mode 100644 benchmarks/benchmark_t2i_lcm_lora.py

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 3190091ea3c8..7713e0aef111 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -36,7 +36,7 @@ jobs:
         run: |
           apt-get update && apt-get install libsndfile1-dev libgl1 -y
           python -m pip install -e .[quality,test]
-          python -m pip install pandas
+          python -m pip install peft pandas
       - name: Environment
         run: |
           python utils/print_env.py
diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 450befeaeebb..6d0da4a98450 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -8,6 +8,7 @@
     AutoPipelineForInpainting,
     AutoPipelineForText2Image,
     ControlNetModel,
+    LCMScheduler,
     StableDiffusionAdapterPipeline,
     StableDiffusionControlNetPipeline,
     StableDiffusionXLAdapterPipeline,
@@ -165,6 +166,24 @@ def run_inference(self, pipe, args):
         )
 
 
+class LCMLoRATextToImageBenchmark(TextToImageBenchmark):
+    lora_id = "latent-consistency/lcm-lora-sdxl"
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.pipe.load_lora_weights(self.lora_id)
+        self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)
+
+    def run_inference(self, pipe, args):
+        _ = pipe(
+            prompt=PROMPT,
+            image=self.image,
+            num_inference_steps=args.num_inference_steps,
+            num_images_per_prompt=args.batch_size,
+            guidance_scale=1.0,
+        )
+
+
 class InpaintingBenchmark(ImageToImageBenchmark):
     pipeline_class = AutoPipelineForInpainting
     mask_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/overture-creations-5sI6fQgYIuo_mask.png"
diff --git a/benchmarks/benchmark_t2i_lcm_lora.py b/benchmarks/benchmark_t2i_lcm_lora.py
new file mode 100644
index 000000000000..e68a6213fe5c
--- /dev/null
+++ b/benchmarks/benchmark_t2i_lcm_lora.py
@@ -0,0 +1,23 @@
+import argparse
+import sys
+
+
+sys.path.append(".")
+from base_classes import LCMLoRATextToImageBenchmark  # noqa: E402
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="stabilityai/stable-diffusion-xl-base-1.0",
+    )
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=3)
+    parser.add_argument("--model_cpu_offload", action="store_true")
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+
+    benchmark_pipe = LCMLoRATextToImageBenchmark(args)
+    benchmark_pipe.benchmark(args)
diff --git a/benchmarks/benchmark_text_to_image.py b/benchmarks/benchmark_text_to_image.py
index 6d01ac558535..caa97b0c5e3b 100644
--- a/benchmarks/benchmark_text_to_image.py
+++ b/benchmarks/benchmark_text_to_image.py
@@ -6,20 +6,23 @@
 from base_classes import TextToImageBenchmark, TurboTextToImageBenchmark  # noqa: E402
 
 
+ALL_T2I_CKPTS = [
+    "runwayml/stable-diffusion-v1-5",
+    "segmind/SSD-1B",
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    "kandinsky-community/kandinsky-2-2-decoder",
+    "warp-ai/wuerstchen",
+    "stabilityai/sdxl-turbo",
+]
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--ckpt",
         type=str,
         default="runwayml/stable-diffusion-v1-5",
-        choices=[
-            "runwayml/stable-diffusion-v1-5",
-            "segmind/SSD-1B",
-            "stabilityai/stable-diffusion-xl-base-1.0",
-            "kandinsky-community/kandinsky-2-2-decoder",
-            "warp-ai/wuerstchen",
-            "stabilityai/sdxl-turbo",
-        ],
+        choices=ALL_T2I_CKPTS,
     )
     parser.add_argument("--batch_size", type=int, default=1)
     parser.add_argument("--num_inference_steps", type=int, default=50)
@@ -27,5 +30,11 @@
     parser.add_argument("--run_compile", action="store_true")
     args = parser.parse_args()
 
-    benchmark_pipe = TextToImageBenchmark(args) if "turbo" not in args.ckpt else TurboTextToImageBenchmark(args)
+    benchmark_cls = None
+    if "turbo" in args.ckpt:
+        benchmark_cls = TurboTextToImageBenchmark
+    else:
+        benchmark_cls = TextToImageBenchmark
+
+    benchmark_pipe = benchmark_cls(args)
     benchmark_pipe.benchmark(args)
diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
index 2da592224766..685f49e2d1c6 100644
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -1,8 +1,13 @@
 import glob
 import subprocess
+import sys
 from typing import List
 
 
+sys.path.append(".")
+from benchmark_text_to_image import ALL_T2I_CKPTS  # noqa: E402
+
+
 PATTERN = "benchmark_*.py"
 
 
@@ -34,6 +39,7 @@ def main():
     for file in python_files:
         print(f"****** Running file: {file} ******")
 
+        # Run with canonical settings.
         if file != "benchmark_text_to_image.py":
             command = f"python {file}"
             run_command(command.split())
@@ -42,14 +48,12 @@ def main():
             run_command(command.split())
 
         if file == "benchmark_text_to_image.py":
-            for ckpt in [
-                "runwayml/stable-diffusion-v1-5",
-                "segmind/SSD-1B",
-                "stabilityai/stable-diffusion-xl-base-1.0",
-                "kandinsky-community/kandinsky-2-2-decoder",
-                "warp-ai/wuerstchen",
-            ]:
+            for ckpt in ALL_T2I_CKPTS:
                 command = f"python {file} --ckpt {ckpt}"
+
+                if "turbo" in ckpt:
+                    command += "--num_inference_steps 1"
+
                 run_command(command.split())
 
                 command += " --run_compile"
@@ -58,14 +62,17 @@ def main():
         elif file == "benchmark_sd_img.py":
             for ckpt in ["stabilityai/stable-diffusion-xl-refiner-1.0", "stabilityai/sdxl-turbo"]:
                 command = f"python {file} --ckpt {ckpt}"
-                run_command(command.split())
 
+                if ckpt == "stabilityai/sdxl-turbo":
+                    command += "--num_inference_steps 1"
+
+                run_command(command.split())
                 command += " --run_compile"
                 run_command(command.split())
 
         elif file == "benchmark_sd_inpainting.py":
             sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
-            command = f"python {file} --ckpt {ckpt}"
+            command = f"python {file} --ckpt {sdxl_ckpt}"
             run_command(command.split())
 
             command += " --run_compile"

From 2588853787854870920086c75106f555c8d29954 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 8 Dec 2023 00:07:55 +0530
Subject: [PATCH 89/99] fix

---
 benchmarks/benchmark_t2i_lcm_lora.py | 2 +-
 benchmarks/run_all.py                | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_t2i_lcm_lora.py b/benchmarks/benchmark_t2i_lcm_lora.py
index e68a6213fe5c..957e0a463e28 100644
--- a/benchmarks/benchmark_t2i_lcm_lora.py
+++ b/benchmarks/benchmark_t2i_lcm_lora.py
@@ -14,7 +14,7 @@
         default="stabilityai/stable-diffusion-xl-base-1.0",
     )
     parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--num_inference_steps", type=int, default=3)
+    parser.add_argument("--num_inference_steps", type=int, default=4)
     parser.add_argument("--model_cpu_offload", action="store_true")
     parser.add_argument("--run_compile", action="store_true")
     args = parser.parse_args()
diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
index 685f49e2d1c6..003f3c4a8a01 100644
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -52,7 +52,7 @@ def main():
                 command = f"python {file} --ckpt {ckpt}"
 
                 if "turbo" in ckpt:
-                    command += "--num_inference_steps 1"
+                    command += " --num_inference_steps 1"
 
                 run_command(command.split())
 
@@ -64,7 +64,7 @@ def main():
                 command = f"python {file} --ckpt {ckpt}"
 
                 if ckpt == "stabilityai/sdxl-turbo":
-                    command += "--num_inference_steps 1"
+                    command += " --num_inference_steps 1"
 
                 run_command(command.split())
                 command += " --run_compile"

From a7fd2c345fc73c95e518d7598d2c05f54d037a9e Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 8 Dec 2023 00:44:29 +0530
Subject: [PATCH 90/99] increase steps to 2 when running turbo i2i

---
 benchmarks/run_all.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
index 003f3c4a8a01..af78bd738376 100644
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -64,7 +64,7 @@ def main():
                 command = f"python {file} --ckpt {ckpt}"
 
                 if ckpt == "stabilityai/sdxl-turbo":
-                    command += " --num_inference_steps 1"
+                    command += " --num_inference_steps 2"
 
                 run_command(command.split())
                 command += " --run_compile"

From b878a29fad0d1898183d527d728d0826c829ebf3 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 8 Dec 2023 08:25:44 +0530
Subject: [PATCH 91/99] debug

---
 benchmarks/base_classes.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 6d0da4a98450..979fd211c5c0 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -153,10 +153,13 @@ def run_inference(self, pipe, args):
 
 
 class TurboImageToImageBenchmark(ImageToImageBenchmark):
+    image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png")
+
     def __init__(self, args):
         super().__init__(args)
 
     def run_inference(self, pipe, args):
+        print(f"Image size: {self.image.size}")
         _ = pipe(
             prompt=PROMPT,
             image=self.image,

From 1389d0e922ba0362c1430ff124037e8126e6cc34 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 8 Dec 2023 08:27:52 +0530
Subject: [PATCH 92/99] debug

---
 benchmarks/base_classes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 979fd211c5c0..f954f03d809a 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -157,6 +157,7 @@ class TurboImageToImageBenchmark(ImageToImageBenchmark):
 
     def __init__(self, args):
         super().__init__(args)
+        self.pipe = AutoPipelineForImage2Image.from_pretrained(args.ckpt, torch_dtype=torch.float16).to("cuda")
 
     def run_inference(self, pipe, args):
         print(f"Image size: {self.image.size}")

From b2d35be32b18291a66baf95a81010ee0135d9384 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 8 Dec 2023 08:29:49 +0530
Subject: [PATCH 93/99] debug

---
 benchmarks/base_classes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index f954f03d809a..32a02d7cd5d1 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -167,6 +167,7 @@ def run_inference(self, pipe, args):
             num_inference_steps=args.num_inference_steps,
             num_images_per_prompt=args.batch_size,
             guidance_scale=0.0,
+            strength=0.5
         )
 
 
From d78609d0f7cdc1c2ebd1c0440ccd469ac5c1abef Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 8 Dec 2023 08:32:15 +0530
Subject: [PATCH 94/99] fix for good

---
 benchmarks/base_classes.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 32a02d7cd5d1..7203e67e285e 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -104,7 +104,7 @@ def run_inference(self, pipe, args):
     def benchmark(self, args):
         flush()
 
-        print(f"Running benchmark with: {vars(args)}\n")
+        print(f"[INFO] {self.pipe.__class__.__name__}: Running benchmark with: {vars(args)}\n")
 
         time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
         memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
@@ -153,21 +153,17 @@ def run_inference(self, pipe, args):
 
 
 class TurboImageToImageBenchmark(ImageToImageBenchmark):
-    image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png")
-
     def __init__(self, args):
         super().__init__(args)
-        self.pipe = AutoPipelineForImage2Image.from_pretrained(args.ckpt, torch_dtype=torch.float16).to("cuda")
 
     def run_inference(self, pipe, args):
-        print(f"Image size: {self.image.size}")
         _ = pipe(
             prompt=PROMPT,
             image=self.image,
             num_inference_steps=args.num_inference_steps,
             num_images_per_prompt=args.batch_size,
             guidance_scale=0.0,
-            strength=0.5
+            strength=0.5,
         )
 
 
From b3897f8572d0e3c85f114406b977ac15654b373b Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 8 Dec 2023 10:06:53 +0530
Subject: [PATCH 95/99] fix and isolate better

---
 benchmarks/base_classes.py | 1 -
 benchmarks/run_all.py      | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 7203e67e285e..86b7d73e0f98 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -178,7 +178,6 @@ def __init__(self, args):
     def run_inference(self, pipe, args):
         _ = pipe(
             prompt=PROMPT,
-            image=self.image,
             num_inference_steps=args.num_inference_steps,
             num_images_per_prompt=args.batch_size,
             guidance_scale=1.0,
diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py
index af78bd738376..c70fb2227383 100644
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -47,6 +47,8 @@ def main():
             command += " --run_compile"
             run_command(command.split())
 
+    # Run variants.
+    for file in python_files:
         if file == "benchmark_text_to_image.py":
             for ckpt in ALL_T2I_CKPTS:
                 command = f"python {file} --ckpt {ckpt}"

From 8289baa8690811d614d9800be00e537941ead355 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 8 Dec 2023 16:02:52 +0530
Subject: [PATCH 96/99] fuse lora so that torch compile works with peft

---
 benchmarks/base_classes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 86b7d73e0f98..3593c2d071b2 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -173,6 +173,7 @@ class LCMLoRATextToImageBenchmark(TextToImageBenchmark):
     def __init__(self, args):
         super().__init__(args)
         self.pipe.load_lora_weights(self.lora_id)
+        self.pipe.fuse_lora()
         self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)
 
     def run_inference(self, pipe, args):

From dd54366c8a6e304a591e03bb9d0212fb1d2479a8 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 8 Dec 2023 16:31:02 +0530
Subject: [PATCH 97/99] fix: LCMLoRA

---
 .github/workflows/benchmark.yml |  2 +-
 benchmarks/base_classes.py      | 32 ++++++++++++++++----------------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 7713e0aef111..3190091ea3c8 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -36,7 +36,7 @@ jobs:
         run: |
           apt-get update && apt-get install libsndfile1-dev libgl1 -y
           python -m pip install -e .[quality,test]
-          python -m pip install peft pandas
+          python -m pip install pandas
       - name: Environment
         run: |
           python utils/print_env.py
diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 3593c2d071b2..1e9529db4e96 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -134,27 +134,32 @@ def run_inference(self, pipe, args):
         )
 
 
-class ImageToImageBenchmark(TextToImageBenchmark):
-    pipeline_class = AutoPipelineForImage2Image
-    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg"
-    image = load_image(url).convert("RGB")
+class LCMLoRATextToImageBenchmark(TextToImageBenchmark):
+    lora_id = "latent-consistency/lcm-lora-sdxl"
 
     def __init__(self, args):
         super().__init__(args)
-        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
+        self.pipe.load_lora_weights(self.lora_id)
+        self.pipe.fuse_lora()
+        self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)
 
     def run_inference(self, pipe, args):
         _ = pipe(
             prompt=PROMPT,
-            image=self.image,
             num_inference_steps=args.num_inference_steps,
             num_images_per_prompt=args.batch_size,
+            guidance_scale=1.0,
         )
 
 
-class TurboImageToImageBenchmark(ImageToImageBenchmark):
+class ImageToImageBenchmark(TextToImageBenchmark):
+    pipeline_class = AutoPipelineForImage2Image
+    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg"
+    image = load_image(url).convert("RGB")
+
     def __init__(self, args):
         super().__init__(args)
+        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
 
     def run_inference(self, pipe, args):
         _ = pipe(
@@ -162,26 +167,21 @@ def run_inference(self, pipe, args):
             image=self.image,
             num_inference_steps=args.num_inference_steps,
             num_images_per_prompt=args.batch_size,
-            guidance_scale=0.0,
-            strength=0.5,
         )
 
 
-class LCMLoRATextToImageBenchmark(TextToImageBenchmark):
-    lora_id = "latent-consistency/lcm-lora-sdxl"
-
+class TurboImageToImageBenchmark(ImageToImageBenchmark):
     def __init__(self, args):
         super().__init__(args)
-        self.pipe.load_lora_weights(self.lora_id)
-        self.pipe.fuse_lora()
-        self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)
 
     def run_inference(self, pipe, args):
         _ = pipe(
             prompt=PROMPT,
+            image=self.image,
             num_inference_steps=args.num_inference_steps,
             num_images_per_prompt=args.batch_size,
-            guidance_scale=1.0,
+            guidance_scale=0.0,
+            strength=0.5,
         )
 
 
From 51acace44f912bbb1325ab3c8f48e96c91087442 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Fri, 8 Dec 2023 17:36:28 +0530
Subject: [PATCH 98/99] better identification for LCM

---
 benchmarks/base_classes.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py
index 1e9529db4e96..5d328f62b904 100644
--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -143,6 +143,17 @@ def __init__(self, args):
         self.pipe.fuse_lora()
         self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)
 
+    def get_result_filepath(self, args):
+        pipeline_class_name = str(self.pipe.__class__.__name__)
+        name = (
+            self.lora_id.replace("/", "_")
+            + "_"
+            + pipeline_class_name
+            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
+        )
+        filepath = os.path.join(BASE_PATH, name)
+        return filepath
+
     def run_inference(self, pipe, args):
         _ = pipe(
             prompt=PROMPT,

From 80e83110bed0636760e45ad2277095c1d5039a3c Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 9 Dec 2023 09:40:23 +0530
Subject: [PATCH 99/99] change to cron job

---
 .github/workflows/benchmark.yml | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 3190091ea3c8..c4c3c101dbfd 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -1,12 +1,8 @@
 name: Benchmarking tests
 
 on:
-  pull_request:
-    branches:
-      - main
-  push:
-    branches:
-      - ci-*
+  schedule:
+    - cron: "30 1 1,15 * *" # every 2 weeks on the 1st and the 15th of every month at 1:30 AM
 
 env:
   DIFFUSERS_IS_CI: yes
@@ -42,7 +38,7 @@ jobs:
           python utils/print_env.py
       - name: Diffusers Benchmarking
         env:
-            HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+            HUGGING_FACE_HUB_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
             BASE_PATH: benchmark_outputs
         run: |
           export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))")