huggingface · sayakpaul · Dec 12, 2023 · Nov 17, 2023 · Nov 17, 2023 · Nov 17, 2023
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -0,0 +1,61 @@
+name: Benchmarking tests
+
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - ci-*
+
+env:
+  DIFFUSERS_IS_CI: yes
+  HF_HOME: /mnt/cache
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  PYTEST_TIMEOUT: 600
+  RUN_SLOW: yes
+  PIPELINE_USAGE_CUTOFF: 50000
+
+jobs:
+  torch_pipelines_cuda_benchmark_tests:
+    name: Torch Core Pipelines CUDA Benchmarking Tests
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+    runs-on: docker-gpu
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+      - name: Install dependencies
+        run: |
+          apt-get update && apt-get install libsndfile1-dev libgl1 -y
+          python -m pip install -e .[quality,test]
+          python -m pip install git+https://github.com/huggingface/accelerate.git
+          mkdir benchmark_outputs
+      - name: Environment
+        run: |
+          python utils/print_env.py
+      - name: Stable Diffusion Benchmarking Tests
+        env:
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        run: |
+          cd benchmarks && python benchmark_sd.py && \
+            python benchmark_sd.py --batch_size 4 && \
+            python benchmark_sd.py --run_compile && \
+            python benchmark_sd.py --batch_size 4 --run_compile 
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: benchmark_test_reports
+          path: benchmark_outputs
diff --git a/benchmarks/benchmark_sd.py b/benchmarks/benchmark_sd.py
@@ -0,0 +1,60 @@
+import argparse
+import os
+import torch
+from diffusers import DiffusionPipeline
+from .benchmark_utils import benchmark_fn, bytes_to_giga_bytes, BenchmarkInfo, generate_markdown_table
+
+CKPT = "CompVis/stable-diffusion-v1-4"
+PROMPT = "ghibli style, a fantasy landscape with castles"
+BASE_PATH = "benchmark_outputs"
+
+
+def load_pipeline(run_compile=False, with_tensorrt=False):
+    pipe = DiffusionPipeline.from_pretrained(
+        CKPT, torch_dtype=torch.float16, use_safetensors=True
+    )
+    pipe = pipe.to("cuda")
+
+    if run_compile:
+        pipe.unet.to(memory_format=torch.channels_last)
+        print("Run torch compile")
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+    pipe.set_progress_bar_config(disable=True)
+    return pipe
+
+
+def run_inference(pipe, args):
+    _ = pipe(
+        prompt=PROMPT,
+        num_inference_steps=args.num_inference_steps,
+        num_images_per_prompt=args.batch_size,
+    )
+
+def main(args):
+    pipeline = load_pipeline(
+        run_compile=args.run_compile, with_tensorrt=args.with_tensorrt
+    )
+
+    time = benchmark_fn(run_inference, pipeline, args) # in seconds.
+    memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs.
+    benchmark_info = BenchmarkInfo(time=time, memory=memory)
+
+    markdown_report = ""
+    markdown_report = generate_markdown_table(pipeline_name=CKPT, args=args, benchmark_info=benchmark_info, markdown_report=markdown_report)  
+    return markdown_report
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--run_compile", action="store_true")
+    args = parser.parse_args()
+    markdown_report = main(args)
+
+    name = CKPT + f"-batch_sze@{args.batch_size}-num_inference_steps@{args.num_inference_steps}--run_compile@{args.run_compile}"
+    filepath = os.path.join(BASE_PATH, name)
+    with open(filepath, "w") as f:
+        f.write(markdown_report)
+
+
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
@@ -0,0 +1,46 @@
+import gc  
+import torch 
+import torch.utils.benchmark as benchmark
+from dataclasses import dataclass
+import argparse
+
+@dataclass
+class BenchmarkInfo:
+    time: float 
+    memory: float
+
+
+def flush():
+    gc.collect()
+    torch.cuda.empty_cache()
+
+def bytes_to_giga_bytes(bytes):
+    return bytes / 1024 / 1024 / 1024
+
+
+# Adapted from
+# https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html
+def benchmark_fn(f, *args, **kwargs):
+    t0 = benchmark.Timer(
+        stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+    )
+    return f"{(t0.blocked_autorange().mean):.3f}" 
+
+def generate_markdown_table(pipeline_name: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo) -> str:
+    headers = ["**Parameter**", "**Value**"]
+    data = [
+        ["Batch Size", args.batch_size],
+        ["Number of Inference Steps", args.num_inference_steps],
+        ["Run Compile", args.run_compile],
+        ["Time (seconds)", benchmark_info.time],
+        ["Memory (GBs)", benchmark_info.memory]
+    ]
+
+    # Formatting the table.
+    markdown_table = f"## {pipeline_name}\n\n"
+    markdown_table += "| " + " | ".join(headers) + " |\n"
+    markdown_table += "|-" + "-|-".join(['' for _ in headers]) + "-|\n"
+    for row in data:
+        markdown_table += "| " + " | ".join(str(item) for item in row) + " |\n"
+
+    return markdown_table