Merge branch 'main' into test-cache

Signed-off-by: Adrien <[email protected]>
huggingface · Jul 31, 2024 · 1972b80 · 1972b80
2 parents 9e638d8 + ea1b4ea
commit 1972b80
Show file tree

Hide file tree

Showing 51 changed files with 5,588 additions and 48 deletions.
diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml
@@ -56,7 +56,8 @@ jobs:
       max-parallel: 8
       matrix:
         module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
     container:
       image: diffusers/diffusers-pytorch-cuda
       options: --shm-size "16gb" --ipc host --gpus 0
@@ -106,7 +107,8 @@ jobs:
 
   run_nightly_tests_for_other_torch_modules:
     name: Nightly Torch CUDA Tests
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
     container:
       image: diffusers/diffusers-pytorch-cuda
       options: --shm-size "16gb" --ipc host --gpus 0
@@ -235,7 +237,8 @@ jobs:
 
   run_nightly_onnx_tests:
     name: Nightly ONNXRuntime CUDA tests on Ubuntu
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
     container:
       image: diffusers/diffusers-onnxruntime-cuda
       options: --gpus 0 --shm-size "16gb" --ipc host

diff --git a/.github/workflows/pr_test_fetcher.yml b/.github/workflows/pr_test_fetcher.yml
@@ -130,7 +130,8 @@ jobs:
             report: torch_hub
 
     name: ${{ matrix.config.name }}
-    runs-on: ${{ matrix.config.runner }}
+    runs-on:
+      group: ${{ matrix.config.runner }}
     container:
       image: ${{ matrix.config.image }}
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/

diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
@@ -77,7 +77,7 @@ jobs:
         config:
           - name: Fast PyTorch Pipeline CPU tests
             framework: pytorch_pipelines
-            runner: [ self-hosted, intel-cpu, 32-cpu, 256-ram, ci ]
+            runner: aws-highmemory-32-plus
             image: diffusers/diffusers-pytorch-cpu
             report: torch_cpu_pipelines
           - name: Fast PyTorch Models & Schedulers CPU tests
@@ -98,7 +98,8 @@ jobs:
 
     name: ${{ matrix.config.name }}
 
-    runs-on: ${{ matrix.config.runner }}
+    runs-on:
+      group: ${{ matrix.config.runner }}
 
     container:
       image: ${{ matrix.config.image }}

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
@@ -58,7 +58,8 @@ jobs:
       max-parallel: 8
       matrix:
         module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
     container:
       image: diffusers/diffusers-pytorch-cuda
       options: --shm-size "16gb" --ipc host --gpus 0
@@ -102,7 +103,8 @@ jobs:
 
   torch_cuda_tests:
     name: Torch CUDA Tests
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
     container:
       image: diffusers/diffusers-pytorch-cuda
       options: --shm-size "16gb" --ipc host --gpus 0
@@ -202,7 +204,8 @@ jobs:
 
   onnx_cuda_tests:
     name: ONNX CUDA Tests
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
     container:
       image: diffusers/diffusers-onnxruntime-cuda
       options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
@@ -250,7 +253,8 @@ jobs:
   run_torch_compile_tests:
     name: PyTorch Compile CUDA tests
 
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
 
     container:
       image: diffusers/diffusers-pytorch-compile-cuda
@@ -292,7 +296,8 @@ jobs:
   run_xformers_tests:
     name: PyTorch xformers CUDA tests
 
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
 
     container:
       image: diffusers/diffusers-pytorch-xformers-cuda
@@ -333,7 +338,8 @@ jobs:
   run_examples_tests:
     name: Examples PyTorch CUDA tests on Ubuntu
 
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
 
     container:
       image: diffusers/diffusers-pytorch-cuda

diff --git a/.github/workflows/push_tests_fast.yml b/.github/workflows/push_tests_fast.yml
@@ -50,7 +50,8 @@ jobs:
 
     name: ${{ matrix.config.name }}
 
-    runs-on: ${{ matrix.config.runner }}
+    runs-on:
+      group: ${{ matrix.config.runner }}
 
     container:
       image: ${{ matrix.config.image }}

diff --git a/.github/workflows/run_tests_from_a_pr.yml b/.github/workflows/run_tests_from_a_pr.yml
@@ -26,7 +26,8 @@ env:
 jobs:
   run_tests:
     name: "Run a test on our runner from a PR"
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
     container:
       image: ${{ github.event.inputs.docker_image }}
       options: --gpus 0 --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -70,4 +71,4 @@ jobs:
         env:
             PY_TEST: ${{ github.event.inputs.test }}
         run: |
-          pytest "$PY_TEST"
+          pytest "$PY_TEST"
diff --git a/.github/workflows/ssh-pr-runner.yml b/.github/workflows/ssh-pr-runner.yml
@@ -19,7 +19,8 @@ env:
 jobs:
   ssh_runner:
     name: "SSH"
-    runs-on: [self-hosted, intel-cpu, 32-cpu, 256-ram, ci]
+    runs-on:
+      group: aws-highmemory-32-plus
     container:
       image: ${{ github.event.inputs.docker_image }}
       options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --privileged

diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml
@@ -22,7 +22,8 @@ env:
 jobs:
   ssh_runner:
     name: "SSH"
-    runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
+    runs-on:
+      group: "${{ github.event.inputs.runner_type }}"
     container:
       image: ${{ github.event.inputs.docker_image }}
       options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -239,6 +239,8 @@
       title: AsymmetricAutoencoderKL
     - local: api/models/autoencoder_tiny
       title: Tiny AutoEncoder
+    - local: api/models/autoencoder_oobleck
+      title: Oobleck AutoEncoder
     - local: api/models/consistency_decoder_vae
       title: ConsistencyDecoderVAE
     - local: api/models/transformer2d
@@ -259,6 +261,8 @@
       title: TransformerTemporalModel
     - local: api/models/sd3_transformer2d
       title: SD3Transformer2DModel
+    - local: api/models/stable_audio_transformer
+      title: StableAudioDiTModel
     - local: api/models/prior_transformer
       title: PriorTransformer
     - local: api/models/controlnet
@@ -362,6 +366,8 @@
       title: Semantic Guidance
     - local: api/pipelines/shap_e
       title: Shap-E
+    - local: api/pipelines/stable_audio
+      title: Stable Audio
     - local: api/pipelines/stable_cascade
       title: Stable Cascade
     - sections:
@@ -425,6 +431,8 @@
       title: CMStochasticIterativeScheduler
     - local: api/schedulers/consistency_decoder
       title: ConsistencyDecoderScheduler
+    - local: api/schedulers/cosine_dpm
+      title: CosineDPMSolverMultistepScheduler
     - local: api/schedulers/ddim_inverse
       title: DDIMInverseScheduler
     - local: api/schedulers/ddim

diff --git a/docs/source/en/api/models/autoencoder_oobleck.md b/docs/source/en/api/models/autoencoder_oobleck.md
@@ -0,0 +1,38 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# AutoencoderOobleck
+
+The Oobleck variational autoencoder (VAE) model with KL loss was introduced in [Stability-AI/stable-audio-tools](https://github.com/Stability-AI/stable-audio-tools) and [Stable Audio Open](https://huggingface.co/papers/2407.14358) by Stability AI. The model is used in 🤗 Diffusers to encode audio waveforms into latents and to decode latent representations into audio waveforms.
+
+The abstract from the paper is:
+
+*Open generative models are vitally important for the community, allowing for fine-tunes and serving as baselines when presenting new models. However, most current text-to-audio models are private and not accessible for artists and researchers to build upon. Here we describe the architecture and training process of a new open-weights text-to-audio model trained with Creative Commons data. Our evaluation shows that the model's performance is competitive with the state-of-the-art across various metrics. Notably, the reported FDopenl3 results (measuring the realism of the generations) showcase its potential for high-quality stereo sound synthesis at 44.1kHz.*
+
+## AutoencoderOobleck
+
+[[autodoc]] AutoencoderOobleck
+    - decode
+    - encode
+    - all
+
+## OobleckDecoderOutput
+
+[[autodoc]] models.autoencoders.autoencoder_oobleck.OobleckDecoderOutput
+
+## OobleckDecoderOutput
+
+[[autodoc]] models.autoencoders.autoencoder_oobleck.OobleckDecoderOutput
+
+## AutoencoderOobleckOutput
+
+[[autodoc]] models.autoencoders.autoencoder_oobleck.AutoencoderOobleckOutput
diff --git a/docs/source/en/api/models/stable_audio_transformer.md b/docs/source/en/api/models/stable_audio_transformer.md
@@ -0,0 +1,19 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# StableAudioDiTModel
+
+A Transformer model for audio waveforms from [Stable Audio Open](https://huggingface.co/papers/2407.14358).
+
+## StableAudioDiTModel
+
+[[autodoc]] StableAudioDiTModel
diff --git a/docs/source/en/api/pipelines/animatediff.md b/docs/source/en/api/pipelines/animatediff.md
@@ -25,6 +25,9 @@ The abstract of the paper is the following:
 | Pipeline | Tasks | Demo
 |---|---|:---:|
 | [AnimateDiffPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff.py) | *Text-to-Video Generation with AnimateDiff* |
+| [AnimateDiffControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py) | *Controlled Video-to-Video Generation with AnimateDiff using ControlNet* |
+| [AnimateDiffSparseControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py) | *Controlled Video-to-Video Generation with AnimateDiff using SparseCtrl* |
+| [AnimateDiffSDXLPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py) | *Video-to-Video Generation with AnimateDiff* |
 | [AnimateDiffVideoToVideoPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py) | *Video-to-Video Generation with AnimateDiff* |
 
 ## Available checkpoints
@@ -100,6 +103,83 @@ AnimateDiff tends to work better with finetuned Stable Diffusion models. If you
 
 </Tip>
 
+### AnimateDiffControlNetPipeline
+
+AnimateDiff can also be used with ControlNets ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala. With a ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide depth maps, the ControlNet model generates a video that'll preserve the spatial information from the depth maps. It is a more flexible and accurate way to control the video generation process.
+
+```python
+import torch
+from diffusers import AnimateDiffControlNetPipeline, AutoencoderKL, ControlNetModel, MotionAdapter, LCMScheduler
+from diffusers.utils import export_to_gif, load_video
+
+# Additionally, you will need a preprocess videos before they can be used with the ControlNet
+# HF maintains just the right package for it: `pip install controlnet_aux`
+from controlnet_aux.processor import ZoeDetector
+
+# Download controlnets from https://huggingface.co/lllyasviel/ControlNet-v1-1 to use .from_single_file
+# Download Diffusers-format controlnets, such as https://huggingface.co/lllyasviel/sd-controlnet-depth, to use .from_pretrained()
+controlnet = ControlNetModel.from_single_file("control_v11f1p_sd15_depth.pth", torch_dtype=torch.float16)
+
+# We use AnimateLCM for this example but one can use the original motion adapters as well (for example, https://huggingface.co/guoyww/animatediff-motion-adapter-v1-5-3)
+motion_adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")
+
+vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
+pipe: AnimateDiffControlNetPipeline = AnimateDiffControlNetPipeline.from_pretrained(
+    "SG161222/Realistic_Vision_V5.1_noVAE",
+    motion_adapter=motion_adapter,
+    controlnet=controlnet,
+    vae=vae,
+).to(device="cuda", dtype=torch.float16)
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
+pipe.load_lora_weights("wangfuyun/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
+pipe.set_adapters(["lcm-lora"], [0.8])
+
+depth_detector = ZoeDetector.from_pretrained("lllyasviel/Annotators").to("cuda")
+video = load_video("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif")
+conditioning_frames = []
+
+with pipe.progress_bar(total=len(video)) as progress_bar:
+    for frame in video:
+        conditioning_frames.append(depth_detector(frame))
+        progress_bar.update()
+
+prompt = "a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality"
+negative_prompt = "bad quality, worst quality"
+
+video = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    num_frames=len(video),
+    num_inference_steps=10,
+    guidance_scale=2.0,
+    conditioning_frames=conditioning_frames,
+    generator=torch.Generator().manual_seed(42),
+).frames[0]
+
+export_to_gif(video, "animatediff_controlnet.gif", fps=8)
+```
+
+Here are some sample outputs:
+
+<table align="center">
+    <tr>
+      <th align="center">Source Video</th>
+      <th align="center">Output Video</th>
+    </tr>
+    <tr>
+        <td align="center">
+          raccoon playing a guitar
+          <br />
+          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif" alt="racoon playing a guitar" />
+        </td>
+        <td align="center">
+          a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality
+          <br/>
+          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-controlnet-output.gif" alt="a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality" />
+        </td>
+    </tr>
+</table>
+
 ### AnimateDiffSparseControlNetPipeline
 
 [SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models](https://arxiv.org/abs/2311.16933) for achieving controlled generation in text-to-video diffusion models by Yuwei Guo, Ceyuan Yang, Anyi Rao, Maneesh Agrawala, Dahua Lin, and Bo Dai.
@@ -762,6 +842,12 @@ pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapt
   - all
   - __call__
 
+## AnimateDiffControlNetPipeline
+
+[[autodoc]] AnimateDiffControlNetPipeline
+  - all
+  - __call__
+
 ## AnimateDiffSparseControlNetPipeline
 
 [[autodoc]] AnimateDiffSparseControlNetPipeline

diff --git a/docs/source/en/api/pipelines/latte.md b/docs/source/en/api/pipelines/latte.md
@@ -24,6 +24,8 @@ The abstract from the paper is:
 
 **Highlights**: Latte is a latent diffusion transformer proposed as a backbone for modeling different modalities (trained for text-to-video generation here). It achieves state-of-the-art performance across four standard video benchmarks - [FaceForensics](https://arxiv.org/abs/1803.09179), [SkyTimelapse](https://arxiv.org/abs/1709.07592), [UCF101](https://arxiv.org/abs/1212.0402) and [Taichi-HD](https://arxiv.org/abs/2003.00196). To prepare and download the datasets for evaluation, please refer to [this https URL](https://github.com/Vchitect/Latte/blob/main/docs/datasets_evaluation.md).
 
+This pipeline was contributed by [maxin-cn](https://github.com/maxin-cn). The original codebase can be found [here](https://github.com/Vchitect/Latte). The original weights can be found under [hf.co/maxin-cn](https://huggingface.co/maxin-cn).
+
 <Tip>
 
 Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.

diff --git a/docs/source/en/api/pipelines/lumina.md b/docs/source/en/api/pipelines/lumina.md
@@ -43,6 +43,8 @@ Lumina-T2X has the following components:
 * It uses a Flow-based Large Diffusion Transformer as the backbone
 * It supports different any modalities with one backbone and corresponding encoder, decoder.
 
+This pipeline was contributed by [PommesPeter](https://github.com/PommesPeter). The original codebase can be found [here](https://github.com/Alpha-VLLM/Lumina-T2X). The original weights can be found under [hf.co/Alpha-VLLM](https://huggingface.co/Alpha-VLLM).
+
 <Tip>
 
 Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.

diff --git a/docs/source/en/api/pipelines/overview.md b/docs/source/en/api/pipelines/overview.md
@@ -71,6 +71,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [Semantic Guidance](semantic_stable_diffusion) | text2image |
 | [Shap-E](shap_e) | text-to-3D, image-to-3D |
 | [Spectrogram Diffusion](spectrogram_diffusion) |  |
+| [Stable Audio](stable_audio) | text2audio |
 | [Stable Diffusion](stable_diffusion/overview) | text2image, image2image, depth2image, inpainting, image variation, latent upscaler, super-resolution |
 | [Stable Diffusion Model Editing](model_editing) | model editing |
 | [Stable Diffusion XL](stable_diffusion/stable_diffusion_xl) | text2image, image2image, inpainting |
-Original file line number
+Diff line change
@@ Expand Up / @@ -24,6 +24,8 @@ The abstract from the paper is: @@
     **Highlights**: Latte is a latent diffusion transformer proposed as a backbone for modeling different modalities (trained for text-to-video generation here). It achieves state-of-the-art performance across four standard video benchmarks - [FaceForensics](https://arxiv.org/abs/1803.09179), [SkyTimelapse](https://arxiv.org/abs/1709.07592), [UCF101](https://arxiv.org/abs/1212.0402) and [Taichi-HD](https://arxiv.org/abs/2003.00196). To prepare and download the datasets for evaluation, please refer to [this https URL](https://github.com/Vchitect/Latte/blob/main/docs/datasets_evaluation.md).
+    This pipeline was contributed by [maxin-cn](https://github.com/maxin-cn). The original codebase can be found [here](https://github.com/Vchitect/Latte). The original weights can be found under [hf.co/maxin-cn](https://huggingface.co/maxin-cn).
     <Tip>
     Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
@@ Expand Down @@