Skip to content

Commit

Permalink
Merge branch 'main' into test-cache
Browse files Browse the repository at this point in the history
Signed-off-by: Adrien <[email protected]>
  • Loading branch information
XciD committed Jul 31, 2024
2 parents 9e638d8 + ea1b4ea commit 1972b80
Show file tree
Hide file tree
Showing 51 changed files with 5,588 additions and 48 deletions.
9 changes: 6 additions & 3 deletions .github/workflows/nightly_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ jobs:
max-parallel: 8
matrix:
module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
runs-on: [single-gpu, nvidia-gpu, t4, ci]
runs-on:
group: aws-g4dn-2xlarge
container:
image: diffusers/diffusers-pytorch-cuda
options: --shm-size "16gb" --ipc host --gpus 0
Expand Down Expand Up @@ -106,7 +107,8 @@ jobs:
run_nightly_tests_for_other_torch_modules:
name: Nightly Torch CUDA Tests
runs-on: [single-gpu, nvidia-gpu, t4, ci]
runs-on:
group: aws-g4dn-2xlarge
container:
image: diffusers/diffusers-pytorch-cuda
options: --shm-size "16gb" --ipc host --gpus 0
Expand Down Expand Up @@ -235,7 +237,8 @@ jobs:
run_nightly_onnx_tests:
name: Nightly ONNXRuntime CUDA tests on Ubuntu
runs-on: [single-gpu, nvidia-gpu, t4, ci]
runs-on:
group: aws-g4dn-2xlarge
container:
image: diffusers/diffusers-onnxruntime-cuda
options: --gpus 0 --shm-size "16gb" --ipc host
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/pr_test_fetcher.yml
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ jobs:
report: torch_hub

name: ${{ matrix.config.name }}
runs-on: ${{ matrix.config.runner }}
runs-on:
group: ${{ matrix.config.runner }}
container:
image: ${{ matrix.config.image }}
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/pr_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ jobs:
config:
- name: Fast PyTorch Pipeline CPU tests
framework: pytorch_pipelines
runner: [ self-hosted, intel-cpu, 32-cpu, 256-ram, ci ]
runner: aws-highmemory-32-plus
image: diffusers/diffusers-pytorch-cpu
report: torch_cpu_pipelines
- name: Fast PyTorch Models & Schedulers CPU tests
Expand All @@ -98,7 +98,8 @@ jobs:

name: ${{ matrix.config.name }}

runs-on: ${{ matrix.config.runner }}
runs-on:
group: ${{ matrix.config.runner }}

container:
image: ${{ matrix.config.image }}
Expand Down
18 changes: 12 additions & 6 deletions .github/workflows/push_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ jobs:
max-parallel: 8
matrix:
module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
runs-on: [single-gpu, nvidia-gpu, t4, ci]
runs-on:
group: aws-g4dn-2xlarge
container:
image: diffusers/diffusers-pytorch-cuda
options: --shm-size "16gb" --ipc host --gpus 0
Expand Down Expand Up @@ -102,7 +103,8 @@ jobs:

torch_cuda_tests:
name: Torch CUDA Tests
runs-on: [single-gpu, nvidia-gpu, t4, ci]
runs-on:
group: aws-g4dn-2xlarge
container:
image: diffusers/diffusers-pytorch-cuda
options: --shm-size "16gb" --ipc host --gpus 0
Expand Down Expand Up @@ -202,7 +204,8 @@ jobs:

onnx_cuda_tests:
name: ONNX CUDA Tests
runs-on: [single-gpu, nvidia-gpu, t4, ci]
runs-on:
group: aws-g4dn-2xlarge
container:
image: diffusers/diffusers-onnxruntime-cuda
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
Expand Down Expand Up @@ -250,7 +253,8 @@ jobs:
run_torch_compile_tests:
name: PyTorch Compile CUDA tests

runs-on: [single-gpu, nvidia-gpu, t4, ci]
runs-on:
group: aws-g4dn-2xlarge

container:
image: diffusers/diffusers-pytorch-compile-cuda
Expand Down Expand Up @@ -292,7 +296,8 @@ jobs:
run_xformers_tests:
name: PyTorch xformers CUDA tests

runs-on: [single-gpu, nvidia-gpu, t4, ci]
runs-on:
group: aws-g4dn-2xlarge

container:
image: diffusers/diffusers-pytorch-xformers-cuda
Expand Down Expand Up @@ -333,7 +338,8 @@ jobs:
run_examples_tests:
name: Examples PyTorch CUDA tests on Ubuntu

runs-on: [single-gpu, nvidia-gpu, t4, ci]
runs-on:
group: aws-g4dn-2xlarge

container:
image: diffusers/diffusers-pytorch-cuda
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/push_tests_fast.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ jobs:

name: ${{ matrix.config.name }}

runs-on: ${{ matrix.config.runner }}
runs-on:
group: ${{ matrix.config.runner }}

container:
image: ${{ matrix.config.image }}
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/run_tests_from_a_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ env:
jobs:
run_tests:
name: "Run a test on our runner from a PR"
runs-on: [single-gpu, nvidia-gpu, t4, ci]
runs-on:
group: aws-g4dn-2xlarge
container:
image: ${{ github.event.inputs.docker_image }}
options: --gpus 0 --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
Expand Down Expand Up @@ -70,4 +71,4 @@ jobs:
env:
PY_TEST: ${{ github.event.inputs.test }}
run: |
pytest "$PY_TEST"
pytest "$PY_TEST"
3 changes: 2 additions & 1 deletion .github/workflows/ssh-pr-runner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ env:
jobs:
ssh_runner:
name: "SSH"
runs-on: [self-hosted, intel-cpu, 32-cpu, 256-ram, ci]
runs-on:
group: aws-highmemory-32-plus
container:
image: ${{ github.event.inputs.docker_image }}
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --privileged
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/ssh-runner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ env:
jobs:
ssh_runner:
name: "SSH"
runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
runs-on:
group: "${{ github.event.inputs.runner_type }}"
container:
image: ${{ github.event.inputs.docker_image }}
options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
Expand Down
8 changes: 8 additions & 0 deletions docs/source/en/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,8 @@
title: AsymmetricAutoencoderKL
- local: api/models/autoencoder_tiny
title: Tiny AutoEncoder
- local: api/models/autoencoder_oobleck
title: Oobleck AutoEncoder
- local: api/models/consistency_decoder_vae
title: ConsistencyDecoderVAE
- local: api/models/transformer2d
Expand All @@ -259,6 +261,8 @@
title: TransformerTemporalModel
- local: api/models/sd3_transformer2d
title: SD3Transformer2DModel
- local: api/models/stable_audio_transformer
title: StableAudioDiTModel
- local: api/models/prior_transformer
title: PriorTransformer
- local: api/models/controlnet
Expand Down Expand Up @@ -362,6 +366,8 @@
title: Semantic Guidance
- local: api/pipelines/shap_e
title: Shap-E
- local: api/pipelines/stable_audio
title: Stable Audio
- local: api/pipelines/stable_cascade
title: Stable Cascade
- sections:
Expand Down Expand Up @@ -425,6 +431,8 @@
title: CMStochasticIterativeScheduler
- local: api/schedulers/consistency_decoder
title: ConsistencyDecoderScheduler
- local: api/schedulers/cosine_dpm
title: CosineDPMSolverMultistepScheduler
- local: api/schedulers/ddim_inverse
title: DDIMInverseScheduler
- local: api/schedulers/ddim
Expand Down
38 changes: 38 additions & 0 deletions docs/source/en/api/models/autoencoder_oobleck.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->

# AutoencoderOobleck

The Oobleck variational autoencoder (VAE) model with KL loss was introduced in [Stability-AI/stable-audio-tools](https://github.com/Stability-AI/stable-audio-tools) and [Stable Audio Open](https://huggingface.co/papers/2407.14358) by Stability AI. The model is used in 🤗 Diffusers to encode audio waveforms into latents and to decode latent representations into audio waveforms.

The abstract from the paper is:

*Open generative models are vitally important for the community, allowing for fine-tunes and serving as baselines when presenting new models. However, most current text-to-audio models are private and not accessible for artists and researchers to build upon. Here we describe the architecture and training process of a new open-weights text-to-audio model trained with Creative Commons data. Our evaluation shows that the model's performance is competitive with the state-of-the-art across various metrics. Notably, the reported FDopenl3 results (measuring the realism of the generations) showcase its potential for high-quality stereo sound synthesis at 44.1kHz.*

## AutoencoderOobleck

[[autodoc]] AutoencoderOobleck
- decode
- encode
- all

## OobleckDecoderOutput

[[autodoc]] models.autoencoders.autoencoder_oobleck.OobleckDecoderOutput

## OobleckDecoderOutput

[[autodoc]] models.autoencoders.autoencoder_oobleck.OobleckDecoderOutput

## AutoencoderOobleckOutput

[[autodoc]] models.autoencoders.autoencoder_oobleck.AutoencoderOobleckOutput
19 changes: 19 additions & 0 deletions docs/source/en/api/models/stable_audio_transformer.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->

# StableAudioDiTModel

A Transformer model for audio waveforms from [Stable Audio Open](https://huggingface.co/papers/2407.14358).

## StableAudioDiTModel

[[autodoc]] StableAudioDiTModel
86 changes: 86 additions & 0 deletions docs/source/en/api/pipelines/animatediff.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ The abstract of the paper is the following:
| Pipeline | Tasks | Demo
|---|---|:---:|
| [AnimateDiffPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff.py) | *Text-to-Video Generation with AnimateDiff* |
| [AnimateDiffControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py) | *Controlled Video-to-Video Generation with AnimateDiff using ControlNet* |
| [AnimateDiffSparseControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py) | *Controlled Video-to-Video Generation with AnimateDiff using SparseCtrl* |
| [AnimateDiffSDXLPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py) | *Video-to-Video Generation with AnimateDiff* |
| [AnimateDiffVideoToVideoPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py) | *Video-to-Video Generation with AnimateDiff* |

## Available checkpoints
Expand Down Expand Up @@ -100,6 +103,83 @@ AnimateDiff tends to work better with finetuned Stable Diffusion models. If you

</Tip>

### AnimateDiffControlNetPipeline

AnimateDiff can also be used with ControlNets ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala. With a ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide depth maps, the ControlNet model generates a video that'll preserve the spatial information from the depth maps. It is a more flexible and accurate way to control the video generation process.

```python
import torch
from diffusers import AnimateDiffControlNetPipeline, AutoencoderKL, ControlNetModel, MotionAdapter, LCMScheduler
from diffusers.utils import export_to_gif, load_video

# Additionally, you will need a preprocess videos before they can be used with the ControlNet
# HF maintains just the right package for it: `pip install controlnet_aux`
from controlnet_aux.processor import ZoeDetector

# Download controlnets from https://huggingface.co/lllyasviel/ControlNet-v1-1 to use .from_single_file
# Download Diffusers-format controlnets, such as https://huggingface.co/lllyasviel/sd-controlnet-depth, to use .from_pretrained()
controlnet = ControlNetModel.from_single_file("control_v11f1p_sd15_depth.pth", torch_dtype=torch.float16)

# We use AnimateLCM for this example but one can use the original motion adapters as well (for example, https://huggingface.co/guoyww/animatediff-motion-adapter-v1-5-3)
motion_adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")

vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
pipe: AnimateDiffControlNetPipeline = AnimateDiffControlNetPipeline.from_pretrained(
"SG161222/Realistic_Vision_V5.1_noVAE",
motion_adapter=motion_adapter,
controlnet=controlnet,
vae=vae,
).to(device="cuda", dtype=torch.float16)
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
pipe.load_lora_weights("wangfuyun/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
pipe.set_adapters(["lcm-lora"], [0.8])

depth_detector = ZoeDetector.from_pretrained("lllyasviel/Annotators").to("cuda")
video = load_video("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif")
conditioning_frames = []

with pipe.progress_bar(total=len(video)) as progress_bar:
for frame in video:
conditioning_frames.append(depth_detector(frame))
progress_bar.update()

prompt = "a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality"
negative_prompt = "bad quality, worst quality"

video = pipe(
prompt=prompt,
negative_prompt=negative_prompt,
num_frames=len(video),
num_inference_steps=10,
guidance_scale=2.0,
conditioning_frames=conditioning_frames,
generator=torch.Generator().manual_seed(42),
).frames[0]

export_to_gif(video, "animatediff_controlnet.gif", fps=8)
```

Here are some sample outputs:

<table align="center">
<tr>
<th align="center">Source Video</th>
<th align="center">Output Video</th>
</tr>
<tr>
<td align="center">
raccoon playing a guitar
<br />
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif" alt="racoon playing a guitar" />
</td>
<td align="center">
a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality
<br/>
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-controlnet-output.gif" alt="a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality" />
</td>
</tr>
</table>

### AnimateDiffSparseControlNetPipeline

[SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models](https://arxiv.org/abs/2311.16933) for achieving controlled generation in text-to-video diffusion models by Yuwei Guo, Ceyuan Yang, Anyi Rao, Maneesh Agrawala, Dahua Lin, and Bo Dai.
Expand Down Expand Up @@ -762,6 +842,12 @@ pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapt
- all
- __call__

## AnimateDiffControlNetPipeline

[[autodoc]] AnimateDiffControlNetPipeline
- all
- __call__

## AnimateDiffSparseControlNetPipeline

[[autodoc]] AnimateDiffSparseControlNetPipeline
Expand Down
2 changes: 2 additions & 0 deletions docs/source/en/api/pipelines/latte.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ The abstract from the paper is:

**Highlights**: Latte is a latent diffusion transformer proposed as a backbone for modeling different modalities (trained for text-to-video generation here). It achieves state-of-the-art performance across four standard video benchmarks - [FaceForensics](https://arxiv.org/abs/1803.09179), [SkyTimelapse](https://arxiv.org/abs/1709.07592), [UCF101](https://arxiv.org/abs/1212.0402) and [Taichi-HD](https://arxiv.org/abs/2003.00196). To prepare and download the datasets for evaluation, please refer to [this https URL](https://github.com/Vchitect/Latte/blob/main/docs/datasets_evaluation.md).

This pipeline was contributed by [maxin-cn](https://github.com/maxin-cn). The original codebase can be found [here](https://github.com/Vchitect/Latte). The original weights can be found under [hf.co/maxin-cn](https://huggingface.co/maxin-cn).

<Tip>

Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
Expand Down
2 changes: 2 additions & 0 deletions docs/source/en/api/pipelines/lumina.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ Lumina-T2X has the following components:
* It uses a Flow-based Large Diffusion Transformer as the backbone
* It supports different any modalities with one backbone and corresponding encoder, decoder.

This pipeline was contributed by [PommesPeter](https://github.com/PommesPeter). The original codebase can be found [here](https://github.com/Alpha-VLLM/Lumina-T2X). The original weights can be found under [hf.co/Alpha-VLLM](https://huggingface.co/Alpha-VLLM).

<Tip>

Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
Expand Down
1 change: 1 addition & 0 deletions docs/source/en/api/pipelines/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
| [Semantic Guidance](semantic_stable_diffusion) | text2image |
| [Shap-E](shap_e) | text-to-3D, image-to-3D |
| [Spectrogram Diffusion](spectrogram_diffusion) | |
| [Stable Audio](stable_audio) | text2audio |
| [Stable Diffusion](stable_diffusion/overview) | text2image, image2image, depth2image, inpainting, image variation, latent upscaler, super-resolution |
| [Stable Diffusion Model Editing](model_editing) | model editing |
| [Stable Diffusion XL](stable_diffusion/stable_diffusion_xl) | text2image, image2image, inpainting |
Expand Down
Loading

0 comments on commit 1972b80

Please sign in to comment.