black-forest-labs · andompesta · Aug 29, 2024 · Sep 13, 2024 · Oct 3, 2024 · Oct 3, 2024
diff --git a/README.md b/README.md
@@ -15,6 +15,19 @@ source .venv/bin/activate
 pip install -e ".[all]"
 ```
 
+## Local installation with TRT support
+
+```bash
+docker pull nvcr.io/nvidia/pytorch:24.10-py3
+cd $HOME && git clone https://github.com/black-forest-labs/flux
+cd $HOME/flux
+docker run --rm -it --gpus all -v $PWD:/workspace/flux nvcr.io/nvidia/pytorch:24.10-py3 /bin/bash
+# inside container
+cd /workspace/flux
+pip install -e ".[all]"
+pip install -r trt_requirements.txt
+```
+
 ### Models
 
 We are offering an extensive suite of models. For more information about the invidual models, please refer to the link under **Usage**.
@@ -40,6 +53,57 @@ We are offering an extensive suite of models. For more information about the inv
 
 The weights of the autoencoder are also released under [apache-2.0](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md) and can be found in the HuggingFace repos above.
 
+We also offer a Gradio-based demo for an interactive experience. To run the Gradio demo:
+
+```bash
+python demo_gr.py --name flux-schnell --device cuda
+```
+
+Options:
+
+- `--name`: Choose the model to use (options: "flux-schnell", "flux-dev")
+- `--device`: Specify the device to use (default: "cuda" if available, otherwise "cpu")
+- `--offload`: Offload model to CPU when not in use
+- `--share`: Create a public link to your demo
+
+To run the demo with the dev model and create a public link:
+
+```bash
+python demo_gr.py --name flux-dev --share
+```
+
+## Diffusers integration
+
+`FLUX.1 [schnell]` and `FLUX.1 [dev]` are integrated with the [🧨 diffusers](https://github.com/huggingface/diffusers) library. To use it with diffusers, install it:
+
+```shell
+pip install git+https://github.com/huggingface/diffusers.git
+```
+
+Then you can use `FluxPipeline` to run the model
+
+```python
+import torch
+from diffusers import FluxPipeline
+
+model_id = "black-forest-labs/FLUX.1-schnell" #you can also use `black-forest-labs/FLUX.1-dev`
+
+pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+pipe.enable_model_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU power
+
+prompt = "A cat holding a sign that says hello world"
+seed = 42
+image = pipe(
+    prompt,
+    output_type="pil",
+    num_inference_steps=4, #use a larger number if you are using [dev]
+    generator=torch.Generator("cpu").manual_seed(seed)
+).images[0]
+image.save("flux-schnell.png")
+```
+
+To learn more check out the [diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux) documentation
+
 ## API usage
 
 Our API offers access to our models. It is documented here:

diff --git a/demo_gr.py b/demo_gr.py
@@ -15,7 +15,6 @@
 
 NSFW_THRESHOLD = 0.85
 
-
 def get_models(name: str, device: torch.device, offload: bool, is_schnell: bool):
     t5 = load_t5(device, max_length=256 if is_schnell else 512)
     clip = load_clip(device)
@@ -24,7 +23,6 @@ def get_models(name: str, device: torch.device, offload: bool, is_schnell: bool)
     nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)
     return model, ae, t5, clip, nsfw_classifier
 
-
 class FluxGenerator:
     def __init__(self, model_name: str, device: str, offload: bool):
         self.device = torch.device(device)
@@ -153,7 +151,6 @@ def generate_image(
             exif_data[ExifTags.Base.Model] = self.model_name
             if add_sampling_metadata:
                 exif_data[ExifTags.Base.ImageDescription] = prompt
-
             img.save(filename, format="jpeg", exif=exif_data, quality=95, subsampling=0)
 
             return img, str(opts.seed), filename, None

diff --git a/src/flux/api.py b/src/flux/api.py
@@ -146,7 +146,9 @@ def request(self):
         )
         result = response.json()
         if response.status_code != 200:
-            raise ApiException(status_code=response.status_code, detail=result.get("detail"))
+            raise ApiException(
+                status_code=response.status_code, detail=result.get("detail")
+            )
         self.request_id = response.json()["id"]
 
     def retrieve(self) -> dict:
@@ -168,13 +170,17 @@ def retrieve(self) -> dict:
             )
             result = response.json()
             if "status" not in result:
-                raise ApiException(status_code=response.status_code, detail=result.get("detail"))
+                raise ApiException(
+                    status_code=response.status_code, detail=result.get("detail")
+                )
             elif result["status"] == "Ready":
                 self.result = result["result"]
             elif result["status"] == "Pending":
                 time.sleep(0.5)
             else:
-                raise ApiException(status_code=200, detail=f"API returned status '{result['status']}'")
+                raise ApiException(
+                    status_code=200, detail=f"API returned status '{result['status']}'"
+                )
         return self.result
 
     @property

diff --git a/src/flux/cli.py b/src/flux/cli.py
@@ -5,10 +5,12 @@
 from glob import iglob
 
 import torch
+from cuda import cudart
 from fire import Fire
 from transformers import pipeline
 
 from flux.sampling import denoise, get_noise, get_schedule, prepare, unpack
+from flux.trt.trt_manager import TRTManager
 from flux.util import configs, load_ae, load_clip, load_flow_model, load_t5, save_image
 
 NSFW_THRESHOLD = 0.85
@@ -108,6 +110,8 @@ def main(
     offload: bool = False,
     output_dir: str = "output",
     add_sampling_metadata: bool = True,
+    trt: bool = False,
+    **kwargs: dict | None,
 ):
     """
     Sample the flux model. Either interactively (set `--loop`) or run for a
@@ -126,6 +130,8 @@ def main(
         loop: start an interactive session and sample multiple times
         guidance: guidance value used for guidance distillation
         add_sampling_metadata: Add the prompt to the image Exif metadata
+        trt: use TensorRT backend for optimized inference
+        kwargs: additional arguments for TensorRT support
     """
     nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)
 
@@ -158,6 +164,58 @@ def main(
     model = load_flow_model(name, device="cpu" if offload else torch_device)
     ae = load_ae(name, device="cpu" if offload else torch_device)
 
+    if trt:
+        # offload to CPU to save memory
+        ae = ae.cpu()
+        model = model.cpu()
+        clip = clip.cpu()
+        t5 = t5.cpu()
+
+        torch.cuda.empty_cache()
+
+        trt_ctx_manager = TRTManager(
+            bf16=True,
+            device=torch_device,
+        )
+
+        engines = trt_ctx_manager.load_engines(
+            models={
+                "clip": clip,
+                "transformer": model,
+                "t5": t5,
+                "vae": ae,
+            },
+            engine_dir=os.environ.get("TRT_ENGINE_DIR", "./engines"),
+            onnx_dir=os.environ.get("ONNX_DIR", "./onnx"),
+            opt_image_height=height,
+            opt_image_width=width,
+        )
+
+        torch.cuda.synchronize()
+
+        trt_ctx_manager.init_runtime()
+        stream = cudart.cudaStreamCreate()[1]
+
+        for engine in engines.values():
+            engine.load(stream)
+
+        calculate_max_device_memory = trt_ctx_manager.calculate_max_device_memory(engines)
+        _, shared_device_memory = cudart.cudaMalloc(calculate_max_device_memory)
+
+        for engine_name, engine in engines.items():
+            engine.activate(shared_device_memory)
+            shape_dict = engine.get_shape_dict(
+                batch_size=1,
+                image_height=height,
+                image_width=width,
+            )
+            engine.allocate_buffers(shape_dict, device=torch_device)
+
+        ae = engines["vae"]
+        model = engines["transformer"]
+        clip = engines["clip"]
+        t5 = engines["t5"]
+
     rng = torch.Generator(device="cpu")
     opts = SamplingOptions(
         prompt=prompt,

diff --git a/src/flux/math.py b/src/flux/math.py
@@ -14,7 +14,7 @@ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
 
 def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
     assert dim % 2 == 0
-    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    scale = torch.arange(0, dim, 2, dtype=pos.dtype, device=pos.device) / dim
     omega = 1.0 / (theta**scale)
     out = torch.einsum("...n,d->...nd", pos, omega)
     out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)

diff --git a/src/flux/modules/autoencoder.py b/src/flux/modules/autoencoder.py
@@ -235,6 +235,9 @@ def __init__(
         self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
 
     def forward(self, z: Tensor) -> Tensor:
+        # get dtype for proper tracing
+        upscale_dtype = next(self.up.parameters()).dtype
+
         # z to block_in
         h = self.conv_in(z)
 
@@ -243,6 +246,8 @@ def forward(self, z: Tensor) -> Tensor:
         h = self.mid.attn_1(h)
         h = self.mid.block_2(h)
 
+        # cast to proper dtype
+        h = h.to(upscale_dtype)
         # upsampling
         for i_level in reversed(range(self.num_resolutions)):
             for i_block in range(self.num_res_blocks + 1):
@@ -277,6 +282,7 @@ def forward(self, z: Tensor) -> Tensor:
 class AutoEncoder(nn.Module):
     def __init__(self, params: AutoEncoderParams):
         super().__init__()
+        self.params = params
         self.encoder = Encoder(
             resolution=params.resolution,
             in_channels=params.in_channels,

diff --git a/src/flux/trt/__init__.py b/src/flux/trt/__init__.py
diff --git a/src/flux/trt/engine/__init__.py b/src/flux/trt/engine/__init__.py
@@ -0,0 +1,29 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from flux.trt.engine.base_engine import BaseEngine
+from flux.trt.engine.clip_engine import CLIPEngine
+from flux.trt.engine.t5_engine import T5Engine
+from flux.trt.engine.transformer_engine import TransformerEngine
+from flux.trt.engine.vae_engine import VAEEngine
+
+__all__ = [
+    "BaseEngine",
+    "CLIPEngine",
+    "TransformerEngine",
+    "T5Engine",
+    "VAEEngine",
+]