#5337: Merge branch 'main' into mistral_model_weights

tenstorrent · Jun 5, 2024 · 5094e68 · 5094e68
2 parents 0aa1a20 + a409944
commit 5094e68
Show file tree

Hide file tree

Showing 143 changed files with 2,544 additions and 2,818 deletions.
diff --git a/README.md b/README.md
@@ -24,10 +24,10 @@
 
 | Model                                                      | Batch               | End-to-end throughput [1]    | Device throughput [2]       | Target                              |
 |----------------------------------------------------------  |---------------------|------------------------------|-----------------------------|-------------------------------------|
-| [ResNet-50](./models/demos/resnet) (fps)                   | 20                  | 2,850                        | 7,200                       | 10,000                              |
+| [ResNet-50](./models/demos/resnet) (fps)                   | 20                  | 4,400                        | 7,700                       | 10,000                              |
 | [BERT-Large](./models/demos/bert) (sen/s)                  | 12                  | 362                          | 406                         | 410                                 |
 | [Falcon7B-decode](./models/demos/ttnn_falcon7b) (t/s)      | 32                  | 135                          | 135                         | 140                                 |
-| [ViT](./models/demos/grayskull/vit) (fps)                  | 8                   | 480                          | 1570                        | 2000                                |
+| [ViT](./models/demos/grayskull/vit) (fps)                  | 8                   | 860                          | 1570                        | 2000                                |
 | [T5 small](.models/demos/grayskull/t5) (sen/s)             |                     | 140                          |                             |                                     |
 | [Bloom](.models/demos/grayskull/functional_bloom) (sen/s)  |                     | 70                           |                             |                                     |
 | U-Net                                                      | coming soon         |                              |                             |                                     |
@@ -42,13 +42,13 @@
 >
 > All model demos in this table function on both N150 and N300 Wormhole cards, unless otherwise stated.
 
-| Model                                                       | Gen. Token [3]     |  Batch               | End-to-end throughput [1]    | Device throughput [2]       | Target         |
-|-------------------------------------------------------------|--------------------|----------------------|------------------------------|-----------------------------|----------------|
-| [Falcon7B-decode](./models/demos/wormhole/falcon7b)         | 129th              | 32                   | 11.6 t/s/u - 371 t/s         | 15.4 t/s/u - 493 t/s        | 21 t/s/u       |
-| [Mistral-7B-decode](./models/demos/wormhole/mistral7b)      |  33rd              | 32                   | 10.9 t/s/u - 349 t/s         | 13.3 t/s/u - 426 t/s        | 21 t/s/u       |
-| [Mamba-2.8B-decode](./models/demos/mamba)                   |  any               | 32                   | 9.2 t/s/u -  295 t/s         | 13.1 t/s/u - 419 t/s        | 22 t/s/u       |
-| [BERT-Large](./models/demos/metal_BERT_large_11/) (sen/s) [4] | any                |  8                   | 270                          | 340                         | 400            |
-| [Stable Diffusion 1.4](./models/demos/wormhole/stable_diffusion) 512x512  (sec/img)                              |          | 1                    |      8s                        |    5s                         |                |
+| Model                                                                                | Gen. Token [3]     |  Batch               | End-to-end throughput [1]    | Device throughput [2]       | Target         |
+|--------------------------------------------------------------------------------------|--------------------|----------------------|------------------------------|-----------------------------|----------------|
+| [Falcon7B-decode](./models/demos/wormhole/falcon7b)                                  | 129th              | 32                   | 11.6 t/s/u - 371 t/s         | 15.4 t/s/u - 493 t/s        | 21             |
+| [Mistral-7B-decode](./models/demos/wormhole/mistral7b)                               | 33rd               | 32                   | 10.9 t/s/u - 349 t/s         | 13.3 t/s/u - 426 t/s        | 21             |
+| [Mamba-2.8B-decode](./models/demos/mamba)                                            | any                | 32                   | 9.2 t/s/u - 295 t/s          | 13.1 t/s/u - 419 t/s        | 22             |
+| [BERT-Large](./models/demos/metal_BERT_large_11/) (sen/s) [4]                        |                    | 8                    | 270                          | 340                         | 400            |
+| [Stable Diffusion 1.4](./models/demos/wormhole/stable_diffusion) 512x512  (sec/img)  |                    | 1                    | 8                            | 5                           |                |
 
 [1] - Observed from the host. Includes dispatch overhead and kernel execution time.
 

diff --git a/conftest.py b/conftest.py
@@ -326,9 +326,6 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0):
     except (ValueError, AttributeError):
         num_devices_requested = len(device_ids)
 
-    if num_devices_requested <= 1:
-        pytest.skip("Requires multiple devices to run")
-
     device_mesh = ttnn.open_device_mesh(ttnn.DeviceGrid(1, num_devices_requested), device_ids[:num_devices_requested])
 
     logger.debug(f"multidevice with {device_mesh.get_num_devices()} devices is created")
@@ -354,9 +351,6 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0):
     except (ValueError, AttributeError):
         num_pcie_devices_requested = len(device_ids)
 
-    if num_pcie_devices_requested <= 1:
-        pytest.skip("Requires multiple devices to run")
-
     device_mesh = ttnn.open_device_mesh(
         ttnn.DeviceGrid(1, num_pcie_devices_requested), device_ids[:num_pcie_devices_requested]
     )
@@ -386,9 +380,6 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0):
     except (ValueError, AttributeError):
         num_devices_requested = len(device_ids)
 
-    if num_devices_requested <= 1:
-        pytest.skip("Requires multiple devices to run")
-
     device_mesh = ttnn.open_device_mesh(ttnn.DeviceGrid(1, num_devices_requested), device_ids[:num_devices_requested])
 
     logger.debug(f"multidevice with {device_mesh.get_num_devices()} devices is created")

diff --git a/dockerfile/ubuntu-20.04-x86.Dockerfile → dockerfile/ubuntu-20.04-amd64.Dockerfile b/dockerfile/ubuntu-20.04-x86.Dockerfile → dockerfile/ubuntu-20.04-amd64.Dockerfile
@@ -1,4 +1,4 @@
-# Second stage: the actual image
+# TT-METAL UBUNTU 20.04 AMD64 DOCKERFILE
 FROM ubuntu:20.04
 
 ARG DEBIAN_FRONTEND=noninteractive
@@ -25,16 +25,19 @@ RUN /bin/bash /opt/tt_metal_infra/scripts/docker/install_test_deps.sh ${GTEST_VE
 COPY /scripts /opt/tt_metal_infra/scripts
 COPY build_metal.sh /scripts/build_metal.sh
 
-# ENV TT_METAL_INFRA_DIR=/opt/tt_metal_infra
-# ENV PYTHON_ENV_DIR=${TT_METAL_INFRA_DIR}/tt-metal/python_env
-# RUN python3 -m venv $PYTHON_ENV_DIR
+# Setup Env variables to setup Python Virtualenv - Install TT-Metal Python deps
+ENV TT_METAL_INFRA_DIR=/opt/tt_metal_infra
+ENV PYTHON_ENV_DIR=${TT_METAL_INFRA_DIR}/tt-metal/python_env
+RUN python3 -m venv $PYTHON_ENV_DIR
+ENV PATH="$PYTHON_ENV_DIR/bin:$PATH"
 
-# COPY /docs/requirements-docs.txt ${TT_METAL_INFRA_DIR}/tt-metal/docs/.
-# COPY /tt_metal/python_env/* ${TT_METAL_INFRA_DIR}/tt-metal/tt_metal/python_env/.
-# ENV PATH="$PYTHON_ENV_DIR/bin:$PATH"
-# RUN python3 -m pip config set global.extra-index-url https://download.pytorch.org/whl/cpu \
-#     && python3 -m pip install setuptools wheel
+# Copy requirements from tt-metal folders with requirements.txt docs
+COPY /docs/requirements-docs.txt ${TT_METAL_INFRA_DIR}/tt-metal/docs/.
+COPY /tt_metal/python_env/* ${TT_METAL_INFRA_DIR}/tt-metal/tt_metal/python_env/.
+RUN python3 -m pip config set global.extra-index-url https://download.pytorch.org/whl/cpu \
+    && python3 -m pip install setuptools wheel
 
-# RUN python3 -m pip install -r ${TT_METAL_INFRA_DIR}/tt-metal/tt_metal/python_env/requirements-dev.txt
+RUN python3 -m pip install -r ${TT_METAL_INFRA_DIR}/tt-metal/tt_metal/python_env/requirements-dev.txt
+RUN python3 -m pip install -r ${TT_METAL_INFRA_DIR}/tt-metal/docs/requirements-docs.txt
 
 CMD ["tail", "-f", "/dev/null"]
diff --git a/models/demos/mamba/demo/demo.py b/models/demos/mamba/demo/demo.py
@@ -28,13 +28,8 @@ def get_tt_metal_model(
     from models.demos.mamba.tt import model_config
 
     reference_model = get_cpu_reference_model(version, batch_size=batch_size)
-    if cache_dir:
-        cache_path = model_config.get_weights_cache_path(version, cache_dir)
-    else:
-        cache_path = None
-
     config = model_config.create_model_config(batch_size, reference_model.args.d_model)
-    model = MambaTT(reference_model, device, config, tt_cache_path=cache_path)
+    model = MambaTT(reference_model, device, config, tt_cache_path=cache_dir)
 
     return model
 
@@ -89,6 +84,7 @@ def run_mamba_demo(
     assert batch_size == len(prompts), "32 prompts are required"
 
     logger.info(f"Running Mamba demo (weights='{model_version}') with batch={batch_size}")
+    logger.info(f"Using tensor cache at '{cache_dir}'")
 
     model = get_tt_metal_model(model_version, device, cache_dir, batch_size)
 
@@ -129,8 +125,18 @@ def run_mamba_demo(
 
 
 @pytest.mark.parametrize(
-    "max_gen_len",
-    ([100]),
+    "model_version, max_gen_len",
+    (
+        (
+            "state-spaces/mamba-2.8b-slimpj",
+            100,
+        ),
+    ),
 )
-def test_demo(user_input, device, use_program_cache, max_gen_len):
-    return run_mamba_demo(prompts=user_input, device=device, generated_sequence_length=max_gen_len)
+def test_demo(user_input, device, use_program_cache, get_tt_cache_path, model_version, max_gen_len):
+    return run_mamba_demo(
+        prompts=user_input,
+        device=device,
+        cache_dir=get_tt_cache_path(model_version),
+        generated_sequence_length=max_gen_len,
+    )
diff --git a/models/demos/mamba/tests/test_full_model.py b/models/demos/mamba/tests/test_full_model.py
@@ -46,9 +46,9 @@ def run_inference(
     model_version: MambaPretrainedModelName,
     batch: int,
     pcc: float,
-    cache_dir: Optional[str],
     num_layers: int,
     iterations: int,
+    cache_dir: Optional[str],
 ):
     torch.manual_seed(10)
 
@@ -64,13 +64,8 @@ def run_inference(
         with torch.no_grad():
             reference_output = mamba_model_pytorch(input_ids)
 
-    if cache_dir:
-        cache_path = model_config.get_weights_cache_path(model_version, cache_dir)
-    else:
-        cache_path = None
-
     config = model_config.create_model_config(batch, reference_model.args.d_model)
-    mamba_model_tt = MambaTT(reference_model, device, config, tt_cache_path=cache_path, num_layers=num_layers)
+    mamba_model_tt = MambaTT(reference_model, device, config, tt_cache_path=cache_dir, num_layers=num_layers)
 
     for _ in range(iterations):
         tt_output = mamba_model_tt(input_ids)
@@ -87,13 +82,12 @@ def run_inference(
 
 @skip_for_grayskull("Not supported on Grayskull")
 @pytest.mark.parametrize(
-    "model_version, batch, pcc, cache_dir, num_layers, iterations",
+    "model_version, batch, pcc, num_layers, iterations",
     (
         (
             "state-spaces/mamba-2.8b",
             32,
-            0.985,
-            None,
+            0.98,
             64,
             1,
         ),
@@ -102,14 +96,23 @@ def run_inference(
 def test_inference(
     device: ttnn.Device,
     use_program_cache,
+    get_tt_cache_path,
     model_version: MambaPretrainedModelName,
     batch: int,
     pcc: float,
-    cache_dir: Optional[str],
     num_layers: int,
     iterations: int,
 ):
-    run_inference(device, use_program_cache, model_version, batch, pcc, cache_dir, num_layers, iterations)
+    run_inference(
+        device,
+        use_program_cache,
+        model_version,
+        batch,
+        pcc,
+        num_layers,
+        iterations,
+        cache_dir=get_tt_cache_path(model_version),
+    )
 
 
 @skip_for_grayskull("Not supported on Grayskull")
@@ -120,11 +123,20 @@ def test_inference(
 def test_device_perf(
     device: ttnn.Device,
     use_program_cache,
+    get_tt_cache_path,
     iterations,
     model_version="state-spaces/mamba-2.8b",
     batch=32,
     pcc=0.97,
-    cache_dir=None,
     num_layers=1,
 ):
-    run_inference(device, use_program_cache, model_version, batch, pcc, cache_dir, num_layers, iterations)
+    run_inference(
+        device,
+        use_program_cache,
+        model_version,
+        batch,
+        pcc,
+        num_layers,
+        iterations,
+        cache_dir=get_tt_cache_path(model_version),
+    )
diff --git a/models/demos/mamba/tests/test_full_model_loop.py b/models/demos/mamba/tests/test_full_model_loop.py
diff --git a/models/demos/mamba/tests/test_mamba_block.py b/models/demos/mamba/tests/test_mamba_block.py
@@ -10,7 +10,6 @@
 from models.demos.mamba.tt.full_model import TtTensorLoader
 from models.demos.mamba.reference.decode_model import MambaDecode, MambaPretrainedModelName
 from models.demos.mamba.tt.mamba_block import TtMambaBlock
-from models.demos.mamba.tt.transforms import MambaSsmBlockTransformer
 from models.demos.mamba.tt import model_config
 from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import (
     comp_allclose,
@@ -30,13 +29,12 @@ def forward(self, x):
 
 
 @pytest.mark.parametrize(
-    "model_version, batch, pcc, cache_dir",
+    "model_version, batch, pcc",
     (
         (
             "state-spaces/mamba-2.8b",
             32,
             0.99,
-            None,
         ),
     ),
 )
@@ -46,7 +44,6 @@ def test_mamba_block_inference(
     model_version: MambaPretrainedModelName,
     batch: int,
     pcc: float,
-    cache_dir: Optional[str],
 ):
     torch.manual_seed(0)
 
@@ -63,19 +60,11 @@ def test_mamba_block_inference(
     residual_block = reference_model.layers[LAYER_NUM]
     assert not isinstance(residual_block, torch.Tensor), "Expected torch.Module"
 
-    if cache_dir:
-        cache_path = model_config.get_weights_cache_path(model_version, cache_dir)
-    else:
-        cache_path = None
-
     config = model_config.create_model_config(batch, d_model)
 
-    loader = TtTensorLoader(reference_model.state_dict(), device, tt_cache_path=cache_path)
-    transformer = MambaSsmBlockTransformer(
-        device, batch, reference_model.args.d_inner, reference_model.args.d_state * 2
-    )
+    loader = TtTensorLoader(reference_model.state_dict(), device)
 
-    model = TtMambaBlock(reference_model.args, device, config, loader.get_tensor_loader(LAYER_NUM), transformer)
+    model = TtMambaBlock(reference_model.args, device, config, loader.get_tensor_loader(LAYER_NUM))
     tt_input = input.view(1, 1, batch, d_model)
     tt_input = ttnn.to_device(
         ttnn.from_torch(tt_input, layout=ttnn.TILE_LAYOUT, dtype=ttnn.bfloat16),

diff --git a/models/demos/mamba/tests/test_mamba_demo.py b/models/demos/mamba/tests/test_mamba_demo.py
@@ -7,8 +7,15 @@
 
 
 @pytest.mark.parametrize(
-    "user_input, max_gen_len",
-    ((["Hello World"], 2),),
+    "user_input, model_version, max_gen_len",
+    ((["Hello World"], "state-spaces/mamba-2.8b-slimpj", 2),),
 )
-def test_demo(user_input, device, use_program_cache, max_gen_len):
-    return run_mamba_demo(prompts=user_input, device=device, generated_sequence_length=max_gen_len, display=False)
+def test_demo(user_input, model_version, device, use_program_cache, get_tt_cache_path, max_gen_len):
+    return run_mamba_demo(
+        prompts=user_input,
+        model_version=model_version,
+        device=device,
+        generated_sequence_length=max_gen_len,
+        display=False,
+        cache_dir=get_tt_cache_path(model_version),
+    )
diff --git a/models/demos/mamba/tests/test_mamba_perf.py b/models/demos/mamba/tests/test_mamba_perf.py
@@ -27,7 +27,14 @@
     ((32, 10, 12.5, 0.40),),  # Issue 7816 Compile time
 )
 def test_mamba_e2e_perf(
-    device, batch, iterations, expected_compile_time, expected_inference_time, use_program_cache, reset_seeds
+    device,
+    batch,
+    iterations,
+    expected_compile_time,
+    expected_inference_time,
+    use_program_cache,
+    reset_seeds,
+    get_tt_cache_path,
 ):
     model_version = "state-spaces/mamba-2.8b-slimpj"
     display_decoded_seq = False
@@ -46,7 +53,7 @@ def test_mamba_e2e_perf(
     profiler.end("pytorch_ref_model_setup")
 
     profiler.start("tt_model_setup")
-    tt_model = get_tt_metal_model(model_version, device, cache_dir=None, batch_size=batch)
+    tt_model = get_tt_metal_model(model_version, device, cache_dir=get_tt_cache_path(model_version), batch_size=batch)
     profiler.end("tt_model_setup")
 
     sequences: torch.Tensor = tokenizer(prompts, return_tensors="pt", padding=True).input_ids