Skip to content

Commit

Permalink
#5337: Merge branch 'main' into mistral_model_weights
Browse files Browse the repository at this point in the history
  • Loading branch information
mtairum committed Jun 5, 2024
2 parents 0aa1a20 + a409944 commit 5094e68
Show file tree
Hide file tree
Showing 143 changed files with 2,544 additions and 2,818 deletions.
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@

| Model | Batch | End-to-end throughput [1] | Device throughput [2] | Target |
|---------------------------------------------------------- |---------------------|------------------------------|-----------------------------|-------------------------------------|
| [ResNet-50](./models/demos/resnet) (fps) | 20 | 2,850 | 7,200 | 10,000 |
| [ResNet-50](./models/demos/resnet) (fps) | 20 | 4,400 | 7,700 | 10,000 |
| [BERT-Large](./models/demos/bert) (sen/s) | 12 | 362 | 406 | 410 |
| [Falcon7B-decode](./models/demos/ttnn_falcon7b) (t/s) | 32 | 135 | 135 | 140 |
| [ViT](./models/demos/grayskull/vit) (fps) | 8 | 480 | 1570 | 2000 |
| [ViT](./models/demos/grayskull/vit) (fps) | 8 | 860 | 1570 | 2000 |
| [T5 small](.models/demos/grayskull/t5) (sen/s) | | 140 | | |
| [Bloom](.models/demos/grayskull/functional_bloom) (sen/s) | | 70 | | |
| U-Net | coming soon | | | |
Expand All @@ -42,13 +42,13 @@
>
> All model demos in this table function on both N150 and N300 Wormhole cards, unless otherwise stated.
| Model | Gen. Token [3] | Batch | End-to-end throughput [1] | Device throughput [2] | Target |
|-------------------------------------------------------------|--------------------|----------------------|------------------------------|-----------------------------|----------------|
| [Falcon7B-decode](./models/demos/wormhole/falcon7b) | 129th | 32 | 11.6 t/s/u - 371 t/s | 15.4 t/s/u - 493 t/s | 21 t/s/u |
| [Mistral-7B-decode](./models/demos/wormhole/mistral7b) | 33rd | 32 | 10.9 t/s/u - 349 t/s | 13.3 t/s/u - 426 t/s | 21 t/s/u |
| [Mamba-2.8B-decode](./models/demos/mamba) | any | 32 | 9.2 t/s/u - 295 t/s | 13.1 t/s/u - 419 t/s | 22 t/s/u |
| [BERT-Large](./models/demos/metal_BERT_large_11/) (sen/s) [4] | any | 8 | 270 | 340 | 400 |
| [Stable Diffusion 1.4](./models/demos/wormhole/stable_diffusion) 512x512 (sec/img) | | 1 | 8s | 5s | |
| Model | Gen. Token [3] | Batch | End-to-end throughput [1] | Device throughput [2] | Target |
|--------------------------------------------------------------------------------------|--------------------|----------------------|------------------------------|-----------------------------|----------------|
| [Falcon7B-decode](./models/demos/wormhole/falcon7b) | 129th | 32 | 11.6 t/s/u - 371 t/s | 15.4 t/s/u - 493 t/s | 21 |
| [Mistral-7B-decode](./models/demos/wormhole/mistral7b) | 33rd | 32 | 10.9 t/s/u - 349 t/s | 13.3 t/s/u - 426 t/s | 21 |
| [Mamba-2.8B-decode](./models/demos/mamba) | any | 32 | 9.2 t/s/u - 295 t/s | 13.1 t/s/u - 419 t/s | 22 |
| [BERT-Large](./models/demos/metal_BERT_large_11/) (sen/s) [4] | | 8 | 270 | 340 | 400 |
| [Stable Diffusion 1.4](./models/demos/wormhole/stable_diffusion) 512x512 (sec/img) | | 1 | 8 | 5 | |

[1] - Observed from the host. Includes dispatch overhead and kernel execution time.

Expand Down
9 changes: 0 additions & 9 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,9 +326,6 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0):
except (ValueError, AttributeError):
num_devices_requested = len(device_ids)

if num_devices_requested <= 1:
pytest.skip("Requires multiple devices to run")

device_mesh = ttnn.open_device_mesh(ttnn.DeviceGrid(1, num_devices_requested), device_ids[:num_devices_requested])

logger.debug(f"multidevice with {device_mesh.get_num_devices()} devices is created")
Expand All @@ -354,9 +351,6 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0):
except (ValueError, AttributeError):
num_pcie_devices_requested = len(device_ids)

if num_pcie_devices_requested <= 1:
pytest.skip("Requires multiple devices to run")

device_mesh = ttnn.open_device_mesh(
ttnn.DeviceGrid(1, num_pcie_devices_requested), device_ids[:num_pcie_devices_requested]
)
Expand Down Expand Up @@ -386,9 +380,6 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0):
except (ValueError, AttributeError):
num_devices_requested = len(device_ids)

if num_devices_requested <= 1:
pytest.skip("Requires multiple devices to run")

device_mesh = ttnn.open_device_mesh(ttnn.DeviceGrid(1, num_devices_requested), device_ids[:num_devices_requested])

logger.debug(f"multidevice with {device_mesh.get_num_devices()} devices is created")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Second stage: the actual image
# TT-METAL UBUNTU 20.04 AMD64 DOCKERFILE
FROM ubuntu:20.04

ARG DEBIAN_FRONTEND=noninteractive
Expand All @@ -25,16 +25,19 @@ RUN /bin/bash /opt/tt_metal_infra/scripts/docker/install_test_deps.sh ${GTEST_VE
COPY /scripts /opt/tt_metal_infra/scripts
COPY build_metal.sh /scripts/build_metal.sh

# ENV TT_METAL_INFRA_DIR=/opt/tt_metal_infra
# ENV PYTHON_ENV_DIR=${TT_METAL_INFRA_DIR}/tt-metal/python_env
# RUN python3 -m venv $PYTHON_ENV_DIR
# Setup Env variables to setup Python Virtualenv - Install TT-Metal Python deps
ENV TT_METAL_INFRA_DIR=/opt/tt_metal_infra
ENV PYTHON_ENV_DIR=${TT_METAL_INFRA_DIR}/tt-metal/python_env
RUN python3 -m venv $PYTHON_ENV_DIR
ENV PATH="$PYTHON_ENV_DIR/bin:$PATH"

# COPY /docs/requirements-docs.txt ${TT_METAL_INFRA_DIR}/tt-metal/docs/.
# COPY /tt_metal/python_env/* ${TT_METAL_INFRA_DIR}/tt-metal/tt_metal/python_env/.
# ENV PATH="$PYTHON_ENV_DIR/bin:$PATH"
# RUN python3 -m pip config set global.extra-index-url https://download.pytorch.org/whl/cpu \
# && python3 -m pip install setuptools wheel
# Copy requirements from tt-metal folders with requirements.txt docs
COPY /docs/requirements-docs.txt ${TT_METAL_INFRA_DIR}/tt-metal/docs/.
COPY /tt_metal/python_env/* ${TT_METAL_INFRA_DIR}/tt-metal/tt_metal/python_env/.
RUN python3 -m pip config set global.extra-index-url https://download.pytorch.org/whl/cpu \
&& python3 -m pip install setuptools wheel

# RUN python3 -m pip install -r ${TT_METAL_INFRA_DIR}/tt-metal/tt_metal/python_env/requirements-dev.txt
RUN python3 -m pip install -r ${TT_METAL_INFRA_DIR}/tt-metal/tt_metal/python_env/requirements-dev.txt
RUN python3 -m pip install -r ${TT_METAL_INFRA_DIR}/tt-metal/docs/requirements-docs.txt

CMD ["tail", "-f", "/dev/null"]
26 changes: 16 additions & 10 deletions models/demos/mamba/demo/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,8 @@ def get_tt_metal_model(
from models.demos.mamba.tt import model_config

reference_model = get_cpu_reference_model(version, batch_size=batch_size)
if cache_dir:
cache_path = model_config.get_weights_cache_path(version, cache_dir)
else:
cache_path = None

config = model_config.create_model_config(batch_size, reference_model.args.d_model)
model = MambaTT(reference_model, device, config, tt_cache_path=cache_path)
model = MambaTT(reference_model, device, config, tt_cache_path=cache_dir)

return model

Expand Down Expand Up @@ -89,6 +84,7 @@ def run_mamba_demo(
assert batch_size == len(prompts), "32 prompts are required"

logger.info(f"Running Mamba demo (weights='{model_version}') with batch={batch_size}")
logger.info(f"Using tensor cache at '{cache_dir}'")

model = get_tt_metal_model(model_version, device, cache_dir, batch_size)

Expand Down Expand Up @@ -129,8 +125,18 @@ def run_mamba_demo(


@pytest.mark.parametrize(
"max_gen_len",
([100]),
"model_version, max_gen_len",
(
(
"state-spaces/mamba-2.8b-slimpj",
100,
),
),
)
def test_demo(user_input, device, use_program_cache, max_gen_len):
return run_mamba_demo(prompts=user_input, device=device, generated_sequence_length=max_gen_len)
def test_demo(user_input, device, use_program_cache, get_tt_cache_path, model_version, max_gen_len):
return run_mamba_demo(
prompts=user_input,
device=device,
cache_dir=get_tt_cache_path(model_version),
generated_sequence_length=max_gen_len,
)
40 changes: 26 additions & 14 deletions models/demos/mamba/tests/test_full_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ def run_inference(
model_version: MambaPretrainedModelName,
batch: int,
pcc: float,
cache_dir: Optional[str],
num_layers: int,
iterations: int,
cache_dir: Optional[str],
):
torch.manual_seed(10)

Expand All @@ -64,13 +64,8 @@ def run_inference(
with torch.no_grad():
reference_output = mamba_model_pytorch(input_ids)

if cache_dir:
cache_path = model_config.get_weights_cache_path(model_version, cache_dir)
else:
cache_path = None

config = model_config.create_model_config(batch, reference_model.args.d_model)
mamba_model_tt = MambaTT(reference_model, device, config, tt_cache_path=cache_path, num_layers=num_layers)
mamba_model_tt = MambaTT(reference_model, device, config, tt_cache_path=cache_dir, num_layers=num_layers)

for _ in range(iterations):
tt_output = mamba_model_tt(input_ids)
Expand All @@ -87,13 +82,12 @@ def run_inference(

@skip_for_grayskull("Not supported on Grayskull")
@pytest.mark.parametrize(
"model_version, batch, pcc, cache_dir, num_layers, iterations",
"model_version, batch, pcc, num_layers, iterations",
(
(
"state-spaces/mamba-2.8b",
32,
0.985,
None,
0.98,
64,
1,
),
Expand All @@ -102,14 +96,23 @@ def run_inference(
def test_inference(
device: ttnn.Device,
use_program_cache,
get_tt_cache_path,
model_version: MambaPretrainedModelName,
batch: int,
pcc: float,
cache_dir: Optional[str],
num_layers: int,
iterations: int,
):
run_inference(device, use_program_cache, model_version, batch, pcc, cache_dir, num_layers, iterations)
run_inference(
device,
use_program_cache,
model_version,
batch,
pcc,
num_layers,
iterations,
cache_dir=get_tt_cache_path(model_version),
)


@skip_for_grayskull("Not supported on Grayskull")
Expand All @@ -120,11 +123,20 @@ def test_inference(
def test_device_perf(
device: ttnn.Device,
use_program_cache,
get_tt_cache_path,
iterations,
model_version="state-spaces/mamba-2.8b",
batch=32,
pcc=0.97,
cache_dir=None,
num_layers=1,
):
run_inference(device, use_program_cache, model_version, batch, pcc, cache_dir, num_layers, iterations)
run_inference(
device,
use_program_cache,
model_version,
batch,
pcc,
num_layers,
iterations,
cache_dir=get_tt_cache_path(model_version),
)
22 changes: 0 additions & 22 deletions models/demos/mamba/tests/test_full_model_loop.py

This file was deleted.

17 changes: 3 additions & 14 deletions models/demos/mamba/tests/test_mamba_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from models.demos.mamba.tt.full_model import TtTensorLoader
from models.demos.mamba.reference.decode_model import MambaDecode, MambaPretrainedModelName
from models.demos.mamba.tt.mamba_block import TtMambaBlock
from models.demos.mamba.tt.transforms import MambaSsmBlockTransformer
from models.demos.mamba.tt import model_config
from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import (
comp_allclose,
Expand All @@ -30,13 +29,12 @@ def forward(self, x):


@pytest.mark.parametrize(
"model_version, batch, pcc, cache_dir",
"model_version, batch, pcc",
(
(
"state-spaces/mamba-2.8b",
32,
0.99,
None,
),
),
)
Expand All @@ -46,7 +44,6 @@ def test_mamba_block_inference(
model_version: MambaPretrainedModelName,
batch: int,
pcc: float,
cache_dir: Optional[str],
):
torch.manual_seed(0)

Expand All @@ -63,19 +60,11 @@ def test_mamba_block_inference(
residual_block = reference_model.layers[LAYER_NUM]
assert not isinstance(residual_block, torch.Tensor), "Expected torch.Module"

if cache_dir:
cache_path = model_config.get_weights_cache_path(model_version, cache_dir)
else:
cache_path = None

config = model_config.create_model_config(batch, d_model)

loader = TtTensorLoader(reference_model.state_dict(), device, tt_cache_path=cache_path)
transformer = MambaSsmBlockTransformer(
device, batch, reference_model.args.d_inner, reference_model.args.d_state * 2
)
loader = TtTensorLoader(reference_model.state_dict(), device)

model = TtMambaBlock(reference_model.args, device, config, loader.get_tensor_loader(LAYER_NUM), transformer)
model = TtMambaBlock(reference_model.args, device, config, loader.get_tensor_loader(LAYER_NUM))
tt_input = input.view(1, 1, batch, d_model)
tt_input = ttnn.to_device(
ttnn.from_torch(tt_input, layout=ttnn.TILE_LAYOUT, dtype=ttnn.bfloat16),
Expand Down
15 changes: 11 additions & 4 deletions models/demos/mamba/tests/test_mamba_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,15 @@


@pytest.mark.parametrize(
"user_input, max_gen_len",
((["Hello World"], 2),),
"user_input, model_version, max_gen_len",
((["Hello World"], "state-spaces/mamba-2.8b-slimpj", 2),),
)
def test_demo(user_input, device, use_program_cache, max_gen_len):
return run_mamba_demo(prompts=user_input, device=device, generated_sequence_length=max_gen_len, display=False)
def test_demo(user_input, model_version, device, use_program_cache, get_tt_cache_path, max_gen_len):
return run_mamba_demo(
prompts=user_input,
model_version=model_version,
device=device,
generated_sequence_length=max_gen_len,
display=False,
cache_dir=get_tt_cache_path(model_version),
)
11 changes: 9 additions & 2 deletions models/demos/mamba/tests/test_mamba_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,14 @@
((32, 10, 12.5, 0.40),), # Issue 7816 Compile time
)
def test_mamba_e2e_perf(
device, batch, iterations, expected_compile_time, expected_inference_time, use_program_cache, reset_seeds
device,
batch,
iterations,
expected_compile_time,
expected_inference_time,
use_program_cache,
reset_seeds,
get_tt_cache_path,
):
model_version = "state-spaces/mamba-2.8b-slimpj"
display_decoded_seq = False
Expand All @@ -46,7 +53,7 @@ def test_mamba_e2e_perf(
profiler.end("pytorch_ref_model_setup")

profiler.start("tt_model_setup")
tt_model = get_tt_metal_model(model_version, device, cache_dir=None, batch_size=batch)
tt_model = get_tt_metal_model(model_version, device, cache_dir=get_tt_cache_path(model_version), batch_size=batch)
profiler.end("tt_model_setup")

sequences: torch.Tensor = tokenizer(prompts, return_tensors="pt", padding=True).input_ids
Expand Down
Loading

0 comments on commit 5094e68

Please sign in to comment.