Skip to content

Commit

Permalink
Merge branch 'main' into sminakov/from_torch-p3
Browse files Browse the repository at this point in the history
  • Loading branch information
sminakov-tt authored Jan 10, 2025
2 parents a801a64 + a94c89e commit 81fa970
Show file tree
Hide file tree
Showing 814 changed files with 16,367 additions and 5,587 deletions.
5 changes: 0 additions & 5 deletions .clang-format-ignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ tt_metal/distributed/mesh_device.cpp
tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_unpack_tilize_api.h
tt_metal/impl/buffers/buffer.cpp
tt_metal/impl/buffers/buffer.hpp
tt_metal/impl/device/device.cpp
tt_metal/impl/device/device.hpp
tt_metal/impl/dispatch/arch.cpp
tt_metal/impl/dispatch/arch.hpp
Expand Down Expand Up @@ -40,10 +39,8 @@ ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp
ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather.cpp
ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp
ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp
ttnn/cpp/ttnn/operations/ccl/all_gather_v2/device/all_gather_op.cpp
ttnn/cpp/ttnn/operations/ccl/all_gather_v2/device/all_gather_op.hpp
ttnn/cpp/ttnn/operations/ccl/all_gather_v2/device/multi_core/all_gather_op_multi_core_new.cpp
ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp
ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp
ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp
ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp
Expand All @@ -58,8 +55,6 @@ ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp
ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command_device.hpp
ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.cpp
ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.hpp
ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp
ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp
ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp
ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp
ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/cpp-post-commit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,9 @@ jobs:

{name: dispatch multicmd queue, cmd: "TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter=MultiCommandQueue*Fixture.*"},

{name: ttnn cpp tests, cmd: ./build/test/ttnn/unit_tests_ttnn},
{name: ttnn cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn},
{name: ttnn ccl cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn_ccl},
{name: ttnn tensor cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn_tensor},
]
name: ${{ matrix.test-group.name }} ${{ inputs.arch }} ${{ inputs.runner-label }}
env:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/tg-unit-tests-impl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ jobs:
{ name: "TG unit tests", arch: wormhole_b0, model: unit, timeout: 30, owner_id: XXXXX}, # Add owner
{ name: "TG Llama3-small unit tests", arch: wormhole_b0, model: llama3-small, timeout: 45, owner_id: U06F3ER8X9A}, # Stuti Raizada
{ name: "TG Llama3-70b unit tests", arch: wormhole_b0, model: llama3-70b, timeout: 45, owner_id: U06F3ER8X9A}, # Stuti Raizada
{ name: "TG DRAM Prefetcher unit tests", arch: wormhole_b0, model: prefetcher, timeout: 30, owner_id: U071CKL4AFK}, # Ammar Vora, Yu Gao
]
name: ${{ matrix.test-group.name }}
env:
Expand Down
27 changes: 25 additions & 2 deletions .github/workflows/ttnn-run-sweeps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ on:
- eltwise.unary.hardsigmoid.hardsigmoid_pytorch2
- eltwise.unary.leaky_relu.leaky_relu_pytorch2
- eltwise.unary.abs.abs
- eltwise.unary.abs.abs_sharded
- eltwise.unary.abs.abs_forge
- eltwise.unary.cos.cos
- eltwise.unary.cos.cos_pytorch2
Expand Down Expand Up @@ -61,11 +62,15 @@ on:
- eltwise.unary.elu.elu
- eltwise.unary.elu.elu_pytorch2
- eltwise.unary.erfc.erfc
- eltwise.unary.erfc.erfc_sharded
- eltwise.unary.exp.exp
- eltwise.unary.exp.exp_sharded
- eltwise.unary.exp.exp_forge
- eltwise.unary.exp.exp_pytorch2
- eltwise.unary.exp2.exp2
- eltwise.unary.exp2.exp2_sharded
- eltwise.unary.expm1.expm1
- eltwise.unary.expm1.expm1_sharded
- eltwise.unary.tanh.tanh
- eltwise.unary.tanh.tanh_pytorch2
- eltwise.unary.tanh.tanh_forge
Expand All @@ -76,6 +81,7 @@ on:
- eltwise.unary.deg2rad.deg2rad
- eltwise.unary.relu6.relu6
- eltwise.unary.log.log
- eltwise.unary.log.log_sharded
- eltwise.unary.log.log_pytorch2
- eltwise.unary.log.log_forge
- eltwise.unary.log1p.log1p
Expand All @@ -99,6 +105,7 @@ on:
- eltwise.unary.neg.neg_forge
- eltwise.unary.erf.erf
- eltwise.unary.erfinv.erfinv
- eltwise.unary.erfinv.erfinv_sharded
- eltwise.unary.i0.i0
- eltwise.unary.silu.silu
- eltwise.unary.silu.silu_pytorch2
Expand All @@ -124,21 +131,30 @@ on:
- eltwise.unary.hardsigmoid.hardsigmoid_sharded
- eltwise.unary.hardshrink.hardshrink
- eltwise.unary.hardshrink.hardshrink_sharded
- eltwise.unary.softmax.softmax
- normalization.softmax.softmax
- normalization.softmax.softmax_sharded
- eltwise.unary.identity.identity
- eltwise.unary.identity.identity_sharded
- eltwise.unary.neg.neg
- eltwise.unary.neg.neg_sharded
- eltwise.unary.sinh.sinh
- eltwise.unary.sinh.sinh_sharded
- eltwise.unary.asinh.asinh
- eltwise.unary.cosh.cosh
- eltwise.unary.relu_min.relu_min
- eltwise.unary.relu_min.relu_min_sharded
- eltwise.unary.relu_max.relu_max
- eltwise.unary.relu_max.relu_max_sharded
- eltwise.unary.softplus.softplus
- eltwise.unary.softplus.softplus_sharded
- eltwise.unary.selu.selu
- eltwise.unary.selu.selu_sharded
- eltwise.unary.softshrink.softshrink_sharded
- eltwise.unary_backward.fill_zero_bw
- eltwise.unary_backward.log_sigmoid_bw
- eltwise.unary_backward.logit_bw
- eltwise.unary_backward.neg_bw
- eltwise.unary_backward.neg_bw.neg_bw
- eltwise.unary_backward.neg_bw.neg_bw_sharded
- eltwise.unary_backward.hardshrink_bw
- eltwise.unary_backward.softshrink_bw
- eltwise.unary_backward.acos_bw.acos_bw
Expand All @@ -149,7 +165,9 @@ on:
- eltwise.unary_backward.i0_bw.i0_bw
- eltwise.unary_backward.rad2deg_bw.rad2deg_bw
- eltwise.unary_backward.relu_bw.relu_bw
- eltwise.unary_backward.relu_bw.relu_bw_sharded
- eltwise.unary_backward.rsqrt_bw.rsqrt_bw
- eltwise.unary_backward.rsqrt_bw.rsqrt_bw_sharded
- eltwise.unary_backward.sigmoid_bw.sigmoid_bw
- eltwise.unary_backward.tan_bw.tan_bw
- eltwise.unary_backward.trunc_bw.trunc_bw
Expand All @@ -160,7 +178,9 @@ on:
- eltwise.unary_backward.threshold_bw.threshold_bw
- eltwise.unary_backward.div_bw.div_bw
- eltwise.unary_backward.log_bw.log_bw
- eltwise.unary_backward.log_bw.log_bw_sharded
- eltwise.unary_backward.relu6_bw.relu6_bw
- eltwise.unary_backward.relu6_bw.relu6_bw_sharded
- eltwise.unary_backward.log10_bw.log10_bw
- eltwise.unary_backward.abs_bw.abs_bw
- eltwise.unary_backward.sinh_bw.sinh_bw
Expand All @@ -179,6 +199,7 @@ on:
- eltwise.unary_backward.lgamma_bw.lgamma_bw
- eltwise.unary_backward.multigammaln_bw.multigammaln_bw
- eltwise.unary_backward.leaky_relu_bw.leaky_relu_bw
- eltwise.unary_backward.leaky_relu_bw.leaky_relu_bw_sharded
- eltwise.unary_backward.elu_bw.elu_bw
- eltwise.unary_backward.celu_bw.celu_bw
- eltwise.unary_backward.selu_bw.selu_bw
Expand Down Expand Up @@ -226,10 +247,12 @@ on:
- eltwise.unary.lez.lez
- eltwise.unary.nez.nez
- eltwise.unary.prelu.prelu
- eltwise.unary.prelu.prelu_sharded
- eltwise.unary.hardswish.hardswish_pytorch2
- eltwise.unary.hardtanh.hardtanh_pytorch2
- eltwise.unary.leaky_relu.leaky_relu
- eltwise.unary.reglu.reglu
- eltwise.unary.round.round_sharded
- eltwise.unary_complex.polar.polar
- eltwise.unary_complex.angle.angle
- eltwise.unary_complex.polar_bw.polar_bw
Expand Down
2 changes: 0 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,6 @@ target_link_libraries(
if(NOT DEFINED ENV{ARCH_NAME})
message(FATAL_ERROR "Please set ARCH_NAME to grayskull, wormhole_b0, or blackhole")
endif(NOT DEFINED ENV{ARCH_NAME})
string(TOUPPER "$ENV{ARCH_NAME}" ARCH_NAME_DEF)
add_compile_definitions(ARCH_${ARCH_NAME_DEF})
add_compile_options(
-Werror
-Wno-deprecated-declarations
Expand Down
1 change: 0 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ Table of Contents
- [Table of Contents](#table-of-contents)
- [Contributing to tt-metal](#contributing-to-tt-metal)
- [Machine setup](#machine-setup)
- [Hugepages setup](#hugepages-setup)
- [Developing tt-metal](#developing-tt-metal)
- [Setting logger level](#setting-logger-level)
- [Building and viewing the documentation locally](#building-and-viewing-the-documentation-locally)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ Get started with [simple kernels](https://docs.tenstorrent.com/tt-metalium/lates
- [Ethernet and Multichip Basics](./tech_reports/EthernetMultichip/BasicEthernetGuide.md) (Updated Sept 20th, 2024)
- [Collective Communication Library (CCL)](./tech_reports/EthernetMultichip/CclDeveloperGuide.md) (Updated Sept 20th, 2024)
- [Blackhole Bring-Up Programming Guide](./tech_reports/Blackhole/BlackholeBringUpProgrammingGuide.md) (Updated Dec 18th, 2024)
- [Sub-Devices](./tech_reports/SubDevices/SubDevices.md) (Updated Jan 2nd, 2025)
- [Sub-Devices](./tech_reports/SubDevices/SubDevices.md) (Updated Jan 7th, 2025)

## TT-Metalium Programming Examples

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
DumpDeviceProfileResults
========================

.. doxygenfunction:: tt::tt_metal::v0::DumpDeviceProfileResults(Device *device, const Program &program);
.. doxygenfunction:: tt::tt_metal::v0::DumpDeviceProfileResults(IDevice *device, const Program &program);
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ Runtime Arguments

.. doxygenfunction:: tt::tt_metal::v0::SetRuntimeArgs(const Program &program, KernelHandle kernel, const std::vector< CoreCoord > & core_spec, const std::vector< std::vector<uint32_t> > &runtime_args)

.. doxygenfunction:: tt::tt_metal::v0::SetRuntimeArgs(Device* device, const std::shared_ptr<Kernel>& kernel, const std::variant<CoreCoord, CoreRange, CoreRangeSet> &core_spec, std::shared_ptr<RuntimeArgs> runtime_args)
.. doxygenfunction:: tt::tt_metal::v0::SetRuntimeArgs(IDevice* device, const std::shared_ptr<Kernel>& kernel, const std::variant<CoreCoord, CoreRange, CoreRangeSet> &core_spec, std::shared_ptr<RuntimeArgs> runtime_args)

.. doxygenfunction:: tt::tt_metal::v0::SetRuntimeArgs(Device* device, const std::shared_ptr<Kernel>& kernel, const std::vector< CoreCoord > & core_spec, const std::vector<std::shared_ptr<RuntimeArgs>>& runtime_args)
.. doxygenfunction:: tt::tt_metal::v0::SetRuntimeArgs(IDevice* device, const std::shared_ptr<Kernel>& kernel, const std::vector< CoreCoord > & core_spec, const std::vector<std::shared_ptr<RuntimeArgs>>& runtime_args)

.. doxygenfunction:: tt::tt_metal::v0::GetRuntimeArgs(const Program &program, KernelHandle kernel_id, const CoreCoord &logical_core)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Silicon accelerator setup
.. code-block:: cpp
constexpr int device_id = 0;
Device *device = CreateDevice(device_id);
auto device = CreateDevice(device_id);
We instantiate a device to control our ``GRAYSKULL`` type
accelerator.
Expand Down
23 changes: 12 additions & 11 deletions models/common/rmsnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,17 +81,18 @@ def __init__(
mesh_mapper=ttnn.ReplicateTensorToMesh(device) if is_mesh_device else None,
)

self.weight_distributed = ttnn.as_tensor(
torch_weight,
device=device,
dtype=weight_dtype,
layout=ttnn.ROW_MAJOR_LAYOUT,
memory_config=weight_memory_config,
cache_file_name=cache_name,
mesh_mapper=ttnn.ShardTensor2dMesh(device, dims=(None, 2), mesh_shape=list(device.shape))
if is_mesh_device
else None,
)
if self.is_distributed:
self.weight_distributed = ttnn.as_tensor(
torch_weight,
device=device,
dtype=weight_dtype,
layout=ttnn.ROW_MAJOR_LAYOUT,
memory_config=weight_memory_config,
cache_file_name=cache_name,
mesh_mapper=ttnn.ShardTensor2dMesh(device, dims=(None, 2), mesh_shape=list(device.shape))
if is_mesh_device
else None,
)

self.sharded_output_config = sharded_output_config
self.sharded_program_config = sharded_program_config
Expand Down
17 changes: 8 additions & 9 deletions models/demos/llama3/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,19 @@ All the above llama models (with the exception of 70B due to its large size) are
- T3000 (8-chips)
- TG (32-chips)

Below is an updated table with max prefill context-length support for our demo. These were tested on both accuracy and performance mode.

The main reason for a long context length not fitting on device is lack of memory memory. Any exceptions are marked in the table in appendix.
**Max Context Lengths (text-only)**: All of the compatible model/device combinations support a max prefill context-length of 128k, with the exception of Llama3.1-8B and Llama3.2-11B on N150 which have a max of 64k (due to a lack of memory). To support these large max context-lengths, chunked prefill is performed with different max chunk sizes as shown in the table below.

Max Prefill Chunk Sizes (text-only):
| | N150 | N300 | T3K | TG |
|--------------|---------------|---------------|----------------|-------------|
| Llama3.2-1B | 128k tokens | 128k tokens | 128k tokens | 128k tokens |
| Llama3.2-3B | 32k tokens | 128k tokens | 128k tokens | 128k tokens |
| Llama3.1-8B | 16k tokens | 64k tokens | 128k tokens | 128k tokens |
| Llama3.2-11B | 16k tokens | 64k tokens | 128k tokens | 128k tokens |
| Llama3.1-70B | Not supported | Not supported | 64k tokens [1] | 128k tokens |

[1] Although longer prefill context-lengths are not supported due to model size and available memory, you can still decode (generate) tokens up to a maximum of 128k tokens.
| Llama3.2-3B | 8k tokens | 128k tokens | 128k tokens | 128k tokens |
| Llama3.1-8B | 4k tokens | 64k tokens | 128k tokens | 128k tokens |
| Llama3.2-11B | 4k tokens | 64k tokens | 128k tokens | 128k tokens |
| Llama3.1-70B | Not supported | Not supported | 32k tokens | 128k tokens |
- These max chunk sizes are specific to max context length 128k and are configured via `MAX_PREFILL_CHUNK_SIZES_DIV1024` in [model_config.py](https://github.com/tenstorrent/tt-metal/blob/main/models/demos/llama3/tt/model_config.py). If the max context length is set to a smaller value using the `max_seq_len` flag (see [Run the demo](#run-the-demo)), these chunk sizes can possibly be increased due to using a smaller KV cache.

**Max Context Lengths (Llama3.2-11B multimodal)**: Llama3.2-11B multimodal is currently only supported on N300 and T3000. On N300, a max prefill context length of 8k is supported, while T3000 supports a max context length of 128k.

## How to Run

Expand Down
35 changes: 6 additions & 29 deletions models/demos/llama3/demo/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,38 +233,15 @@ def run_llama3_demo(
llama_model_name = model_args.model_name # ["3.2-1B", "3.2-3B", "3.1-8B", "3.2-11B", "3.1-70B"]
tt_device_name = model_args.device_name # ["N150", "N300", "T3K", "TG"]

if llama_model_name == "3.2-1B":
if llama_model_name in ["3.1-8B", "3.2-11B"] and tt_device_name == "N150":
assert (
max_seq_len <= 128 * 1024
), "Llama3.2-1B supports the official max context length of 128k tokens across all architectures"
if llama_model_name == "3.2-3B":
if tt_device_name == "N150":
assert max_seq_len <= 32 * 1024, "N150 only supports a max context length of 32k tokens for Llama3.2-3B"
else: # N300, T3K and TG
assert (
max_seq_len <= 128 * 1024
), "N300, T3K and TG support the official max context length of 128k tokens for Llama3.2-3B"
if llama_model_name in ["3.1-8B", "3.2-11B"]:
if tt_device_name == "N150":
assert (
max_seq_len <= 16 * 1024
), "N150 only supports a max context length of 16k tokens for Llama3.1-8B and Llama3.2-11B"
elif tt_device_name == "N300":
assert (
max_seq_len <= 64 * 1024
), "N300 only supports a max context length of 64k tokens for Llama3.1-8B and Llama3.2-11B"
else: # T3K and TG
assert (
max_seq_len <= 128 * 1024
), "T3K only supports a max context length of 128k tokens for Llama3.1-8B and Llama3.2-11B"
max_seq_len <= 64 * 1024
), "N150 only supports a max context length of 64k tokens for Llama3.1-8B and Llama3.2-11B"
else:
assert max_seq_len <= 128 * 1024, f"Llama{llama_model_name} supports a max context length of 128k tokens"

if llama_model_name == "3.1-70B":
assert tt_device_name in ["T3K", "TG"], "Llama3.1-70B is only supported on T3K or TG"
if tt_device_name == "T3K":
assert max_seq_len <= 64 * 1024, "T3K only supports a max context length of 64k tokens for Llama3.1-70B"
else: # TG
assert (
max_seq_len <= 128 * 1024
), "TG supports the official max context length of 128k tokens for Llama3.1-70B"

logger.info("Loading weights...")
profiler.start("weight_loading")
Expand Down
2 changes: 1 addition & 1 deletion models/demos/llama3/tests/test_llama_chunked_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
)
@pytest.mark.parametrize(
"seq_len, prefill_chunk_size",
[(4096, 1024)],
[(4096, 2048)],
)
@pytest.mark.parametrize(
"optimizations",
Expand Down
4 changes: 3 additions & 1 deletion models/demos/llama3/tt/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,10 +141,12 @@ def prefill_forward_single_user_text(self, tokens, page_table, user_id, last_tok
get_last_token=(last_token_idx_in_chunk // 32) * 32,
kv_cache=kv_cache,
)
logits = self.model.process_output_prefill(tt_logits, last_token_idx=(last_token_idx_in_chunk % 32))

if chunk_start == last_chunk_start:
logits = self.model.process_output_prefill(tt_logits, last_token_idx=(last_token_idx_in_chunk % 32))
return logits
else:
del tt_logits
else:
prefill_input, rot_mats_prefill, page_table_tt, _ = self.model.prepare_inputs_prefill(
tokens,
Expand Down
Loading

0 comments on commit 81fa970

Please sign in to comment.