Merge branch 'main' into sminakov/from_torch-p3

tenstorrent · Jan 10, 2025 · 81fa970 · 81fa970
2 parents a801a64 + a94c89e
commit 81fa970
Show file tree

Hide file tree

Showing 814 changed files with 16,367 additions and 5,587 deletions.
diff --git a/.clang-format-ignore b/.clang-format-ignore
@@ -2,7 +2,6 @@ tt_metal/distributed/mesh_device.cpp
 tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_unpack_tilize_api.h
 tt_metal/impl/buffers/buffer.cpp
 tt_metal/impl/buffers/buffer.hpp
-tt_metal/impl/device/device.cpp
 tt_metal/impl/device/device.hpp
 tt_metal/impl/dispatch/arch.cpp
 tt_metal/impl/dispatch/arch.hpp
@@ -40,10 +39,8 @@ ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp
 ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather.cpp
 ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp
 ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp
-ttnn/cpp/ttnn/operations/ccl/all_gather_v2/device/all_gather_op.cpp
 ttnn/cpp/ttnn/operations/ccl/all_gather_v2/device/all_gather_op.hpp
 ttnn/cpp/ttnn/operations/ccl/all_gather_v2/device/multi_core/all_gather_op_multi_core_new.cpp
-ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp
 ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp
 ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp
 ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp
@@ -58,8 +55,6 @@ ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp
 ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command_device.hpp
 ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.cpp
 ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.hpp
-ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp
-ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp
 ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp
 ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp
 ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp

diff --git a/.github/workflows/cpp-post-commit.yaml b/.github/workflows/cpp-post-commit.yaml
@@ -66,8 +66,9 @@ jobs:
 
           {name: dispatch multicmd queue, cmd: "TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter=MultiCommandQueue*Fixture.*"},
 
-          {name: ttnn cpp tests, cmd: ./build/test/ttnn/unit_tests_ttnn},
+          {name: ttnn cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn},
           {name: ttnn ccl cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn_ccl},
+          {name: ttnn tensor cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn_tensor},
         ]
     name: ${{ matrix.test-group.name }} ${{ inputs.arch }} ${{ inputs.runner-label }}
     env:

diff --git a/.github/workflows/tg-unit-tests-impl.yaml b/.github/workflows/tg-unit-tests-impl.yaml
@@ -43,6 +43,7 @@ jobs:
           { name: "TG unit tests", arch: wormhole_b0, model: unit, timeout: 30, owner_id: XXXXX},  # Add owner
           { name: "TG Llama3-small unit tests", arch: wormhole_b0, model: llama3-small, timeout: 45, owner_id: U06F3ER8X9A}, # Stuti Raizada
           { name: "TG Llama3-70b unit tests", arch: wormhole_b0, model: llama3-70b, timeout: 45, owner_id: U06F3ER8X9A}, # Stuti Raizada
+          { name: "TG DRAM Prefetcher unit tests", arch: wormhole_b0, model: prefetcher, timeout: 30, owner_id: U071CKL4AFK}, # Ammar Vora, Yu Gao
         ]
     name: ${{ matrix.test-group.name }}
     env:

diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml
@@ -29,6 +29,7 @@ on:
           - eltwise.unary.hardsigmoid.hardsigmoid_pytorch2
           - eltwise.unary.leaky_relu.leaky_relu_pytorch2
           - eltwise.unary.abs.abs
+          - eltwise.unary.abs.abs_sharded
           - eltwise.unary.abs.abs_forge
           - eltwise.unary.cos.cos
           - eltwise.unary.cos.cos_pytorch2
@@ -61,11 +62,15 @@ on:
           - eltwise.unary.elu.elu
           - eltwise.unary.elu.elu_pytorch2
           - eltwise.unary.erfc.erfc
+          - eltwise.unary.erfc.erfc_sharded
           - eltwise.unary.exp.exp
+          - eltwise.unary.exp.exp_sharded
           - eltwise.unary.exp.exp_forge
           - eltwise.unary.exp.exp_pytorch2
           - eltwise.unary.exp2.exp2
+          - eltwise.unary.exp2.exp2_sharded
           - eltwise.unary.expm1.expm1
+          - eltwise.unary.expm1.expm1_sharded
           - eltwise.unary.tanh.tanh
           - eltwise.unary.tanh.tanh_pytorch2
           - eltwise.unary.tanh.tanh_forge
@@ -76,6 +81,7 @@ on:
           - eltwise.unary.deg2rad.deg2rad
           - eltwise.unary.relu6.relu6
           - eltwise.unary.log.log
+          - eltwise.unary.log.log_sharded
           - eltwise.unary.log.log_pytorch2
           - eltwise.unary.log.log_forge
           - eltwise.unary.log1p.log1p
@@ -99,6 +105,7 @@ on:
           - eltwise.unary.neg.neg_forge
           - eltwise.unary.erf.erf
           - eltwise.unary.erfinv.erfinv
+          - eltwise.unary.erfinv.erfinv_sharded
           - eltwise.unary.i0.i0
           - eltwise.unary.silu.silu
           - eltwise.unary.silu.silu_pytorch2
@@ -124,21 +131,30 @@ on:
           - eltwise.unary.hardsigmoid.hardsigmoid_sharded
           - eltwise.unary.hardshrink.hardshrink
           - eltwise.unary.hardshrink.hardshrink_sharded
-          - eltwise.unary.softmax.softmax
+          - normalization.softmax.softmax
+          - normalization.softmax.softmax_sharded
           - eltwise.unary.identity.identity
+          - eltwise.unary.identity.identity_sharded
           - eltwise.unary.neg.neg
+          - eltwise.unary.neg.neg_sharded
           - eltwise.unary.sinh.sinh
+          - eltwise.unary.sinh.sinh_sharded
           - eltwise.unary.asinh.asinh
           - eltwise.unary.cosh.cosh
           - eltwise.unary.relu_min.relu_min
+          - eltwise.unary.relu_min.relu_min_sharded
           - eltwise.unary.relu_max.relu_max
+          - eltwise.unary.relu_max.relu_max_sharded
           - eltwise.unary.softplus.softplus
+          - eltwise.unary.softplus.softplus_sharded
           - eltwise.unary.selu.selu
+          - eltwise.unary.selu.selu_sharded
           - eltwise.unary.softshrink.softshrink_sharded
           - eltwise.unary_backward.fill_zero_bw
           - eltwise.unary_backward.log_sigmoid_bw
           - eltwise.unary_backward.logit_bw
-          - eltwise.unary_backward.neg_bw
+          - eltwise.unary_backward.neg_bw.neg_bw
+          - eltwise.unary_backward.neg_bw.neg_bw_sharded
           - eltwise.unary_backward.hardshrink_bw
           - eltwise.unary_backward.softshrink_bw
           - eltwise.unary_backward.acos_bw.acos_bw
@@ -149,7 +165,9 @@ on:
           - eltwise.unary_backward.i0_bw.i0_bw
           - eltwise.unary_backward.rad2deg_bw.rad2deg_bw
           - eltwise.unary_backward.relu_bw.relu_bw
+          - eltwise.unary_backward.relu_bw.relu_bw_sharded
           - eltwise.unary_backward.rsqrt_bw.rsqrt_bw
+          - eltwise.unary_backward.rsqrt_bw.rsqrt_bw_sharded
           - eltwise.unary_backward.sigmoid_bw.sigmoid_bw
           - eltwise.unary_backward.tan_bw.tan_bw
           - eltwise.unary_backward.trunc_bw.trunc_bw
@@ -160,7 +178,9 @@ on:
           - eltwise.unary_backward.threshold_bw.threshold_bw
           - eltwise.unary_backward.div_bw.div_bw
           - eltwise.unary_backward.log_bw.log_bw
+          - eltwise.unary_backward.log_bw.log_bw_sharded
           - eltwise.unary_backward.relu6_bw.relu6_bw
+          - eltwise.unary_backward.relu6_bw.relu6_bw_sharded
           - eltwise.unary_backward.log10_bw.log10_bw
           - eltwise.unary_backward.abs_bw.abs_bw
           - eltwise.unary_backward.sinh_bw.sinh_bw
@@ -179,6 +199,7 @@ on:
           - eltwise.unary_backward.lgamma_bw.lgamma_bw
           - eltwise.unary_backward.multigammaln_bw.multigammaln_bw
           - eltwise.unary_backward.leaky_relu_bw.leaky_relu_bw
+          - eltwise.unary_backward.leaky_relu_bw.leaky_relu_bw_sharded
           - eltwise.unary_backward.elu_bw.elu_bw
           - eltwise.unary_backward.celu_bw.celu_bw
           - eltwise.unary_backward.selu_bw.selu_bw
@@ -226,10 +247,12 @@ on:
           - eltwise.unary.lez.lez
           - eltwise.unary.nez.nez
           - eltwise.unary.prelu.prelu
+          - eltwise.unary.prelu.prelu_sharded
           - eltwise.unary.hardswish.hardswish_pytorch2
           - eltwise.unary.hardtanh.hardtanh_pytorch2
           - eltwise.unary.leaky_relu.leaky_relu
           - eltwise.unary.reglu.reglu
+          - eltwise.unary.round.round_sharded
           - eltwise.unary_complex.polar.polar
           - eltwise.unary_complex.angle.angle
           - eltwise.unary_complex.polar_bw.polar_bw

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -207,8 +207,6 @@ target_link_libraries(
 if(NOT DEFINED ENV{ARCH_NAME})
     message(FATAL_ERROR "Please set ARCH_NAME to grayskull, wormhole_b0, or blackhole")
 endif(NOT DEFINED ENV{ARCH_NAME})
-string(TOUPPER "$ENV{ARCH_NAME}" ARCH_NAME_DEF)
-add_compile_definitions(ARCH_${ARCH_NAME_DEF})
 add_compile_options(
     -Werror
     -Wno-deprecated-declarations

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -6,7 +6,6 @@ Table of Contents
 - [Table of Contents](#table-of-contents)
   - [Contributing to tt-metal](#contributing-to-tt-metal)
   - [Machine setup](#machine-setup)
-    - [Hugepages setup](#hugepages-setup)
   - [Developing tt-metal](#developing-tt-metal)
     - [Setting logger level](#setting-logger-level)
     - [Building and viewing the documentation locally](#building-and-viewing-the-documentation-locally)

diff --git a/README.md b/README.md
@@ -121,7 +121,7 @@ Get started with [simple kernels](https://docs.tenstorrent.com/tt-metalium/lates
 - [Ethernet and Multichip Basics](./tech_reports/EthernetMultichip/BasicEthernetGuide.md) (Updated Sept 20th, 2024)
 - [Collective Communication Library (CCL)](./tech_reports/EthernetMultichip/CclDeveloperGuide.md) (Updated Sept 20th, 2024)
 - [Blackhole Bring-Up Programming Guide](./tech_reports/Blackhole/BlackholeBringUpProgrammingGuide.md) (Updated Dec 18th, 2024)
-- [Sub-Devices](./tech_reports/SubDevices/SubDevices.md) (Updated Jan 2nd, 2025)
+- [Sub-Devices](./tech_reports/SubDevices/SubDevices.md) (Updated Jan 7th, 2025)
 
 ## TT-Metalium Programming Examples
 

diff --git a/...ource/tt-metalium/tt_metal/apis/host_apis/profiler/DumpDeviceProfileResults.rst b/...ource/tt-metalium/tt_metal/apis/host_apis/profiler/DumpDeviceProfileResults.rst
@@ -3,4 +3,4 @@
 DumpDeviceProfileResults
 ========================
 
-.. doxygenfunction:: tt::tt_metal::v0::DumpDeviceProfileResults(Device *device, const Program &program);
+.. doxygenfunction:: tt::tt_metal::v0::DumpDeviceProfileResults(IDevice *device, const Program &program);
diff --git a/docs/source/tt-metalium/tt_metal/apis/host_apis/runtime_args/runtime_args.rst b/docs/source/tt-metalium/tt_metal/apis/host_apis/runtime_args/runtime_args.rst
@@ -5,9 +5,9 @@ Runtime Arguments
 
 .. doxygenfunction:: tt::tt_metal::v0::SetRuntimeArgs(const Program &program, KernelHandle kernel, const std::vector< CoreCoord > & core_spec, const std::vector< std::vector<uint32_t> > &runtime_args)
 
-.. doxygenfunction:: tt::tt_metal::v0::SetRuntimeArgs(Device* device, const std::shared_ptr<Kernel>& kernel, const std::variant<CoreCoord, CoreRange, CoreRangeSet> &core_spec, std::shared_ptr<RuntimeArgs> runtime_args)
+.. doxygenfunction:: tt::tt_metal::v0::SetRuntimeArgs(IDevice* device, const std::shared_ptr<Kernel>& kernel, const std::variant<CoreCoord, CoreRange, CoreRangeSet> &core_spec, std::shared_ptr<RuntimeArgs> runtime_args)
 
-.. doxygenfunction:: tt::tt_metal::v0::SetRuntimeArgs(Device* device, const std::shared_ptr<Kernel>& kernel, const std::vector< CoreCoord > & core_spec, const std::vector<std::shared_ptr<RuntimeArgs>>& runtime_args)
+.. doxygenfunction:: tt::tt_metal::v0::SetRuntimeArgs(IDevice* device, const std::shared_ptr<Kernel>& kernel, const std::vector< CoreCoord > & core_spec, const std::vector<std::shared_ptr<RuntimeArgs>>& runtime_args)
 
 .. doxygenfunction:: tt::tt_metal::v0::GetRuntimeArgs(const Program &program, KernelHandle kernel_id, const CoreCoord &logical_core)
 

diff --git a/docs/source/tt-metalium/tt_metal/examples/dram_loopback.rst b/docs/source/tt-metalium/tt_metal/examples/dram_loopback.rst
@@ -29,7 +29,7 @@ Silicon accelerator setup
 .. code-block:: cpp
 
    constexpr int device_id = 0;
-   Device *device = CreateDevice(device_id);
+   auto device = CreateDevice(device_id);
 
 We instantiate a device to control our ``GRAYSKULL`` type
 accelerator.

diff --git a/models/common/rmsnorm.py b/models/common/rmsnorm.py
@@ -81,17 +81,18 @@ def __init__(
             mesh_mapper=ttnn.ReplicateTensorToMesh(device) if is_mesh_device else None,
         )
 
-        self.weight_distributed = ttnn.as_tensor(
-            torch_weight,
-            device=device,
-            dtype=weight_dtype,
-            layout=ttnn.ROW_MAJOR_LAYOUT,
-            memory_config=weight_memory_config,
-            cache_file_name=cache_name,
-            mesh_mapper=ttnn.ShardTensor2dMesh(device, dims=(None, 2), mesh_shape=list(device.shape))
-            if is_mesh_device
-            else None,
-        )
+        if self.is_distributed:
+            self.weight_distributed = ttnn.as_tensor(
+                torch_weight,
+                device=device,
+                dtype=weight_dtype,
+                layout=ttnn.ROW_MAJOR_LAYOUT,
+                memory_config=weight_memory_config,
+                cache_file_name=cache_name,
+                mesh_mapper=ttnn.ShardTensor2dMesh(device, dims=(None, 2), mesh_shape=list(device.shape))
+                if is_mesh_device
+                else None,
+            )
 
         self.sharded_output_config = sharded_output_config
         self.sharded_program_config = sharded_program_config

diff --git a/models/demos/llama3/README.md b/models/demos/llama3/README.md
@@ -15,20 +15,19 @@ All the above llama models (with the exception of 70B due to its large size) are
 - T3000 (8-chips)
 - TG (32-chips)
 
-Below is an updated table with max prefill context-length support for our demo. These were tested on both accuracy and performance mode.
-
-The main reason for a long context length not fitting on device is lack of memory memory. Any exceptions are marked in the table in appendix.
+**Max Context Lengths (text-only)**: All of the compatible model/device combinations support a max prefill context-length of 128k, with the exception of Llama3.1-8B and Llama3.2-11B on N150 which have a max of 64k (due to a lack of memory). To support these large max context-lengths, chunked prefill is performed with different max chunk sizes as shown in the table below.
 
+Max Prefill Chunk Sizes (text-only):
 |              |      N150     |      N300     |      T3K       |      TG     |
 |--------------|---------------|---------------|----------------|-------------|
 | Llama3.2-1B  | 128k tokens   | 128k tokens   | 128k tokens    | 128k tokens |
-| Llama3.2-3B  | 32k tokens    | 128k tokens   | 128k tokens    | 128k tokens |
-| Llama3.1-8B  | 16k tokens    | 64k tokens    | 128k tokens    | 128k tokens |
-| Llama3.2-11B | 16k tokens    | 64k tokens    | 128k tokens    | 128k tokens |
-| Llama3.1-70B | Not supported | Not supported | 64k tokens [1] | 128k tokens |
-
-[1] Although longer prefill context-lengths are not supported due to model size and available memory, you can still decode (generate) tokens up to a maximum of 128k tokens.
+| Llama3.2-3B  | 8k tokens     | 128k tokens   | 128k tokens    | 128k tokens |
+| Llama3.1-8B  | 4k tokens     | 64k tokens    | 128k tokens    | 128k tokens |
+| Llama3.2-11B | 4k tokens     | 64k tokens    | 128k tokens    | 128k tokens |
+| Llama3.1-70B | Not supported | Not supported | 32k tokens     | 128k tokens |
+- These max chunk sizes are specific to max context length 128k and are configured via `MAX_PREFILL_CHUNK_SIZES_DIV1024` in [model_config.py](https://github.com/tenstorrent/tt-metal/blob/main/models/demos/llama3/tt/model_config.py). If the max context length is set to a smaller value using the `max_seq_len` flag (see [Run the demo](#run-the-demo)), these chunk sizes can possibly be increased due to using a smaller KV cache.
 
+**Max Context Lengths (Llama3.2-11B multimodal)**: Llama3.2-11B multimodal is currently only supported on N300 and T3000. On N300, a max prefill context length of 8k is supported, while T3000 supports a max context length of 128k.
 
 ## How to Run
 

diff --git a/models/demos/llama3/demo/demo.py b/models/demos/llama3/demo/demo.py
@@ -233,38 +233,15 @@ def run_llama3_demo(
     llama_model_name = model_args.model_name  # ["3.2-1B", "3.2-3B", "3.1-8B", "3.2-11B", "3.1-70B"]
     tt_device_name = model_args.device_name  # ["N150", "N300", "T3K", "TG"]
 
-    if llama_model_name == "3.2-1B":
+    if llama_model_name in ["3.1-8B", "3.2-11B"] and tt_device_name == "N150":
         assert (
-            max_seq_len <= 128 * 1024
-        ), "Llama3.2-1B supports the official max context length of 128k tokens across all architectures"
-    if llama_model_name == "3.2-3B":
-        if tt_device_name == "N150":
-            assert max_seq_len <= 32 * 1024, "N150 only supports a max context length of 32k tokens for Llama3.2-3B"
-        else:  # N300, T3K and TG
-            assert (
-                max_seq_len <= 128 * 1024
-            ), "N300, T3K and TG support the official max context length of 128k tokens for Llama3.2-3B"
-    if llama_model_name in ["3.1-8B", "3.2-11B"]:
-        if tt_device_name == "N150":
-            assert (
-                max_seq_len <= 16 * 1024
-            ), "N150 only supports a max context length of 16k tokens for Llama3.1-8B and Llama3.2-11B"
-        elif tt_device_name == "N300":
-            assert (
-                max_seq_len <= 64 * 1024
-            ), "N300 only supports a max context length of 64k tokens for Llama3.1-8B and Llama3.2-11B"
-        else:  # T3K and TG
-            assert (
-                max_seq_len <= 128 * 1024
-            ), "T3K only supports a max context length of 128k tokens for Llama3.1-8B and Llama3.2-11B"
+            max_seq_len <= 64 * 1024
+        ), "N150 only supports a max context length of 64k tokens for Llama3.1-8B and Llama3.2-11B"
+    else:
+        assert max_seq_len <= 128 * 1024, f"Llama{llama_model_name} supports a max context length of 128k tokens"
+
     if llama_model_name == "3.1-70B":
         assert tt_device_name in ["T3K", "TG"], "Llama3.1-70B is only supported on T3K or TG"
-        if tt_device_name == "T3K":
-            assert max_seq_len <= 64 * 1024, "T3K only supports a max context length of 64k tokens for Llama3.1-70B"
-        else:  # TG
-            assert (
-                max_seq_len <= 128 * 1024
-            ), "TG supports the official max context length of 128k tokens for Llama3.1-70B"
 
     logger.info("Loading weights...")
     profiler.start("weight_loading")

diff --git a/models/demos/llama3/tests/test_llama_chunked_generation.py b/models/demos/llama3/tests/test_llama_chunked_generation.py
@@ -47,7 +47,7 @@
 )
 @pytest.mark.parametrize(
     "seq_len, prefill_chunk_size",
-    [(4096, 1024)],
+    [(4096, 2048)],
 )
 @pytest.mark.parametrize(
     "optimizations",

diff --git a/models/demos/llama3/tt/generator.py b/models/demos/llama3/tt/generator.py
@@ -141,10 +141,12 @@ def prefill_forward_single_user_text(self, tokens, page_table, user_id, last_tok
                     get_last_token=(last_token_idx_in_chunk // 32) * 32,
                     kv_cache=kv_cache,
                 )
-                logits = self.model.process_output_prefill(tt_logits, last_token_idx=(last_token_idx_in_chunk % 32))
 
                 if chunk_start == last_chunk_start:
+                    logits = self.model.process_output_prefill(tt_logits, last_token_idx=(last_token_idx_in_chunk % 32))
                     return logits
+                else:
+                    del tt_logits
         else:
             prefill_input, rot_mats_prefill, page_table_tt, _ = self.model.prepare_inputs_prefill(
                 tokens,