#9747: Implement ttnn.tilize(_with_val_padding) Python bindings

tenstorrent · Jul 17, 2024 · 0ecbb36 · 0ecbb36
1 parent f86c0c1
commit 0ecbb36
Show file tree

Hide file tree

Showing 39 changed files with 367 additions and 182 deletions.
diff --git a/models/demos/falcon7b/tt/falcon_model.py b/models/demos/falcon7b/tt/falcon_model.py
@@ -138,10 +138,10 @@ def model_preprocessing(self, llm_mode, input_ids, kv_cache_len, num_input_token
                 # Tilize attn masks
                 for tt_attention_mask_slice in attn_masks_unordered:
                     for i in range(self.num_devices):
-                        tt_attention_mask_slice[i] = ttnn.experimental.tensor.tilize(
+                        tt_attention_mask_slice[i] = ttnn.tilize(
                             tt_attention_mask_slice[i],
-                            output_mem_config=self.model_config["ATTN_MASK_MEMCFG"],
-                            output_dtype=self.model_config["ATTN_MASK_OPTIMIZED_PREFILL_DTYPE"],
+                            memory_config=self.model_config["ATTN_MASK_MEMCFG"],
+                            dtype=self.model_config["ATTN_MASK_OPTIMIZED_PREFILL_DTYPE"],
                         )
                 # Expected output attention_masks
                 # [dev0: [slice0, slice1, ...], dev1: [slice0, slice1, ...], ...]
@@ -166,10 +166,10 @@ def model_preprocessing(self, llm_mode, input_ids, kv_cache_len, num_input_token
                     )
                 # Tilize attn masks
                 for i in range(self.num_devices):
-                    tt_attention_mask[i] = ttnn.experimental.tensor.tilize(
+                    tt_attention_mask[i] = ttnn.tilize(
                         tt_attention_mask[i],
-                        output_mem_config=self.model_config["ATTN_MASK_MEMCFG"],
-                        output_dtype=self.model_config["ATTN_MASK_DTYPE"],
+                        memory_config=self.model_config["ATTN_MASK_MEMCFG"],
+                        dtype=self.model_config["ATTN_MASK_DTYPE"],
                     )
 
             tt_input_ids = []
@@ -220,10 +220,10 @@ def model_preprocessing(self, llm_mode, input_ids, kv_cache_len, num_input_token
             if not self.model_config["l1_sharded"]:
                 # Tilize attn masks
                 for i in range(self.num_devices):
-                    tt_attention_mask[i] = ttnn.experimental.tensor.tilize(
+                    tt_attention_mask[i] = ttnn.tilize(
                         tt_attention_mask[i],
-                        output_mem_config=self.model_config["ATTN_MASK_MEMCFG"],
-                        output_dtype=self.model_config["ATTN_MASK_DTYPE"],
+                        memory_config=self.model_config["ATTN_MASK_MEMCFG"],
+                        dtype=self.model_config["ATTN_MASK_DTYPE"],
                     )
 
             for i, device in enumerate(self.devices):

diff --git a/models/demos/resnet/tt/metalResnetBlock50.py b/models/demos/resnet/tt/metalResnetBlock50.py
@@ -159,7 +159,7 @@ def format_tensor(x, target_layout, device, output_mem_config, pad_value=0.0):
                 x, device, x_padded_shape, pad_value, target_layout, output_mem_config
             )
         else:
-            return tt_lib.tensor.tilize(x, output_mem_config, use_multicore=True)
+            return ttnn.tilize(x, memory_config=output_mem_config, use_multicore=True)
     elif x.get_layout() == tt_lib.tensor.Layout.TILE and target_layout == tt_lib.tensor.Layout.ROW_MAJOR:
         if x.get_legacy_shape() != x.shape_without_padding():
             return tt_lib.tensor.format_output_tensor(
@@ -2187,10 +2187,10 @@ def forward(self, x: tt_lib.tensor, write_event=None, op_event=None, final_out_m
             self.maxpool_output_shape[0] * self.maxpool_output_shape[1] * self.maxpool_output_shape[2],
             self.maxpool_output_shape[3],
         )
-        x = tt_lib.tensor.tilize(
+        x = ttnn.tilize(
             x,
-            output_mem_config=self.height_sharded_memory_config,
-            output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+            memory_config=self.height_sharded_memory_config,
+            dtype=self.model_config["ACTIVATIONS_DTYPE"],
             use_multicore=True,
         )
         if self.batch_size == 20:
@@ -2314,12 +2314,12 @@ def forward(self, x: tt_lib.tensor, write_event=None, op_event=None, final_out_m
             _nearest_32(unpadded_shape[3]),
         ]
         if self.sharded:
-            x = tt_lib.tensor.tilize_with_val_padding(
+            x = ttnn.tilize_with_val_padding(
                 x,
                 padded_shape,
                 0,
-                output_mem_config=self.width_sharded_memory_config,
-                output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+                memory_config=self.width_sharded_memory_config,
+                dtype=self.model_config["ACTIVATIONS_DTYPE"],
             )
         else:
             x = ttnn.pad(
@@ -2330,10 +2330,10 @@ def forward(self, x: tt_lib.tensor, write_event=None, op_event=None, final_out_m
                 memory_config=self.memory_config,
                 use_multicore=True,
             )
-            x = tt_lib.tensor.tilize(
+            x = ttnn.tilize(
                 x,
-                output_mem_config=self.memory_config,
-                output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+                memory_config=self.memory_config,
+                dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 use_multicore=True,
             )
 
@@ -2365,12 +2365,12 @@ def forward(self, x: tt_lib.tensor, write_event=None, op_event=None, final_out_m
             _nearest_32(unpadded_shape[3]),
         ]
         if self.sharded:
-            x = tt_lib.tensor.tilize_with_val_padding(
+            x = ttnn.tilize_with_val_padding(
                 x,
                 padded_shape,
                 0,
-                output_mem_config=self.width_sharded_memory_config,
-                output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+                memory_config=self.width_sharded_memory_config,
+                dtype=self.model_config["ACTIVATIONS_DTYPE"],
             )
         else:
             x = ttnn.pad(
@@ -2381,10 +2381,10 @@ def forward(self, x: tt_lib.tensor, write_event=None, op_event=None, final_out_m
                 memory_config=self.memory_config,
                 use_multicore=True,
             )
-            x = tt_lib.tensor.tilize(
+            x = ttnn.tilize(
                 x,
-                output_mem_config=self.memory_config,
-                output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+                memory_config=self.memory_config,
+                dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 use_multicore=True,
             )
 

diff --git a/models/demos/t3000/falcon40b/tt/falcon_model.py b/models/demos/t3000/falcon40b/tt/falcon_model.py
@@ -48,7 +48,11 @@ def __init__(
         self.num_layers = num_layers
         self.hidden_size = config.hidden_size
         self.num_devices = device_mesh.get_num_devices()
-        self.ln_output_tensors_dict = {"final_layernorm": dict(), "mlp_layernorm": dict(), "attn_layernorm": dict()}
+        self.ln_output_tensors_dict = {
+            "final_layernorm": dict(),
+            "mlp_layernorm": dict(),
+            "attn_layernorm": dict(),
+        }
 
         # Word Embeddings
         self.embeddings = TtFalconEmbeddings(
@@ -138,10 +142,10 @@ def create_attn_mask(self, max_seq_len):
             preprocess=lambda x: (x * -1e5),
         )
 
-        tt_attn_mask = ttnn.experimental.tensor.tilize(
+        tt_attn_mask = ttnn.tilize(
             tt_attn_mask,
-            output_mem_config=attention_mask_memconfig,
-            output_dtype=self.model_config["ATTN_MASK_DTYPE"],
+            memory_config=attention_mask_memconfig,
+            dtype=self.model_config["ATTN_MASK_DTYPE"],
         )
         return tt_attn_mask
 
@@ -235,10 +239,10 @@ def model_preprocessing(self, llm_mode, input_ids, kv_cache_len, num_input_token
                 preprocess=lambda x: (x.transpose(0, 2) * -1e5).expand(-1, self.config.num_attention_heads, -1, -1),
             )
 
-            tt_attention_mask = ttnn.experimental.tensor.tilize(
+            tt_attention_mask = ttnn.tilize(
                 tt_attention_mask,
-                output_mem_config=attention_mask_memconfig,
-                output_dtype=self.model_config["ATTN_MASK_DTYPE"],
+                memory_config=attention_mask_memconfig,
+                dtype=self.model_config["ATTN_MASK_DTYPE"],
             )
 
         else:

diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50.py
@@ -727,12 +727,12 @@ def __call__(self, input_tensor) -> ttnn.Tensor:
             _nearest_32(unpadded_shape[2]),
             _nearest_32(unpadded_shape[3]),
         ]
-        x = ttnn.experimental.tensor.tilize_with_val_padding(
+        x = ttnn.tilize_with_val_padding(
             x,
             padded_shape,
             0,
-            output_mem_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
-            output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+            memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
+            dtype=self.model_config["ACTIVATIONS_DTYPE"],
         )
 
         x = self.avgpool(x, memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG)
@@ -759,12 +759,12 @@ def __call__(self, input_tensor) -> ttnn.Tensor:
             _nearest_32(unpadded_shape[3]),
         ]
 
-        x = ttnn.experimental.tensor.tilize_with_val_padding(
+        x = ttnn.tilize_with_val_padding(
             x,
             padded_shape,
             0,
-            output_mem_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
-            output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+            memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
+            dtype=self.model_config["ACTIVATIONS_DTYPE"],
         )
 
         x = self.fc(x)

diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py
@@ -755,12 +755,12 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
             _nearest_32(unpadded_shape[2]),
             _nearest_32(unpadded_shape[3]),
         ]
-        x = ttnn.experimental.tensor.tilize_with_val_padding(
+        x = ttnn.tilize_with_val_padding(
             x,
             padded_shape,
             0,
-            output_mem_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
-            output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+            memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
+            dtype=self.model_config["ACTIVATIONS_DTYPE"],
         )
 
         x = self.avgpool(x, memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG)
@@ -787,12 +787,12 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
             _nearest_32(unpadded_shape[3]),
         ]
 
-        x = ttnn.experimental.tensor.tilize_with_val_padding(
+        x = ttnn.tilize_with_val_padding(
             x,
             padded_shape,
             0,
-            output_mem_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
-            output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+            memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
+            dtype=self.model_config["ACTIVATIONS_DTYPE"],
         )
 
         x = self.fc(x)
@@ -975,12 +975,12 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c
             _nearest_32(unpadded_shape[2]),
             _nearest_32(unpadded_shape[3]),
         ]
-        x = ttnn.experimental.tensor.tilize_with_val_padding(
+        x = ttnn.tilize_with_val_padding(
             x,
             padded_shape,
             0,
-            output_mem_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
-            output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+            memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
+            dtype=self.model_config["ACTIVATIONS_DTYPE"],
         )
 
         x = self.avgpool(x, memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG)
@@ -1007,12 +1007,12 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c
             _nearest_32(unpadded_shape[3]),
         ]
 
-        x = ttnn.experimental.tensor.tilize_with_val_padding(
+        x = ttnn.tilize_with_val_padding(
             x,
             padded_shape,
             0,
-            output_mem_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
-            output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+            memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
+            dtype=self.model_config["ACTIVATIONS_DTYPE"],
         )
 
         x = self.fc(x)

diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py
@@ -1099,12 +1099,12 @@ def run(self, input_tensor, device, ops_parallel_config, conv_op_cache={}) -> tt
             _nearest_32(unpadded_shape[2]),
             _nearest_32(unpadded_shape[3]),
         ]
-        x = ttnn.experimental.tensor.tilize_with_val_padding(
+        x = ttnn.tilize_with_val_padding(
             x,
             padded_shape,
             0,
-            output_mem_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
-            output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+            memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
+            dtype=self.model_config["ACTIVATIONS_DTYPE"],
         )
 
         x = self.avgpool(x, memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG)
@@ -1131,12 +1131,12 @@ def run(self, input_tensor, device, ops_parallel_config, conv_op_cache={}) -> tt
             _nearest_32(unpadded_shape[3]),
         ]
 
-        x = ttnn.experimental.tensor.tilize_with_val_padding(
+        x = ttnn.tilize_with_val_padding(
             x,
             padded_shape,
             0,
-            output_mem_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
-            output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+            memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
+            dtype=self.model_config["ACTIVATIONS_DTYPE"],
         )
 
         x = self.fc(x)

diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api.py
@@ -749,12 +749,12 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
             _nearest_32(unpadded_shape[2]),
             _nearest_32(unpadded_shape[3]),
         ]
-        x = ttnn.experimental.tensor.tilize_with_val_padding(
+        x = ttnn.tilize_with_val_padding(
             x,
             padded_shape,
             0,
-            output_mem_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
-            output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+            memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
+            dtype=self.model_config["ACTIVATIONS_DTYPE"],
         )
 
         x = self.avgpool(x, memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG)
@@ -781,12 +781,12 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
             _nearest_32(unpadded_shape[3]),
         ]
 
-        x = ttnn.experimental.tensor.tilize_with_val_padding(
+        x = ttnn.tilize_with_val_padding(
             x,
             padded_shape,
             0,
-            output_mem_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
-            output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+            memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
+            dtype=self.model_config["ACTIVATIONS_DTYPE"],
         )
 
         x = self.fc(x)
@@ -952,12 +952,12 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c
             _nearest_32(unpadded_shape[2]),
             _nearest_32(unpadded_shape[3]),
         ]
-        x = ttnn.experimental.tensor.tilize_with_val_padding(
+        x = ttnn.tilize_with_val_padding(
             x,
             padded_shape,
             0,
-            output_mem_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
-            output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+            memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
+            dtype=self.model_config["ACTIVATIONS_DTYPE"],
         )
 
         x = self.avgpool(x, memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG)
@@ -984,12 +984,12 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c
             _nearest_32(unpadded_shape[3]),
         ]
 
-        x = ttnn.experimental.tensor.tilize_with_val_padding(
+        x = ttnn.tilize_with_val_padding(
             x,
             padded_shape,
             0,
-            output_mem_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
-            output_dtype=self.model_config["ACTIVATIONS_DTYPE"],
+            memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
+            dtype=self.model_config["ACTIVATIONS_DTYPE"],
         )
 
         x = self.fc(x)