#10644: remove tensor creation ops

tenstorrent · Jul 31, 2024 · 6ad7dbb · 6ad7dbb
1 parent 96c3029
commit 6ad7dbb
Show file tree

Hide file tree

Showing 36 changed files with 190 additions and 542 deletions.
diff --git a/docs/source/ttnn/ttnn/dependencies/tt_lib.rst b/docs/source/ttnn/ttnn/dependencies/tt_lib.rst
@@ -330,23 +330,8 @@ but in general retaining the data.
 Tensor creation operations
 ==========================
 
-.. autofunction:: tt_lib.tensor.arange
-
-.. autofunction:: tt_lib.tensor.full
-
-.. autofunction:: tt_lib.tensor.ones
-
-.. autofunction:: tt_lib.tensor.ones_like
-
-.. autofunction:: tt_lib.tensor.zeros
-
-.. autofunction:: tt_lib.tensor.zeros_like
-
-.. autofunction:: tt_lib.tensor.full_like
-
 .. autofunction:: tt_lib.tensor.split_last_dim_two_chunks_tiled
 
-.. autofunction:: tt_lib.tensor.empty
 
 Broadcast and Reduce
 ====================

diff --git a/models/demos/metal_BERT_large_11/tt/mha.py b/models/demos/metal_BERT_large_11/tt/mha.py
@@ -138,7 +138,7 @@ def op6_unmake_attention_heads(x):
     def mha_(activation, attention_mask):
         # TODO: Remove hardcoded shape hack
         if reserve_split_heads_shape is not None:
-            temp = tt_lib.tensor.empty(
+            temp = ttnn.empty(
                 reserve_split_heads_shape,
                 tt_lib.tensor.DataType.BFLOAT16,
                 tt_lib.tensor.Layout.ROW_MAJOR,

diff --git a/models/experimental/deit/tt/deit_for_image_classification_with_teacher.py b/models/experimental/deit/tt/deit_for_image_classification_with_teacher.py
@@ -76,7 +76,7 @@ def forward(
 
         # during inference, return the average of both classifier predictions
         logits = ttnn.add(cls_logits, distillation_logits)
-        half = tt_lib.tensor.full(logits.get_legacy_shape(), 0.5)
+        half = ttnn.full(logits.get_legacy_shape(), 0.5)
         logits = ttnn.mul(logits, half)
 
         # if not return_dict:

diff --git a/models/experimental/deit/tt/deit_self_attention.py b/models/experimental/deit/tt/deit_self_attention.py
@@ -67,7 +67,7 @@ def forward(
 
         attention_scores = ttnn.matmul(query_layer, key_layer_transposed)
 
-        attention_head_size_tt = tt_lib.tensor.full(attention_scores.get_legacy_shape(), self.attention_head_size)
+        attention_head_size_tt = ttnn.full(attention_scores.get_legacy_shape(), self.attention_head_size)
         attention_head_size_tt = ttnn.sqrt(attention_head_size_tt)
         attention_head_size_tt = ttnn.reciprocal(attention_head_size_tt)
 

diff --git a/models/experimental/distilbert/tt/distilbert_model.py b/models/experimental/distilbert/tt/distilbert_model.py
@@ -11,6 +11,7 @@
 )
 
 import tt_lib
+import ttnn
 from dataclasses import dataclass
 
 from models.experimental.distilbert.tt.distilbert_embedding import TtDistilBert_Embeddings
@@ -105,7 +106,7 @@ def forward(
 
         if attention_mask is not None:
             input_shape[0:0] = [1, 1]
-            attention_mask = tt_lib.tensor.ones(input_shape)
+            attention_mask = ttnn.ones(input_shape)
 
         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
         """

diff --git a/models/experimental/distilbert/tt/distilbert_multihead_self_attention.py b/models/experimental/distilbert/tt/distilbert_multihead_self_attention.py
@@ -65,7 +65,7 @@ def __init__(self, config, state_dict=None, base_address="", device=None):
         self.attention_head_size = self.dim // self.n_heads
 
     def const_tensor(self, shape: List[int], value: int) -> tt_lib.tensor.Tensor:
-        return tt_lib.tensor.full(shape, value)
+        return ttnn.full(shape, value)
 
     def get_min(self, tensor: tt_lib.tensor.Tensor):
         tensor = tt_to_torch_tensor(tensor)

diff --git a/models/experimental/functional_mistral/tt/ttnn_functional_attention.py b/models/experimental/functional_mistral/tt/ttnn_functional_attention.py
@@ -95,18 +95,18 @@ def attention(config, x, bcast_freq_xq, bcast_freq_xk, positions, mask, seqlen,
     scatter_pos = scatter_pos.to(torch.int64)
     scatter_pos = scatter_pos.repeat(bsz, 1, config.n_kv_heads, config.head_dim)
 
-    cache_k = tt_lib.tensor.empty(
+    cache_k = ttnn.empty(
         [config.max_batch_size, config.sliding_window, config.n_kv_heads, config.head_dim],
         layout=tt_lib.tensor.Layout.ROW_MAJOR,
         device=device,
-        output_mem_config=config.out_mem_config,
+        memory_config=config.out_mem_config,
     )
     cache_k = tt_to_torch_tensor(cache_k).to(torch.float32)
-    cache_v = tt_lib.tensor.empty(
+    cache_v = ttnn.empty(
         [config.max_batch_size, config.sliding_window, config.n_kv_heads, config.head_dim],
         layout=tt_lib.tensor.Layout.ROW_MAJOR,
         device=device,
-        output_mem_config=config.out_mem_config,
+        memory_config=config.out_mem_config,
     )
     cache_v = tt_to_torch_tensor(cache_v).to(torch.float32)
     cache_k[:bsz].scatter_(dim=1, index=scatter_pos, src=xk[:, -config.sliding_window :])

diff --git a/models/experimental/mistral/mistral_helper_funcs.py b/models/experimental/mistral/mistral_helper_funcs.py
@@ -113,7 +113,7 @@ def get_freqs_cis(freqs_cis: torch.Tensor, query_shape, key_shape, device=None,
     BCH = tt_lib.tensor.BcastOpDim.HW
     BCMUL = tt_lib.tensor.BcastOpMath.MUL
 
-    t_one_xq = tt_lib.tensor.ones(query_shape, output_mem_config=mem_config)
+    t_one_xq = ttnn.ones(query_shape, memory_config=mem_config)
     t_one_xq = ttnn.permute(t_one_xq, (3, 1, 2, 0), memory_config=mem_config)
 
     freqs_real = ttnn.permute(freqs_cis.real, (3, 1, 2, 0), memory_config=mem_config)
@@ -130,7 +130,7 @@ def get_freqs_cis(freqs_cis: torch.Tensor, query_shape, key_shape, device=None,
     bcast_freq_re_xq.deallocate()
     bcast_freq_im_xq.deallocate()
 
-    t_one_xk = tt_lib.tensor.ones(key_shape, output_mem_config=mem_config)
+    t_one_xk = ttnn.ones(key_shape, memory_config=mem_config)
     t_one_xk = ttnn.permute(t_one_xk, (3, 1, 2, 0), memory_config=mem_config)
 
     bcast_freq_re_xk = tt_lib.tensor.bcast(t_one_xk, freqs_real, BCMUL, BCH, output_mem_config=mem_config)

diff --git a/models/experimental/mistral/tt/mistral_attention.py b/models/experimental/mistral/tt/mistral_attention.py
@@ -87,18 +87,18 @@ def __init__(
             )
             self.cache_v = torch.empty(args.max_batch_size, args.sliding_window, self.n_kv_heads, self.args.head_dim)
         else:
-            cache_k = tt_lib.tensor.empty(
+            cache_k = ttnn.empty(
                 [args.max_batch_size, args.sliding_window, self.n_kv_heads, self.args.head_dim],
                 layout=tt_lib.tensor.Layout.ROW_MAJOR,
                 device=self.device,
-                output_mem_config=self.args.out_mem_config,
+                memory_config=self.args.out_mem_config,
             )
             self.cache_k = tt_to_torch_tensor(cache_k).to(torch.float32)
-            cache_v = tt_lib.tensor.empty(
+            cache_v = ttnn.empty(
                 [args.max_batch_size, args.sliding_window, self.n_kv_heads, self.args.head_dim],
                 layout=tt_lib.tensor.Layout.ROW_MAJOR,
                 device=self.device,
-                output_mem_config=self.args.out_mem_config,
+                memory_config=self.args.out_mem_config,
             )
             self.cache_v = tt_to_torch_tensor(cache_v).to(torch.float32)
 

diff --git a/models/experimental/mistral/tt/mistral_transformer.py b/models/experimental/mistral/tt/mistral_transformer.py
@@ -97,7 +97,7 @@ def forward(
         mask: Optional[torch.Tensor] = None
         if input_ids.get_legacy_shape()[-1] > 1:
             seqlen = input_ids.get_legacy_shape()[-1]
-            tensor = tt_lib.tensor.full(
+            tensor = ttnn.full(
                 (1, 1, seqlen, seqlen),
                 fill_value=1.0,
             )

diff --git a/models/experimental/nanogpt/tt/nanogpt_attention.py b/models/experimental/nanogpt/tt/nanogpt_attention.py
@@ -48,7 +48,7 @@ def __init__(self, config, base_address, device, tt_cache_path, dtype):
         self.n_head = self.config.n_head
         self.n_embd = self.config.n_embd
 
-        temp_bias = ttnn.tril(tt_lib.tensor.ones([1, 1, self.block_size, self.block_size]))
+        temp_bias = ttnn.tril(ttnn.ones([1, 1, self.block_size, self.block_size]))
         temp_bias = tt_to_torch_tensor(temp_bias)
         self.register_buffer(
             "bias",
@@ -69,7 +69,7 @@ def __init__(self, config, base_address, device, tt_cache_path, dtype):
         )
 
     def const_tensor(self, shape, value):
-        return tt_lib.tensor.full(shape, value)
+        return ttnn.full(shape, value)
 
     def forward(self, x: tt_lib.tensor.Tensor) -> tt_lib.tensor.Tensor:
         (

diff --git a/models/experimental/nanogpt/tt/nanogpt_model.py b/models/experimental/nanogpt/tt/nanogpt_model.py
@@ -67,7 +67,7 @@ def forward(self, idx: torch.Tensor) -> tt_lib.tensor.Tensor:
         assert (
             t <= self.config.block_size
         ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
-        pos = tt_lib.tensor.arange(0, t, 1)
+        pos = ttnn.arange(0, t, 1)
         pos = tt_to_torch_tensor(pos)
         pos = pos.squeeze(0).squeeze(0)
         pos = pos.to(dtype=torch.int64)

diff --git a/models/experimental/roberta/tt/roberta_model.py b/models/experimental/roberta/tt/roberta_model.py
@@ -10,7 +10,7 @@
 from typing import Optional, Tuple, Union, List
 
 import tt_lib
-
+import ttnn
 from models.experimental.roberta.tt.roberta_encoder import TtRobertaEncoder
 from models.experimental.roberta.tt.roberta_pooler import TtRobertaPooler
 from models.experimental.roberta.tt.roberta_embeddings import PytorchEmbeddings
@@ -170,8 +170,8 @@ def get_extended_attention_mask(
         # positions we want to attend and the dtype's smallest value for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-        self.ones_const = tt_lib.tensor.full(extended_attention_mask.get_legacy_shape(), 1.0)
-        self.mul_const = tt_lib.tensor.full(extended_attention_mask.get_legacy_shape(), self.dtype_min_const)
+        self.ones_const = ttnn.full(extended_attention_mask.get_legacy_shape(), 1.0)
+        self.mul_const = ttnn.full(extended_attention_mask.get_legacy_shape(), self.dtype_min_const)
         extended_attention_mask = ttnn.sub(self.ones_const, extended_attention_mask, memory_config=self.mem_config)
 
         extended_attention_mask = ttnn.mul(extended_attention_mask, self.mul_const, memory_config=self.mem_config)
@@ -196,8 +196,8 @@ def invert_attention_mask(self, encoder_attention_mask: tt_lib.tensor.Tensor) ->
 
         encoder_extended_attention_mask = torch2tt_tensor(torch_encoder_extended_attention_mask, self.device)
 
-        self.ones_const = tt_lib.tensor.full(encoder_extended_attention_mask.get_legacy_shape(), 1.0)
-        self.mul_const = tt_lib.tensor.full(encoder_extended_attention_mask.get_legacy_shape(), self.dtype_min_const)
+        self.ones_const = ttnn.full(encoder_extended_attention_mask.get_legacy_shape(), 1.0)
+        self.mul_const = ttnn.full(encoder_extended_attention_mask.get_legacy_shape(), self.dtype_min_const)
 
         encoder_extended_attention_mask = ttnn.sub(
             self.ones_const,
@@ -339,7 +339,7 @@ def forward(
         past_key_values_length = past_key_values[0][0].get_legacy_shape()[2] if past_key_values is not None else 0
 
         if attention_mask is None:
-            attention_mask = tt_lib.tensor.full((1, 1, batch_size, seq_length + past_key_values_length), 0.0)
+            attention_mask = ttnn.full((1, 1, batch_size, seq_length + past_key_values_length), 0.0)
 
         if token_type_ids is None:
             if hasattr(self.embeddings, "token_type_ids"):
@@ -364,7 +364,7 @@ def forward(
             ) = encoder_hidden_states.get_legacy_shape()
             encoder_hidden_shape = (1, 1, encoder_batch_size, encoder_sequence_length)
             if encoder_attention_mask is None:
-                encoder_attention_mask = tt_lib.tensor.full(encoder_hidden_shape, 1.1)
+                encoder_attention_mask = ttnn.full(encoder_hidden_shape, 1.1)
             encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
         else:
             encoder_extended_attention_mask = None

diff --git a/models/experimental/roberta/tt/roberta_self_attention.py b/models/experimental/roberta/tt/roberta_self_attention.py
@@ -184,7 +184,7 @@ def forward(
             # back to tt
             attention_scores = torch2tt_tensor(attention_scores, self.device)
 
-        div_const = tt_lib.tensor.full(
+        div_const = ttnn.full(
             attention_scores.get_legacy_shape(),
             1.0 / math.sqrt(self.attention_head_size),
         )

diff --git a/models/experimental/ssd/tt/ssd.py b/models/experimental/ssd/tt/ssd.py
@@ -5,6 +5,7 @@
 import torch
 from torch import nn
 import tt_lib
+import ttnn
 import tt_lib.fallback_ops as fallback_ops
 from typing import List, Optional, Tuple, Dict, OrderedDict
 
@@ -102,7 +103,7 @@ def __init__(
 
     def get_in_channels(self, backbone: TtSSDLiteFeatureExtractorMobileNet):
         size = (320, 320)
-        temporary_image = tt_lib.tensor.ones([1, 3, size[1], size[0]], device=self.device)
+        temporary_image = ttnn.ones([1, 3, size[1], size[0]], device=self.device)
         backbone.eval()
         features = backbone(temporary_image)
         out_channels = [tensor.get_legacy_shape()[1] for i, tensor in features.items()]

diff --git a/models/experimental/stable_diffusion/tt/cross_attention.py b/models/experimental/stable_diffusion/tt/cross_attention.py
@@ -186,7 +186,7 @@ def get_attention_scores(
         #                                 self.scale,
         #                                 self.scale)
 
-        scale_tensor = ttl.tensor.full(temp.get_legacy_shape(), self.scale)
+        scale_tensor = ttnn.full(temp.get_legacy_shape(), self.scale)
         attention_scores = ttnn.mul(scale_tensor, temp)
 
         if attention_mask is not None:

diff --git a/models/experimental/stable_diffusion/tt/residual_block.py b/models/experimental/stable_diffusion/tt/residual_block.py
@@ -205,7 +205,7 @@ def forward(self, input_tensor: ttl.tensor.Tensor, temb: ttl.tensor.Tensor) -> t
 
         # create a tensor of size output_scale_factor
         output_sc_recip = 1 / self.output_scale_factor
-        output_sc_recip = ttl.tensor.full(input_tensor.get_legacy_shape(), output_sc_recip)
+        output_sc_recip = ttnn.full(input_tensor.get_legacy_shape(), output_sc_recip)
         output_tensor = ttnn.add(input_tensor, hidden_states)
         output_tensor = ttnn.mul(output_tensor, output_sc_recip)
 

diff --git a/models/experimental/swin/tt/swin_embeddings.py b/models/experimental/swin/tt/swin_embeddings.py
@@ -44,7 +44,7 @@ def __init__(self, config, state_dict, base_address, device, use_mask_token=Fals
         self.norm = fallback_ops.LayerNorm(gamma, beta, normalized_shape=config.embed_dim, eps=config.layer_norm_eps)
 
     def const_tensor(self, shape, value):
-        return tt_lib.tensor.full(shape, value)
+        return ttnn.full(shape, value)
 
     def forward(
         self,

diff --git a/models/experimental/swin/tt/swin_self_attention.py b/models/experimental/swin/tt/swin_self_attention.py
@@ -70,7 +70,7 @@ def __init__(
         self.value_bias = torch_to_tt_tensor_rm(state_dict[f"{base_address}.value.bias"], self.device)
 
     def const_tensor(self, shape, value):
-        return tt_lib.tensor.full(shape, value)
+        return ttnn.full(shape, value)
 
     def transpose_for_scores(self, x: tt_lib.tensor.Tensor) -> tt_lib.tensor.Tensor:
         # x must be 4d originaly

diff --git a/models/experimental/whisper/tt/whisper_attention.py b/models/experimental/whisper/tt/whisper_attention.py
@@ -5,6 +5,7 @@
 import torch
 import torch.nn as nn
 import tt_lib
+import ttnn
 from typing import Optional, Tuple, Union
 
 from models.utility_functions import torch2tt_tensor, tt2torch_tensor
@@ -114,7 +115,7 @@ def forward(
         if q_proj_shape == self.cached_q_proj_shape:
             q_proj_mul_const = self.q_proj_mul_const
         else:
-            self.q_proj_mul_const = tt_lib.tensor.full(q_proj_shape, self.scaling)
+            self.q_proj_mul_const = ttnn.full(q_proj_shape, self.scaling)
             self.cached_q_proj_shape = q_proj_shape
             q_proj_mul_const = self.q_proj_mul_const
 

diff --git a/models/experimental/whisper/tt/whisper_for_audio_classification.py b/models/experimental/whisper/tt/whisper_for_audio_classification.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import tt_lib
+import ttnn
 import torch
 import torch.nn as nn
 from dataclasses import dataclass
@@ -14,6 +15,7 @@
 
 from models.experimental.whisper.tt.whisper_encoder import TtWhisperEncoder
 
+
 @dataclass
 class TtWhisperForAudioClassificationOutput:
     loss: Optional[tt_lib.tensor.Tensor] = None
@@ -37,16 +39,12 @@ def __init__(self, state_dict, device, config):
             config=config,
         )
 
-        num_layers = (
-            config.num_hidden_layers + 1
-        )  # transformer layers + input embeddings
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
         if config.use_weighted_layer_sum:
             # Not using this parameter for now
             N, C, H, W = 1, 1, 1, num_layers
             weight_init_const = 1.0 / num_layers
-            self.layer_weights = tt_lib.tensor.full(
-                (1, 1, 1, num_layers), weight_init_const
-            )
+            self.layer_weights = ttnn.full((1, 1, 1, num_layers), weight_init_const)
 
         self.projector_weight = torch2tt_tensor(
             state_dict[f"projector.weight"], self.device, tt_lib.tensor.Layout.ROW_MAJOR
@@ -122,19 +120,11 @@ def forward(
         'af_za'
         ```"""
 
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
@@ -160,17 +150,13 @@ def forward(
             hidden_states = encoder_outputs.last_hidden_state
 
         # Apply Linear layer
-        hidden_states = linear(
-            hidden_states, self.projector_weight, self.projector_bias
-        )
+        hidden_states = linear(hidden_states, self.projector_weight, self.projector_bias)
 
         # Torch mean
         torch_hidden_states = tt2torch_tensor(hidden_states)
         torch_pooled_output = torch_hidden_states.mean(dim=-2)
         # If something changes these dimension -2 should always work
-        pooled_output = torch2tt_tensor(
-            torch_pooled_output, self.device, tt_lib.tensor.Layout.ROW_MAJOR
-        )
+        pooled_output = torch2tt_tensor(torch_pooled_output, self.device, tt_lib.tensor.Layout.ROW_MAJOR)
 
         # Apply classifier layer
         logits = linear(pooled_output, self.classifier_weight, self.classifier_bias)

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_composite.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_composite.py
@@ -74,14 +74,6 @@ def custom_compare(*args, **kwargs):
                 "hypot",
                 "hardswish",
                 "hardsigmoid",
-                "ones_like",
-                "zeros_like",
-                "full_like",
-                "ones",
-                "empty",
-                "zeros",
-                "full",
-                "arange",
                 "hardshrink",
                 "softshrink",
                 "sinh",