#9999: remove old maxpools (#12243)

* #9999: remove old maxpools * #9999: remove old unit tests and move nondivis set over to new * #9999: minor * #9999: removing more old maxpool usages * #9999: removing more old maxpool usages * #9999: Another left-over test set * #9999: max_pool2d_new -> max_pool2d * #9999: Update remaining large resnets * #9999: remove suffix new
tenstorrent · Sep 9, 2024 · 2b5d9e4 · 2b5d9e4
1 parent fba4a2d
commit 2b5d9e4
Show file tree

Hide file tree

Showing 40 changed files with 415 additions and 5,369 deletions.
diff --git a/docs/source/ttnn/ttnn/api.rst b/docs/source/ttnn/ttnn/api.rst
@@ -443,7 +443,7 @@ Pooling
    :maxdepth: 1
 
    ttnn/global_avg_pool2d
-   ttnn/MaxPool2d
+   ttnn/max_pool2d
 
 Vision
 ========

diff --git a/docs/source/ttnn/ttnn/ttnn/MaxPool2d.rst b/docs/source/ttnn/ttnn/ttnn/MaxPool2d.rst
diff --git a/docs/source/ttnn/ttnn/ttnn/max_pool2d.rst b/docs/source/ttnn/ttnn/ttnn/max_pool2d.rst
@@ -0,0 +1,6 @@
+.. _ttnn.max_pool2d:
+
+ttnn.max_pool2d
+###############
+
+.. autofunction:: ttnn.max_pool2d
diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py
@@ -395,25 +395,6 @@ def __init__(
         self.conv1_output_channels = self.conv1_weight_tensor.shape[0]
         assert self.conv1_weight_tensor.shape[2] == 4
 
-        self.max_pool_reader_patterns_cache = {}
-        max_pool_parallel_config_override = {}
-
-        self.max_pool = ttnn.MaxPool2d(
-            kernel_size=(3, 3),
-            stride=(2, 2),
-            padding=(1, 1),
-            dilation=(1, 1),
-            dtype=ttnn.bfloat16,
-            device=self.device,
-            batch_size=self.batch_size,
-            input_height=256,
-            input_width=256,
-            reader_patterns_cache=self.max_pool_reader_patterns_cache,
-            deallocate_activation=True,
-            parallel_config_override=max_pool_parallel_config_override,
-            channels=self.conv1_output_channels,
-        )
-
         self.layer1 = self._make_layer(
             parameters=parameters.layer1,
             planes=64,
@@ -481,7 +462,6 @@ def __init__(
     def __del__(self):
         # Need to clear global configs for each Resnet run
         self.conv_op_cache.clear()
-        self.max_pool_reader_patterns_cache.clear()
 
     def _make_layer(
         self,
@@ -570,15 +550,18 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
         )
         # Relu is fused with conv1
 
-        if self.batch_size == 20:
-            x = ttnn.reallocate(x)
-
-        if is_wormhole_b0() and self.batch_size == 20:
-            # TODO: fix the need to do the reshard here
-            x = ttnn.to_memory_config(x, ttnn.L1_MEMORY_CONFIG)
-            x = ttnn.to_layout(x, ttnn.ROW_MAJOR_LAYOUT)
-            x = ttnn.to_memory_config(x, self.max_pool.max_pool.input_sharded_memory_config)
-        x = self.max_pool(x)
+        x = ttnn.max_pool2d(
+            input_tensor=x,
+            batch_size=self.batch_size,
+            input_h=x_height,
+            input_w=x_width,
+            channels=self.conv1_output_channels,
+            kernel_size=[3, 3],
+            stride=[2, 2],
+            padding=[1, 1],
+            dilation=[1, 1],
+            device=device,
+        )
 
         x_height = 128
         x_width = 128
@@ -868,15 +851,18 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c
         )
         # Relu is fused with conv1
 
-        if self.batch_size == 20:
-            x = ttnn.reallocate(x)
-
-        if is_wormhole_b0() and self.batch_size == 20:
-            # TODO: fix the need to do the reshard here
-            x = ttnn.to_memory_config(x, ttnn.L1_MEMORY_CONFIG)
-            x = ttnn.to_layout(x, ttnn.ROW_MAJOR_LAYOUT)
-            x = ttnn.to_memory_config(x, self.max_pool.max_pool.input_sharded_memory_config)
-        x = self.max_pool(x)
+        x = ttnn.max_pool2d(
+            input_tensor=x,
+            batch_size=self.batch_size,
+            input_h=x_height,
+            input_w=x_width,
+            channels=self.conv1_output_channels,
+            kernel_size=[3, 3],
+            stride=[2, 2],
+            padding=[1, 1],
+            dilation=[1, 1],
+            device=device,
+        )
 
         x_height = 128
         x_width = 128

diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py
@@ -14,9 +14,6 @@
 from loguru import logger
 from tests.ttnn.utils_for_testing import assert_with_pcc
 
-use_new_maxpool2d = True
-
-
 hardcoded_matmul_config_linear = {
     8: ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig(
         compute_with_storage_grid_size=(8, 4),
@@ -222,26 +219,6 @@ def __call__(
         logger.debug(
             f"==== Running {batch_size}, {input_height}, {input_width}, {self.conv1_input_channels}, {self.conv1_output_channels}"
         )
-        # if (
-        #     is_wormhole_b0()
-        #     and (batch_size == 20)  ## or batch_size == 16)
-        #     and input_height == 56
-        #     and self.conv1_input_channels == 256
-        #     and self.conv1_output_channels == 128
-        # ):
-        #     # TODO: fix the need to do the reshard here
-        #     ## reshard to 49 cores
-        #     ## TensorMemoryLayout::HEIGHT_SHARDED;(grid={[(x=0;y=0) - (x=7;y=5)]; [(x=0;y=6) - (x=0;y=6)]}; shape={1280; 256}; orientation=ShardOrientation::ROW_MAJOR; halo=false
-        #     mem_config = ttnn.create_sharded_memory_config_(
-        #         ttnn.Shape([batch_size * input_height * input_width, 256]),
-        #         (ttnn.CoreGrid(x=8, y=6), ttnn.CoreGrid(x=1, y=7)),
-        #         ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
-        #         ttnn.ShardOrientation.ROW_MAJOR,
-        #         tile_layout=True,
-        #     )
-        #     x_resharded = ttnn.to_memory_config(x, mem_config)
-        #     ttnn.deallocate(x)
-        #     x = ttnn.reallocate(x_resharded)
 
         # conv1 is 1x1 conv
         logger.debug(f"Running conv1")
@@ -504,24 +481,6 @@ def __init__(
         self.max_pool_reader_patterns_cache = {}
         max_pool_parallel_config_override = {}
 
-        if not use_new_maxpool2d:
-            self.max_pool = ttnn.MaxPool2d(
-                kernel_size=(3, 3),
-                stride=(2, 2),
-                padding=(1, 1),
-                dilation=(1, 1),
-                dtype=ttnn.bfloat16,
-                device=self.device,
-                batch_size=self.batch_size,
-                input_height=112,
-                input_width=112,
-                reader_patterns_cache=self.max_pool_reader_patterns_cache,
-                deallocate_activation=True,
-                parallel_config_override=max_pool_parallel_config_override,
-                channels=self.conv1_output_channels,
-                mesh_mapper=self.mesh_mapper,
-            )
-
         self.layer1 = self._make_layer(
             parameters=parameters.layer1,
             planes=64,
@@ -771,21 +730,18 @@ def run(self, input_tensor, device, ops_parallel_config, conv_op_cache={}) -> tt
         if self.batch_size == 20:
             x = ttnn.reallocate(x)
 
-        if use_new_maxpool2d:
-            x = ttnn.max_pool2d_new(
-                input_tensor=x,
-                batch_size=self.batch_size,
-                input_h=x_height,
-                input_w=x_width,
-                channels=self.conv1_output_channels,
-                kernel_size=[3, 3],
-                stride=[2, 2],
-                padding=[1, 1],
-                dilation=[1, 1],
-                device=device,
-            )
-        else:
-            x = self.max_pool(x)
+        x = ttnn.max_pool2d(
+            input_tensor=x,
+            batch_size=self.batch_size,
+            input_h=x_height,
+            input_w=x_width,
+            channels=self.conv1_output_channels,
+            kernel_size=[3, 3],
+            stride=[2, 2],
+            padding=[1, 1],
+            dilation=[1, 1],
+            device=device,
+        )
 
         x_height = 56
         x_width = 56

diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api.py
@@ -390,25 +390,6 @@ def __init__(
         self.conv1_output_channels = self.conv1_weight_tensor.shape[0]
         assert self.conv1_weight_tensor.shape[2] == 4
 
-        self.max_pool_reader_patterns_cache = {}
-        max_pool_parallel_config_override = {}
-
-        self.max_pool = ttnn.MaxPool2d(
-            kernel_size=(3, 3),
-            stride=(2, 2),
-            padding=(1, 1),
-            dilation=(1, 1),
-            dtype=ttnn.bfloat16,
-            device=self.device,
-            batch_size=self.batch_size,
-            input_height=448,
-            input_width=448,
-            reader_patterns_cache=self.max_pool_reader_patterns_cache,
-            deallocate_activation=True,
-            parallel_config_override=max_pool_parallel_config_override,
-            channels=self.conv1_output_channels,
-        )
-
         self.layer1 = self._make_layer(
             parameters=parameters.layer1,
             planes=64,
@@ -476,7 +457,6 @@ def __init__(
     def __del__(self):
         # Need to clear global configs for each Resnet run
         self.conv_op_cache.clear()
-        self.max_pool_reader_patterns_cache.clear()
 
     def _make_layer(
         self,
@@ -567,12 +547,18 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
         if self.batch_size == 20 or self.batch_size == 1:
             x = ttnn.reallocate(x)
 
-        if is_wormhole_b0() and self.batch_size == 20:
-            # TODO: fix the need to do the reshard here
-            x = ttnn.to_memory_config(x, ttnn.L1_MEMORY_CONFIG)
-            x = ttnn.to_layout(x, ttnn.ROW_MAJOR_LAYOUT)
-            x = ttnn.to_memory_config(x, self.max_pool.max_pool.input_sharded_memory_config)
-        x = self.max_pool(x)
+        x = ttnn.max_pool2d(
+            input_tensor=x,
+            batch_size=self.batch_size,
+            input_h=x_height,
+            input_w=x_width,
+            channels=self.conv1_output_channels,
+            kernel_size=[3, 3],
+            stride=[2, 2],
+            padding=[1, 1],
+            dilation=[1, 1],
+            device=device,
+        )
 
         x_height = 224
         x_width = 224
@@ -859,12 +845,18 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c
         if self.batch_size == 20 or self.batch_size == 1:
             x = ttnn.reallocate(x)
 
-        if is_wormhole_b0() and self.batch_size == 20:
-            # TODO: fix the need to do the reshard here
-            x = ttnn.to_memory_config(x, ttnn.L1_MEMORY_CONFIG)
-            x = ttnn.to_layout(x, ttnn.ROW_MAJOR_LAYOUT)
-            x = ttnn.to_memory_config(x, self.max_pool.max_pool.input_sharded_memory_config)
-        x = self.max_pool(x)
+        x = ttnn.max_pool2d(
+            input_tensor=x,
+            batch_size=self.batch_size,
+            input_h=x_height,
+            input_w=x_width,
+            channels=self.conv1_output_channels,
+            kernel_size=[3, 3],
+            stride=[2, 2],
+            padding=[1, 1],
+            dilation=[1, 1],
+            device=device,
+        )
 
         x_height = 224
         x_width = 224

diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api_24.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api_24.py
@@ -392,25 +392,6 @@ def __init__(
         self.conv1_output_channels = self.conv1_weight_tensor.shape[0]
         assert self.conv1_weight_tensor.shape[2] == 4
 
-        self.max_pool_reader_patterns_cache = {}
-        max_pool_parallel_config_override = {}
-
-        self.max_pool = ttnn.MaxPool2d(
-            kernel_size=(3, 3),
-            stride=(2, 2),
-            padding=(1, 1),
-            dilation=(1, 1),
-            dtype=ttnn.bfloat16,
-            device=self.device,
-            batch_size=self.batch_size,
-            input_height=448,
-            input_width=448,
-            reader_patterns_cache=self.max_pool_reader_patterns_cache,
-            deallocate_activation=True,
-            parallel_config_override=max_pool_parallel_config_override,
-            channels=self.conv1_output_channels,
-        )
-
         self.layer1 = self._make_layer(
             parameters=parameters.layer1,
             planes=64,
@@ -478,7 +459,6 @@ def __init__(
     def __del__(self):
         # Need to clear global configs for each Resnet run
         self.conv_op_cache.clear()
-        self.max_pool_reader_patterns_cache.clear()
 
     def _make_layer(
         self,
@@ -569,12 +549,18 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
         if self.batch_size == 20 or self.batch_size == 1:
             x = ttnn.reallocate(x)
 
-        if is_wormhole_b0() and self.batch_size == 20:
-            # TODO: fix the need to do the reshard here
-            x = ttnn.to_memory_config(x, ttnn.L1_MEMORY_CONFIG)
-            x = ttnn.to_layout(x, ttnn.ROW_MAJOR_LAYOUT)
-            x = ttnn.to_memory_config(x, self.max_pool.max_pool.input_sharded_memory_config)
-        x = self.max_pool(x)
+        x = ttnn.max_pool2d(
+            input_tensor=x,
+            batch_size=self.batch_size,
+            input_h=x_height,
+            input_w=x_width,
+            channels=self.conv1_output_channels,
+            kernel_size=[3, 3],
+            stride=[2, 2],
+            padding=[1, 1],
+            dilation=[1, 1],
+            device=device,
+        )
 
         x_height = 224
         x_width = 224
@@ -888,12 +874,18 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c
         if self.batch_size == 20 or self.batch_size == 1:
             x = ttnn.reallocate(x)
 
-        if is_wormhole_b0() and self.batch_size == 20:
-            # TODO: fix the need to do the reshard here
-            x = ttnn.to_memory_config(x, ttnn.L1_MEMORY_CONFIG)
-            x = ttnn.to_layout(x, ttnn.ROW_MAJOR_LAYOUT)
-            x = ttnn.to_memory_config(x, self.max_pool.max_pool.input_sharded_memory_config)
-        x = self.max_pool(x)
+        x = ttnn.max_pool2d(
+            input_tensor=x,
+            batch_size=self.batch_size,
+            input_h=x_height,
+            input_w=x_width,
+            channels=self.conv1_output_channels,
+            kernel_size=[3, 3],
+            stride=[2, 2],
+            padding=[1, 1],
+            dilation=[1, 1],
+            device=device,
+        )
 
         x_height = 224
         x_width = 224