From 9ca712cc3d38e010e54354a3013414c029c2aa85 Mon Sep 17 00:00:00 2001
From: Brian Van Essen <vanessen1@llnl.gov>
Date: Fri, 16 Dec 2022 15:50:31 -0800
Subject: [PATCH] Bugfix input layer activations (#2164)

* Fixed the input layer so that it would only resize the activation
matrix if it wasn't already setup to be a view of the
data_coordinator's matrix.  This addresses a signficant performance
bug in the data ingestion where the activation matrix was a view into
the data coordinator's internal buffers.  Then on each step the matrix
would be deallocated, reallocated, then set to a view again.  This
also triggered a performance problem in the memory pool allocator in
Hydrogen, which is not designed to cache very large allocations.

Added guards in the top level data_type_layer to make sure that
matrices that are views of other matrices are not resized.

* Added a release note about the performance fix

* Updated the distconv identity test to force it to keep its error
signals to avoid accessing non-existent previous error signals.
---
 ReleaseNotes.txt                              |  2 ++
 .../test_unit_layer_identity_distconv.py      | 20 +++++++++++--------
 src/layers/data_type_layer.cpp                | 12 +++++++++++
 src/layers/io/input_layer.cpp                 | 14 +++++++++++--
 4 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/ReleaseNotes.txt b/ReleaseNotes.txt
index c8439d0754a..b527a38a4c5 100644
--- a/ReleaseNotes.txt
+++ b/ReleaseNotes.txt
@@ -86,6 +86,8 @@ Bug fixes:
  - Fixed build and runtime errors with DistConv
  - Use correct LayerNorm operaton in "Attention Is All You Need"
    Transformer
+ - Fixed a bug where the input layer performed unnecessary memory
+   allocations.
 
 Retired features:
 
diff --git a/ci_test/unit_tests/test_unit_layer_identity_distconv.py b/ci_test/unit_tests/test_unit_layer_identity_distconv.py
index b07d1cd63fd..8ed3ada9135 100644
--- a/ci_test/unit_tests/test_unit_layer_identity_distconv.py
+++ b/ci_test/unit_tests/test_unit_layer_identity_distconv.py
@@ -118,7 +118,7 @@ def construct_model(lbann):
         upper_bound=val+tol,
         error_on_failure=True,
         execution_modes='test'))
-    
+
     # ------------------------------------------
     # Data-parallel layout with DC and activation
     # ------------------------------------------
@@ -131,17 +131,17 @@ def construct_model(lbann):
     _data = lbann.Relu(_data,
                        name="distconv_activation",
                        parallel_strategy=channelwise_parallel_strategy(num_height_groups))
-    
+
     _data = lbann.Reshape(_data, dims=[48])
     z = lbann.L2Norm2(_data)
     obj.append(z)
     metrics.append(lbann.Metric(z, name='data-parallel w activation'))
-    
+
     # Numpy implementation
     vals = []
     for i in range(num_samples()):
         x = get_sample(i).astype(np.float64)
-        y = np.maximum(x,0) 
+        y = np.maximum(x,0)
         z = tools.numpy_l2norm2(y)
         vals.append(z)
     val = np.mean(vals)
@@ -152,11 +152,11 @@ def construct_model(lbann):
         upper_bound=val+tol,
         error_on_failure=True,
         execution_modes='test'))
-    
+
     # ------------------------------------------
     # Model-parallel layout
     # ------------------------------------------
-    
+
     # LBANN implementation
     x = x_lbann
     y = lbann.Identity(x, data_layout='model_parallel')
@@ -179,7 +179,7 @@ def construct_model(lbann):
         upper_bound=val+tol,
         error_on_failure=True,
         execution_modes='test'))
-    
+
     # ------------------------------------------
     # Gradient checking
     # ------------------------------------------
@@ -237,7 +237,11 @@ def construct_data_reader(lbann):
 # Setup PyTest
 # ==============================================
 
+# Runtime parameters/arguments
+environment = tools.get_distconv_environment()
+environment['LBANN_KEEP_ERROR_SIGNALS'] = 1
+
 # Create test functions that can interact with PyTest
 for _test_func in tools.create_tests(setup_experiment, __file__,
-                               environment=tools.get_distconv_environment()):
+                               environment=environment):
     globals()[_test_func.__name__] = _test_func
diff --git a/src/layers/data_type_layer.cpp b/src/layers/data_type_layer.cpp
index c709b4ebfe4..fa470c752b8 100644
--- a/src/layers/data_type_layer.cpp
+++ b/src/layers/data_type_layer.cpp
@@ -861,6 +861,12 @@ fp_setup_outputs(El::Int mini_batch_size) {
     if (!keep_original_outputs(i)) continue;
 #endif // LBANN_HAS_DISTCONV
     auto& output = get_activations(i);
+    if (output.Viewing()) {
+      LBANN_ERROR(get_name(),
+                  " fp_setup_outputs should be overridden",
+                  " if it needs to handle outputs that view",
+                  " other matrices");
+    }
     output.Empty(false);
     if (align_outputs) {
       output.AlignWith(alignment_dist);
@@ -1076,6 +1082,12 @@ bp_setup_gradient_wrt_inputs(
     if (!keep_original_gradient_wrt_inputs(i)) continue;
 #endif // LBANN_HAS_DISTCONV
     auto& gradient_wrt_input = get_error_signals(i);
+    if (gradient_wrt_input.Viewing()) {
+      LBANN_ERROR(get_name(),
+                  " bp_setup_gradient_wrt_inputs should be overridden",
+                  " if it needs to handle error signals that view other",
+                  "  matrices");
+    }
     gradient_wrt_input.Empty(false);
     gradient_wrt_input.AlignWith(get_prev_activations(i));
     gradient_wrt_input.Resize(get_input_size(i), mini_batch_size);
diff --git a/src/layers/io/input_layer.cpp b/src/layers/io/input_layer.cpp
index dd267e692b2..58342e2a55b 100644
--- a/src/layers/io/input_layer.cpp
+++ b/src/layers/io/input_layer.cpp
@@ -120,8 +120,18 @@ void input_layer<TensorDataType, T_layout, Dev>::fp_setup_outputs(El::Int mini_b
     c.set_effective_mini_batch_size(effective_mini_batch_size);
   }
 
-  // Initialize matrices
-  data_type_layer<TensorDataType>::fp_setup_outputs(mini_batch_size);
+  // Activation matrices are initalized in setup_data and further
+  // managed in the distribute_from_local_matrix function of the
+  // data_coordinator.
+  // However, on the first pass through the execution algorithm it
+  // is necessary to setup the size of the matrix.
+  for (int i = 0; i < this->get_num_children(); ++i) {
+    auto& output = this->get_activations(i);
+    if (!output.Viewing()) {
+      output.Empty(false);
+      output.Resize(this->get_output_size(i), mini_batch_size);
+    }
+  }
 }
 
 template <typename TensorDataType,