Bugfix input layer activations (#2164)

* Fixed the input layer so that it would only resize the activation matrix if it wasn't already setup to be a view of the data_coordinator's matrix. This addresses a signficant performance bug in the data ingestion where the activation matrix was a view into the data coordinator's internal buffers. Then on each step the matrix would be deallocated, reallocated, then set to a view again. This also triggered a performance problem in the memory pool allocator in Hydrogen, which is not designed to cache very large allocations. Added guards in the top level data_type_layer to make sure that matrices that are views of other matrices are not resized. * Added a release note about the performance fix * Updated the distconv identity test to force it to keep its error signals to avoid accessing non-existent previous error signals.
LLNL · Dec 16, 2022 · 9ca712c · 9ca712c
1 parent 2e3012a
commit 9ca712c
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 10 deletions.
diff --git a/ReleaseNotes.txt b/ReleaseNotes.txt
@@ -86,6 +86,8 @@ Bug fixes:
  - Fixed build and runtime errors with DistConv
  - Use correct LayerNorm operaton in "Attention Is All You Need"
    Transformer
+ - Fixed a bug where the input layer performed unnecessary memory
+   allocations.
 
 Retired features:
 

diff --git a/ci_test/unit_tests/test_unit_layer_identity_distconv.py b/ci_test/unit_tests/test_unit_layer_identity_distconv.py
@@ -118,7 +118,7 @@ def construct_model(lbann):
         upper_bound=val+tol,
         error_on_failure=True,
         execution_modes='test'))
-    
+
     # ------------------------------------------
     # Data-parallel layout with DC and activation
     # ------------------------------------------
@@ -131,17 +131,17 @@ def construct_model(lbann):
     _data = lbann.Relu(_data,
                        name="distconv_activation",
                        parallel_strategy=channelwise_parallel_strategy(num_height_groups))
-    
+
     _data = lbann.Reshape(_data, dims=[48])
     z = lbann.L2Norm2(_data)
     obj.append(z)
     metrics.append(lbann.Metric(z, name='data-parallel w activation'))
-    
+
     # Numpy implementation
     vals = []
     for i in range(num_samples()):
         x = get_sample(i).astype(np.float64)
-        y = np.maximum(x,0) 
+        y = np.maximum(x,0)
         z = tools.numpy_l2norm2(y)
         vals.append(z)
     val = np.mean(vals)
@@ -152,11 +152,11 @@ def construct_model(lbann):
         upper_bound=val+tol,
         error_on_failure=True,
         execution_modes='test'))
-    
+
     # ------------------------------------------
     # Model-parallel layout
     # ------------------------------------------
-    
+
     # LBANN implementation
     x = x_lbann
     y = lbann.Identity(x, data_layout='model_parallel')
@@ -179,7 +179,7 @@ def construct_model(lbann):
         upper_bound=val+tol,
         error_on_failure=True,
         execution_modes='test'))
-    
+
     # ------------------------------------------
     # Gradient checking
     # ------------------------------------------
@@ -237,7 +237,11 @@ def construct_data_reader(lbann):
 # Setup PyTest
 # ==============================================
 
+# Runtime parameters/arguments
+environment = tools.get_distconv_environment()
+environment['LBANN_KEEP_ERROR_SIGNALS'] = 1
+
 # Create test functions that can interact with PyTest
 for _test_func in tools.create_tests(setup_experiment, __file__,
-                               environment=tools.get_distconv_environment()):
+                               environment=environment):
     globals()[_test_func.__name__] = _test_func
diff --git a/src/layers/data_type_layer.cpp b/src/layers/data_type_layer.cpp
@@ -861,6 +861,12 @@ fp_setup_outputs(El::Int mini_batch_size) {
     if (!keep_original_outputs(i)) continue;
 #endif // LBANN_HAS_DISTCONV
     auto& output = get_activations(i);
+    if (output.Viewing()) {
+      LBANN_ERROR(get_name(),
+                  " fp_setup_outputs should be overridden",
+                  " if it needs to handle outputs that view",
+                  " other matrices");
+    }
     output.Empty(false);
     if (align_outputs) {
       output.AlignWith(alignment_dist);
@@ -1076,6 +1082,12 @@ bp_setup_gradient_wrt_inputs(
     if (!keep_original_gradient_wrt_inputs(i)) continue;
 #endif // LBANN_HAS_DISTCONV
     auto& gradient_wrt_input = get_error_signals(i);
+    if (gradient_wrt_input.Viewing()) {
+      LBANN_ERROR(get_name(),
+                  " bp_setup_gradient_wrt_inputs should be overridden",
+                  " if it needs to handle error signals that view other",
+                  "  matrices");
+    }
     gradient_wrt_input.Empty(false);
     gradient_wrt_input.AlignWith(get_prev_activations(i));
     gradient_wrt_input.Resize(get_input_size(i), mini_batch_size);

diff --git a/src/layers/io/input_layer.cpp b/src/layers/io/input_layer.cpp
@@ -120,8 +120,18 @@ void input_layer<TensorDataType, T_layout, Dev>::fp_setup_outputs(El::Int mini_b
     c.set_effective_mini_batch_size(effective_mini_batch_size);
   }
 
-  // Initialize matrices
-  data_type_layer<TensorDataType>::fp_setup_outputs(mini_batch_size);
+  // Activation matrices are initalized in setup_data and further
+  // managed in the distribute_from_local_matrix function of the
+  // data_coordinator.
+  // However, on the first pass through the execution algorithm it
+  // is necessary to setup the size of the matrix.
+  for (int i = 0; i < this->get_num_children(); ++i) {
+    auto& output = this->get_activations(i);
+    if (!output.Viewing()) {
+      output.Empty(false);
+      output.Resize(this->get_output_size(i), mini_batch_size);
+    }
+  }
 }
 
 template <typename TensorDataType,