From 9ca712cc3d38e010e54354a3013414c029c2aa85 Mon Sep 17 00:00:00 2001 From: Brian Van Essen Date: Fri, 16 Dec 2022 15:50:31 -0800 Subject: [PATCH] Bugfix input layer activations (#2164) * Fixed the input layer so that it would only resize the activation matrix if it wasn't already setup to be a view of the data_coordinator's matrix. This addresses a signficant performance bug in the data ingestion where the activation matrix was a view into the data coordinator's internal buffers. Then on each step the matrix would be deallocated, reallocated, then set to a view again. This also triggered a performance problem in the memory pool allocator in Hydrogen, which is not designed to cache very large allocations. Added guards in the top level data_type_layer to make sure that matrices that are views of other matrices are not resized. * Added a release note about the performance fix * Updated the distconv identity test to force it to keep its error signals to avoid accessing non-existent previous error signals. --- ReleaseNotes.txt | 2 ++ .../test_unit_layer_identity_distconv.py | 20 +++++++++++-------- src/layers/data_type_layer.cpp | 12 +++++++++++ src/layers/io/input_layer.cpp | 14 +++++++++++-- 4 files changed, 38 insertions(+), 10 deletions(-) diff --git a/ReleaseNotes.txt b/ReleaseNotes.txt index c8439d0754a..b527a38a4c5 100644 --- a/ReleaseNotes.txt +++ b/ReleaseNotes.txt @@ -86,6 +86,8 @@ Bug fixes: - Fixed build and runtime errors with DistConv - Use correct LayerNorm operaton in "Attention Is All You Need" Transformer + - Fixed a bug where the input layer performed unnecessary memory + allocations. Retired features: diff --git a/ci_test/unit_tests/test_unit_layer_identity_distconv.py b/ci_test/unit_tests/test_unit_layer_identity_distconv.py index b07d1cd63fd..8ed3ada9135 100644 --- a/ci_test/unit_tests/test_unit_layer_identity_distconv.py +++ b/ci_test/unit_tests/test_unit_layer_identity_distconv.py @@ -118,7 +118,7 @@ def construct_model(lbann): upper_bound=val+tol, error_on_failure=True, execution_modes='test')) - + # ------------------------------------------ # Data-parallel layout with DC and activation # ------------------------------------------ @@ -131,17 +131,17 @@ def construct_model(lbann): _data = lbann.Relu(_data, name="distconv_activation", parallel_strategy=channelwise_parallel_strategy(num_height_groups)) - + _data = lbann.Reshape(_data, dims=[48]) z = lbann.L2Norm2(_data) obj.append(z) metrics.append(lbann.Metric(z, name='data-parallel w activation')) - + # Numpy implementation vals = [] for i in range(num_samples()): x = get_sample(i).astype(np.float64) - y = np.maximum(x,0) + y = np.maximum(x,0) z = tools.numpy_l2norm2(y) vals.append(z) val = np.mean(vals) @@ -152,11 +152,11 @@ def construct_model(lbann): upper_bound=val+tol, error_on_failure=True, execution_modes='test')) - + # ------------------------------------------ # Model-parallel layout # ------------------------------------------ - + # LBANN implementation x = x_lbann y = lbann.Identity(x, data_layout='model_parallel') @@ -179,7 +179,7 @@ def construct_model(lbann): upper_bound=val+tol, error_on_failure=True, execution_modes='test')) - + # ------------------------------------------ # Gradient checking # ------------------------------------------ @@ -237,7 +237,11 @@ def construct_data_reader(lbann): # Setup PyTest # ============================================== +# Runtime parameters/arguments +environment = tools.get_distconv_environment() +environment['LBANN_KEEP_ERROR_SIGNALS'] = 1 + # Create test functions that can interact with PyTest for _test_func in tools.create_tests(setup_experiment, __file__, - environment=tools.get_distconv_environment()): + environment=environment): globals()[_test_func.__name__] = _test_func diff --git a/src/layers/data_type_layer.cpp b/src/layers/data_type_layer.cpp index c709b4ebfe4..fa470c752b8 100644 --- a/src/layers/data_type_layer.cpp +++ b/src/layers/data_type_layer.cpp @@ -861,6 +861,12 @@ fp_setup_outputs(El::Int mini_batch_size) { if (!keep_original_outputs(i)) continue; #endif // LBANN_HAS_DISTCONV auto& output = get_activations(i); + if (output.Viewing()) { + LBANN_ERROR(get_name(), + " fp_setup_outputs should be overridden", + " if it needs to handle outputs that view", + " other matrices"); + } output.Empty(false); if (align_outputs) { output.AlignWith(alignment_dist); @@ -1076,6 +1082,12 @@ bp_setup_gradient_wrt_inputs( if (!keep_original_gradient_wrt_inputs(i)) continue; #endif // LBANN_HAS_DISTCONV auto& gradient_wrt_input = get_error_signals(i); + if (gradient_wrt_input.Viewing()) { + LBANN_ERROR(get_name(), + " bp_setup_gradient_wrt_inputs should be overridden", + " if it needs to handle error signals that view other", + " matrices"); + } gradient_wrt_input.Empty(false); gradient_wrt_input.AlignWith(get_prev_activations(i)); gradient_wrt_input.Resize(get_input_size(i), mini_batch_size); diff --git a/src/layers/io/input_layer.cpp b/src/layers/io/input_layer.cpp index dd267e692b2..58342e2a55b 100644 --- a/src/layers/io/input_layer.cpp +++ b/src/layers/io/input_layer.cpp @@ -120,8 +120,18 @@ void input_layer::fp_setup_outputs(El::Int mini_b c.set_effective_mini_batch_size(effective_mini_batch_size); } - // Initialize matrices - data_type_layer::fp_setup_outputs(mini_batch_size); + // Activation matrices are initalized in setup_data and further + // managed in the distribute_from_local_matrix function of the + // data_coordinator. + // However, on the first pass through the execution algorithm it + // is necessary to setup the size of the matrix. + for (int i = 0; i < this->get_num_children(); ++i) { + auto& output = this->get_activations(i); + if (!output.Viewing()) { + output.Empty(false); + output.Resize(this->get_output_size(i), mini_batch_size); + } + } } template