Skip to content

Commit

Permalink
Bugfix input layer activations (#2164)
Browse files Browse the repository at this point in the history
* Fixed the input layer so that it would only resize the activation
matrix if it wasn't already setup to be a view of the
data_coordinator's matrix.  This addresses a signficant performance
bug in the data ingestion where the activation matrix was a view into
the data coordinator's internal buffers.  Then on each step the matrix
would be deallocated, reallocated, then set to a view again.  This
also triggered a performance problem in the memory pool allocator in
Hydrogen, which is not designed to cache very large allocations.

Added guards in the top level data_type_layer to make sure that
matrices that are views of other matrices are not resized.

* Added a release note about the performance fix

* Updated the distconv identity test to force it to keep its error
signals to avoid accessing non-existent previous error signals.
  • Loading branch information
bvanessen authored Dec 16, 2022
1 parent 2e3012a commit 9ca712c
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 10 deletions.
2 changes: 2 additions & 0 deletions ReleaseNotes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ Bug fixes:
- Fixed build and runtime errors with DistConv
- Use correct LayerNorm operaton in "Attention Is All You Need"
Transformer
- Fixed a bug where the input layer performed unnecessary memory
allocations.

Retired features:

Expand Down
20 changes: 12 additions & 8 deletions ci_test/unit_tests/test_unit_layer_identity_distconv.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def construct_model(lbann):
upper_bound=val+tol,
error_on_failure=True,
execution_modes='test'))

# ------------------------------------------
# Data-parallel layout with DC and activation
# ------------------------------------------
Expand All @@ -131,17 +131,17 @@ def construct_model(lbann):
_data = lbann.Relu(_data,
name="distconv_activation",
parallel_strategy=channelwise_parallel_strategy(num_height_groups))

_data = lbann.Reshape(_data, dims=[48])
z = lbann.L2Norm2(_data)
obj.append(z)
metrics.append(lbann.Metric(z, name='data-parallel w activation'))

# Numpy implementation
vals = []
for i in range(num_samples()):
x = get_sample(i).astype(np.float64)
y = np.maximum(x,0)
y = np.maximum(x,0)
z = tools.numpy_l2norm2(y)
vals.append(z)
val = np.mean(vals)
Expand All @@ -152,11 +152,11 @@ def construct_model(lbann):
upper_bound=val+tol,
error_on_failure=True,
execution_modes='test'))

# ------------------------------------------
# Model-parallel layout
# ------------------------------------------

# LBANN implementation
x = x_lbann
y = lbann.Identity(x, data_layout='model_parallel')
Expand All @@ -179,7 +179,7 @@ def construct_model(lbann):
upper_bound=val+tol,
error_on_failure=True,
execution_modes='test'))

# ------------------------------------------
# Gradient checking
# ------------------------------------------
Expand Down Expand Up @@ -237,7 +237,11 @@ def construct_data_reader(lbann):
# Setup PyTest
# ==============================================

# Runtime parameters/arguments
environment = tools.get_distconv_environment()
environment['LBANN_KEEP_ERROR_SIGNALS'] = 1

# Create test functions that can interact with PyTest
for _test_func in tools.create_tests(setup_experiment, __file__,
environment=tools.get_distconv_environment()):
environment=environment):
globals()[_test_func.__name__] = _test_func
12 changes: 12 additions & 0 deletions src/layers/data_type_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -861,6 +861,12 @@ fp_setup_outputs(El::Int mini_batch_size) {
if (!keep_original_outputs(i)) continue;
#endif // LBANN_HAS_DISTCONV
auto& output = get_activations(i);
if (output.Viewing()) {
LBANN_ERROR(get_name(),
" fp_setup_outputs should be overridden",
" if it needs to handle outputs that view",
" other matrices");
}
output.Empty(false);
if (align_outputs) {
output.AlignWith(alignment_dist);
Expand Down Expand Up @@ -1076,6 +1082,12 @@ bp_setup_gradient_wrt_inputs(
if (!keep_original_gradient_wrt_inputs(i)) continue;
#endif // LBANN_HAS_DISTCONV
auto& gradient_wrt_input = get_error_signals(i);
if (gradient_wrt_input.Viewing()) {
LBANN_ERROR(get_name(),
" bp_setup_gradient_wrt_inputs should be overridden",
" if it needs to handle error signals that view other",
" matrices");
}
gradient_wrt_input.Empty(false);
gradient_wrt_input.AlignWith(get_prev_activations(i));
gradient_wrt_input.Resize(get_input_size(i), mini_batch_size);
Expand Down
14 changes: 12 additions & 2 deletions src/layers/io/input_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,18 @@ void input_layer<TensorDataType, T_layout, Dev>::fp_setup_outputs(El::Int mini_b
c.set_effective_mini_batch_size(effective_mini_batch_size);
}

// Initialize matrices
data_type_layer<TensorDataType>::fp_setup_outputs(mini_batch_size);
// Activation matrices are initalized in setup_data and further
// managed in the distribute_from_local_matrix function of the
// data_coordinator.
// However, on the first pass through the execution algorithm it
// is necessary to setup the size of the matrix.
for (int i = 0; i < this->get_num_children(); ++i) {
auto& output = this->get_activations(i);
if (!output.Viewing()) {
output.Empty(false);
output.Resize(this->get_output_size(i), mini_batch_size);
}
}
}

template <typename TensorDataType,
Expand Down

0 comments on commit 9ca712c

Please sign in to comment.