-
Notifications
You must be signed in to change notification settings - Fork 79
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add an LBANN Python unit test wrapper and utilities (#2264)
* Add an LBANN Python unit test wrapper and utilities * Add capability for extra metrics and callbacks * Add a simple test that uses the new interface * Relax bounds of NASNet further * Improve support for multidimensional tensors * Add single-tensor test data reader * Improve readability of pytest assertions * Fix weighted sum operation and add test * Make weighted sum in-place-capable, ensure backprop runs all the way through in testing
- Loading branch information
Showing
7 changed files
with
490 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
################################################################################ | ||
# Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC. | ||
# Produced at the Lawrence Livermore National Laboratory. | ||
# Written by the LBANN Research Team (B. Van Essen, et al.) listed in | ||
# the CONTRIBUTORS file. <[email protected]> | ||
# | ||
# LLNL-CODE-697807. | ||
# All rights reserved. | ||
# | ||
# This file is part of LBANN: Livermore Big Artificial Neural Network | ||
# Toolkit. For details, see http://software.llnl.gov/LBANN or | ||
# https://github.com/LLNL/LBANN. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "Licensee"); you | ||
# may not use this file except in compliance with the License. You may | ||
# obtain a copy of the License at: | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | ||
# implied. See the License for the specific language governing | ||
# permissions and limitations under the license. | ||
# | ||
################################################################################ | ||
""" | ||
Simple data reader that opens one file with one tensor. Used for unit testing. | ||
""" | ||
import numpy as np | ||
|
||
# Lazy-load tensor | ||
tensor = None | ||
|
||
|
||
def lazy_load(): | ||
# This file operates under the assumption that the working directory is set | ||
# to a specific experiment. | ||
global tensor | ||
if tensor is None: | ||
tensor = np.load('data.npy') | ||
assert len(tensor.shape) == 2 | ||
|
||
|
||
def get_sample(idx): | ||
lazy_load() | ||
return tensor[idx] | ||
|
||
|
||
def num_samples(): | ||
lazy_load() | ||
return tensor.shape[0] | ||
|
||
|
||
def sample_dims(): | ||
lazy_load() | ||
return (tensor.shape[1], ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,322 @@ | ||
################################################################################ | ||
# Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC. | ||
# Produced at the Lawrence Livermore National Laboratory. | ||
# Written by the LBANN Research Team (B. Van Essen, et al.) listed in | ||
# the CONTRIBUTORS file. <[email protected]> | ||
# | ||
# LLNL-CODE-697807. | ||
# All rights reserved. | ||
# | ||
# This file is part of LBANN: Livermore Big Artificial Neural Network | ||
# Toolkit. For details, see http://software.llnl.gov/LBANN or | ||
# https://github.com/LLNL/LBANN. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "Licensee"); you | ||
# may not use this file except in compliance with the License. You may | ||
# obtain a copy of the License at: | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | ||
# implied. See the License for the specific language governing | ||
# permissions and limitations under the license. | ||
# | ||
################################################################################ | ||
import lbann | ||
from dataclasses import dataclass, field | ||
import functools | ||
import inspect | ||
from typing import Any, Callable, List, Optional, Tuple, Union | ||
import numpy as np | ||
import os | ||
import re | ||
import tools | ||
import single_tensor_data_reader | ||
|
||
|
||
def lbann_test(check_gradients=False, **decorator_kwargs): | ||
""" | ||
A decorator that wraps an LBANN-enabled model unit test. | ||
Use it before a function named ``test_*`` to run it automatically in pytest. | ||
The unit test in the wrapped function must return a ``test_util.ModelTester`` | ||
object, which contains all the necessary information to test the model (e.g., | ||
model, input/reference tensors). | ||
The decorator wraps the test with the appropriate setup phase, data reading, | ||
callbacks, and metrics so that the test functions properly. | ||
""" | ||
|
||
def internal_tester(f): | ||
|
||
@functools.wraps(f) | ||
def wrapped(*args, **kwargs): | ||
# Call model constructor | ||
tester = f(*args, **kwargs) | ||
|
||
# Check return value | ||
if not isinstance(tester, ModelTester): | ||
raise ValueError('LBANN test must return a ModelTester object') | ||
if tester.loss is None: | ||
raise ValueError( | ||
'LBANN test did not define a loss function, ' | ||
'use ``ModelTester.set_loss`` or ``set_loss_function``.') | ||
if tester.input_tensor is None: | ||
raise ValueError('LBANN test did not define an input, call ' | ||
'``ModelTester.inputs`` or ``inputs_like``.') | ||
if (tester.reference_tensor is not None | ||
and tester.reference_tensor.shape[0] != | ||
tester.input_tensor.shape[0]): | ||
raise ValueError( | ||
'Input and reference tensors in LBANN test ' | ||
'must match in the first (minibatch) dimension') | ||
full_graph = lbann.traverse_layer_graph(tester.loss) | ||
callbacks = [] | ||
callbacks.append( | ||
lbann.CallbackCheckMetric(metric='test', | ||
lower_bound=0, | ||
upper_bound=tester.tolerance, | ||
error_on_failure=True, | ||
execution_modes='test')) | ||
if check_gradients: | ||
callbacks.append( | ||
lbann.CallbackCheckGradients(error_on_failure=True)) | ||
callbacks.extend(tester.extra_callbacks) | ||
|
||
metrics = [lbann.Metric(tester.loss, name='test')] | ||
metrics.extend(tester.extra_metrics) | ||
model = lbann.Model(epochs=0, | ||
layers=full_graph, | ||
metrics=metrics, | ||
callbacks=callbacks) | ||
|
||
# Get file | ||
file = inspect.getfile(f) | ||
|
||
def setup_func(lbann, weekly): | ||
# Get minibatch size from tensor | ||
mini_batch_size = tester.input_tensor.shape[0] | ||
|
||
# Save combined input/reference data to file | ||
work_dir = _get_work_dir(file) | ||
os.makedirs(work_dir, exist_ok=True) | ||
if tester.reference_tensor is not None: | ||
flat_inp = tester.input_tensor.reshape(mini_batch_size, -1) | ||
flat_ref = tester.reference_tensor.reshape( | ||
mini_batch_size, -1) | ||
np.save(os.path.join(work_dir, 'data.npy'), | ||
np.concatenate((flat_inp, flat_ref), axis=1)) | ||
else: | ||
np.save(os.path.join(work_dir, 'data.npy'), | ||
tester.input_tensor.reshape(mini_batch_size, -1)) | ||
|
||
# Setup data reader | ||
data_reader = lbann.reader_pb2.DataReader() | ||
data_reader.reader.extend([ | ||
tools.create_python_data_reader( | ||
lbann, single_tensor_data_reader.__file__, | ||
'get_sample', 'num_samples', 'sample_dims', 'train'), | ||
tools.create_python_data_reader( | ||
lbann, single_tensor_data_reader.__file__, | ||
'get_sample', 'num_samples', 'sample_dims', 'test') | ||
]) | ||
|
||
trainer = lbann.Trainer(mini_batch_size) | ||
optimizer = lbann.NoOptimizer() | ||
return trainer, model, data_reader, optimizer, None # Don't request any specific number of nodes | ||
|
||
test = tools.create_tests(setup_func, file, **decorator_kwargs)[0] | ||
cluster = kwargs.get('cluster', 'unset') | ||
weekly = kwargs.get('weekly', False) | ||
test(cluster, weekly, False, **decorator_kwargs) | ||
|
||
return wrapped | ||
|
||
return internal_tester | ||
|
||
|
||
@dataclass | ||
class ModelTester: | ||
""" | ||
An object that is constructed within an ``lbann_test``-wrapped unit test. | ||
""" | ||
|
||
# Input tensor (required for test to construct) | ||
input_tensor: Optional[Any] = None | ||
|
||
reference: Optional[lbann.Layer] = None #: Reference LBANN node (optional) | ||
reference_tensor: Optional[ | ||
Any] = None #: Optional reference tensor to compare with | ||
|
||
loss: Optional[lbann.Layer] = None # Optional loss test | ||
tolerance: float = 0.0 #: Tolerance value for loss test | ||
|
||
# Optional additional metrics to use in test | ||
extra_metrics: List[lbann.Metric] = field(default_factory=list) | ||
|
||
# Optional additional callbacks to use in test | ||
extra_callbacks: List[lbann.Callback] = field(default_factory=list) | ||
|
||
def inputs(self, tensor: Any) -> lbann.Layer: | ||
""" | ||
Marks the given tensor as an input of the tested LBANN model, and | ||
returns a matching LBANN Input node (or a Slice/Reshape thereof). | ||
:param tensor: The input NumPy array to use. | ||
:return: An LBANN layer object that will serve as the input. | ||
""" | ||
self.input_tensor = tensor | ||
inp = lbann.Input(data_field='samples') | ||
return slice_to_tensors(inp, tensor) | ||
|
||
def inputs_like(self, *tensors) -> List[lbann.Layer]: | ||
""" | ||
Marks the given tensors as input of the tested LBANN model, and | ||
returns a list of matching LBANN Slice nodes, potentially reshaped to | ||
be like the input tensors. | ||
:param tensors: The input NumPy arrays to use. | ||
:return: A list of LBANN layer objects that will serve as the inputs. | ||
""" | ||
minibatch_size = tensors[0].shape[0] # Assume the first dimension | ||
|
||
# All tensors concatenated on the non-batch dimension | ||
all_tensors_combined = np.concatenate( | ||
[t.reshape(minibatch_size, -1) for t in tensors], axis=1) | ||
|
||
self.input_tensor = all_tensors_combined | ||
x = lbann.Input(data_field='samples') | ||
return slice_to_tensors(x, *tensors) | ||
|
||
def make_reference(self, ref: Any) -> lbann.Input: | ||
""" | ||
Marks the given tensor as a reference output of the tested LBANN model, | ||
and returns a matching LBANN node. | ||
:param ref: The reference NumPy array to use. | ||
:return: An LBANN layer object that will serve as the reference. | ||
""" | ||
# The reference is the second part of the input "samples" | ||
refnode = lbann.Input(data_field='samples') | ||
if self.input_tensor is None: | ||
raise ValueError('Please call ``inputs`` or ``inputs_like`` prior ' | ||
'to calling ``make_reference`` for correctness.') | ||
mbsize = self.input_tensor.shape[0] | ||
|
||
# Obtain reference | ||
refnode = lbann.Reshape(lbann.Identity( | ||
lbann.Slice( | ||
refnode, | ||
slice_points=[ | ||
numel(self.input_tensor) // mbsize, | ||
(numel(self.input_tensor) + numel(ref)) // mbsize | ||
], | ||
)), | ||
dims=ref.shape[1:]) | ||
|
||
# Store reference | ||
self.reference = refnode | ||
self.reference_tensor = ref | ||
return self.reference | ||
|
||
def set_loss_function(self, | ||
func: Callable[[lbann.Layer, lbann.Layer], | ||
lbann.Layer], | ||
output: lbann.Layer, | ||
tolerance=None): | ||
""" | ||
Sets a loss function and the LBANN test output to be measured for the | ||
test. | ||
This assumes that the first argument has two parameters (e.g., | ||
``MeanSquaredError``), where the first argument will be used for the | ||
LBANN output and the second will be used for the reference. | ||
:param func: The loss function. | ||
:param output: The LBANN model output to use. | ||
:param tolerance: Optional tolerance to set for the test. If ``None``, | ||
the default tolerance of ``8*eps*mean(reference)`` | ||
will be used. | ||
""" | ||
return self.set_loss(func(output, self.reference), tolerance) | ||
|
||
def set_loss(self, | ||
loss: lbann.Layer, | ||
tolerance: Optional[float] = None) -> None: | ||
""" | ||
Sets an LBANN node to be measured for the test. | ||
:param loss: The LBANN graph node to use for the test. | ||
:param tolerance: Optional tolerance to set for the test. If ``None``, | ||
the default tolerance of ``8*eps*mean(reference)`` | ||
will be used. | ||
""" | ||
# Set loss node | ||
self.loss = loss | ||
|
||
# Set tolerance | ||
if tolerance is not None: | ||
self.tolerance = tolerance | ||
else: | ||
if self.reference_tensor is None: | ||
raise ValueError( | ||
'Cannot set tolerance on loss function automatically ' | ||
'without a reference tensor. Either set tolerance ' | ||
'explicitly or call ``ModelTester.make_reference``.') | ||
# Default tolerance | ||
self.tolerance = abs(8 * np.mean(self.reference_tensor) * | ||
np.finfo(self.reference_tensor.dtype).eps) | ||
|
||
|
||
def slice_to_tensors(x: lbann.Layer, *tensors) -> List[lbann.Layer]: | ||
""" | ||
Slices an LBANN layer into multiple tensors that match the dimensions of | ||
the given numpy arrays. | ||
""" | ||
slice_points = [0] | ||
offset = 0 | ||
for tensor in tensors: | ||
offset += numel(tensor) // tensor.shape[0] | ||
|
||
slice_points.append(offset) | ||
lslice = lbann.Slice(x, slice_points=slice_points) | ||
return [ | ||
lbann.Reshape(_ensure_bp(t, lbann.Identity(lslice)), dims=t.shape[1:]) | ||
for t in tensors | ||
] | ||
|
||
|
||
def numel(tensor) -> int: | ||
""" | ||
Returns the number of elements in a NumPy array, PyTorch array, or integer. | ||
""" | ||
if isinstance(tensor, int): # Integer | ||
return tensor | ||
elif hasattr(tensor, 'numel'): # PyTorch array | ||
return tensor.numel() | ||
else: # NumPy array | ||
return tensor.size | ||
|
||
|
||
# Mimics the other tester's determination of working directory | ||
def _get_work_dir(test_file: str) -> str: | ||
test_fname = os.path.realpath(test_file) | ||
# Create test name by removing '.py' from file name | ||
test_fname = os.path.splitext(os.path.basename(test_fname))[0] | ||
if not re.match('^test_.', test_fname): | ||
# Make sure test name is prefixed with 'test_' | ||
test_fname = 'test_' + test_fname | ||
return os.path.join(os.path.dirname(test_file), 'experiments', test_fname) | ||
|
||
|
||
# Ensures that backpropagation would be run through the entire model | ||
def _ensure_bp(tensor: Any, node: lbann.Layer) -> lbann.Sum: | ||
# Note: Sum with a weights layer so that gradient checking will | ||
# verify that error signals are correct. | ||
x_weights = lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0)) | ||
return lbann.Sum( | ||
node, | ||
lbann.WeightsLayer( | ||
weights=x_weights, | ||
dims=[numel(tensor) // tensor.shape[0]], | ||
)) |
Oops, something went wrong.