Skip to content

Commit

Permalink
Add an LBANN Python unit test wrapper and utilities (#2264)
Browse files Browse the repository at this point in the history
* Add an LBANN Python unit test wrapper and utilities

* Add capability for extra metrics and callbacks

* Add a simple test that uses the new interface

* Relax bounds of NASNet further

* Improve support for multidimensional tensors

* Add single-tensor test data reader

* Improve readability of pytest assertions

* Fix weighted sum operation and add test

* Make weighted sum in-place-capable, ensure backprop runs all the way through in testing
  • Loading branch information
tbennun authored May 18, 2023
1 parent cffea66 commit c9d643b
Show file tree
Hide file tree
Showing 7 changed files with 490 additions and 7 deletions.
57 changes: 57 additions & 0 deletions ci_test/common_python/single_tensor_data_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
################################################################################
# Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
# Produced at the Lawrence Livermore National Laboratory.
# Written by the LBANN Research Team (B. Van Essen, et al.) listed in
# the CONTRIBUTORS file. <[email protected]>
#
# LLNL-CODE-697807.
# All rights reserved.
#
# This file is part of LBANN: Livermore Big Artificial Neural Network
# Toolkit. For details, see http://software.llnl.gov/LBANN or
# https://github.com/LLNL/LBANN.
#
# Licensed under the Apache License, Version 2.0 (the "Licensee"); you
# may not use this file except in compliance with the License. You may
# obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing
# permissions and limitations under the license.
#
################################################################################
"""
Simple data reader that opens one file with one tensor. Used for unit testing.
"""
import numpy as np

# Lazy-load tensor
tensor = None


def lazy_load():
# This file operates under the assumption that the working directory is set
# to a specific experiment.
global tensor
if tensor is None:
tensor = np.load('data.npy')
assert len(tensor.shape) == 2


def get_sample(idx):
lazy_load()
return tensor[idx]


def num_samples():
lazy_load()
return tensor.shape[0]


def sample_dims():
lazy_load()
return (tensor.shape[1], )
322 changes: 322 additions & 0 deletions ci_test/common_python/test_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,322 @@
################################################################################
# Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
# Produced at the Lawrence Livermore National Laboratory.
# Written by the LBANN Research Team (B. Van Essen, et al.) listed in
# the CONTRIBUTORS file. <[email protected]>
#
# LLNL-CODE-697807.
# All rights reserved.
#
# This file is part of LBANN: Livermore Big Artificial Neural Network
# Toolkit. For details, see http://software.llnl.gov/LBANN or
# https://github.com/LLNL/LBANN.
#
# Licensed under the Apache License, Version 2.0 (the "Licensee"); you
# may not use this file except in compliance with the License. You may
# obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing
# permissions and limitations under the license.
#
################################################################################
import lbann
from dataclasses import dataclass, field
import functools
import inspect
from typing import Any, Callable, List, Optional, Tuple, Union
import numpy as np
import os
import re
import tools
import single_tensor_data_reader


def lbann_test(check_gradients=False, **decorator_kwargs):
"""
A decorator that wraps an LBANN-enabled model unit test.
Use it before a function named ``test_*`` to run it automatically in pytest.
The unit test in the wrapped function must return a ``test_util.ModelTester``
object, which contains all the necessary information to test the model (e.g.,
model, input/reference tensors).
The decorator wraps the test with the appropriate setup phase, data reading,
callbacks, and metrics so that the test functions properly.
"""

def internal_tester(f):

@functools.wraps(f)
def wrapped(*args, **kwargs):
# Call model constructor
tester = f(*args, **kwargs)

# Check return value
if not isinstance(tester, ModelTester):
raise ValueError('LBANN test must return a ModelTester object')
if tester.loss is None:
raise ValueError(
'LBANN test did not define a loss function, '
'use ``ModelTester.set_loss`` or ``set_loss_function``.')
if tester.input_tensor is None:
raise ValueError('LBANN test did not define an input, call '
'``ModelTester.inputs`` or ``inputs_like``.')
if (tester.reference_tensor is not None
and tester.reference_tensor.shape[0] !=
tester.input_tensor.shape[0]):
raise ValueError(
'Input and reference tensors in LBANN test '
'must match in the first (minibatch) dimension')
full_graph = lbann.traverse_layer_graph(tester.loss)
callbacks = []
callbacks.append(
lbann.CallbackCheckMetric(metric='test',
lower_bound=0,
upper_bound=tester.tolerance,
error_on_failure=True,
execution_modes='test'))
if check_gradients:
callbacks.append(
lbann.CallbackCheckGradients(error_on_failure=True))
callbacks.extend(tester.extra_callbacks)

metrics = [lbann.Metric(tester.loss, name='test')]
metrics.extend(tester.extra_metrics)
model = lbann.Model(epochs=0,
layers=full_graph,
metrics=metrics,
callbacks=callbacks)

# Get file
file = inspect.getfile(f)

def setup_func(lbann, weekly):
# Get minibatch size from tensor
mini_batch_size = tester.input_tensor.shape[0]

# Save combined input/reference data to file
work_dir = _get_work_dir(file)
os.makedirs(work_dir, exist_ok=True)
if tester.reference_tensor is not None:
flat_inp = tester.input_tensor.reshape(mini_batch_size, -1)
flat_ref = tester.reference_tensor.reshape(
mini_batch_size, -1)
np.save(os.path.join(work_dir, 'data.npy'),
np.concatenate((flat_inp, flat_ref), axis=1))
else:
np.save(os.path.join(work_dir, 'data.npy'),
tester.input_tensor.reshape(mini_batch_size, -1))

# Setup data reader
data_reader = lbann.reader_pb2.DataReader()
data_reader.reader.extend([
tools.create_python_data_reader(
lbann, single_tensor_data_reader.__file__,
'get_sample', 'num_samples', 'sample_dims', 'train'),
tools.create_python_data_reader(
lbann, single_tensor_data_reader.__file__,
'get_sample', 'num_samples', 'sample_dims', 'test')
])

trainer = lbann.Trainer(mini_batch_size)
optimizer = lbann.NoOptimizer()
return trainer, model, data_reader, optimizer, None # Don't request any specific number of nodes

test = tools.create_tests(setup_func, file, **decorator_kwargs)[0]
cluster = kwargs.get('cluster', 'unset')
weekly = kwargs.get('weekly', False)
test(cluster, weekly, False, **decorator_kwargs)

return wrapped

return internal_tester


@dataclass
class ModelTester:
"""
An object that is constructed within an ``lbann_test``-wrapped unit test.
"""

# Input tensor (required for test to construct)
input_tensor: Optional[Any] = None

reference: Optional[lbann.Layer] = None #: Reference LBANN node (optional)
reference_tensor: Optional[
Any] = None #: Optional reference tensor to compare with

loss: Optional[lbann.Layer] = None # Optional loss test
tolerance: float = 0.0 #: Tolerance value for loss test

# Optional additional metrics to use in test
extra_metrics: List[lbann.Metric] = field(default_factory=list)

# Optional additional callbacks to use in test
extra_callbacks: List[lbann.Callback] = field(default_factory=list)

def inputs(self, tensor: Any) -> lbann.Layer:
"""
Marks the given tensor as an input of the tested LBANN model, and
returns a matching LBANN Input node (or a Slice/Reshape thereof).
:param tensor: The input NumPy array to use.
:return: An LBANN layer object that will serve as the input.
"""
self.input_tensor = tensor
inp = lbann.Input(data_field='samples')
return slice_to_tensors(inp, tensor)

def inputs_like(self, *tensors) -> List[lbann.Layer]:
"""
Marks the given tensors as input of the tested LBANN model, and
returns a list of matching LBANN Slice nodes, potentially reshaped to
be like the input tensors.
:param tensors: The input NumPy arrays to use.
:return: A list of LBANN layer objects that will serve as the inputs.
"""
minibatch_size = tensors[0].shape[0] # Assume the first dimension

# All tensors concatenated on the non-batch dimension
all_tensors_combined = np.concatenate(
[t.reshape(minibatch_size, -1) for t in tensors], axis=1)

self.input_tensor = all_tensors_combined
x = lbann.Input(data_field='samples')
return slice_to_tensors(x, *tensors)

def make_reference(self, ref: Any) -> lbann.Input:
"""
Marks the given tensor as a reference output of the tested LBANN model,
and returns a matching LBANN node.
:param ref: The reference NumPy array to use.
:return: An LBANN layer object that will serve as the reference.
"""
# The reference is the second part of the input "samples"
refnode = lbann.Input(data_field='samples')
if self.input_tensor is None:
raise ValueError('Please call ``inputs`` or ``inputs_like`` prior '
'to calling ``make_reference`` for correctness.')
mbsize = self.input_tensor.shape[0]

# Obtain reference
refnode = lbann.Reshape(lbann.Identity(
lbann.Slice(
refnode,
slice_points=[
numel(self.input_tensor) // mbsize,
(numel(self.input_tensor) + numel(ref)) // mbsize
],
)),
dims=ref.shape[1:])

# Store reference
self.reference = refnode
self.reference_tensor = ref
return self.reference

def set_loss_function(self,
func: Callable[[lbann.Layer, lbann.Layer],
lbann.Layer],
output: lbann.Layer,
tolerance=None):
"""
Sets a loss function and the LBANN test output to be measured for the
test.
This assumes that the first argument has two parameters (e.g.,
``MeanSquaredError``), where the first argument will be used for the
LBANN output and the second will be used for the reference.
:param func: The loss function.
:param output: The LBANN model output to use.
:param tolerance: Optional tolerance to set for the test. If ``None``,
the default tolerance of ``8*eps*mean(reference)``
will be used.
"""
return self.set_loss(func(output, self.reference), tolerance)

def set_loss(self,
loss: lbann.Layer,
tolerance: Optional[float] = None) -> None:
"""
Sets an LBANN node to be measured for the test.
:param loss: The LBANN graph node to use for the test.
:param tolerance: Optional tolerance to set for the test. If ``None``,
the default tolerance of ``8*eps*mean(reference)``
will be used.
"""
# Set loss node
self.loss = loss

# Set tolerance
if tolerance is not None:
self.tolerance = tolerance
else:
if self.reference_tensor is None:
raise ValueError(
'Cannot set tolerance on loss function automatically '
'without a reference tensor. Either set tolerance '
'explicitly or call ``ModelTester.make_reference``.')
# Default tolerance
self.tolerance = abs(8 * np.mean(self.reference_tensor) *
np.finfo(self.reference_tensor.dtype).eps)


def slice_to_tensors(x: lbann.Layer, *tensors) -> List[lbann.Layer]:
"""
Slices an LBANN layer into multiple tensors that match the dimensions of
the given numpy arrays.
"""
slice_points = [0]
offset = 0
for tensor in tensors:
offset += numel(tensor) // tensor.shape[0]

slice_points.append(offset)
lslice = lbann.Slice(x, slice_points=slice_points)
return [
lbann.Reshape(_ensure_bp(t, lbann.Identity(lslice)), dims=t.shape[1:])
for t in tensors
]


def numel(tensor) -> int:
"""
Returns the number of elements in a NumPy array, PyTorch array, or integer.
"""
if isinstance(tensor, int): # Integer
return tensor
elif hasattr(tensor, 'numel'): # PyTorch array
return tensor.numel()
else: # NumPy array
return tensor.size


# Mimics the other tester's determination of working directory
def _get_work_dir(test_file: str) -> str:
test_fname = os.path.realpath(test_file)
# Create test name by removing '.py' from file name
test_fname = os.path.splitext(os.path.basename(test_fname))[0]
if not re.match('^test_.', test_fname):
# Make sure test name is prefixed with 'test_'
test_fname = 'test_' + test_fname
return os.path.join(os.path.dirname(test_file), 'experiments', test_fname)


# Ensures that backpropagation would be run through the entire model
def _ensure_bp(tensor: Any, node: lbann.Layer) -> lbann.Sum:
# Note: Sum with a weights layer so that gradient checking will
# verify that error signals are correct.
x_weights = lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0))
return lbann.Sum(
node,
lbann.WeightsLayer(
weights=x_weights,
dims=[numel(tensor) // tensor.shape[0]],
))
Loading

0 comments on commit c9d643b

Please sign in to comment.