Skip to content

Commit

Permalink
Use model methods for gradient checking
Browse files Browse the repository at this point in the history
  • Loading branch information
tbennun committed Jan 22, 2024
1 parent 2756920 commit 4615528
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 82 deletions.
5 changes: 3 additions & 2 deletions include/lbann/models/model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -399,9 +399,10 @@ class model
void reset_epoch_statistics(execution_mode mode);

/** @brief Forward propagation step. */
void forward_prop(execution_mode mode);
void forward_prop(execution_mode mode, bool skip_callbacks = false);
/** @brief Backward propagation step. */
void backward_prop(bool compute_weight_grads_only = true);
void backward_prop(bool compute_weight_grads_only = true,
bool skip_callbacks = false);
/** Evaluate any metrics in the model */
void evaluate_metrics(execution_mode mode, uint64_t current_mini_batch_size);
/** @brief Clear each optimizer's gradient.
Expand Down
81 changes: 15 additions & 66 deletions src/callbacks/check_gradients.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,30 +58,12 @@ namespace {
EvalType compute_objective_function(model& m)
{
const auto& c = static_cast<SGDExecutionContext&>(m.get_execution_context());
m.get_activation_reference_counter().clear();

// Forward prop, skipping input layers

if (m.is_subgraph_parallelism_enabled()) {
for (auto&& l : m.get_layers()) {
if (dynamic_cast<input_layer<DataType>*>(l) == nullptr &&
l->get_run_layer_in_subgraph()) {
l->forward_prop();
}
}
}
else // sub-graph parallelism not enabled
{
for (auto&& l : m.get_layers()) {
if (dynamic_cast<input_layer<DataType>*>(l) == nullptr) {
l->forward_prop();
}
}
}
const auto mode = c.get_execution_mode();

// Get objective function value
m.forward_prop(mode, true);
auto&& obj = m.get_objective_function();
const auto mode = c.get_execution_mode();

const auto mini_batch_size = m.get_current_mini_batch_size();
obj->start_evaluation(mode, mini_batch_size);
return obj->finish_evaluation(mode, mini_batch_size);
Expand Down Expand Up @@ -134,6 +116,7 @@ struct CheckWeightsFunctor : DefaultErrorReporter
// Get weights matrix and gradient
auto const& weights_matrix = dtw.get_values_sharded();
auto const& gradient = dtw.get_optimizer()->get_gradient_sharded();

// Iterate through weights matrix entries
for (El::Int col = 0; col < weights_matrix.Width(); ++col) {
for (El::Int row = 0; row < weights_matrix.Height(); ++row) {
Expand Down Expand Up @@ -275,40 +258,24 @@ void check_gradients::do_check_gradients(model& m) const
for (auto&& met : m.get_metrics()) {
met->reset_statistics(mode);
}
for (auto&& w : m.get_weights()) {
auto&& opt = w->get_optimizer();
if (opt != nullptr) {
opt->clear_gradient();
}
}
m.get_activation_reference_counter().clear();
m.clear_gradients();

// Load data in input layers
data_coordinator& dc = get_trainer().get_data_coordinator();
dc.fetch_active_batch_synchronous(mode);
El::Int current_mini_batch_size = dc.get_current_mini_batch_size(mode);
m.set_current_mini_batch_size(current_mini_batch_size);

// checking subgrpah parallelism
if (m.is_subgraph_parallelism_enabled()) {
for (auto&& l : m.get_layers()) {
if (dynamic_cast<input_layer<DataType>*>(l) != nullptr &&
l->get_run_layer_in_subgraph()) {
l->forward_prop();
}
}
}
else {
for (auto&& l : m.get_layers()) {
if (dynamic_cast<input_layer<DataType>*>(l) != nullptr) {
l->forward_prop();
}
}
}

// Compute objective function
const EvalType objective = compute_objective_function(m);

// Compute gradients
m.get_objective_function()->differentiate();
m.get_objective_function()->compute_weight_regularization();

// Compute analytical gradients through model
m.backward_prop(false, /*skip_callbacks=*/true);

// Choose finite difference step
// Note: Consider a central difference scheme:
// f'(x) ~ ( - f(x+2h) + 8 f(x+h) - 8 f(x-h) + f(x-2h) ) / 12h
Expand All @@ -323,31 +290,14 @@ void check_gradients::do_check_gradients(model& m) const
// epsilon based on the minimum step size of the float data type
const EvalType epsilon =
std::pow(std::numeric_limits<DataType>::epsilon(), 0.9);
const EvalType step_size =
const EvalType step_size = std::max(
std::numeric_limits<EvalType>::epsilon(),
(m_step_size > EvalType{0} ? m_step_size
: std::fabs(objective) * El::Sqrt(epsilon));
: std::fabs(objective) * El::Sqrt(epsilon)));
EvalType expected_error =
std::pow((epsilon * objective / step_size + std::pow(step_size, 4) / 18),
0.9);

// Compute gradients
m.get_objective_function()->differentiate();
m.get_objective_function()->compute_weight_regularization();

// checking subgraph parallelism
if (m.is_subgraph_parallelism_enabled()) {
for (El::Int i = layers.size() - 1; i >= 0; --i) {
if (layers[i]->get_run_layer_in_subgraph()) {
layers[i]->back_prop();
}
}
}
else {
for (El::Int i = layers.size() - 1; i >= 0; --i) {
layers[i]->back_prop();
}
}

// Print objective function value
if (comm.am_world_master()) {
std::cout << std::string(64, '-') << "\n"
Expand Down Expand Up @@ -383,7 +333,6 @@ void check_gradients::do_check_gradients(model& m) const
}

// Clean up
// TODO: Why
auto&& dataset = dc.get_dataset(mode);
dataset.set_initial_position();
m.get_objective_function()->reset_statistics(mode);
Expand Down
40 changes: 26 additions & 14 deletions src/models/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1555,10 +1555,11 @@ void model::clear_gradients()
}
}

void model::forward_prop(execution_mode mode)
void model::forward_prop(execution_mode mode, bool skip_callbacks)
{
LBANN_CALIPER_MARK_FUNCTION;
do_model_forward_prop_begin_cbs(mode);
if (!skip_callbacks)
do_model_forward_prop_begin_cbs(mode);

// Clear activations in reference counter
m_activation_refcnt.clear();
Expand All @@ -1568,25 +1569,30 @@ void model::forward_prop(execution_mode mode)

if (this->is_subgraph_parallelism_enabled()) {
if (l.get_run_layer_in_subgraph() || l.get_name() == "layer1") {
do_layer_forward_prop_begin_cbs(mode, &l);
if (!skip_callbacks)
do_layer_forward_prop_begin_cbs(mode, &l);
l.forward_prop();
do_layer_forward_prop_end_cbs(mode, &l);
if (!skip_callbacks)
do_layer_forward_prop_end_cbs(mode, &l);
}
else {
// To Do: Fix last batch problem in sub-graph parallelism
// experimental code to fix last batch problem in subgraph parallelism
}
}
else {
do_layer_forward_prop_begin_cbs(mode, &l);
if (!skip_callbacks)
do_layer_forward_prop_begin_cbs(mode, &l);
l.forward_prop();
do_layer_forward_prop_end_cbs(mode, &l);
if (!skip_callbacks)
do_layer_forward_prop_end_cbs(mode, &l);
}
}
do_model_forward_prop_end_cbs(mode);
if (!skip_callbacks)
do_model_forward_prop_end_cbs(mode);
}

void model::backward_prop(bool compute_weight_grads_only)
void model::backward_prop(bool compute_weight_grads_only, bool skip_callbacks)
{
LBANN_CALIPER_MARK_FUNCTION;

Expand All @@ -1596,7 +1602,8 @@ void model::backward_prop(bool compute_weight_grads_only)
bool const envvar_disable_layers =
!arg_parser.get<bool>(LBANN_OPTION_NO_BACKPROP_DISABLE);

do_model_backward_prop_begin_cbs();
if (!skip_callbacks)
do_model_backward_prop_begin_cbs();

for (El::Int i = get_num_layers() - 1; i >= 0; --i) {

Expand Down Expand Up @@ -1626,21 +1633,25 @@ void model::backward_prop(bool compute_weight_grads_only)

if (this->is_subgraph_parallelism_enabled()) {
if (l.get_run_layer_in_subgraph()) {
do_layer_backward_prop_begin_cbs(&l);
if (!skip_callbacks)
do_layer_backward_prop_begin_cbs(&l);
if (enable_layer)
l.back_prop();
do_layer_backward_prop_end_cbs(&l);
if (!skip_callbacks)
do_layer_backward_prop_end_cbs(&l);
}
else {
// To Do: Fix last batch problem in sub-graph parallelism
// experimental code to fix last batch problem in subgraph parallelism
}
}
else {
do_layer_backward_prop_begin_cbs(&l);
if (!skip_callbacks)
do_layer_backward_prop_begin_cbs(&l);
if (enable_layer)
l.back_prop();
do_layer_backward_prop_end_cbs(&l);
if (!skip_callbacks)
do_layer_backward_prop_end_cbs(&l);
}

// Terminate early if all gradients have been computed
Expand All @@ -1665,7 +1676,8 @@ void model::backward_prop(bool compute_weight_grads_only)
}
}

do_model_backward_prop_end_cbs();
if (!skip_callbacks)
do_model_backward_prop_end_cbs();
}

void model::update_weights()
Expand Down

0 comments on commit 4615528

Please sign in to comment.