From e26ece957722f001029a59a15ed758c1c15444c1 Mon Sep 17 00:00:00 2001 From: lruizcalico Date: Mon, 20 May 2024 16:12:23 -0700 Subject: [PATCH 1/7] add summary writer --- src/baskerville/trainer.py | 42 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/src/baskerville/trainer.py b/src/baskerville/trainer.py index 5c55f52..44722fe 100644 --- a/src/baskerville/trainer.py +++ b/src/baskerville/trainer.py @@ -91,6 +91,7 @@ def __init__( train_data, eval_data, out_dir: str, + log_dir: str, strategy=None, num_gpu: int = 1, keras_fit: bool = False, @@ -103,6 +104,7 @@ def __init__( if type(self.eval_data) is not list: self.eval_data = [self.eval_data] self.out_dir = out_dir + self.log_dir = log_dir self.strategy = strategy self.num_gpu = num_gpu self.batch_size = self.train_data[0].batch_size @@ -188,7 +190,7 @@ def fit_keras(self, seqnn_model): callbacks = [ early_stop, - tf.keras.callbacks.TensorBoard(self.out_dir), + tf.keras.callbacks.TensorBoard(self.log_dir, histogram_freq=1), tf.keras.callbacks.ModelCheckpoint("%s/model_check.h5" % self.out_dir), save_best, ] @@ -397,6 +399,12 @@ def eval_step1_distr(xd, yd): # training loop first_step = True + # set up summary writer + train_log_dir = self.log_dir + "/train" + valid_log_dir = self.log_dir + "/valid" + train_summary_writer = tf.summary.create_file_writer(train_log_dir) + valid_summary_writer = tf.summary.create_file_writer(valid_log_dir) + for ei in range(epoch_start, self.train_epochs_max): if ei >= self.train_epochs_min and np.min(unimproved) > self.patience: break @@ -429,7 +437,12 @@ def eval_step1_distr(xd, yd): for di in range(self.num_datasets): print(" Data %d" % di, end="") model = seqnn_model.models[di] - + with train_summary_writer.as_default(): + tf.summary.scalar( + "loss", train_loss[di].result().numpy(), step=ei + ) + tf.summary.scalar("r", train_r[di].result().numpy(), step=ei) + tf.summary.scalar("r2", train_r2[di].result().numpy(), step=ei) # print training accuracy print( " - train_loss: %.4f" % train_loss[di].result().numpy(), end="" @@ -450,6 +463,13 @@ def eval_step1_distr(xd, yd): else: eval_step1_distr(x, y) + with valid_summary_writer.as_default(): + tf.summary.scalar( + "loss", valid_loss[di].result().numpy(), step=ei + ) + tf.summary.scalar("r", valid_r[di].result().numpy(), step=ei) + tf.summary.scalar("r2", valid_r2[di].result().numpy(), step=ei) + # print validation accuracy print( " - valid_loss: %.4f" % valid_loss[di].result().numpy(), end="" @@ -587,6 +607,12 @@ def eval_step_distr(xd, yd): valid_best = -np.inf unimproved = 0 + # set up summary writer + train_log_dir = self.log_dir + "/train" + valid_log_dir = self.log_dir + "/valid" + train_summary_writer = tf.summary.create_file_writer(train_log_dir) + valid_summary_writer = tf.summary.create_file_writer(valid_log_dir) + # training loop for ei in range(epoch_start, self.train_epochs_max): if ei >= self.train_epochs_min and unimproved > self.patience: @@ -615,6 +641,12 @@ def eval_step_distr(xd, yd): train_loss_epoch = train_loss.result().numpy() train_r_epoch = train_r.result().numpy() train_r2_epoch = train_r2.result().numpy() + + with train_summary_writer.as_default(): + tf.summary.scalar("loss", train_loss_epoch, step=ei) + tf.summary.scalar("r", train_r_epoch, step=ei) + tf.summary.scalar("r2", train_r2_epoch, step=ei) + print( "Epoch %d - %ds - train_loss: %.4f - train_r: %.4f - train_r2: %.4f" % ( @@ -631,6 +663,12 @@ def eval_step_distr(xd, yd): valid_loss_epoch = valid_loss.result().numpy() valid_r_epoch = valid_r.result().numpy() valid_r2_epoch = valid_r2.result().numpy() + + with valid_summary_writer.as_default(): + tf.summary.scalar("loss", valid_loss_epoch, step=ei) + tf.summary.scalar("r", valid_r_epoch, step=ei) + tf.summary.scalar("r2", valid_r2_epoch, step=ei) + print( " - valid_loss: %.4f - valid_r: %.4f - valid_r2: %.4f" % (valid_loss_epoch, valid_r_epoch, valid_r2_epoch), From 2bf51786e0c77a1d9805505be9a533f488e0861b Mon Sep 17 00:00:00 2001 From: lruizcalico Date: Mon, 20 May 2024 16:29:04 -0700 Subject: [PATCH 2/7] add log dir --- src/baskerville/scripts/hound_train.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/baskerville/scripts/hound_train.py b/src/baskerville/scripts/hound_train.py index e7ec150..92069ac 100755 --- a/src/baskerville/scripts/hound_train.py +++ b/src/baskerville/scripts/hound_train.py @@ -56,6 +56,12 @@ def main(): default="train_out", help="Output directory [Default: %(default)s]", ) + parser.add_argument( + "-l", + "--log_dir", + default=None, + help="Tensorboard log directory [Default: %(default)s]", + ) parser.add_argument( "--restore", default=None, @@ -150,7 +156,7 @@ def main(): # initialize trainer seqnn_trainer = trainer.Trainer( - params_train, train_data, eval_data, args.out_dir + params_train, train_data, eval_data, args.out_dir, args.log_dir ) # compile model @@ -182,6 +188,7 @@ def main(): train_data, eval_data, args.out_dir, + args.log_dir, strategy, params_train["num_gpu"], args.keras_fit, From 85e85c2fa1908865d58cdd14c178c8af46622211 Mon Sep 17 00:00:00 2001 From: lruizcalico Date: Mon, 20 May 2024 16:40:35 -0700 Subject: [PATCH 3/7] change log dir --- tests/test_train.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_train.py b/tests/test_train.py index 75949b9..7ff1ba4 100755 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -18,6 +18,8 @@ def test_train(clean_data): "src/baskerville/scripts/hound_train.py", "-o", "tests/data/train1", + "-l", + "tests/data/train1/logs", "tests/data/params.json", "tests/data/tiny/hg38", ] @@ -33,6 +35,8 @@ def test_train2(clean_data): "src/baskerville/scripts/hound_train.py", "-o", "tests/data/train2", + "-l", + "tests/data/train2/logs", "tests/data/params.json", "tests/data/tiny/hg38", "tests/data/tiny/mm10", From b36c278b0ceaff42c23b080f229596f4b01fd18e Mon Sep 17 00:00:00 2001 From: lruizcalico Date: Thu, 23 May 2024 19:28:13 -0700 Subject: [PATCH 4/7] add tensorboard writing --- src/baskerville/trainer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/baskerville/trainer.py b/src/baskerville/trainer.py index 44722fe..a9beeff 100644 --- a/src/baskerville/trainer.py +++ b/src/baskerville/trainer.py @@ -438,6 +438,7 @@ def eval_step1_distr(xd, yd): print(" Data %d" % di, end="") model = seqnn_model.models[di] with train_summary_writer.as_default(): + print("Write into train_summary_writer\n") tf.summary.scalar( "loss", train_loss[di].result().numpy(), step=ei ) @@ -464,6 +465,7 @@ def eval_step1_distr(xd, yd): eval_step1_distr(x, y) with valid_summary_writer.as_default(): + print("Write into valid_summary_writer\n") tf.summary.scalar( "loss", valid_loss[di].result().numpy(), step=ei ) @@ -643,6 +645,7 @@ def eval_step_distr(xd, yd): train_r2_epoch = train_r2.result().numpy() with train_summary_writer.as_default(): + print("Write into train_summary_writer\n") tf.summary.scalar("loss", train_loss_epoch, step=ei) tf.summary.scalar("r", train_r_epoch, step=ei) tf.summary.scalar("r2", train_r2_epoch, step=ei) @@ -665,6 +668,7 @@ def eval_step_distr(xd, yd): valid_r2_epoch = valid_r2.result().numpy() with valid_summary_writer.as_default(): + print("Write into valid_summary_writer\n") tf.summary.scalar("loss", valid_loss_epoch, step=ei) tf.summary.scalar("r", valid_r_epoch, step=ei) tf.summary.scalar("r2", valid_r2_epoch, step=ei) From 79a3973fdc074a4ba77e27a671633663a83a4bb5 Mon Sep 17 00:00:00 2001 From: lruizcalico Date: Tue, 28 May 2024 19:16:59 -0700 Subject: [PATCH 5/7] add default for log dir tensorboard --- src/baskerville/scripts/hound_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/baskerville/scripts/hound_train.py b/src/baskerville/scripts/hound_train.py index 92069ac..40bf142 100755 --- a/src/baskerville/scripts/hound_train.py +++ b/src/baskerville/scripts/hound_train.py @@ -59,7 +59,7 @@ def main(): parser.add_argument( "-l", "--log_dir", - default=None, + default="log_out", help="Tensorboard log directory [Default: %(default)s]", ) parser.add_argument( From 37da3c62d1be73da95bcd8f336268f77b3e7a011 Mon Sep 17 00:00:00 2001 From: lruizcalico Date: Tue, 28 May 2024 19:31:25 -0700 Subject: [PATCH 6/7] add flush --- src/baskerville/trainer.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/baskerville/trainer.py b/src/baskerville/trainer.py index a9beeff..cc38465 100644 --- a/src/baskerville/trainer.py +++ b/src/baskerville/trainer.py @@ -438,12 +438,14 @@ def eval_step1_distr(xd, yd): print(" Data %d" % di, end="") model = seqnn_model.models[di] with train_summary_writer.as_default(): - print("Write into train_summary_writer\n") + print("\nWrite into train_summary_writer\n") tf.summary.scalar( "loss", train_loss[di].result().numpy(), step=ei ) tf.summary.scalar("r", train_r[di].result().numpy(), step=ei) tf.summary.scalar("r2", train_r2[di].result().numpy(), step=ei) + train_summary_writer.flush() + # print training accuracy print( " - train_loss: %.4f" % train_loss[di].result().numpy(), end="" @@ -465,12 +467,13 @@ def eval_step1_distr(xd, yd): eval_step1_distr(x, y) with valid_summary_writer.as_default(): - print("Write into valid_summary_writer\n") + print("\nWrite into valid_summary_writer\n") tf.summary.scalar( "loss", valid_loss[di].result().numpy(), step=ei ) tf.summary.scalar("r", valid_r[di].result().numpy(), step=ei) tf.summary.scalar("r2", valid_r2[di].result().numpy(), step=ei) + valid_summary_writer.flush() # print validation accuracy print( @@ -645,10 +648,11 @@ def eval_step_distr(xd, yd): train_r2_epoch = train_r2.result().numpy() with train_summary_writer.as_default(): - print("Write into train_summary_writer\n") + print("\nWrite into train_summary_writer\n") tf.summary.scalar("loss", train_loss_epoch, step=ei) tf.summary.scalar("r", train_r_epoch, step=ei) tf.summary.scalar("r2", train_r2_epoch, step=ei) + train_summary_writer.flush() print( "Epoch %d - %ds - train_loss: %.4f - train_r: %.4f - train_r2: %.4f" @@ -668,10 +672,11 @@ def eval_step_distr(xd, yd): valid_r2_epoch = valid_r2.result().numpy() with valid_summary_writer.as_default(): - print("Write into valid_summary_writer\n") + print("\nWrite into valid_summary_writer\n") tf.summary.scalar("loss", valid_loss_epoch, step=ei) tf.summary.scalar("r", valid_r_epoch, step=ei) tf.summary.scalar("r2", valid_r2_epoch, step=ei) + valid_summary_writer.flush() print( " - valid_loss: %.4f - valid_r: %.4f - valid_r2: %.4f" From 17c2522e586268113c64f39f06027ea820f580a2 Mon Sep 17 00:00:00 2001 From: lruizcalico Date: Thu, 6 Jun 2024 19:56:33 -0700 Subject: [PATCH 7/7] take out the print statement --- src/baskerville/trainer.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/baskerville/trainer.py b/src/baskerville/trainer.py index 80e5df5..75e538e 100644 --- a/src/baskerville/trainer.py +++ b/src/baskerville/trainer.py @@ -455,7 +455,6 @@ def eval_step1_distr(xd, yd): print(" Data %d" % di, end="") model = seqnn_model.models[di] with train_summary_writer.as_default(): - print("\nWrite into train_summary_writer\n") tf.summary.scalar( "loss", train_loss[di].result().numpy(), step=ei ) @@ -484,7 +483,6 @@ def eval_step1_distr(xd, yd): eval_step1_distr(x, y) with valid_summary_writer.as_default(): - print("\nWrite into valid_summary_writer\n") tf.summary.scalar( "loss", valid_loss[di].result().numpy(), step=ei ) @@ -665,7 +663,6 @@ def eval_step_distr(xd, yd): train_r2_epoch = train_r2.result().numpy() with train_summary_writer.as_default(): - print("\nWrite into train_summary_writer\n") tf.summary.scalar("loss", train_loss_epoch, step=ei) tf.summary.scalar("r", train_r_epoch, step=ei) tf.summary.scalar("r2", train_r2_epoch, step=ei) @@ -689,7 +686,6 @@ def eval_step_distr(xd, yd): valid_r2_epoch = valid_r2.result().numpy() with valid_summary_writer.as_default(): - print("\nWrite into valid_summary_writer\n") tf.summary.scalar("loss", valid_loss_epoch, step=ei) tf.summary.scalar("r", valid_r_epoch, step=ei) tf.summary.scalar("r2", valid_r2_epoch, step=ei)