diff --git a/benchmarks/pytorch/run_benchmarks.sh b/benchmarks/pytorch/run_benchmarks.sh index b3224bd7..89e55e5b 100755 --- a/benchmarks/pytorch/run_benchmarks.sh +++ b/benchmarks/pytorch/run_benchmarks.sh @@ -16,7 +16,7 @@ SEED=$RANDOM # effnet, greyscale and color sbatch --job-name=evo_py_gr_eff_224_$SEED --export=ARCHITECTURE=efficientnet_b0,BATCH_SIZE=256,RESIZE_AFTER_CROP=224,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB # sbatch --job-name=evo_py_gr_eff_300_$SEED --export=ARCHITECTURE=efficientnet_b0,BATCH_SIZE=256,RESIZE_AFTER_CROP=300,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB -# sbatch --job-name=evo_py_co_eff_224_$SEED --export=ARCHITECTURE=efficientnet_b0,BATCH_SIZE=256,RESIZE_AFTER_CROP=224,DATASET=gz_evo,COLOR_STRING=--color,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB +sbatch --job-name=evo_py_co_eff_224_$SEED --export=ARCHITECTURE=efficientnet_b0,BATCH_SIZE=256,RESIZE_AFTER_CROP=224,DATASET=gz_evo,COLOR_STRING=--color,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB # sbatch --job-name=evo_py_co_eff_300_$SEED --export=ARCHITECTURE=efficientnet_b0,BATCH_SIZE=256,RESIZE_AFTER_CROP=300,DATASET=gz_evo,COLOR_STRING=--color,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB # and resnet18 # sbatch --job-name=evo_py_gr_res18_224_$SEED --export=ARCHITECTURE=resnet18,BATCH_SIZE=256,RESIZE_AFTER_CROP=224,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB @@ -26,11 +26,15 @@ sbatch --job-name=evo_py_gr_eff_224_$SEED --export=ARCHITECTURE=efficientnet_b0, # sbatch --job-name=evo_py_gr_res50_300_$SEED --export=ARCHITECTURE=resnet50,BATCH_SIZE=256,RESIZE_AFTER_CROP=300,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB # and with max-vit tiny because hey transformers are cool # smaller batch size due to memory -sbatch --job-name=evo_py_gr_vittiny_224_$SEED --export=ARCHITECTURE=maxvit_tiny_224,BATCH_SIZE=128,RESIZE_AFTER_CROP=224,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB +# sbatch --job-name=evo_py_gr_vittiny_224_$SEED --export=ARCHITECTURE=maxvit_tiny_224,BATCH_SIZE=128,RESIZE_AFTER_CROP=224,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB +sbatch --job-name=evo_py_co_vittiny_224_$SEED --export=ARCHITECTURE=maxvit_tiny_224,BATCH_SIZE=128,RESIZE_AFTER_CROP=224,DATASET=gz_evo,COLOR_STRING=--color,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB # and max-vit small (works badly) # sbatch --job-name=evo_py_gr_vitsmall_224_$SEED --export=ARCHITECTURE=maxvit_small_224,BATCH_SIZE=64,RESIZE_AFTER_CROP=224,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB # and convnext (works badly) # sbatch --job-name=evo_py_gr_$SEED --export=ARCHITECTURE=convnext_nano,BATCH_SIZE=256,RESIZE_AFTER_CROP=224,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB +# and vit +# sbatch --job-name=evo_py_gr_vittinyp16_224_$SEED --export=ARCHITECTURE=vit_tiny_patch16_224,BATCH_SIZE=128,RESIZE_AFTER_CROP=224,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB + # and in color with no mixed precision, for specific project # sbatch --job-name=evo_py_co_res50_224_fullprec_$SEED --export=ARCHITECTURE=resnet50,BATCH_SIZE=256,RESIZE_AFTER_CROP=224,DATASET=gz_evo,COLOR_STRING=--color,GPUS=2,SEED=$SEED $TRAIN_JOB diff --git a/setup.py b/setup.py index ff6aa010..5be99374 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="zoobot", - version="1.0.1", + version="1.0.2", author="Mike Walmsley", author_email="walmsleymk1@gmail.com", description="Galaxy morphology classifiers", @@ -105,6 +105,6 @@ # for saving metrics to weights&biases (cloud service, free within limits) 'wandb', 'setuptools==59.5.0', # wandb logger incompatibility - 'galaxy-datasets==0.0.11' # for dataset loading in both TF and Torch (renamed from pytorch-galaxy-datasets) + 'galaxy-datasets==0.0.12' # for dataset loading in both TF and Torch (renamed from pytorch-galaxy-datasets) ] ) diff --git a/zoobot/pytorch/examples/representations/get_representations.py b/zoobot/pytorch/examples/representations/get_representations.py index 6d50a022..dc154485 100644 --- a/zoobot/pytorch/examples/representations/get_representations.py +++ b/zoobot/pytorch/examples/representations/get_representations.py @@ -6,7 +6,7 @@ from zoobot.pytorch.training import finetune, representations from zoobot.pytorch.estimators import define_model from zoobot.pytorch.predictions import predict_on_catalog -from zoobot.shared import load_predictions +from zoobot.shared import load_predictions, schemas def main(catalog, checkpoint_loc, save_dir): @@ -16,9 +16,12 @@ def main(catalog, checkpoint_loc, save_dir): if not os.path.exists(save_dir): os.mkdir(save_dir) - # can load from either ZoobotTree (if trained from scratch) or FinetuneableZoobotTree (if finetuned) - encoder = finetune.FinetuneableZoobotTree.load_from_checkpoint(checkpoint_loc).encoder - # encoder = define_model.ZoobotTree.load_from_checkpoint(checkpoint_loc).encoder + # can load from either ZoobotTree checkpoint (if trained from scratch) + encoder = define_model.ZoobotTree.load_from_checkpoint(checkpoint_loc).encoder + # or FinetuneableZoobotTree (if finetuned) + # currently, FinetuneableZoobotTree checkpoints should be loaded as ZoobotTree with the args below + # this is a bit awkward and I'm working on a clearer method - but it does work. + # encoder = define_model.ZoobotTree.load_from_checkpoint(checkpoint_loc, output_dim=TODO, question_index_groups=[]).encoder # convert to simple pytorch lightning model model = representations.ZoobotEncoder(encoder=encoder, pyramid=False) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index c6cf6c6e..a1fe80b5 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -185,7 +185,11 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0): def test_step(self, batch, batch_idx, dataloader_idx=0): return self.make_step(batch) - def on_train_batch_end(self, outputs, batch, batch_idx: int): + def on_train_batch_end(self, outputs, batch, batch_idx: int, dataloader_idx=0): + # v2 docs currently do not show dataloader_idx as train argument so unclear if this will value be updated properly + # arg is shown for val/test equivalents + # currently does nothing in Zoobot so inconsequential + # https://lightning.ai/docs/pytorch/stable/common/lightning_module.html#on-train-batch-end self.train_loss_metric(outputs['loss']) self.log( "finetuning/train_loss", @@ -195,7 +199,7 @@ def on_train_batch_end(self, outputs, batch, batch_idx: int): on_epoch=True ) - def on_validation_batch_end(self, outputs, batch, batch_idx: int): + def on_validation_batch_end(self, outputs, batch, batch_idx: int, dataloader_idx=0): self.val_loss_metric(outputs['loss']) self.log( "finetuning/val_loss", @@ -208,7 +212,7 @@ def on_validation_batch_end(self, outputs, batch, batch_idx: int): if self.visualize_images: self.upload_images_to_wandb(outputs, batch, batch_idx) - def on_test_batch_end(self, outputs, batch, batch_idx: int): + def on_test_batch_end(self, outputs, batch, batch_idx: int, dataloader_idx=0): self.test_loss_metric(outputs['loss']) self.log( "finetuning/test_loss", @@ -468,7 +472,7 @@ def get_trainer( # Initialise pytorch lightning trainer trainer = pl.Trainer( logger=logger, - callbacks=[checkpoint_callback], # early_stopping_callback + callbacks=[checkpoint_callback, early_stopping_callback], max_epochs=max_epochs, accelerator=accelerator, devices=devices, diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 45aeed66..5690e0e1 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -2,6 +2,7 @@ import os from typing import Tuple +import torch import pytorch_lightning as pl from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.callbacks import ModelCheckpoint @@ -141,11 +142,15 @@ def train_default_zoobot_from_scratch( accelerator = 'cpu' devices = 'auto' # all - precision = 32 + if mixed_precision: logging.info( 'Training with automatic mixed precision. Will reduce memory footprint but may cause training instability for e.g. resnet') - precision = 16 + precision = '16-mixed' + torch.set_float32_matmul_precision('medium') + else: + precision = '32' + torch.set_float32_matmul_precision('high') assert num_workers > 0 @@ -161,7 +166,8 @@ def train_default_zoobot_from_scratch( You may be spawning more dataloader workers than you have cpus, causing bottlenecks. Suggest reducing num_workers.""" ) - + + if catalog is not None: assert train_catalog is None assert val_catalog is None