Error reporting and wait for preparation task completion (#31)

* add error reporting in batch scripts report raised errors etc to honeybadger * wait a small time period for the preparation task to complete we don't currently wait for the job preparation task to complete successfully as it leaves the node pool scaled if it doesn't finish properly so we're adding a configurable wait time on the main task start to allow the preparation task to finish and make the training / prediction code available. Without this the main task can run before the job preparation task has finished and we get errors like "python: can't open file '/mnt/batch/tasks/shared/train_model_finetune_on_catalog.py': [Errno 2] No such file or directory"" * remove h5py dep as it's included via zoobot
zooniverse · Mar 29, 2023 · 9e8b4d8 · 9e8b4d8
1 parent a873651
commit 9e8b4d8
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 2 deletions.
diff --git a/azure/batch/scripts/predict_catalog_with_model.py b/azure/batch/scripts/predict_catalog_with_model.py
@@ -33,6 +33,12 @@ def load_model_from_checkpoint(checkpoint_path):
     parser.add_argument('--devices', default=1, type=int)
     args = parser.parse_args()
 
+    # setup the error reporting tool - https://app.honeybadger.io/projects/
+    honeybadger_api_key = os.getenv('HONEYBADGER_API_KEY')
+    if honeybadger_api_key:
+        from honeybadger import honeybadger
+        honeybadger.configure(api_key=honeybadger_api_key)
+
     logging.info(f'Begin predictions on catalog: {args.catalog_url}')
 
     # load the catalog from a remote JSON url

diff --git a/azure/batch/scripts/train_model_finetune_on_catalog.py b/azure/batch/scripts/train_model_finetune_on_catalog.py
@@ -38,6 +38,12 @@
     parser.add_argument('--debug', dest='debug', default=False, action='store_true')
     args = parser.parse_args()
 
+    # setup the error reporting tool - https://app.honeybadger.io/projects/
+    honeybadger_api_key = os.getenv('HONEYBADGER_API_KEY')
+    if honeybadger_api_key:
+        from honeybadger import honeybadger
+        honeybadger.configure(api_key=honeybadger_api_key)
+
     # load csv file catalog location into a pandas data frame
     kade_catalog = pd.read_csv(args.catalog_loc)
 

diff --git a/azure/batch/setup.py b/azure/batch/setup.py
@@ -20,6 +20,6 @@
     install_requires=[
         'zoobot[pytorch_cu113] >= 1.0', # the big cheese - bring in the zoobot!
         'requests >= 2.28.1', # used to download prediction images from a remote URL
-        'h5py >= 3.7.0' # used for prediction exports
+        'honeybadger' # used for error reporting
     ]
 )
diff --git a/bajor/batch/train_finetuning.py b/bajor/batch/train_finetuning.py
@@ -209,7 +209,11 @@ def create_job_tasks(job_id, task_id=1, run_opts=''):
     promote_checkpoint_cmd = f'$AZ_BATCH_NODE_SHARED_DIR/{promote_model_code_path} $AZ_BATCH_NODE_MOUNTS_DIR/$TRAINING_CONTAINER_MOUNT_DIR/$TRAINING_JOB_RESULTS_DIR 2>&1'
     # ensure pytorch has the correct kernel cach path (this enables CUDA JIT - https://pytorch.org/docs/stable/notes/cuda.html#just-in-time-compilation)
     setup_pytorch_kernel_cache_env_var = 'PYTORCH_KERNEL_CACHE_PATH=$AZ_BATCH_NODE_SHARED_DIR/.cache/torch/kernels'
-    command = f'/bin/bash -c \"set -ex; {setup_pytorch_kernel_cache_env_var} python {train_cmd}; {promote_checkpoint_cmd}\"'
+    # add a buffer to wait for the job preparation task to complete as the training task
+    # code is copied down to an executable location in the job preparation task
+    preparation_task_wait_time = os.getenv('PREPARATION_WAIT_TIME', '30')
+    wait_for_preparation_task_completion = f'sleep {preparation_task_wait_time}'
+    command = f'/bin/bash -c \"set -ex; {wait_for_preparation_task_completion}; {setup_pytorch_kernel_cache_env_var} python {train_cmd}; {promote_checkpoint_cmd}\"'
 
 
     # test the cuda install (there is a built in script for this - https://github.com/mwalmsley/zoobot/blob/048543f21a82e10e7aa36a44bd90c01acd57422a/zoobot/pytorch/estimators/cuda_check.py)