Skip to content

Commit

Permalink
Error reporting and wait for preparation task completion (#31)
Browse files Browse the repository at this point in the history
* add error reporting in batch scripts

report raised errors etc to honeybadger

* wait a small time period for the preparation task to complete

we don't currently wait for the job preparation task to complete successfully as it leaves the node pool scaled if it doesn't finish properly

so we're adding a configurable wait time on the main task start to allow the preparation task to finish and make the training / prediction code available.

Without this the main task can run before the job preparation task has finished and we get errors like "python: can't open file '/mnt/batch/tasks/shared/train_model_finetune_on_catalog.py': [Errno 2] No such file or directory""

* remove h5py dep as it's included via zoobot
  • Loading branch information
camallen authored Mar 29, 2023
1 parent a873651 commit 9e8b4d8
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 2 deletions.
6 changes: 6 additions & 0 deletions azure/batch/scripts/predict_catalog_with_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ def load_model_from_checkpoint(checkpoint_path):
parser.add_argument('--devices', default=1, type=int)
args = parser.parse_args()

# setup the error reporting tool - https://app.honeybadger.io/projects/
honeybadger_api_key = os.getenv('HONEYBADGER_API_KEY')
if honeybadger_api_key:
from honeybadger import honeybadger
honeybadger.configure(api_key=honeybadger_api_key)

logging.info(f'Begin predictions on catalog: {args.catalog_url}')

# load the catalog from a remote JSON url
Expand Down
6 changes: 6 additions & 0 deletions azure/batch/scripts/train_model_finetune_on_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@
parser.add_argument('--debug', dest='debug', default=False, action='store_true')
args = parser.parse_args()

# setup the error reporting tool - https://app.honeybadger.io/projects/
honeybadger_api_key = os.getenv('HONEYBADGER_API_KEY')
if honeybadger_api_key:
from honeybadger import honeybadger
honeybadger.configure(api_key=honeybadger_api_key)

# load csv file catalog location into a pandas data frame
kade_catalog = pd.read_csv(args.catalog_loc)

Expand Down
2 changes: 1 addition & 1 deletion azure/batch/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@
install_requires=[
'zoobot[pytorch_cu113] >= 1.0', # the big cheese - bring in the zoobot!
'requests >= 2.28.1', # used to download prediction images from a remote URL
'h5py >= 3.7.0' # used for prediction exports
'honeybadger' # used for error reporting
]
)
6 changes: 5 additions & 1 deletion bajor/batch/train_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,11 @@ def create_job_tasks(job_id, task_id=1, run_opts=''):
promote_checkpoint_cmd = f'$AZ_BATCH_NODE_SHARED_DIR/{promote_model_code_path} $AZ_BATCH_NODE_MOUNTS_DIR/$TRAINING_CONTAINER_MOUNT_DIR/$TRAINING_JOB_RESULTS_DIR 2>&1'
# ensure pytorch has the correct kernel cach path (this enables CUDA JIT - https://pytorch.org/docs/stable/notes/cuda.html#just-in-time-compilation)
setup_pytorch_kernel_cache_env_var = 'PYTORCH_KERNEL_CACHE_PATH=$AZ_BATCH_NODE_SHARED_DIR/.cache/torch/kernels'
command = f'/bin/bash -c \"set -ex; {setup_pytorch_kernel_cache_env_var} python {train_cmd}; {promote_checkpoint_cmd}\"'
# add a buffer to wait for the job preparation task to complete as the training task
# code is copied down to an executable location in the job preparation task
preparation_task_wait_time = os.getenv('PREPARATION_WAIT_TIME', '30')
wait_for_preparation_task_completion = f'sleep {preparation_task_wait_time}'
command = f'/bin/bash -c \"set -ex; {wait_for_preparation_task_completion}; {setup_pytorch_kernel_cache_env_var} python {train_cmd}; {promote_checkpoint_cmd}\"'


# test the cuda install (there is a built in script for this - https://github.com/mwalmsley/zoobot/blob/048543f21a82e10e7aa36a44bd90c01acd57422a/zoobot/pytorch/estimators/cuda_check.py)
Expand Down

0 comments on commit 9e8b4d8

Please sign in to comment.