From c1968611ec0b9c0b7ac09ff088745ce27817c3c8 Mon Sep 17 00:00:00 2001 From: Angus Gibson Date: Thu, 19 Mar 2020 10:57:45 +1100 Subject: [PATCH 1/2] Allow indexing to proceed despite unreadable files The unix find utility returns a non-zero error code very eagerly, such as when unreadable files/directories are encountered. We now assume this command succeeds, and pipe through the warnings to the user. Closes #166 --- cosima_cookbook/database.py | 15 +++++++++------ test/test_indexing.py | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/cosima_cookbook/database.py b/cosima_cookbook/database.py index a7388c4..50b3e95 100644 --- a/cosima_cookbook/database.py +++ b/cosima_cookbook/database.py @@ -5,6 +5,7 @@ import re import subprocess from tqdm import tqdm +import warnings import cftime from dask.distributed import as_completed @@ -22,6 +23,8 @@ from . import netcdf_utils from .database_utils import * +logging.captureWarnings(True) + __DB_VERSION__ = 2 __DEFAULT_DB__ = '/g/data/hh5/tmp/cosima/database/access-om2.db' @@ -295,12 +298,12 @@ def index_experiment(experiment_dir, session=None, client=None, update=False): # find all netCDF files in the hierarchy below this directory files = [] - try: - results = subprocess.check_output(['find', experiment_dir, '-name', '*.nc']) - results = [s for s in results.decode('utf-8').split()] - files.extend(results) - except Exception as e: - logging.error('Error occurred while finding output files: %s', e) + proc = subprocess.run(['find', experiment_dir, '-name', '*.nc'], capture_output=True, encoding='utf-8') + if proc.returncode != 0: + warnings.warn('Some files or directories could not be read while finding output files: %s', UserWarning) + + results = [s for s in proc.stdout.split()] + files.extend(results) expt_path = Path(experiment_dir) expt = NCExperiment(experiment=str(expt_path.name), diff --git a/test/test_indexing.py b/test/test_indexing.py index 3659257..a793497 100644 --- a/test/test_indexing.py +++ b/test/test_indexing.py @@ -14,6 +14,24 @@ def session_db(tmpdir): s.close() +@pytest.fixture +def unreadable_dir(tmpdir): + expt_path = tmpdir / "expt_dir" + expt_path.mkdir() + idx_dir = expt_path / "unreadable" + idx_dir.mkdir() + idx_dir.chmod(0o300) + + yield idx_dir + + expt_path.remove(ignore_errors=True) + +def test_unreadable(session_db, unreadable_dir): + session, db = session_db + + with pytest.warns(UserWarning, match="Some files or directories could not be read"): + indexed = database.build_index(str(unreadable_dir), session) + def test_broken(session_db): session, db = session_db indexed = database.build_index('test/data/indexing/broken_file', session) From 097859af348155d9000740005d4845fcecf205ae Mon Sep 17 00:00:00 2001 From: Angus Gibson Date: Thu, 19 Mar 2020 11:09:42 +1100 Subject: [PATCH 2/2] Remove capture_output from subprocess.run This was just a convenience option, but we can safely replace it with piping stdout and stderr. --- cosima_cookbook/database.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cosima_cookbook/database.py b/cosima_cookbook/database.py index 50b3e95..8a46e22 100644 --- a/cosima_cookbook/database.py +++ b/cosima_cookbook/database.py @@ -298,7 +298,9 @@ def index_experiment(experiment_dir, session=None, client=None, update=False): # find all netCDF files in the hierarchy below this directory files = [] - proc = subprocess.run(['find', experiment_dir, '-name', '*.nc'], capture_output=True, encoding='utf-8') + proc = subprocess.run(['find', experiment_dir, '-name', '*.nc'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + encoding='utf-8') if proc.returncode != 0: warnings.warn('Some files or directories could not be read while finding output files: %s', UserWarning)