Skip to content

Commit

Permalink
Use a cascading search to determine the time dimension (#145)
Browse files Browse the repository at this point in the history
* Use a cascading search to determine the time dimension

The assumption that time was the only record dimension in a file was
not particularly robust against real data. We instead check:
- standard_name == 'time'
- axis == 'T'
- record dimension
- name.lower() == 'time'

Fixes #144.

* Check dimension in dataset variables

A dimension without an associated variable causes a KeyError when
we're looking for dimensions that contain a particular attribute, when
this should just count as not finding the attribute.
  • Loading branch information
angus-g authored Jun 26, 2020
1 parent e594bde commit 02c1ce1
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 1 deletion.
2 changes: 1 addition & 1 deletion cosima_cookbook/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def update_timeinfo(f, ncfile):

with netCDF4.Dataset(f, 'r') as ds:
# we assume the record dimension corresponds to time
time_dim = netcdf_utils.find_record_dimension(ds)
time_dim = netcdf_utils.find_time_dimension(ds)
if time_dim is None:
return None

Expand Down
39 changes: 39 additions & 0 deletions cosima_cookbook/netcdf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,42 @@ def find_record_dimension(d):
return dim

return None

def find_dimension_with_attribute(d, attribute, value):
"""Find a matching dimension with attribute=value, or None."""

for dim in d.dimensions:
if dim not in d.variables:
continue

if getattr(d.variables[dim], attribute, None) == value:
return dim

return None

def find_time_dimension(d):
"""Find a time dimension in a netCDF4 Dataset."""

# this is a bit heuristic, but we cascade through some checks, guided by
# the CF conventions

dim = find_dimension_with_attribute(d, 'standard_name', 'time')
if dim is not None:
return dim

dim = find_dimension_with_attribute(d, 'axis', 'T')
if dim is not None:
return dim

dim = find_record_dimension(d)
if dim is not None:
return dim

for dim in d.dimensions:
if dim.lower() == 'time':
return dim

# CF conventions also suggests the units attribute,
# but time_bounds may have the same units, and a false positive
# here could be very confusing...
return None
Binary file added test/data/indexing/time/t1.nc
Binary file not shown.
Binary file added test/data/indexing/time/t2.nc
Binary file not shown.
Binary file added test/data/indexing/time/t3.nc
Binary file not shown.
Binary file added test/data/indexing/time/t4.nc
Binary file not shown.
10 changes: 10 additions & 0 deletions test/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,16 @@ def test_broken_metadata(session_db):

assert(indexed == 1)

def test_time_dimension(session_db):
session, db = session_db
database.build_index('test/data/indexing/time', session)

q = session.query(database.NCFile.time_start, database.NCFile.time_end)
assert(q.count() == 4) # should pick up 4 files

q = q.filter((database.NCFile.time_start is None) | (database.NCFile.time_end is None))
assert(q.count() == 0) # but all of them should have times populated

def test_distributed(client, session_db):
session, db = session_db
database.build_index('test/data/indexing/broken_file', session, client)
Expand Down

0 comments on commit 02c1ce1

Please sign in to comment.