diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b20d54..a12d8d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# 0.8.18 - 2024-01-21 + +- Bug fix: fixed string variable fill values replaced by the string "nan". + # 0.8.17 - 2023-07-05 - New feature: support `copy_from_alt` list of alternative variable names to use if `name` variable diff --git a/docs/conf.py b/docs/conf.py index 2188b6d..671ab5e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -54,18 +54,18 @@ master_doc = "index" # General information about the project. -project = u"ncagg" -copyright = u"2018, Stefan Codrescu" -author = u"Stefan Codrescu" +project = "ncagg" +copyright = "2018, Stefan Codrescu" +author = "Stefan Codrescu" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = u"" +version = "" # The full version, including alpha/beta/rc tags. -release = u"" +release = "" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -146,7 +146,7 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, "ncagg.tex", u"ncagg Documentation", u"Author", "manual"), + (master_doc, "ncagg.tex", "ncagg Documentation", "Author", "manual"), ] @@ -154,7 +154,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [(master_doc, "ncagg", u"ncagg Documentation", [author], 1)] +man_pages = [(master_doc, "ncagg", "ncagg Documentation", [author], 1)] # -- Options for Texinfo output ------------------------------------------- @@ -166,7 +166,7 @@ ( master_doc, "ncagg", - u"ncagg Documentation", + "ncagg Documentation", author, "ncagg", "One line description of project.", diff --git a/ncagg/aggregator.py b/ncagg/aggregator.py index 2484537..78b37be 100644 --- a/ncagg/aggregator.py +++ b/ncagg/aggregator.py @@ -98,7 +98,7 @@ def generate_aggregation_list(config, files_to_aggregate): def cast_bound(bound): # type: (float | datetime) -> float - """ Cast a bound to a numerical type for use. Will not be working directly with datetime objects. """ + """Cast a bound to a numerical type for use. Will not be working directly with datetime objects.""" if isinstance(bound, datetime): units = config.vars[primary_index_by["index_by"]]["attributes"]["units"] return nc.date2num(bound, units) @@ -111,9 +111,8 @@ def cast_bound(bound): ) # Can continue into the correction loop as long as we have at least cadence_hz, or min and max. - if ( - cadence_hz is None - or (first_along_primary is None and last_along_primary is None) + if cadence_hz is None or ( + first_along_primary is None and last_along_primary is None ): return preliminary @@ -271,7 +270,6 @@ def evaluate_aggregation_list(config, aggregation_list, to_fullpath, callback=No vars_unlim.append(v) with nc.Dataset(to_fullpath, "r+") as nc_out: # type: nc.Dataset - # the vars once don't depend on an unlimited dim so only need to be copied once. Find the first # InputFileNode to copy from so we don't get fill values. Otherwise, if none exists, which shouldn't # happen, but oh well, use a fill node. @@ -364,8 +362,8 @@ def evaluate_aggregation_list(config, aggregation_list, to_fullpath, callback=No def initialize_aggregation_file(config, fullpath): # type: (Config, str) -> None """ - Based on the configuration, initialize a file in which to write the aggregated output. - + Based on the configuration, initialize a file in which to write the aggregated output. + In other words, resurrect a netcdf file that would result in config. :param config: Aggregation configuration @@ -386,7 +384,7 @@ def initialize_aggregation_file(config, fullpath): fill_value = var_type.type(fill_value) zlib = True if np.issubdtype(var_type, str): - # NetCDF started raising RuntimeError when passed compression args on + # NetCDF started raising RuntimeError when passed compression args on # vlen datatypes. Detect vlens (str for now) and avoid compression. # Ref: https://github.com/Unidata/netcdf4-python/issues/1205 zlib = False diff --git a/ncagg/aggrelist.py b/ncagg/aggrelist.py index 598fd70..82b1bf0 100644 --- a/ncagg/aggrelist.py +++ b/ncagg/aggrelist.py @@ -19,14 +19,15 @@ def get_fill_for(variable): :return: A fill value for variable. """ datatype = np.dtype(variable["datatype"]) - try: + + if np.issubdtype(datatype, np.floating): return datatype.type(np.nan) - except ValueError: - # for an integer type, there is no concept of nan, this will raise - # ValueError: cannot convert float NaN to integer, so use -9999 instead - # main reason for this complexity is to handle exis integer datatypes - nc_default_fill = datatype.type(nc.default_fillvals[datatype.str[1:]]) - return datatype.type(variable["attributes"].get("_FillValue", nc_default_fill)) + + if np.issubdtype(datatype, str): + return "" + + nc_default_fill = datatype.type(nc.default_fillvals[datatype.str[1:]]) + return datatype.type(variable["attributes"].get("_FillValue", nc_default_fill)) class VariableNotFoundException(Exception): @@ -154,7 +155,6 @@ def data_for(self, var): result_shape = [] for index, dim in enumerate(var["dimensions"]): - dim_size = self.config.dims[dim]["size"] dim_unlim = dim_size is None # save dim_unlim because we'll set the size @@ -228,7 +228,6 @@ def get_coverage(self): if d["index_by"] is not None and not d["flatten"] ] for udim in index_by: - # cadence_hz may be None in which case we'll simply look for fill or invalid values in the index_by # variable. At the moment, this is hard coded to seek 0's since our main use case is index_by time # and we don't expect our spacecraft to teleport back to the epoch value :) @@ -332,14 +331,14 @@ def cache_dim_sizes(self): self.dim_sizes[dim["name"]] = self.get_file_internal_aggregation_size(dim) def get_first_of_index_by(self, udim): - """ Get the first value along udim. """ + """Get the first value along udim.""" first_slice = self.file_internal_aggregation_list[udim["name"]][0] assert isinstance(first_slice, slice), "Must be a slice!" assert isinstance(first_slice.start, int), "Must be an int!" return self.get_index_of_index_by(first_slice.start, udim).item(0) def get_last_of_index_by(self, udim): - """ Get the last value along udim. """ + """Get the last value along udim.""" last_slice = self.file_internal_aggregation_list[udim["name"]][-1] assert isinstance(last_slice, slice), "Must be a slice!" assert isinstance(last_slice.start, int), "Must be an int!" @@ -567,22 +566,20 @@ def data_for_netcdf(self, var, nc_in): ) fill_value = get_fill_for(var) + nc_var = nc_in.variables[name] dims = [ - self.config.dims[d] - for d in var["dimensions"] - if d in nc_in.variables[name].dimensions + self.config.dims[d] for d in var["dimensions"] if d in nc_var.dimensions ] # step 1: get the sorted data dim_slices = tuple( [self.sort_unlim.get(d["name"], slice(None)) for d in dims] ) or slice(None) - nc_in.variables[name].set_auto_mask(False) + nc_var.set_auto_mask(False) prelim_data = nc_in.variables[name][dim_slices] - if hasattr(nc_in.variables[name], "_FillValue"): - where_to_fill = prelim_data == nc_in.variables[name]._FillValue + if hasattr(nc_var, "_FillValue"): + where_to_fill = prelim_data == nc_var._FillValue prelim_data[where_to_fill] = fill_value - # prelim_data = np.ma.filled(nc_in.variables[name][dim_slices], fill_value=fill_value) if len(dims) == 0: # if this is just a scalar value, return diff --git a/ncagg/cli.py b/ncagg/cli.py index 9a17b1f..3af08eb 100644 --- a/ncagg/cli.py +++ b/ncagg/cli.py @@ -50,12 +50,16 @@ def parse_bound_arg(b): if len(b_split) == 2: b_split[0] = parse_time(b_split[0][1:]) # friendly cli: for the second bound, ignore whether it starts with a T or not. - b_split[1] = parse_time(b_split[1][1:] if b_split[1].startswith("T") else b_split[1]) + b_split[1] = parse_time( + b_split[1][1:] if b_split[1].startswith("T") else b_split[1] + ) elif len(b_split) == 1: # if there's only one, infer dayfile, monthfile, or yearfile based on the length if len(b_split[0][1:]) == 4: # infer -bTYYYY:TYYYY+1 b_split[0] = parse_time(b_split[0][1:]) - b_split.append(datetime(b_split[0].year + 1, 1, 1) - timedelta(microseconds=1)) + b_split.append( + datetime(b_split[0].year + 1, 1, 1) - timedelta(microseconds=1) + ) elif len(b_split[0][1:]) == 6: # infer -bTYYYYMM:TYYYYMM+1 b_split[0] = parse_time(b_split[0][1:]) # datetime month must be in 1..12, so if month+1 == 13, increment year @@ -66,16 +70,24 @@ def parse_bound_arg(b): next_month = 1 else: next_year = b_split[0].year - b_split.append(datetime(next_year, next_month, 1) - timedelta(microseconds=1)) + b_split.append( + datetime(next_year, next_month, 1) - timedelta(microseconds=1) + ) elif len(b_split[0][1:]) == 8: # infer -bTYYYYMMDD:TYYYYMMDD+1 b_split[0] = parse_time(b_split[0][1:]) - b_split.append(b_split[0] + timedelta(days=1) - timedelta(microseconds=1)) + b_split.append( + b_split[0] + timedelta(days=1) - timedelta(microseconds=1) + ) elif len(b_split[0][1:]) == 10: # infer -bTYYYYMMDDHH:TYYYYMMDDHH+1 b_split[0] = parse_time(b_split[0][1:]) - b_split.append(b_split[0] + timedelta(hours=1) - timedelta(microseconds=1)) + b_split.append( + b_split[0] + timedelta(hours=1) - timedelta(microseconds=1) + ) elif len(b_split[0][1:]) == 12: # infer -bTYYYYMMDDHHMM:TYYYYMMDDHHMM+1 b_split[0] = parse_time(b_split[0][1:]) - b_split.append(b_split[0] + timedelta(minutes=1) - timedelta(microseconds=1)) + b_split.append( + b_split[0] + timedelta(minutes=1) - timedelta(microseconds=1) + ) else: raise click.BadParameter("") else: @@ -131,7 +143,9 @@ def get_src_from_stdin(ctx, param, value): ) elif not value: # otherwise, nothing found - raise click.BadParameter("No files provided as argument or via stdin.", ctx=ctx, param=param) + raise click.BadParameter( + "No files provided as argument or via stdin.", ctx=ctx, param=param + ) return value @@ -170,7 +184,7 @@ def get_src_from_stdin(ctx, param, value): ) @click.option("-t", help="Specify a configuration template", type=click.File("r")) def cli(dst, src, u=None, c=None, b=None, l="WARNING", t=None): - """ Aggregate NetCDF files. """ + """Aggregate NetCDF files.""" logging.getLogger().setLevel(l) if t is not None: # if given a template... config = Config.from_dict(json.load(t)) diff --git a/ncagg/config.py b/ncagg/config.py index bf38eeb..a80b3e9 100644 --- a/ncagg/config.py +++ b/ncagg/config.py @@ -184,7 +184,7 @@ def __setitem__(self, key, value): super(ConfigDict, self).__setitem__(value["name"], value) def update(self, *args, **kwargs): - """ Update the ConfigDict, using __setitem__ in order to validate every entry. """ + """Update the ConfigDict, using __setitem__ in order to validate every entry.""" for k, v in dict(*args, **kwargs).items(): self[k] = v diff --git a/setup.py b/setup.py index ec7080b..5af61c4 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name="ncagg", - version="0.8.17", + version="0.8.18", description="Utility for aggregation of NetCDF data.", author="Stefan Codrescu", author_email="stefan.codrescu@noaa.gov", @@ -27,5 +27,7 @@ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ], - project_urls={"Documentation": "http://ncagg.readthedocs.io/en/latest/",}, + project_urls={ + "Documentation": "http://ncagg.readthedocs.io/en/latest/", + }, ) diff --git a/test/exis/EXISL1bSFEU_remapping/type1_test.py b/test/exis/EXISL1bSFEU_remapping/type1_test.py index 0b5ed6c..a4bcd28 100644 --- a/test/exis/EXISL1bSFEU_remapping/type1_test.py +++ b/test/exis/EXISL1bSFEU_remapping/type1_test.py @@ -24,9 +24,9 @@ def tearDown(self): os.remove(self.nc_out_filename) def test_basic(self): - """ Ok, so the files in data/type1/ don't have an unlimited dimension, report_number should be + """Ok, so the files in data/type1/ don't have an unlimited dimension, report_number should be unlimited so I've made report_nubmer unlimited in the config template type1_config.json. - Let's see if we can aggregate to it. """ + Let's see if we can aggregate to it.""" aggregation_list = generate_aggregation_list(self.config, self.files) self.assertEqual(len(aggregation_list), 3) evaluate_aggregation_list(self.config, aggregation_list, self.nc_out_filename) diff --git a/test/exis/EXISL1bSFEU_remapping/type3_test.py b/test/exis/EXISL1bSFEU_remapping/type3_test.py index aa15a1c..0748623 100644 --- a/test/exis/EXISL1bSFEU_remapping/type3_test.py +++ b/test/exis/EXISL1bSFEU_remapping/type3_test.py @@ -24,9 +24,9 @@ def tearDown(self): os.remove(self.nc_out_filename) def test_using_product_bounds(self): - """ Ok, so the files in data/type3/ don't have an unlimited report_number dimension. + """Ok, so the files in data/type3/ don't have an unlimited report_number dimension. Also, euvsCQualityFlags is missing a report_number dimension, can we create an explicit - dependence on this? """ + dependence on this?""" start_time = datetime(2017, 8, 25, 0, 3, 30) # 2017-08-25T00:03:29.6Z end_time = datetime(2017, 8, 25, 0, 5, 0) # 2017-08-25T00:04:29.6Z diff --git a/test/exis/EXISL1bSFXR_copyfromalt/test_EXISL1bSFXR_copyfromalt.py b/test/exis/EXISL1bSFXR_copyfromalt/test_EXISL1bSFXR_copyfromalt.py index 5094ac4..8498241 100644 --- a/test/exis/EXISL1bSFXR_copyfromalt/test_EXISL1bSFXR_copyfromalt.py +++ b/test/exis/EXISL1bSFXR_copyfromalt/test_EXISL1bSFXR_copyfromalt.py @@ -48,4 +48,3 @@ def test_exis_with_config(self): data = np.ma.filled(nc_out.variables["SPP_roll_angle"][:], np.nan) self.assertFalse(np.any(np.isnan(data))) self.assertEqual(len(data), 2) # one record in each file - diff --git a/test/exis/xrsfl2flsum_simple/test_xrsfl2flsum.py b/test/exis/xrsfl2flsum_simple/test_xrsfl2flsum.py index 1ee9a4e..e8dd502 100644 --- a/test/exis/xrsfl2flsum_simple/test_xrsfl2flsum.py +++ b/test/exis/xrsfl2flsum_simple/test_xrsfl2flsum.py @@ -5,6 +5,7 @@ import glob import os import netCDF4 as nc +import numpy as np class TestAggregate(unittest.TestCase): @@ -19,17 +20,18 @@ def tearDown(self): os.remove(self.file) def test_main(self): - """ - Nothing too fancy here, but just making sure that aggregating - a variable of strings works properly. - - Previous to version 0.8.5 we had trouble with vlen datatypes. - """ agg_list = generate_aggregation_list(self.config, self.files) evaluate_aggregation_list(self.config, agg_list, self.file) with nc.Dataset(self.file) as nc_in: - status = nc_in.variables["status"] - # there should be no fill values... - # before ncagg v0.8.5 vlen types like string incorrectly aggregated to all fill values. - self.assertFalse(any(status[:] == status._FillValue)) + # there should be no fill values in the status variable + # before ncagg v0.8.5 vlen types like string incorrectly aggregated to fills + status = nc_in.variables["status"][:] + self.assertFalse(np.ma.is_masked(status)) + + # prior to ncagg v0.8.18, there was a bug that converted string fills + # to the string "nan" + flare_class = nc_in.variables["flare_class"][:] + self.assertFalse("nan" in flare_class) + self.assertTrue("" in flare_class) + self.assertTrue("B1.0" in flare_class) diff --git a/test/exis/xrsfl2flx1s_nochop/test_xrsfl2flx1s.py b/test/exis/xrsfl2flx1s_nochop/test_xrsfl2flx1s.py index a3b0d15..794d929 100644 --- a/test/exis/xrsfl2flx1s_nochop/test_xrsfl2flx1s.py +++ b/test/exis/xrsfl2flx1s_nochop/test_xrsfl2flx1s.py @@ -28,7 +28,7 @@ def test_main(self): record because it was less than 0.5*expected_time_step from the start boundary. Instead, that first data point should be taken, even if it is exactly on the - boundary, since the boundary isn't a realy piece of data. + boundary, since the boundary isn't a realy piece of data. So, the essential piece of this test is to make sure that there remain 86400 records before and after aggregation. diff --git a/test/generic/test_attribute_strategies.py b/test/generic/test_attribute_strategies.py index 0a88ce0..77e5b1c 100644 --- a/test/generic/test_attribute_strategies.py +++ b/test/generic/test_attribute_strategies.py @@ -103,7 +103,6 @@ def test_strat_first_filename(self): self.assertIn(".nc", finalize(self.test_nc)) def test_strat_static(self): - # set the config for a static "license" attribute... value = "Hello world" self.handler_kwargs["config"].attrs["license"] = { diff --git a/test/generic/test_config_objects.py b/test/generic/test_config_objects.py index bd62e6b..d9a0e0b 100644 --- a/test/generic/test_config_objects.py +++ b/test/generic/test_config_objects.py @@ -5,8 +5,8 @@ class SampleConfig(ConfigDict): - """ A very basic config that expect fields with a something float value, - in order to test that basic functionality works as expected. """ + """A very basic config that expect fields with a something float value, + in order to test that basic functionality works as expected.""" def get_item_schema(self): default = super(SampleConfig, self).get_item_schema() @@ -16,8 +16,8 @@ def get_item_schema(self): class TestConfigDict(unittest.TestCase): def test_init_valid(self): - """ Make sure that a valid configuration is accepted and ordering - preserved. """ + """Make sure that a valid configuration is accepted and ordering + preserved.""" a = SampleConfig( [ {"name": "a", "something": 1}, @@ -30,8 +30,8 @@ def test_init_valid(self): self.assertEqual(["a", "b", "z"][i], k) def test_init_invalid(self): - """ Ensure that the sample config rejects the bad string value - since something is expected to be a float value. """ + """Ensure that the sample config rejects the bad string value + since something is expected to be a float value.""" with self.assertRaises(ValueError): SampleConfig( [ @@ -42,7 +42,7 @@ def test_init_invalid(self): ) def test_update(self): - """ Test that we can't insert invalid values through update either. """ + """Test that we can't insert invalid values through update either.""" a = SampleConfig([{"name": "a", "something": 1}, {"name": "z", "something": 1}]) a.update({"b": {"something": 2}}) self.assertEqual(len(a), 3) @@ -52,7 +52,7 @@ def test_update(self): class TestDimVarAttrConfigs(unittest.TestCase): def test_dimension_config(self): - """ Test that the DimensionConfig object behaves as expected. """ + """Test that the DimensionConfig object behaves as expected.""" dc = DimensionConfig([{"name": "a", "size": 5}]) self.assertIn("a", dc.keys()) self.assertEqual(dc["a"]["size"], 5) @@ -68,7 +68,7 @@ def test_dimension_config(self): class TestOverallConfig(unittest.TestCase): def test_basic(self): - """ Make sure the configuration accepts a valid configuration. """ + """Make sure the configuration accepts a valid configuration.""" dims = DimensionConfig([{"name": "a", "size": 2}, {"name": "b", "size": None}]) vars = VariableConfig( [ @@ -80,7 +80,7 @@ def test_basic(self): Config(dims, vars, attrs) def test_basic_with_var_attrs(self): - """ Make sure the configuration accepts a valid configuration. """ + """Make sure the configuration accepts a valid configuration.""" dims = DimensionConfig([{"name": "a", "size": 2}, {"name": "b", "size": None}]) vars = VariableConfig( [ @@ -97,7 +97,7 @@ def test_basic_with_var_attrs(self): Config(dims, vars, attrs) def test_missing_dim(self): - """ The variable t depends on a dimension c that has not been configured. + """The variable t depends on a dimension c that has not been configured. Make sure a ValueError is raised because of this.""" dims = DimensionConfig([{"name": "a", "size": 2}, {"name": "b", "size": None}]) vars = VariableConfig( @@ -112,7 +112,7 @@ def test_missing_dim(self): def test_extra_dim(self): """We have configured an extra dimension z that isn't used by any variables. - Make sure a ValueError is raised. """ + Make sure a ValueError is raised.""" dims = DimensionConfig( [ {"name": "a", "size": 2}, diff --git a/test/generic/test_initialize_aggregation_file.py b/test/generic/test_initialize_aggregation_file.py index 30a0f0b..14ca5ad 100644 --- a/test/generic/test_initialize_aggregation_file.py +++ b/test/generic/test_initialize_aggregation_file.py @@ -26,7 +26,7 @@ def test_initialize_basic(self): "datatype": "int8", "attributes": { "_FillValue": 1, - } + }, } ], "global attributes": [], diff --git a/test/mag/magnl2hires_5min/test_magnl2hires.py b/test/mag/magnl2hires_5min/test_magnl2hires.py index d2ca578..149bf3f 100644 --- a/test/mag/magnl2hires_5min/test_magnl2hires.py +++ b/test/mag/magnl2hires_5min/test_magnl2hires.py @@ -17,7 +17,10 @@ def setUp(self): self.files = glob.glob(os.path.join(pwd, "data", "*.nc")) self.config = Config.from_nc(self.files[0]) self.config.dims["time"].update( - {"index_by": "time", "expected_cadence": {"time": 10},} + { + "index_by": "time", + "expected_cadence": {"time": 10}, + } ) def tearDown(self): diff --git a/test/mag/magnl2hires_gap/test_magnl2hires.py b/test/mag/magnl2hires_gap/test_magnl2hires.py index 5704345..9ba77f6 100644 --- a/test/mag/magnl2hires_gap/test_magnl2hires.py +++ b/test/mag/magnl2hires_gap/test_magnl2hires.py @@ -17,7 +17,10 @@ def setUp(self): self.files = glob.glob(os.path.join(pwd, "data", "*.nc")) self.config = Config.from_nc(self.files[0]) self.config.dims["time"].update( - {"index_by": "time", "expected_cadence": {"time": 10},} + { + "index_by": "time", + "expected_cadence": {"time": 10}, + } ) def tearDown(self):