5tefan · 5tefan · Jan 21, 2024 · Jan 21, 2024 · Jan 21, 2024 · Jan 21, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# 0.8.18 - 2024-01-21
+
+- Bug fix: fixed string variable fill values replaced by the string "nan".
+
 # 0.8.17 - 2023-07-05
 
 - New feature: support `copy_from_alt` list of alternative variable names to use if `name` variable

diff --git a/docs/conf.py b/docs/conf.py
@@ -54,18 +54,18 @@
 master_doc = "index"
 
 # General information about the project.
-project = u"ncagg"
-copyright = u"2018, Stefan Codrescu"
-author = u"Stefan Codrescu"
+project = "ncagg"
+copyright = "2018, Stefan Codrescu"
+author = "Stefan Codrescu"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = u""
+version = ""
 # The full version, including alpha/beta/rc tags.
-release = u""
+release = ""
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -146,15 +146,15 @@
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, "ncagg.tex", u"ncagg Documentation", u"Author", "manual"),
+    (master_doc, "ncagg.tex", "ncagg Documentation", "Author", "manual"),
 ]
 
 
 # -- Options for manual page output ---------------------------------------
 
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [(master_doc, "ncagg", u"ncagg Documentation", [author], 1)]
+man_pages = [(master_doc, "ncagg", "ncagg Documentation", [author], 1)]
 
 
 # -- Options for Texinfo output -------------------------------------------
@@ -166,7 +166,7 @@
     (
         master_doc,
         "ncagg",
-        u"ncagg Documentation",
+        "ncagg Documentation",
         author,
         "ncagg",
         "One line description of project.",

diff --git a/ncagg/aggregator.py b/ncagg/aggregator.py
@@ -98,7 +98,7 @@ def generate_aggregation_list(config, files_to_aggregate):
 
     def cast_bound(bound):
         # type: (float | datetime) -> float
-        """ Cast a bound to a numerical type for use. Will not be working directly with datetime objects. """
+        """Cast a bound to a numerical type for use. Will not be working directly with datetime objects."""
         if isinstance(bound, datetime):
             units = config.vars[primary_index_by["index_by"]]["attributes"]["units"]
             return nc.date2num(bound, units)
@@ -111,9 +111,8 @@ def cast_bound(bound):
     )
 
     # Can continue into the correction loop as long as we have at least cadence_hz, or min and max.
-    if (
-        cadence_hz is None
-        or (first_along_primary is None and last_along_primary is None)
+    if cadence_hz is None or (
+        first_along_primary is None and last_along_primary is None
     ):
         return preliminary
 
@@ -271,7 +270,6 @@ def evaluate_aggregation_list(config, aggregation_list, to_fullpath, callback=No
             vars_unlim.append(v)
 
     with nc.Dataset(to_fullpath, "r+") as nc_out:  # type: nc.Dataset
-
         # the vars once don't depend on an unlimited dim so only need to be copied once. Find the first
         # InputFileNode to copy from so we don't get fill values. Otherwise, if none exists, which shouldn't
         # happen, but oh well, use a fill node.
@@ -364,8 +362,8 @@ def evaluate_aggregation_list(config, aggregation_list, to_fullpath, callback=No
 def initialize_aggregation_file(config, fullpath):
     # type: (Config, str) -> None
     """
-    Based on the configuration, initialize a file in which to write the aggregated output. 
-    
+    Based on the configuration, initialize a file in which to write the aggregated output.
+
     In other words, resurrect a netcdf file that would result in config.
 
     :param config: Aggregation configuration
@@ -386,7 +384,7 @@ def initialize_aggregation_file(config, fullpath):
                 fill_value = var_type.type(fill_value)
             zlib = True
             if np.issubdtype(var_type, str):
-                # NetCDF started raising RuntimeError when passed compression args on 
+                # NetCDF started raising RuntimeError when passed compression args on
                 # vlen datatypes. Detect vlens (str for now) and avoid compression.
                 # Ref: https://github.com/Unidata/netcdf4-python/issues/1205
                 zlib = False

diff --git a/ncagg/aggrelist.py b/ncagg/aggrelist.py
@@ -19,14 +19,15 @@ def get_fill_for(variable):
     :return: A fill value for variable.
     """
     datatype = np.dtype(variable["datatype"])
-    try:
+
+    if np.issubdtype(datatype, np.floating):
         return datatype.type(np.nan)
-    except ValueError:
-        # for an integer type, there is no concept of nan, this will raise
-        # ValueError: cannot convert float NaN to integer, so use -9999 instead
-        # main reason for this complexity is to handle exis integer datatypes
-        nc_default_fill = datatype.type(nc.default_fillvals[datatype.str[1:]])
-        return datatype.type(variable["attributes"].get("_FillValue", nc_default_fill))
+
+    if np.issubdtype(datatype, str):
+        return ""
+
+    nc_default_fill = datatype.type(nc.default_fillvals[datatype.str[1:]])
+    return datatype.type(variable["attributes"].get("_FillValue", nc_default_fill))
 
 
 class VariableNotFoundException(Exception):
@@ -154,7 +155,6 @@ def data_for(self, var):
         result_shape = []
 
         for index, dim in enumerate(var["dimensions"]):
-
             dim_size = self.config.dims[dim]["size"]
             dim_unlim = dim_size is None  # save dim_unlim because we'll set the size
 
@@ -228,7 +228,6 @@ def get_coverage(self):
             if d["index_by"] is not None and not d["flatten"]
         ]
         for udim in index_by:
-
             # cadence_hz may be None in which case we'll simply look for fill or invalid values in the index_by
             # variable. At the moment, this is hard coded to seek 0's since our main use case is index_by time
             # and we don't expect our spacecraft to teleport back to the epoch value :)
@@ -332,14 +331,14 @@ def cache_dim_sizes(self):
             self.dim_sizes[dim["name"]] = self.get_file_internal_aggregation_size(dim)
 
     def get_first_of_index_by(self, udim):
-        """ Get the first value along udim. """
+        """Get the first value along udim."""
         first_slice = self.file_internal_aggregation_list[udim["name"]][0]
         assert isinstance(first_slice, slice), "Must be a slice!"
         assert isinstance(first_slice.start, int), "Must be an int!"
         return self.get_index_of_index_by(first_slice.start, udim).item(0)
 
     def get_last_of_index_by(self, udim):
-        """ Get the last value along udim. """
+        """Get the last value along udim."""
         last_slice = self.file_internal_aggregation_list[udim["name"]][-1]
         assert isinstance(last_slice, slice), "Must be a slice!"
         assert isinstance(last_slice.start, int), "Must be an int!"
@@ -567,22 +566,20 @@ def data_for_netcdf(self, var, nc_in):
                 )
 
         fill_value = get_fill_for(var)
+        nc_var = nc_in.variables[name]
         dims = [
-            self.config.dims[d]
-            for d in var["dimensions"]
-            if d in nc_in.variables[name].dimensions
+            self.config.dims[d] for d in var["dimensions"] if d in nc_var.dimensions
         ]
 
         # step 1: get the sorted data
         dim_slices = tuple(
             [self.sort_unlim.get(d["name"], slice(None)) for d in dims]
         ) or slice(None)
-        nc_in.variables[name].set_auto_mask(False)
+        nc_var.set_auto_mask(False)
         prelim_data = nc_in.variables[name][dim_slices]
-        if hasattr(nc_in.variables[name], "_FillValue"):
-            where_to_fill = prelim_data == nc_in.variables[name]._FillValue
+        if hasattr(nc_var, "_FillValue"):
+            where_to_fill = prelim_data == nc_var._FillValue
             prelim_data[where_to_fill] = fill_value
-        # prelim_data = np.ma.filled(nc_in.variables[name][dim_slices], fill_value=fill_value)
 
         if len(dims) == 0:
             # if this is just a scalar value, return

diff --git a/ncagg/cli.py b/ncagg/cli.py
@@ -50,12 +50,16 @@ def parse_bound_arg(b):
         if len(b_split) == 2:
             b_split[0] = parse_time(b_split[0][1:])
             # friendly cli: for the second bound, ignore whether it starts with a T or not.
-            b_split[1] = parse_time(b_split[1][1:] if b_split[1].startswith("T") else b_split[1])
+            b_split[1] = parse_time(
+                b_split[1][1:] if b_split[1].startswith("T") else b_split[1]
+            )
         elif len(b_split) == 1:
             # if there's only one, infer dayfile, monthfile, or yearfile based on the length
             if len(b_split[0][1:]) == 4:  # infer -bTYYYY:TYYYY+1
                 b_split[0] = parse_time(b_split[0][1:])
-                b_split.append(datetime(b_split[0].year + 1, 1, 1) - timedelta(microseconds=1))
+                b_split.append(
+                    datetime(b_split[0].year + 1, 1, 1) - timedelta(microseconds=1)
+                )
             elif len(b_split[0][1:]) == 6:  # infer -bTYYYYMM:TYYYYMM+1
                 b_split[0] = parse_time(b_split[0][1:])
                 # datetime month must be in 1..12, so if month+1 == 13, increment year
@@ -66,16 +70,24 @@ def parse_bound_arg(b):
                     next_month = 1
                 else:
                     next_year = b_split[0].year
-                b_split.append(datetime(next_year, next_month, 1) - timedelta(microseconds=1))
+                b_split.append(
+                    datetime(next_year, next_month, 1) - timedelta(microseconds=1)
+                )
             elif len(b_split[0][1:]) == 8:  # infer -bTYYYYMMDD:TYYYYMMDD+1
                 b_split[0] = parse_time(b_split[0][1:])
-                b_split.append(b_split[0] + timedelta(days=1) - timedelta(microseconds=1))
+                b_split.append(
+                    b_split[0] + timedelta(days=1) - timedelta(microseconds=1)
+                )
             elif len(b_split[0][1:]) == 10:  # infer -bTYYYYMMDDHH:TYYYYMMDDHH+1
                 b_split[0] = parse_time(b_split[0][1:])
-                b_split.append(b_split[0] + timedelta(hours=1) - timedelta(microseconds=1))
+                b_split.append(
+                    b_split[0] + timedelta(hours=1) - timedelta(microseconds=1)
+                )
             elif len(b_split[0][1:]) == 12:  # infer -bTYYYYMMDDHHMM:TYYYYMMDDHHMM+1
                 b_split[0] = parse_time(b_split[0][1:])
-                b_split.append(b_split[0] + timedelta(minutes=1) - timedelta(microseconds=1))
+                b_split.append(
+                    b_split[0] + timedelta(minutes=1) - timedelta(microseconds=1)
+                )
         else:
             raise click.BadParameter("")
     else:
@@ -131,7 +143,9 @@ def get_src_from_stdin(ctx, param, value):
             )
     elif not value:
         # otherwise, nothing found
-        raise click.BadParameter("No files provided as argument or via stdin.", ctx=ctx, param=param)
+        raise click.BadParameter(
+            "No files provided as argument or via stdin.", ctx=ctx, param=param
+        )
     return value
 
 
@@ -170,7 +184,7 @@ def get_src_from_stdin(ctx, param, value):
 )
 @click.option("-t", help="Specify a configuration template", type=click.File("r"))
 def cli(dst, src, u=None, c=None, b=None, l="WARNING", t=None):
-    """ Aggregate NetCDF files. """
+    """Aggregate NetCDF files."""
     logging.getLogger().setLevel(l)
     if t is not None:  # if given a template...
         config = Config.from_dict(json.load(t))

diff --git a/ncagg/config.py b/ncagg/config.py
@@ -184,7 +184,7 @@ def __setitem__(self, key, value):
         super(ConfigDict, self).__setitem__(value["name"], value)
 
     def update(self, *args, **kwargs):
-        """ Update the ConfigDict, using __setitem__ in order to validate every entry. """
+        """Update the ConfigDict, using __setitem__ in order to validate every entry."""
         for k, v in dict(*args, **kwargs).items():
             self[k] = v
 

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
 setup(
     name="ncagg",
-    version="0.8.17",
+    version="0.8.18",
     description="Utility for aggregation of NetCDF data.",
     author="Stefan Codrescu",
     author_email="[email protected]",
@@ -27,5 +27,7 @@
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
     ],
-    project_urls={"Documentation": "http://ncagg.readthedocs.io/en/latest/",},
+    project_urls={
+        "Documentation": "http://ncagg.readthedocs.io/en/latest/",
+    },
 )
diff --git a/test/exis/EXISL1bSFEU_remapping/type1_test.py b/test/exis/EXISL1bSFEU_remapping/type1_test.py
@@ -24,9 +24,9 @@ def tearDown(self):
         os.remove(self.nc_out_filename)
 
     def test_basic(self):
-        """ Ok, so the files in data/type1/ don't have an unlimited dimension, report_number should be
+        """Ok, so the files in data/type1/ don't have an unlimited dimension, report_number should be
         unlimited so I've made report_nubmer unlimited in the config template type1_config.json.
-        Let's see if we can aggregate to it. """
+        Let's see if we can aggregate to it."""
         aggregation_list = generate_aggregation_list(self.config, self.files)
         self.assertEqual(len(aggregation_list), 3)
         evaluate_aggregation_list(self.config, aggregation_list, self.nc_out_filename)

diff --git a/test/exis/EXISL1bSFEU_remapping/type3_test.py b/test/exis/EXISL1bSFEU_remapping/type3_test.py
@@ -24,9 +24,9 @@ def tearDown(self):
         os.remove(self.nc_out_filename)
 
     def test_using_product_bounds(self):
-        """ Ok, so the files in data/type3/ don't have an unlimited report_number dimension.
+        """Ok, so the files in data/type3/ don't have an unlimited report_number dimension.
         Also, euvsCQualityFlags is missing a report_number dimension, can we create an explicit
-        dependence on this? """
+        dependence on this?"""
         start_time = datetime(2017, 8, 25, 0, 3, 30)  # 2017-08-25T00:03:29.6Z
         end_time = datetime(2017, 8, 25, 0, 5, 0)  # 2017-08-25T00:04:29.6Z
 

diff --git a/test/exis/EXISL1bSFXR_copyfromalt/test_EXISL1bSFXR_copyfromalt.py b/test/exis/EXISL1bSFXR_copyfromalt/test_EXISL1bSFXR_copyfromalt.py
@@ -48,4 +48,3 @@ def test_exis_with_config(self):
             data = np.ma.filled(nc_out.variables["SPP_roll_angle"][:], np.nan)
             self.assertFalse(np.any(np.isnan(data)))
             self.assertEqual(len(data), 2)  # one record in each file
-
diff --git a/test/exis/xrsfl2flsum_simple/test_xrsfl2flsum.py b/test/exis/xrsfl2flsum_simple/test_xrsfl2flsum.py
@@ -5,6 +5,7 @@
 import glob
 import os
 import netCDF4 as nc
+import numpy as np
 
 
 class TestAggregate(unittest.TestCase):
@@ -19,17 +20,18 @@ def tearDown(self):
         os.remove(self.file)
 
     def test_main(self):
-        """
-        Nothing too fancy here, but just making sure that aggregating
-        a variable of strings works properly.
-
-        Previous to version 0.8.5 we had trouble with vlen datatypes.
-        """
         agg_list = generate_aggregation_list(self.config, self.files)
         evaluate_aggregation_list(self.config, agg_list, self.file)
 
         with nc.Dataset(self.file) as nc_in:
-            status = nc_in.variables["status"]
-            # there should be no fill values...
-            # before ncagg v0.8.5 vlen types like string incorrectly aggregated to all fill values.
-            self.assertFalse(any(status[:] == status._FillValue))
+            # there should be no fill values in the status variable
+            # before ncagg v0.8.5 vlen types like string incorrectly aggregated to fills
+            status = nc_in.variables["status"][:]
+            self.assertFalse(np.ma.is_masked(status))
+
+            # prior to ncagg v0.8.18, there was a bug that converted string fills
+            # to the string "nan"
+            flare_class = nc_in.variables["flare_class"][:]
+            self.assertFalse("nan" in flare_class)
+            self.assertTrue("" in flare_class)
+            self.assertTrue("B1.0" in flare_class)
diff --git a/test/exis/xrsfl2flx1s_nochop/test_xrsfl2flx1s.py b/test/exis/xrsfl2flx1s_nochop/test_xrsfl2flx1s.py
@@ -28,7 +28,7 @@ def test_main(self):
         record because it was less than 0.5*expected_time_step from the start boundary.
 
         Instead, that first data point should be taken, even if it is exactly on the
-        boundary, since the boundary isn't a realy piece of data. 
+        boundary, since the boundary isn't a realy piece of data.
 
         So, the essential piece of this test is to make sure that there remain 86400 records
         before and after aggregation.

diff --git a/test/generic/test_attribute_strategies.py b/test/generic/test_attribute_strategies.py
@@ -103,7 +103,6 @@ def test_strat_first_filename(self):
         self.assertIn(".nc", finalize(self.test_nc))
 
     def test_strat_static(self):
-
         # set the config for a static "license" attribute...
         value = "Hello world"
         self.handler_kwargs["config"].attrs["license"] = {
Original file line number	Diff line number	Diff line change
Expand Up		@@ -48,4 +48,3 @@ def test_exis_with_config(self):
		data = np.ma.filled(nc_out.variables["SPP_roll_angle"][:], np.nan)
		self.assertFalse(np.any(np.isnan(data)))
		self.assertEqual(len(data), 2) # one record in each file