Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes string fill values converted to "nan" #11

Merged
merged 4 commits into from
Jan 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# 0.8.18 - 2024-01-21

- Bug fix: fixed string variable fill values replaced by the string "nan".

# 0.8.17 - 2023-07-05

- New feature: support `copy_from_alt` list of alternative variable names to use if `name` variable
Expand Down
16 changes: 8 additions & 8 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,18 @@
master_doc = "index"

# General information about the project.
project = u"ncagg"
copyright = u"2018, Stefan Codrescu"
author = u"Stefan Codrescu"
project = "ncagg"
copyright = "2018, Stefan Codrescu"
author = "Stefan Codrescu"

# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = u""
version = ""
# The full version, including alpha/beta/rc tags.
release = u""
release = ""

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down Expand Up @@ -146,15 +146,15 @@
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, "ncagg.tex", u"ncagg Documentation", u"Author", "manual"),
(master_doc, "ncagg.tex", "ncagg Documentation", "Author", "manual"),
]


# -- Options for manual page output ---------------------------------------

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [(master_doc, "ncagg", u"ncagg Documentation", [author], 1)]
man_pages = [(master_doc, "ncagg", "ncagg Documentation", [author], 1)]


# -- Options for Texinfo output -------------------------------------------
Expand All @@ -166,7 +166,7 @@
(
master_doc,
"ncagg",
u"ncagg Documentation",
"ncagg Documentation",
author,
"ncagg",
"One line description of project.",
Expand Down
14 changes: 6 additions & 8 deletions ncagg/aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def generate_aggregation_list(config, files_to_aggregate):

def cast_bound(bound):
# type: (float | datetime) -> float
""" Cast a bound to a numerical type for use. Will not be working directly with datetime objects. """
"""Cast a bound to a numerical type for use. Will not be working directly with datetime objects."""
if isinstance(bound, datetime):
units = config.vars[primary_index_by["index_by"]]["attributes"]["units"]
return nc.date2num(bound, units)
Expand All @@ -111,9 +111,8 @@ def cast_bound(bound):
)

# Can continue into the correction loop as long as we have at least cadence_hz, or min and max.
if (
cadence_hz is None
or (first_along_primary is None and last_along_primary is None)
if cadence_hz is None or (
first_along_primary is None and last_along_primary is None
):
return preliminary

Expand Down Expand Up @@ -271,7 +270,6 @@ def evaluate_aggregation_list(config, aggregation_list, to_fullpath, callback=No
vars_unlim.append(v)

with nc.Dataset(to_fullpath, "r+") as nc_out: # type: nc.Dataset

# the vars once don't depend on an unlimited dim so only need to be copied once. Find the first
# InputFileNode to copy from so we don't get fill values. Otherwise, if none exists, which shouldn't
# happen, but oh well, use a fill node.
Expand Down Expand Up @@ -364,8 +362,8 @@ def evaluate_aggregation_list(config, aggregation_list, to_fullpath, callback=No
def initialize_aggregation_file(config, fullpath):
# type: (Config, str) -> None
"""
Based on the configuration, initialize a file in which to write the aggregated output.
Based on the configuration, initialize a file in which to write the aggregated output.

In other words, resurrect a netcdf file that would result in config.

:param config: Aggregation configuration
Expand All @@ -386,7 +384,7 @@ def initialize_aggregation_file(config, fullpath):
fill_value = var_type.type(fill_value)
zlib = True
if np.issubdtype(var_type, str):
# NetCDF started raising RuntimeError when passed compression args on
# NetCDF started raising RuntimeError when passed compression args on
# vlen datatypes. Detect vlens (str for now) and avoid compression.
# Ref: https://github.com/Unidata/netcdf4-python/issues/1205
zlib = False
Expand Down
33 changes: 15 additions & 18 deletions ncagg/aggrelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,15 @@ def get_fill_for(variable):
:return: A fill value for variable.
"""
datatype = np.dtype(variable["datatype"])
try:

if np.issubdtype(datatype, np.floating):
return datatype.type(np.nan)
except ValueError:
# for an integer type, there is no concept of nan, this will raise
# ValueError: cannot convert float NaN to integer, so use -9999 instead
# main reason for this complexity is to handle exis integer datatypes
nc_default_fill = datatype.type(nc.default_fillvals[datatype.str[1:]])
return datatype.type(variable["attributes"].get("_FillValue", nc_default_fill))

if np.issubdtype(datatype, str):
return ""

nc_default_fill = datatype.type(nc.default_fillvals[datatype.str[1:]])
return datatype.type(variable["attributes"].get("_FillValue", nc_default_fill))


class VariableNotFoundException(Exception):
Expand Down Expand Up @@ -154,7 +155,6 @@ def data_for(self, var):
result_shape = []

for index, dim in enumerate(var["dimensions"]):

dim_size = self.config.dims[dim]["size"]
dim_unlim = dim_size is None # save dim_unlim because we'll set the size

Expand Down Expand Up @@ -228,7 +228,6 @@ def get_coverage(self):
if d["index_by"] is not None and not d["flatten"]
]
for udim in index_by:

# cadence_hz may be None in which case we'll simply look for fill or invalid values in the index_by
# variable. At the moment, this is hard coded to seek 0's since our main use case is index_by time
# and we don't expect our spacecraft to teleport back to the epoch value :)
Expand Down Expand Up @@ -332,14 +331,14 @@ def cache_dim_sizes(self):
self.dim_sizes[dim["name"]] = self.get_file_internal_aggregation_size(dim)

def get_first_of_index_by(self, udim):
""" Get the first value along udim. """
"""Get the first value along udim."""
first_slice = self.file_internal_aggregation_list[udim["name"]][0]
assert isinstance(first_slice, slice), "Must be a slice!"
assert isinstance(first_slice.start, int), "Must be an int!"
return self.get_index_of_index_by(first_slice.start, udim).item(0)

def get_last_of_index_by(self, udim):
""" Get the last value along udim. """
"""Get the last value along udim."""
last_slice = self.file_internal_aggregation_list[udim["name"]][-1]
assert isinstance(last_slice, slice), "Must be a slice!"
assert isinstance(last_slice.start, int), "Must be an int!"
Expand Down Expand Up @@ -567,22 +566,20 @@ def data_for_netcdf(self, var, nc_in):
)

fill_value = get_fill_for(var)
nc_var = nc_in.variables[name]
dims = [
self.config.dims[d]
for d in var["dimensions"]
if d in nc_in.variables[name].dimensions
self.config.dims[d] for d in var["dimensions"] if d in nc_var.dimensions
]

# step 1: get the sorted data
dim_slices = tuple(
[self.sort_unlim.get(d["name"], slice(None)) for d in dims]
) or slice(None)
nc_in.variables[name].set_auto_mask(False)
nc_var.set_auto_mask(False)
prelim_data = nc_in.variables[name][dim_slices]
if hasattr(nc_in.variables[name], "_FillValue"):
where_to_fill = prelim_data == nc_in.variables[name]._FillValue
if hasattr(nc_var, "_FillValue"):
where_to_fill = prelim_data == nc_var._FillValue
prelim_data[where_to_fill] = fill_value
# prelim_data = np.ma.filled(nc_in.variables[name][dim_slices], fill_value=fill_value)

if len(dims) == 0:
# if this is just a scalar value, return
Expand Down
30 changes: 22 additions & 8 deletions ncagg/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,16 @@ def parse_bound_arg(b):
if len(b_split) == 2:
b_split[0] = parse_time(b_split[0][1:])
# friendly cli: for the second bound, ignore whether it starts with a T or not.
b_split[1] = parse_time(b_split[1][1:] if b_split[1].startswith("T") else b_split[1])
b_split[1] = parse_time(
b_split[1][1:] if b_split[1].startswith("T") else b_split[1]
)
elif len(b_split) == 1:
# if there's only one, infer dayfile, monthfile, or yearfile based on the length
if len(b_split[0][1:]) == 4: # infer -bTYYYY:TYYYY+1
b_split[0] = parse_time(b_split[0][1:])
b_split.append(datetime(b_split[0].year + 1, 1, 1) - timedelta(microseconds=1))
b_split.append(
datetime(b_split[0].year + 1, 1, 1) - timedelta(microseconds=1)
)
elif len(b_split[0][1:]) == 6: # infer -bTYYYYMM:TYYYYMM+1
b_split[0] = parse_time(b_split[0][1:])
# datetime month must be in 1..12, so if month+1 == 13, increment year
Expand All @@ -66,16 +70,24 @@ def parse_bound_arg(b):
next_month = 1
else:
next_year = b_split[0].year
b_split.append(datetime(next_year, next_month, 1) - timedelta(microseconds=1))
b_split.append(
datetime(next_year, next_month, 1) - timedelta(microseconds=1)
)
elif len(b_split[0][1:]) == 8: # infer -bTYYYYMMDD:TYYYYMMDD+1
b_split[0] = parse_time(b_split[0][1:])
b_split.append(b_split[0] + timedelta(days=1) - timedelta(microseconds=1))
b_split.append(
b_split[0] + timedelta(days=1) - timedelta(microseconds=1)
)
elif len(b_split[0][1:]) == 10: # infer -bTYYYYMMDDHH:TYYYYMMDDHH+1
b_split[0] = parse_time(b_split[0][1:])
b_split.append(b_split[0] + timedelta(hours=1) - timedelta(microseconds=1))
b_split.append(
b_split[0] + timedelta(hours=1) - timedelta(microseconds=1)
)
elif len(b_split[0][1:]) == 12: # infer -bTYYYYMMDDHHMM:TYYYYMMDDHHMM+1
b_split[0] = parse_time(b_split[0][1:])
b_split.append(b_split[0] + timedelta(minutes=1) - timedelta(microseconds=1))
b_split.append(
b_split[0] + timedelta(minutes=1) - timedelta(microseconds=1)
)
else:
raise click.BadParameter("")
else:
Expand Down Expand Up @@ -131,7 +143,9 @@ def get_src_from_stdin(ctx, param, value):
)
elif not value:
# otherwise, nothing found
raise click.BadParameter("No files provided as argument or via stdin.", ctx=ctx, param=param)
raise click.BadParameter(
"No files provided as argument or via stdin.", ctx=ctx, param=param
)
return value


Expand Down Expand Up @@ -170,7 +184,7 @@ def get_src_from_stdin(ctx, param, value):
)
@click.option("-t", help="Specify a configuration template", type=click.File("r"))
def cli(dst, src, u=None, c=None, b=None, l="WARNING", t=None):
""" Aggregate NetCDF files. """
"""Aggregate NetCDF files."""
logging.getLogger().setLevel(l)
if t is not None: # if given a template...
config = Config.from_dict(json.load(t))
Expand Down
2 changes: 1 addition & 1 deletion ncagg/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def __setitem__(self, key, value):
super(ConfigDict, self).__setitem__(value["name"], value)

def update(self, *args, **kwargs):
""" Update the ConfigDict, using __setitem__ in order to validate every entry. """
"""Update the ConfigDict, using __setitem__ in order to validate every entry."""
for k, v in dict(*args, **kwargs).items():
self[k] = v

Expand Down
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

setup(
name="ncagg",
version="0.8.17",
version="0.8.18",
description="Utility for aggregation of NetCDF data.",
author="Stefan Codrescu",
author_email="[email protected]",
Expand All @@ -27,5 +27,7 @@
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
project_urls={"Documentation": "http://ncagg.readthedocs.io/en/latest/",},
project_urls={
"Documentation": "http://ncagg.readthedocs.io/en/latest/",
},
)
4 changes: 2 additions & 2 deletions test/exis/EXISL1bSFEU_remapping/type1_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ def tearDown(self):
os.remove(self.nc_out_filename)

def test_basic(self):
""" Ok, so the files in data/type1/ don't have an unlimited dimension, report_number should be
"""Ok, so the files in data/type1/ don't have an unlimited dimension, report_number should be
unlimited so I've made report_nubmer unlimited in the config template type1_config.json.
Let's see if we can aggregate to it. """
Let's see if we can aggregate to it."""
aggregation_list = generate_aggregation_list(self.config, self.files)
self.assertEqual(len(aggregation_list), 3)
evaluate_aggregation_list(self.config, aggregation_list, self.nc_out_filename)
Expand Down
4 changes: 2 additions & 2 deletions test/exis/EXISL1bSFEU_remapping/type3_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ def tearDown(self):
os.remove(self.nc_out_filename)

def test_using_product_bounds(self):
""" Ok, so the files in data/type3/ don't have an unlimited report_number dimension.
"""Ok, so the files in data/type3/ don't have an unlimited report_number dimension.
Also, euvsCQualityFlags is missing a report_number dimension, can we create an explicit
dependence on this? """
dependence on this?"""
start_time = datetime(2017, 8, 25, 0, 3, 30) # 2017-08-25T00:03:29.6Z
end_time = datetime(2017, 8, 25, 0, 5, 0) # 2017-08-25T00:04:29.6Z

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,3 @@ def test_exis_with_config(self):
data = np.ma.filled(nc_out.variables["SPP_roll_angle"][:], np.nan)
self.assertFalse(np.any(np.isnan(data)))
self.assertEqual(len(data), 2) # one record in each file

22 changes: 12 additions & 10 deletions test/exis/xrsfl2flsum_simple/test_xrsfl2flsum.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import glob
import os
import netCDF4 as nc
import numpy as np


class TestAggregate(unittest.TestCase):
Expand All @@ -19,17 +20,18 @@ def tearDown(self):
os.remove(self.file)

def test_main(self):
"""
Nothing too fancy here, but just making sure that aggregating
a variable of strings works properly.

Previous to version 0.8.5 we had trouble with vlen datatypes.
"""
agg_list = generate_aggregation_list(self.config, self.files)
evaluate_aggregation_list(self.config, agg_list, self.file)

with nc.Dataset(self.file) as nc_in:
status = nc_in.variables["status"]
# there should be no fill values...
# before ncagg v0.8.5 vlen types like string incorrectly aggregated to all fill values.
self.assertFalse(any(status[:] == status._FillValue))
# there should be no fill values in the status variable
# before ncagg v0.8.5 vlen types like string incorrectly aggregated to fills
status = nc_in.variables["status"][:]
self.assertFalse(np.ma.is_masked(status))

# prior to ncagg v0.8.18, there was a bug that converted string fills
# to the string "nan"
flare_class = nc_in.variables["flare_class"][:]
self.assertFalse("nan" in flare_class)
self.assertTrue("" in flare_class)
self.assertTrue("B1.0" in flare_class)
2 changes: 1 addition & 1 deletion test/exis/xrsfl2flx1s_nochop/test_xrsfl2flx1s.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_main(self):
record because it was less than 0.5*expected_time_step from the start boundary.

Instead, that first data point should be taken, even if it is exactly on the
boundary, since the boundary isn't a realy piece of data.
boundary, since the boundary isn't a realy piece of data.

So, the essential piece of this test is to make sure that there remain 86400 records
before and after aggregation.
Expand Down
1 change: 0 additions & 1 deletion test/generic/test_attribute_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ def test_strat_first_filename(self):
self.assertIn(".nc", finalize(self.test_nc))

def test_strat_static(self):

# set the config for a static "license" attribute...
value = "Hello world"
self.handler_kwargs["config"].attrs["license"] = {
Expand Down
Loading
Loading