Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAS-2236 - Updates to methods that calculate the points used to create the dimensions #18

Open
wants to merge 16 commits into
base: DAS-2208-SMAP-L3
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
252 changes: 215 additions & 37 deletions hoss/coordinate_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@

import numpy as np
from netCDF4 import Dataset

# from numpy import ndarray
from pyproj import CRS, Transformer
from varinfo import VariableFromDmr, VarInfoFromDmr

from hoss.exceptions import (
IncompatibleCoordinateVariables,
InvalidCoordinateData,
InvalidCoordinateDataset,
InvalidCoordinateVariable,
MissingCoordinateVariable,
MissingVariable,
)
Expand Down Expand Up @@ -97,24 +96,28 @@ def get_coordinate_variables(
varinfo: VarInfoFromDmr,
requested_variables: list,
) -> tuple[list, list]:
"""This function returns latitude and longitude variables listed in the
CF-Convention coordinates metadata attribute. It returns them in a specific
order [latitude, longitude]"
"""This function returns latitude and longitude variable names from
latitude and longitude variables listed in the CF-Convention coordinates
metadata attribute. It returns them in a specific
order [latitude_name, longitude_name]"
"""

coordinate_variables_list = varinfo.get_references_for_attribute(
coordinate_variables = varinfo.get_references_for_attribute(
requested_variables, 'coordinates'
)

latitude_coordinate_variables = [
coordinate
for coordinate in coordinate_variables_list
if varinfo.get_variable(coordinate).is_latitude()
for coordinate in coordinate_variables
if varinfo.get_variable(coordinate) is not None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Same comment for the similar change in the list comprehension for longitude_coordinate_variables)

Where do you catch any issues if there is a reference to a variable in a coordinates metadata attribute that isn't present in the file? If it's pointing to a latitude or longitude variable that isn't present in the file, at some point that's going to cause a failure when you try to access that variable. Is that gracefully handled somewhere?

(It's likely this is already nicely supported, but it's been a while since the previous PR and I just can't remember)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Separate comment - you have not added unit tests to prove this case is being handled as expected. (Needed both for latitude or longitude)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • this method is first called before prefetch. it should not fail if the variables don't exist.
  • if it is present in the variable and not in the file - it will fail during prefetch. That is when it is first accessed from the file. - will check if I have a test on that
  • There is a unit test for the function - will add subtests if the variables don't exist

Copy link
Collaborator Author

@sudha-murthy sudha-murthy Nov 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and varinfo.get_variable(coordinate).is_latitude()
]

longitude_coordinate_variables = [
coordinate
for coordinate in coordinate_variables_list
if varinfo.get_variable(coordinate).is_longitude()
for coordinate in coordinate_variables
if varinfo.get_variable(coordinate) is not None
and varinfo.get_variable(coordinate).is_longitude()
]

return latitude_coordinate_variables, longitude_coordinate_variables
Expand Down Expand Up @@ -162,59 +165,234 @@ def get_coordinate_array(
return coordinate_array


def get_1D_dim_array_data_from_dimvalues(
D-Auty marked this conversation as resolved.
Show resolved Hide resolved
dim_values: np.ndarray, dim_indices: np.ndarray, dim_size: int
def get_1d_dim_array_data_from_dimvalues(
dim_values: np.ndarray,
dim_indices: np.ndarray,
dim_size: int,
) -> np.ndarray:
D-Auty marked this conversation as resolved.
Show resolved Hide resolved
"""
D-Auty marked this conversation as resolved.
Show resolved Hide resolved
return a full dimension data array based on the 2 projected points and
grid size
return a full dimension data array based upon 2 valid projected values
(x and y) given in dim_values which are within the indices given in
dim_indices and the full dimension size provided in dim_size. The
dim_indices need to be between 0 and less than the dim_size.
returns a 1D array of size = dim_size with proper dimension array values
"""

if (dim_indices[1] != dim_indices[0]) and (dim_values[1] != dim_values[0]):
dim_resolution = (dim_values[1] - dim_values[0]) / (
dim_indices[1] - dim_indices[0]
)
else:
raise InvalidCoordinateDataset(dim_values[0], dim_indices[0])
raise InvalidCoordinateData(
'No distinct valid coordinate points - '
f'dim_index={dim_indices[0]}, dim_value={dim_values[0]}'
)

dim_min = dim_values[0] - (dim_resolution * dim_indices[0])
dim_max = dim_values[1] + (dim_resolution * (dim_size - 1 - dim_indices[1]))

return np.linspace(dim_min, dim_max, dim_size)


def get_valid_indices(
coordinate_row_col: np.ndarray, coordinate: VariableFromDmr
lat_lon_array: np.ndarray, coordinate: VariableFromDmr
) -> np.ndarray:
"""
Returns indices of a valid array without fill values if the fill
value is provided. If it is not provided, we check for valid values
for latitude and longitude
Returns an array of boolean values
- true, false - indicating a valid value (non-fill, within range)
for a given coordinate variable. A value of True means the
value is valid
- latitude or longitude - or
returns an empty ndarray of size (0,0) for any other variable.

"""
D-Auty marked this conversation as resolved.
Show resolved Hide resolved

# get_attribute_value returns a value of type `str`
coordinate_fill = coordinate.get_attribute_value('_FillValue')
if coordinate_fill is not None:
is_not_fill = ~np.isclose(coordinate_row_col, float(coordinate_fill))
is_not_fill = ~np.isclose(lat_lon_array, float(coordinate_fill))
else:
# Creates an entire array of `True` values.
is_not_fill = np.ones_like(coordinate_row_col, dtype=bool)
is_not_fill = np.ones_like(lat_lon_array, dtype=bool)

if coordinate.is_longitude():
valid_indices = np.where(
np.logical_and(
is_not_fill,
np.logical_and(
coordinate_row_col >= -180.0, coordinate_row_col <= 360.0
),
)
)[0]
valid_indices = np.logical_and(
is_not_fill,
np.logical_and(lat_lon_array >= -180.0, lat_lon_array <= 360.0),
)
elif coordinate.is_latitude():
valid_indices = np.where(
np.logical_and(
is_not_fill,
np.logical_and(coordinate_row_col >= -90.0, coordinate_row_col <= 90.0),
)
)[0]
valid_indices = np.logical_and(
is_not_fill,
np.logical_and(lat_lon_array >= -90.0, lat_lon_array <= 90.0),
)
else:
valid_indices = np.empty((0, 0))
raise InvalidCoordinateDataset(coordinate.full_name_path)

return valid_indices


def get_x_y_values_from_geographic_points(
points: list[tuple],
crs: CRS,
) -> list[tuple]:
"""Takes an input list of (longitude, latitude) coordinates and CRS object
from PyProj and projects those points to the target grid. Then returns
the list of x-y points as tuple in (x,y) order

"""
point_longitudes, point_latitudes = zip(*list(points))

from_geo_transformer = Transformer.from_crs(4326, crs)
points_x, points_y = ( # pylint: disable=unpacking-non-sequence
from_geo_transformer.transform(point_latitudes, point_longitudes)
)

return list(zip(points_x, points_y))


def get_valid_row_col_pairs(
lat_arr: np.ndarray,
lon_arr: np.ndarray,
lat_coordinate: VariableFromDmr,
lon_coordinate: VariableFromDmr,
) -> tuple[list, list]:
"""
This function finds a set of indices maximally spread across
a row, and the set maximally spread across a column, with the
indices being valid in both the latitude and longitude datasets.
When interpolating between these points, the maximal spread
ensures the greatest interpolation accuracy.
"""
D-Auty marked this conversation as resolved.
Show resolved Hide resolved
valid_lat_lon_mask = np.logical_and(
get_valid_indices(lat_arr, lat_coordinate),
get_valid_indices(lon_arr, lon_coordinate),
)

# get maximally spread points within rows
max_x_spread_pts = get_max_x_spread_pts(~valid_lat_lon_mask)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is a regular grid, which HOSS requires to work correctly, you don't need to pick the two furthest points from one another. You just need any two points that don't have the same row or column. But, on the other hand, picking the furthest points means you get that. (TL;DR - no action needed)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did have it as just any 2 points before. But @autydp said that the best values would be if they were furthest apart.


# Doing the same for the columns is done by transposing the valid_mask
# and then fixing the results from [x, y] to [y, x].
max_y_spread_trsp = get_max_x_spread_pts(np.transpose(~valid_lat_lon_mask))
max_y_spread_pts = [
list(np.flip(max_y_spread_trsp[0])),
list(np.flip(max_y_spread_trsp[1])),
]

return max_y_spread_pts, max_x_spread_pts


def get_max_x_spread_pts(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this function is now being used for both x and y, then it (and the local variables within it) should be renamed to ensure it is clear it isn't just for x.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the x_spread is basically what returns the y_indices
This method is called twice to get points for row and column spans

valid_geospatial_mask: np.ndarray,
) -> list[list]:
"""
This function returns two data points by x, y indices that are spread farthest
from each other in the same row, i.e., have the greatest delta-x value - and
are valid data points from the valid_geospatial_mask array passed in. The input array
must be a 2D Numpy mask array providing the valid data points, e.g., filtering
out fill values and out-of-range values.
- input is Numpy Mask Array, e.g., invalid latitudes & longitudes
- returns 2 points by indices, [[y_ind, x_ind], [y_ind, x_ind]
"""
# fill a sample array with x-index values, x_ind[i, j] = j
x_ind = np.indices(
(valid_geospatial_mask.shape[0], valid_geospatial_mask.shape[1])
)[1]

# mask x_ind to hide the invalid data points
valid_x_ind = np.ma.array(x_ind, mask=valid_geospatial_mask)

if valid_x_ind.count() == 0:
raise InvalidCoordinateData("No valid coordinate data")

# ptp (peak-to-peak) finds the greatest delta-x value amongst valid points
# for each row. Result is 1D
x_ind_spread = valid_x_ind.ptp(axis=1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there is only one valid value, I think this gives you a single result (of value 0). It feels like that needs to be caught here before proceeding. (I don't think right now you are capturing the case anywhere that there is only a single valid point, apologies if I missed that, though)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah. Good catch. I think I had that check in the previous version of the method...
Will add the check

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated - 2f07d1a


# This finds which row has the greatest spread (delta-x)
max_x_spread_row = np.argmax(x_ind_spread)

# Using the row reference, find the min-x and max-x
min_x_ind = np.min(valid_x_ind[max_x_spread_row])
max_x_ind = np.max(valid_x_ind[max_x_spread_row])

# There is just one valid point
if min_x_ind == max_x_ind:
raise InvalidCoordinateData("Only one valid point in coordinate data")

return [[max_x_spread_row, min_x_ind], [max_x_spread_row, max_x_ind]]


def create_dimension_arrays_from_coordinates(
prefetch_dataset: Dataset,
latitude_coordinate: VariableFromDmr,
longitude_coordinate: VariableFromDmr,
crs: CRS,
projected_dimension_names: list,
) -> dict[str, np.ndarray]:
"""Generate artificial 1D dimensions scales for each
2D dimension or coordinate variable.
1) Get 2 valid geo grid points
2) convert them to a projected x-y extent
3) Generate the x-y dimscale array and return to the calling method

"""
lat_arr = get_coordinate_array(
prefetch_dataset,
latitude_coordinate.full_name_path,
)
lon_arr = get_coordinate_array(
prefetch_dataset,
longitude_coordinate.full_name_path,
)
row_size, col_size = get_row_col_sizes_from_coordinate_datasets(
lat_arr,
lon_arr,
)

row_indices, col_indices = get_valid_row_col_pairs(
lat_arr, lon_arr, latitude_coordinate, longitude_coordinate
)

y_dim = get_dimension_array_from_geo_points(
lat_arr, lon_arr, crs, row_indices, row_size, True
)

x_dim = get_dimension_array_from_geo_points(
lat_arr, lon_arr, crs, col_indices, col_size, False
)

projected_y, projected_x = tuple(projected_dimension_names)
return {projected_y: y_dim, projected_x: x_dim}


def get_dimension_array_from_geo_points(
lat_arr: np.ndarray,
lon_arr: np.ndarray,
crs: CRS,
dimension_indices: list,
dimension_size: int,
is_row=True,
) -> np.ndarray:
"""This function uses the list of lat/lon points corresponding
to a list of array indices and reprojects it with the CRS
provided and scales the x/y values to a dimension array with the dimension
size provided

"""
if is_row:
index_for_dimension = 0
x_or_y_index = 1
else:
index_for_dimension = 1
x_or_y_index = 0

geo_grid_points = [
(lon_arr[row, col], lat_arr[row, col]) for row, col in dimension_indices
]
x_y_values = get_x_y_values_from_geographic_points(geo_grid_points, crs)
indices_for_dimension = np.transpose(dimension_indices)[index_for_dimension]
dimension_values = np.transpose(x_y_values)[x_or_y_index]
return get_1d_dim_array_data_from_dimvalues(
dimension_values, indices_for_dimension, dimension_size
)
Loading