make docstrings PEP 257 compliant and consistent

JGCRI · Dec 19, 2023 · a66ab58 · a66ab58
1 parent acba721
commit a66ab58
Show file tree

Hide file tree

Showing 19 changed files with 494 additions and 334 deletions.
diff --git a/stitches/fx_data.py b/stitches/fx_data.py
@@ -6,11 +6,16 @@
 
 
 def get_lat_name(ds):
-    """Get the name for the latitude values (could be either lat or latitude).
+    """Get the name for the latitude values in an xarray dataset.
 
-    :param ds:    xarray dataset of CMIP data.
+    This function searches for latitude coordinates in the dataset,
+    which could be named either 'lat' or 'latitude'.
 
-    :return:    the string name for the latitude variable.
+    :param ds: The dataset from which to retrieve the latitude coordinate name.
+    :type ds: xarray.Dataset
+    :returns: The name of the latitude variable.
+    :rtype: str
+    :raises RuntimeError: If no latitude coordinate is found in the dataset.
     """
     for lat_name in ["lat", "latitude"]:
         if lat_name in ds.coords:
@@ -19,11 +24,13 @@ def get_lat_name(ds):
 
 
 def global_mean(ds):
-    """Get the weighted global mean for a variable.
-
-    :param ds:  xarray dataset of CMIP data.
+    """
+    Calculate the weighted global mean for a variable in an xarray dataset.
 
-    :return:    xarray dataset of the weighted global mean.
+    :param ds: The xarray dataset of CMIP data.
+    :type ds: xarray.Dataset
+    :returns: The xarray dataset of the weighted global mean.
+    :rtype: xarray.Dataset
     """
     lat = ds[get_lat_name(ds)]
     weight = np.cos(np.deg2rad(lat))
@@ -33,11 +40,11 @@ def global_mean(ds):
 
 
 def get_ds_meta(ds):
-    """Get the meta data information from the xarray data set.
-
-    :param ds:  xarray dataset of CMIP data.
+    """
+    Get the metadata information from an xarray dataset.
 
-    :return:    pandas dataset of MIP information.
+    :param ds: xarray dataset of CMIP data.
+    :return: pandas DataFrame of MIP information.
     """
     v = ds.variable_id
 

diff --git a/stitches/fx_match.py b/stitches/fx_match.py
@@ -8,20 +8,22 @@
 
 # Internal fx
 def internal_dist(fx_pt, dx_pt, archivedata, tol=0):
-    """This function calculates the euclidean distance between the target values (fx and dx)
-    and the archive values contained in the data frame. It will be used to help select which
-    of the archive values best matches the target values. To ensure a consistent unit across
-    all dimensions of the space, dx is updated to be windowsize*dx so that it has units of
-    degC. This results in a distance metric (Euclidean/l2) in units of degC.
-    Could _very_ easily make that choice of unit consistency optional via arg and if-statement.
-
-        :param fx_pt:          a single value of the target fx value
-        :param dx_pt:          a single value of the target dx value
-        :param archivedata:    a data frame of the archive fx and dx values
-        :param archivedata:    a data frame of the archive fx and dx values
-        :param tol:            a tolerance for the neighborhood of matching. defaults to 0 degC - only the nearest-neighbor is returned
-
-        :return:               a data frame with the target data and the corresponding matched archive data.
+    """
+    Calculate the Euclidean distance between target and archive values.
+
+    This function calculates the Euclidean distance between the target values (fx and dx)
+    and the archive values contained in the dataframe. It is used to select which
+    archive values best match the target values. To ensure consistent units across
+    all dimensions, dx is updated to be windowsize*dx with units of degC, resulting
+    in a distance metric (Euclidean/l2) in units of degC. The choice of unit consistency
+    could be made optional via an argument and if-statement.
+
+    :param fx_pt: A single value of the target fx value.
+    :param dx_pt: A single value of the target dx value.
+    :param archivedata: A dataframe of the archive fx and dx values.
+    :param tol: A tolerance for the neighborhood of matching; defaults to 0 degC,
+                returning only the nearest neighbor.
+    :return: A dataframe with the target data and the corresponding matched archive data.
     """
 
     # Check the inputs
@@ -73,11 +75,11 @@ def internal_dist(fx_pt, dx_pt, archivedata, tol=0):
 
 # Internal fx
 def shuffle_function(dt):
-    """Randomly shuffle the deck, this should help with the matching process.
-
-    :param dt:          a data of archive values that will be used in the matching process.
+    """
+    Randomly shuffle the deck to assist with the matching process.
 
-    :return:               a randomly ordered data frame.
+    :param dt: A DataFrame of archive values used in the matching process.
+    :return: A DataFrame with rows in random order.
     """
     nrow = dt.shape[0]
     out = dt.sample(nrow, replace=False)
@@ -87,15 +89,18 @@ def shuffle_function(dt):
 
 # Internal fx
 def drop_hist_false_duplicates(matched_data):
-    """A helper function to remove false duplicate matches in the historical period. For
-    example, target 1850 gets 1872 data from realization 13 of SSP126 and SSP585.
-    The metadata of these archive values are different, but the actual data
-    values are identical because we just pasted in the same historical data to
-    every Experiment. So this function keeps only the first match.
+    """
+    Remove false duplicate matches in the historical period.
 
-        :param matched_data:    pandas object returned from match_neighborhood.
+    This function is used to remove false duplicate matches in the historical period.
+    For example, if the target year 1850 gets data from 1872 from realization 13 of
+    SSP126 and SSP585, the metadata of these archive values are different, but the
+    actual data values are identical because the same historical data was pasted into
+    every experiment. This function keeps only the first match.
 
-        :return:               a data frame of matched data with the same structure as the input, with false duplicates in the historical period dropped
+    :param matched_data: pandas DataFrame returned from match_neighborhood.
+    :return: DataFrame with the same structure as the input, with false duplicates
+             in the historical period dropped.
     """
 
     # Subset the idealized runs, since these are not concatenated with the historical time series
@@ -227,19 +232,21 @@ def drop_hist_false_duplicates(matched_data):
 def match_neighborhood(
     target_data, archive_data, tol: float = 0, drop_hist_duplicates: bool = True
 ):
-    """This function takes data frames of target and archive data and calculates the euclidean distance between the target values (fx and dx) and the archive values.
-
-    :param target_data:           a data frame of the target fx and dx values
-
-    :param archive_data:         a data frame of the archive fx and dx values
-
-    :param tol:            a tolerance for the neighborhood of matching. defaults to 0 degC - only the nearest-neighbor is returned
-    :type tol:              float
-
-    :param drop_hist_duplicates:    a Boolean True/False that defaults to True to determine whether to consider historical values across SSP scenarios to be duplicates and therefore all but one dropped from matching (True) or to be distinct points for matching (False).
-    :type drop_hist_duplicates:     bool
-
-    :return:               a data frame with the target data and the corresponding matched archive data.
+    """
+    Calculate the Euclidean distance between target and archive data.
+
+    This function takes data frames of target and archive data and calculates the
+    Euclidean distance between the target values (fx and dx) and the archive values.
+
+    :param target_data: Data frame of the target fx and dx values.
+    :param archive_data: Data frame of the archive fx and dx values.
+    :param tol: Tolerance for the neighborhood of matching. Defaults to 0 degC,
+        meaning only the nearest-neighbor is returned. Must be a float.
+    :param drop_hist_duplicates: Determines whether to consider historical values
+        across SSP scenarios as duplicates (True) and drop all but one from matching,
+        or to consider them as distinct points for matching (False). Defaults to True.
+    :type drop_hist_duplicates: bool
+    :return: Data frame with the target data and the corresponding matched archive data.
     """
     # Check the inputs of the functions
     if util.nrow(target_data) <= 0:

diff --git a/stitches/fx_pangeo.py b/stitches/fx_pangeo.py
@@ -7,9 +7,13 @@
 
 
 def fetch_pangeo_table():
-    """Get a copy of the pangeo archive contents
+    """
+    Fetch the Pangeo CMIP6 archive table of contents as a pandas DataFrame.
+
+    Retrieve a copy of the Pangeo CMIP6 archive contents, which includes information
+    about the available models, sources, experiments, ensembles, and more.
 
-    :return: a pandas data frame containing information about the model, source, experiment, ensemble and so on that is available for download on pangeo.
+    :return: A pandas DataFrame with details on the datasets available for download from Pangeo.
     """
 
     # The url path that contains to the pangeo archive table of contents.
@@ -20,12 +24,12 @@ def fetch_pangeo_table():
 
 
 def fetch_nc(zstore: str):
-    """Extract data for a single file.
-
-    :param zstore:                str of the location of the cmip6 data file on pangeo.
-    :type zstore:                  str
+    """
+    Extract data for a single file from Pangeo.
 
-    :return:                      an xarray containing cmip6 data downloaded from  pangeo.
+    :param zstore: The location of the CMIP6 data file on Pangeo.
+    :type zstore: str
+    :return: An xarray Dataset containing CMIP6 data downloaded from Pangeo.
     """
     ds = xr.open_zarr(fsspec.get_mapper(zstore))
     ds.sortby("time")

diff --git a/stitches/fx_processing.py b/stitches/fx_processing.py
@@ -11,13 +11,14 @@
 
 
 def calculate_rolling_mean(data, size):
-    """ "
-    Calculate the rolling mean for the data frame with a user defined size centered window.
-    :param data:        A data frame of the cmip absolute temperature
-    :type data:        pandas.core.frame.DataFrame
-    :param size:          An integer value for the size of the window to use when calculating the rolling mean
-    :type size:           int
-    :return:          A pandas data frame of the smoothed time series (rolling mean applied)
+    """
+    Calculate the rolling mean for the data frame with a user-defined size centered window.
+
+    :param data: A data frame of the CMIP absolute temperature.
+    :type data: pandas.core.frame.DataFrame
+    :param size: An integer value for the size of the window to use when calculating the rolling mean.
+    :type size: int
+    :return: A pandas data frame of the smoothed time series with the rolling mean applied.
     """
     # Check inputs
     util.check_columns(
@@ -61,15 +62,20 @@ def calculate_rolling_mean(data, size):
 
 
 def chunk_ts(df, n, base_chunk=0):
-    """Format a data frame into an array of data frames containing data for n-sized years of successive data.
-    :param df:     data frame of climate data to chunk into different periods
-    :type df:     pandas DataFrame
-    :param n:    the size of the windows to chunk into separate periods
-    :type n:    int
-    :param base_chunk: a helper argument for creating all of the staggered chunks, defaults to 0 (original behavior)
-    :type base_chunk: int
-    :return:    pandas DataFrame identical to df with the addition of a chunk column
+    """
+    Format a data frame into an array of data frames with n-sized years of successive data.
 
+    This function takes a data frame of climate data and chunks it into separate periods,
+    each containing data for a span of `n` years. It adds a 'chunk' column to the data frame
+    to indicate the period each row belongs to.
+
+    :param df: Data frame of climate data to chunk into different periods.
+    :type df: pandas.DataFrame
+    :param n: The size of the windows to chunk into separate periods.
+    :type n: int
+    :param base_chunk: A helper argument for creating staggered chunks, defaults to 0 (original behavior).
+    :type base_chunk: int
+    :return: A pandas DataFrame identical to `df` with the addition of a 'chunk' column.
     """
 
     # Check inputs
@@ -102,11 +108,13 @@ def chunk_ts(df, n, base_chunk=0):
 
 
 def get_chunk_info(df):
-    """Determine the value and the rate of change for each chunk.
-    :param df:     data frame of climate data chunked into different periods
-    :type df:     pandas DataFrame
-    :return:    pandas DataFrame of the chunk information, the start and end years as well as the chunk value (fx)
-    and the chunk rate of change (dx).
+    """
+    Determine the value and the rate of change for each chunk.
+
+    :param df: Data frame of climate data chunked into different periods.
+    :type df: pandas.DataFrame
+    :return: A pandas DataFrame with the chunk information, including the start and end years, the chunk value (fx),
+             and the chunk rate of change (dx).
     """
 
     # Check the inputs
@@ -189,19 +197,20 @@ def get_chunk_info(df):
 
 
 def subset_archive(staggered_archive, end_yr_vector):
-    """ Take a staggered archive with chunked data for a 9 year window following
-         each year in 1850-2100 and subset to the entries with `end_yr` in
-         `end_yr_vector`.
-      :param staggered_archive:     A formatted archive with chunked data starting
-                                                 in each year
-      :type df:     pandas DataFrame
-
-      :param end_yr_vector:   vector of end_yrs want to subset the archive to.
-
-
-      :return:    pandas DataFrame of the subsetted archive, same format just fewer
-                    entries
-      """
+    """
+    Subset a staggered archive to entries with `end_yr` in `end_yr_vector`.
+
+    This function takes a staggered archive with chunked data for a 9-year window
+    following each year in 1850-2100 and subsets it to the entries with `end_yr`
+    in `end_yr_vector`.
+
+    :param staggered_archive: A formatted archive with chunked data starting in each year.
+    :type staggered_archive: pandas.DataFrame
+    :param end_yr_vector: Vector of end years to subset the archive to.
+    :type end_yr_vector: list or similar iterable
+    :return: A pandas DataFrame of the subsetted archive, same format but fewer entries.
+    :rtype: pandas.DataFrame
+    """
 
     out = staggered_archive[staggered_archive['end_yr'].isin(end_yr_vector)].reset_index(drop=True).copy()
     return out