diff --git a/pyproject.toml b/pyproject.toml index d6cc213..05dee15 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "views-transformation-library" -version = "2.7.1" +version = "2.7.2" description = "A package containing data transformation functions used by the ViEWS team" homepage = "https://www.github.com/prio-data/views_transformation_library" readme = "README.md" diff --git a/views_transformation_library/missing.py b/views_transformation_library/missing.py index bfccf60..8b3c3a0 100644 --- a/views_transformation_library/missing.py +++ b/views_transformation_library/missing.py @@ -16,6 +16,15 @@ from sklearn.linear_model import BayesianRidge # type: ignore def replace_na(df: pd.DataFrame, replacement = 0): + """ + replace_na + + Replaces NaNs in the input dataframe with the specified value (which defaults to zero) + + Arguments: + value: quantity which will replace Nan (defaults to zero) + + """ return df.replace(np.nan,replacement) def list_totally_missing(df: pd.DataFrame) -> List[str]: @@ -29,7 +38,6 @@ def list_totally_missing(df: pd.DataFrame) -> List[str]: return cols - def fill_groups_with_time_means(df: pd.DataFrame) -> pd.DataFrame: """ Fill completely missing groups with time means """ @@ -47,7 +55,6 @@ def fill_groups_with_time_means(df: pd.DataFrame) -> pd.DataFrame: return df - def fill_with_group_and_global_means(df: pd.DataFrame) -> pd.DataFrame: """ Impute missing values to group-level or global means. """ @@ -62,13 +69,24 @@ def fill_with_group_and_global_means(df: pd.DataFrame) -> pd.DataFrame: return df - def extrapolate( df: pd.DataFrame, limit_direction: str = "both", limit_area: Optional[str] = None, ) -> pd.DataFrame: - """ Interpolate and extrapolate """ + """ + extrapolate + + Perform linear interpolation and/or extrapolation over NaNs by spatial unit + + Arguments: + limit_direction: 'forward', 'backward', 'both': consecutive NaNs will be filled in this direction + limit_area: None, 'inside', 'outside': if 'inside', NaNs will only be filled if bracketed by valid values (i.e. + interpolation) . If 'outside', NaNs are only filled outside valid values (i.e. extrapolation). If None, both + interpolation and extrapolation are performed + + """ + return ( df.sort_index() .groupby(level=1) @@ -115,12 +133,18 @@ def fill( limit_direction: Literal["forward", "backward", "both"] = "both", limit_area: Optional[Literal["inside", "outside"]] = None, ) -> pd.Series: - """ Fill column in dataframe with optional direction and area. + """ + fill + + Perform forward and/or backward filling by spatial unit Args: - s: Pandas series to apply filling to. - limit_direction: Direction in which to fill. - limit_area: Area to fill. Default None refers to the entire series. + limit_direction: 'forward', 'backward', 'both': Direction in which to fill. 'forward' propagates most recent + valid value forward. 'backward' propagates oldest valid value backwards. 'both' performs a forward propagation, + followed by a backward propagation + limit_area: None, 'inside', 'outside': if 'inside', NaNs will only be filled if bracketed by valid values. + If 'outside', NaNs are only filled outside valid values. If None, no restrictions are applied. + """ diff --git a/views_transformation_library/views_2.py b/views_transformation_library/views_2.py index 6a25017..85f8de3 100644 --- a/views_transformation_library/views_2.py +++ b/views_transformation_library/views_2.py @@ -10,12 +10,31 @@ import pandas as pd # type: ignore def delta(s: pd.Series, time: int = 1) -> pd.Series: - """ Return the time-delta of s """ + """ + delta + + Returns the time-delta of the input series s, s(t) - s(t-delta) + + Arguments: + + delta: integer specifying how large the time gap should be (defaults to 1) + + """ return s - tlag(s, time=time) def greater_or_equal(s: pd.Series, value: float) -> pd.Series: - """ 1 if s >= value, else 0 """ + """ + greater_or_equal + + Detects where input series is greater than or equal to a threshold value + + Returns 1 if s >= value, else 0 + + Arguments: + value: float specifying threshold + + """ mask = s >= value y = mask.astype(int) @@ -23,7 +42,17 @@ def greater_or_equal(s: pd.Series, value: float) -> pd.Series: return y def smaller_or_equal(s: pd.Series, value: float) -> pd.Series: - """ 1 if s >= value, else 0 """ + """ + smaller_or_equal + + Detects where input series is less than or equal to a threshold value + + Returns 1 if s <= value, else 0 + + Arguments: + value: float specifying threshold + + """ mask = s <= value y = mask.astype(int) @@ -31,7 +60,18 @@ def smaller_or_equal(s: pd.Series, value: float) -> pd.Series: return y def in_range(s: pd.Series, low: float, high: float) -> pd.Series: - """ 1 if low <= s <= high else 0 """ + """ + in_range + + Detects where input series lies between two values. + + Returns 1 if low <= s <= high else 0 + + Arguments: + low: float specifying lower threshold + high: float specifying higher threshold + + """ y_high = smaller_or_equal(s, high) y_low = greater_or_equal(s, low) @@ -40,7 +80,15 @@ def in_range(s: pd.Series, low: float, high: float) -> pd.Series: return y def tlag(s: pd.Series, time: int) -> pd.Series: - """ Time lag """ + """ + tlag + + Shifts input series backwards in time + + Arguments: + time: int specifying how many timesteps to shift backwards by + + """ if time < 1: msg = f"Time below 1 passed to tlag: {time} \n" msg += "Call tlead() instead \n" @@ -50,7 +98,15 @@ def tlag(s: pd.Series, time: int) -> pd.Series: def tlead(s: pd.Series, time: int) -> pd.Series: - """ Time lead """ + """ + tlead + + Shifts input series forwards in time + + Arguments: + time: int specifying how many timesteps to shift forwards by + + """ if time < 1: msg = f"Time below 1 passed to tlead: {time} \n" msg += "Call tlag() instead \n" @@ -59,7 +115,16 @@ def tlead(s: pd.Series, time: int) -> pd.Series: return s.groupby(level=1).shift(-time) def moving_average(s: pd.Series, time: int) -> pd.Series: - """ Moving average """ + """ + moving_average + + Computes moving average over a specified time window + + Arguments: + window: integer size of moving time window over which to average + + """ + if time < 1: msg = f"Time below 1 passed to ma: {time} \n" raise RuntimeError(msg) @@ -77,7 +142,16 @@ def moving_average(s: pd.Series, time: int) -> pd.Series: return y def moving_sum(s: pd.Series, time: int) -> pd.Series: - """ Moving sum """ + """ + moving_sum + + Computes moving sum over a specified time window + + Arguments: + window: integer size of moving time window over which to sum + + """ + if time < 1: msg = f"Time below 1 passed to ms: {time} \n" raise RuntimeError(msg) @@ -96,9 +170,17 @@ def moving_sum(s: pd.Series, time: int) -> pd.Series: def cweq(s: pd.Series, value: float, seed=None) -> pd.Series: - """Count while s equals value + """ + cweq - @TODO: Seed from series (series of seeds per groupvar?) + Moving forwards in time, continue to count the number of timesteps while the input series equals value. If + the series ceases to be equal to the specified value, reset the count to zero. + + Seed specifies the value of the count at the beginning of the series (when no prior data are available) + + Arguments: + value: float specifying value of series to follow + seed: assumed count at beginning of series (defaults to None) """ @@ -112,7 +194,7 @@ def set_seed(count, s, seed, mask): As the time count is summed cumulatively we can "seed" this counting sum with a starting value. - This seed is therefre insterted into the first time period + This seed is therefore inserted into the first time period of the count IF the country is in peace at that time. Being in peace means the count is True, or ==1 as we already cast the masks T/F to the counters 1/0. @@ -160,7 +242,10 @@ def set_seed(count, s, seed, mask): return y def time_since(s, value=0, seed=None) -> pd.Series: - """time since event in s where event is value other than 0. + """ + time_since + + time since event in series, where an event is where the series devaites from value. In order to compute a variable like "time since previous conflict event" we must apply a timelag to cweq() to get a series because @@ -183,35 +268,82 @@ def time_since(s, value=0, seed=None) -> pd.Series: rhs variables are time-lagged anyway but this is useful for dynamic simulation where X and predicted y are simulatenous. + Arguments: + value: float specifying value of series to follow + seed: assumed time_since at beginning of series (defaults to None) + """ return cweq(s=tlag(s=s, time=1), value=value, seed=seed) def decay(s: pd.Series, halflife: float) -> pd.Series: - """Decay function + """ + decay + + Decay function, returning 2**(-s/halflife) See half-life formulation at https://en.wikipedia.org/wiki/Exponential_decay + + Arguments: + halflife: float specifying time over which decay by a factor of 2 occurs + """ return 2 ** ((-1 * s) / halflife) def mean(s: pd.Series) -> pd.Series: - """ Per-groupvar arithmetic mean """ + """ + mean + + Computes the arithmetic mean over time for each spatial unit + + Arguments: + None + + """ return s.groupby(level=1).transform("mean") def ln(s: pd.Series) -> pd.Series: - """ Natural log of s+1 """ + """ + ln + + Returns natural log of s+1 + + Arguments: + None + + """ + return np.log1p(s) def demean(s: pd.Series) -> pd.Series: - """ demean, s = s - mean_group(s) """ + """ + demean + + Computes difference between value and mean of input, grouped by spatial unit + + Returns s - mean_group(s) + + Arguments: + None + + """ + s_mean = s.groupby(level=1).transform("mean") return s - s_mean def rollmax(s: pd.Series, window: int) -> pd.Series: - """ Rolling max """ + """ + rollmax + + Computes rolling maximum over a specified time window + + Arguments: + window: integer size of moving time window over which to compute maximum + + """ # See https://github.com/pandas-dev/pandas/issues/14013 y = s.groupby(level=1).apply( lambda x: x.rolling(window=window, min_periods=0).max() @@ -220,16 +352,30 @@ def rollmax(s: pd.Series, window: int) -> pd.Series: return y def onset_possible(s: pd.Series, window: int) -> pd.Series: - """Onset possible if no event occured in the preceeding window times """ + """ + onset_possible + + Helper function which detects whether an onset (change from zero to non-zero state after at least window zero + values) is possible. This function detects if no event occured in the preceeding window timesteps + + Arguments: + window: integer specifying how many zero values must exist before a non-zero value to constitute an onset + + """ + # fillna() is so that the first t in a group is always a possible onset return (~rollmax(tlag(s, 1).fillna(0), window).astype(bool)).astype(int) def onset(s: pd.Series, window: int) -> pd.Series: - """Compute onset + """ + onset + + Computes onsets, where an onset occurs if, given the specified window, an onset is possible, and the value of s is + non-zero + + Arguments: + window: integer specifying how many zero values must exist before a non-zero value to constitute an onset - A row is defined as an onset if - * onset is possible - * s is greater than 0 """ s_onset_possible = ( onset_possible(s, window).astype(bool) & s.astype(bool) @@ -239,7 +385,7 @@ def onset(s: pd.Series, window: int) -> pd.Series: def tick_time_since(s_event: pd.Series, s_time_since: pd.Series) -> np.ndarray: """Special time since ticker - Special case time_sine counter. + Special case time_since counter. In dynasim we need a fast update of time_since variables. The normal time_since(s) is very fast for a single pass but for updates at each t it is too slow.