From 77649b5440685c23a0d28d79b74d4ba1a1b9f532 Mon Sep 17 00:00:00 2001 From: Jordan-Day-ONS <57715292+Jday7879@users.noreply.github.com> Date: Fri, 10 Jan 2025 07:35:27 +0000 Subject: [PATCH] 704 type marker back data (#143) * Creating backdata unit test * Testing updating return flag when imputation_flag exists - Updating unit test data - intial fir flag now overwritten with bir ... * frozen change pre-commit fix * update imputation flags from back data * updated filepath for input data * refactored, fillna using forward or backward is depreciated, now using ffill or bfill * replaced fillna(fillmethod) with either ffill or bfill * removed final fill method to either ffill or bfill * Dealing with pandas infer dtype warning * Adding extra unit test cases * question columns are correct for back data cases * Corrected columns in unit test data * update filtering to use defined bool * Docstrings and tidying up old comments --------- Co-authored-by: Wil Roberts --- .../imputation/apply_imputation_link.py | 24 ++- mbs_results/imputation/imputation_flags.py | 109 +++++++++---- mbs_results/imputation/ratio_of_means.py | 150 +++++++++++++++++- mbs_results/staging/data_cleaning.py | 23 +-- .../back_data_testing/C_FIC_FIC_input.csv | 21 +++ .../back_data_testing/C_FIC_FIC_output.csv | 21 +++ .../back_data_testing/FIC_FIC_FIC_input.csv | 13 ++ .../back_data_testing/FIC_FIC_FIC_output.csv | 13 ++ .../FIMC_FIMC_FIMC_input.csv | 13 ++ .../FIMC_FIMC_FIMC_output.csv | 13 ++ .../back_data_testing/FIR_FIR_FIR_input.csv | 13 ++ .../back_data_testing/FIR_FIR_FIR_output.csv | 13 ++ .../back_data_testing/MC_FIMC_FIMC_input.csv | 13 ++ .../back_data_testing/MC_FIMC_FIMC_output.csv | 13 ++ .../back_data_testing/R_FIR_FIR_input.csv | 13 ++ .../back_data_testing/R_FIR_FIR_output.csv | 13 ++ .../data_cleaning/test_run_live_or_frozen.csv | 8 - .../test_run_live_or_frozen_frozen_output.csv | 4 + .../test_run_live_or_frozen_input.csv | 4 + tests/imputation/test_imputation_flags.py | 5 + tests/imputation/test_ratio_of_means.py | 23 ++- .../test_ratio_of_means_back_data.py | 110 +++++++++++++ tests/staging/test_data_cleaning.py | 20 +-- 23 files changed, 573 insertions(+), 79 deletions(-) create mode 100644 tests/data/imputation/back_data_testing/C_FIC_FIC_input.csv create mode 100644 tests/data/imputation/back_data_testing/C_FIC_FIC_output.csv create mode 100644 tests/data/imputation/back_data_testing/FIC_FIC_FIC_input.csv create mode 100644 tests/data/imputation/back_data_testing/FIC_FIC_FIC_output.csv create mode 100644 tests/data/imputation/back_data_testing/FIMC_FIMC_FIMC_input.csv create mode 100644 tests/data/imputation/back_data_testing/FIMC_FIMC_FIMC_output.csv create mode 100644 tests/data/imputation/back_data_testing/FIR_FIR_FIR_input.csv create mode 100644 tests/data/imputation/back_data_testing/FIR_FIR_FIR_output.csv create mode 100644 tests/data/imputation/back_data_testing/MC_FIMC_FIMC_input.csv create mode 100644 tests/data/imputation/back_data_testing/MC_FIMC_FIMC_output.csv create mode 100644 tests/data/imputation/back_data_testing/R_FIR_FIR_input.csv create mode 100644 tests/data/imputation/back_data_testing/R_FIR_FIR_output.csv delete mode 100644 tests/data/staging/data_cleaning/test_run_live_or_frozen.csv create mode 100755 tests/data/staging/data_cleaning/test_run_live_or_frozen_frozen_output.csv create mode 100644 tests/data/staging/data_cleaning/test_run_live_or_frozen_input.csv create mode 100644 tests/imputation/test_ratio_of_means_back_data.py diff --git a/mbs_results/imputation/apply_imputation_link.py b/mbs_results/imputation/apply_imputation_link.py index 78517e7c..7bb64e26 100644 --- a/mbs_results/imputation/apply_imputation_link.py +++ b/mbs_results/imputation/apply_imputation_link.py @@ -56,6 +56,14 @@ def create_and_merge_imputation_values( # constructed has to come first to use the result for forward # impute from constructed imputation_config = { + # "backdata": { + # "intermediate_column": "backdata", + # "marker": "backdata", + # # doesn't actually apply a fill so can be forward or back + # "fill_column": target, + # "fill_method": "ffill", + # "link_column": cumulative_forward_link, + # }, "c": { "intermediate_column": "constructed", "marker": "c", @@ -145,9 +153,19 @@ def create_impute(df, group, imputation_spec): fill_column = imputation_spec["fill_column"] fill_method = imputation_spec["fill_method"] link_column = imputation_spec["link_column"] - df[column_name] = ( - df.groupby(group)[fill_column].fillna(method=fill_method) * df[link_column] - ) + imputation_spec["marker"] + + if fill_method == "ffill": + df[column_name] = df.groupby(group)[fill_column].ffill() * df[link_column] + elif fill_method == "bfill": + df[column_name] = df.groupby(group)[fill_column].bfill() * df[link_column] + + if "hold_period_0_values" in df.columns: + df.loc[df["hold_period_0_values"].notnull(), column_name] = df.loc[ + df["hold_period_0_values"].notnull(), "hold_period_0_values" + ] + df.drop(columns="hold_period_0_values", inplace=True) + return df diff --git a/mbs_results/imputation/imputation_flags.py b/mbs_results/imputation/imputation_flags.py index 6d30fa47..b6ed63d8 100644 --- a/mbs_results/imputation/imputation_flags.py +++ b/mbs_results/imputation/imputation_flags.py @@ -9,6 +9,7 @@ def generate_imputation_marker( reference: str, strata: str, auxiliary: str, + back_data_period: str, time_difference=1, **kwargs, ) -> pd.DataFrame: @@ -35,7 +36,10 @@ def generate_imputation_marker( Column name containing strata information (sic). auxiliary : str Column name containing auxiliary data. - time_difference: int + back_data_period : pd.Timestamp + Time period used as the back data period. This periods data + should not be changed + time_difference: int, Optional lookup distance for matched pairs kwargs : mapping, optional A dictionary of keyword arguments passed into func. @@ -47,6 +51,7 @@ def generate_imputation_marker( i.e. the type of imputation method that should be used to fill missing returns. """ + if f"{target}_man" in df.columns: flags = ["r", "mc", "fir", "bir", "fimc", "fic", "c"] # Check order from Specs @@ -54,10 +59,18 @@ def generate_imputation_marker( flags = ["r", "fir", "bir", "fic", "c"] create_imputation_logical_columns( - df, target, period, reference, strata, auxiliary, time_difference + df, + target, + period, + reference, + strata, + auxiliary, + back_data_period, + time_difference, ) select_cols = [f"{i}_flag_{target}" for i in flags] + df.to_csv("temp.csv") first_condition_met = [np.where(i)[0][0] for i in df[select_cols].values] df[f"imputation_flags_{target}"] = [flags[i] for i in first_condition_met] df.drop(columns=select_cols, inplace=True) @@ -72,6 +85,7 @@ def create_imputation_logical_columns( reference: str, strata: str, auxiliary: str, + back_data_period: str, time_difference: int = 1, ): """ @@ -110,31 +124,57 @@ def create_imputation_logical_columns( df.sort_values([reference, strata, period], inplace=True) - df[f"r_flag_{target}"] = df[target].notna() + if f"imputation_flags_{target}" in df.columns: + # Case where back data is present + backdata_r_mask = df[f"backdata_flags_{target}"] == "r" + backdata_fir_mask = df[f"backdata_flags_{target}"] == "fir" + backdata_fimc_mask = df[f"backdata_flags_{target}"] == "fimc" + backdata_c_mask = df[f"backdata_flags_{target}"] == "c" + backdata_fic_mask = df[f"backdata_flags_{target}"] == "fic" + + else: + df["is_backdata"] = df[reference] != df[reference] + backdata_r_mask = df[reference] != df[reference] + backdata_fir_mask = df[reference] != df[reference] + backdata_fimc_mask = df[reference] != df[reference] + backdata_c_mask = df[reference] != df[reference] + backdata_fic_mask = df[reference] != df[reference] + print(backdata_r_mask) + + # if target na but not back data period OR if backdata flag is 'r' + df[f"r_flag_{target}"] = (df[target].notna() & ~df["is_backdata"]) | backdata_r_mask if f"{target}_man" in df.columns: df[f"mc_flag_{target}"] = df[f"{target}_man"].notna() - df[f"fir_flag_{target}"] = flag_rolling_impute( - df, time_difference, strata, reference, target, period - ) + df[f"fir_flag_{target}"] = ( + flag_rolling_impute(df, time_difference, strata, reference, target, period) + & ~df["is_backdata"] + ) | backdata_fir_mask - df[f"bir_flag_{target}"] = flag_rolling_impute( - df, -time_difference, strata, reference, target, period - ) + df[f"bir_flag_{target}"] = ( + flag_rolling_impute(df, -time_difference, strata, reference, target, period) + & ~df["is_backdata"] + ) | backdata_r_mask if f"{target}_man" in df.columns: - df[f"fimc_flag_{target}"] = flag_rolling_impute( - df, time_difference, strata, reference, f"{target}_man", period + df[f"fimc_flag_{target}"] = ( + flag_rolling_impute( + df, time_difference, strata, reference, f"{target}_man", period + ) + | backdata_fimc_mask ) df = imputation_overlaps_mc(df, target, reference, strata) - construction_conditions = df[target].isna() & df[auxiliary].notna() + construction_conditions = ( + df[target].isna() & df[auxiliary].notna() & ~df["is_backdata"] + ) | backdata_c_mask df[f"c_flag_{target}"] = np.where(construction_conditions, True, False) - df[f"fic_flag_{target}"] = flag_rolling_impute( - df, time_difference, strata, reference, auxiliary, period + df[f"fic_flag_{target}"] = ( + flag_rolling_impute(df, time_difference, strata, reference, auxiliary, period) + | backdata_fic_mask ) return df @@ -173,11 +213,15 @@ def imputation_overlaps_mc(df, target, reference, strata): df[column] = np.where( df[imputation_marker_column] & df[f"mc_flag_{target}"], False, None ) - df[column] = ( - df.groupby([strata, reference])[column].fillna( - method=direction_single_string + "fill" - ) - ).fillna(True) + if direction_single_string == "b": + df[column] = ( + df.groupby([strata, reference])[column].bfill().astype(bool) + ).fillna(True) + elif direction_single_string == "f": + df[column] = ( + df.groupby([strata, reference])[column].ffill().astype(bool) + ).fillna(True) + df[imputation_marker_column] = df[imputation_marker_column] & df[column] df.drop( columns=[column], @@ -220,23 +264,28 @@ def flag_rolling_impute( pd.Series """ - if time_difference < 0: - fillmethod = "bfill" - elif time_difference > 0: - fillmethod = "ffill" - df["fill_group"] = ( (df[period] - pd.DateOffset(months=1) != df.shift(1)[period]) | (df[strata].diff(1) != 0) | (df[reference].diff(1) != 0) ).cumsum() - boolean_column = ( - df.groupby(["fill_group"])[target] - .fillna(method=fillmethod) - .notnull() - .mul(df["fill_group"] == df.shift(time_difference)["fill_group"]) - ) + if time_difference < 0: + boolean_column = ( + df.groupby(["fill_group"])[target] + .bfill() + .notnull() + .mul(df["fill_group"] == df.shift(time_difference)["fill_group"]) + ) + + elif time_difference > 0: + boolean_column = ( + df.groupby(["fill_group"])[target] + .ffill() + .notnull() + .mul(df["fill_group"] == df.shift(time_difference)["fill_group"]) + ) + df.drop(columns="fill_group", inplace=True) return boolean_column diff --git a/mbs_results/imputation/ratio_of_means.py b/mbs_results/imputation/ratio_of_means.py index eeee9e8f..66c61f49 100644 --- a/mbs_results/imputation/ratio_of_means.py +++ b/mbs_results/imputation/ratio_of_means.py @@ -219,6 +219,121 @@ def wrap_get_cumulative_links( return df +def process_backdata( + df: pd.DataFrame, target: str, period: str, back_data_period: str +) -> pd.DataFrame: + """ + function to process the back data. Removes some values from target column so + correct imputation links are calculated + + Parameters + ---------- + df : pd.DataFrame + original dataframe + target : str + tartget column name + period : str + period column name + back_data_period : str + back data period value + + Returns + ------- + pd.DataFrame + dataframe with backdata processed and backdata flags copied to seperate columns + """ + # Bool for if period is back data + df["is_backdata"] = df[period] == pd.to_datetime(back_data_period, format="%Y%m") + # Copying backdata to seperate column + df.loc[df["is_backdata"], f"backdata_{target}"] = df.loc[df["is_backdata"], target] + # Copying flags to sep column + df[f"backdata_flags_{target}"] = df[f"imputation_flags_{target}"].str.lower() + + # moving mc data into manual construction column for MC imputation + df.loc[df[f"backdata_flags_{target}"] == "mc", f"{target}_man"] = df.loc[ + df[f"backdata_flags_{target}"] == "mc", target + ] + df.loc[df[f"backdata_flags_{target}"] == "fimc", f"{target}_man"] = df.loc[ + df[f"backdata_flags_{target}"] == "fimc", target + ] + + # removing mc data from target column + df.loc[ + (~df[f"backdata_flags_{target}"].isin(["r"])) + & (df[f"backdata_flags_{target}"].notna()), + target, + ] = None + + return df + + +def reapply_backdata( + df: pd.DataFrame, target: str, dropping: bool = False +) -> pd.DataFrame: + """ + reapply backdata flags and values to ensure no changes are made to back data. + will not do anything if backdata is not present in dataframe + dropping is optional argument which will drop the copied backdata column + + Parameters + ---------- + df : pd.DataFrame + original dataframe + target : str + target column name + dropping : bool, optional + if true the temp column to store back data will be removed , by default False + + Returns + ------- + pd.DataFrame + original dataframe with back data re-applied. + """ + if f"backdata_flags_{target}" in df.columns: + + is_backdata_not_return = (df[f"backdata_flags_{target}"] != "r") & ( + df["is_backdata"] + ) + df.loc[is_backdata_not_return, target] = df.loc[ + is_backdata_not_return, f"backdata_{target}" + ] + df.loc[is_backdata_not_return, f"imputation_flags_{target}"] = df.loc[ + is_backdata_not_return, f"backdata_flags_{target}" + ] + + if dropping: + df.drop(columns=["is_backdata"], inplace=True) + + return df + + +def replace_fir_backdata(df: pd.DataFrame, target: str) -> pd.DataFrame: + """ + replaced the target column with back data. + this is removed before calculating forwards and backwards links to + ensure the correct values are used. + + Parameters + ---------- + df : pd.DataFrame + original dataframe + target : str + target column name + + Returns + ------- + pd.DataFrame + original dataframe with imputed data copied over into the target column. + + """ + if f"backdata_flags_{target}" in df.columns: + df.loc[(df[f"backdata_flags_{target}"].isin(["fir"])), target] = df.loc[ + (df[f"backdata_flags_{target}"].isin(["fir"])), f"backdata_{target}" + ] + + return df + + def ratio_of_means( df: pd.DataFrame, target: str, @@ -226,6 +341,8 @@ def ratio_of_means( reference: str, strata: str, auxiliary: str, + current_period: str, + revision_period: str, filters: pd.DataFrame = None, manual_constructions: pd.DataFrame = None, imputation_links: Dict[str, str] = {}, @@ -276,6 +393,9 @@ def ratio_of_means( # These arguments are used from the majority of functions # TODO: Consider more elegant solution, or define function arguments explicitly + back_data_period = calculate_back_data_period(current_period, revision_period) + if f"imputation_flags_{target}" in df.columns: + df = process_backdata(df, target, period, back_data_period) default_columns = { "target": target, @@ -283,6 +403,7 @@ def ratio_of_means( "reference": reference, "strata": strata, "auxiliary": auxiliary, + "back_data_period": back_data_period, } if filters is not None: @@ -319,13 +440,12 @@ def ratio_of_means( imputation_types = ("c", "fir", "bir", "fic") df = ( - df # .pipe( - # create_impute_flags, - # **default_columns, - # predictive_auxiliary="f_predictive_auxiliary" - # ) + df + # Pass backdata period to calculate imputation link + .pipe(replace_fir_backdata, target=target) .pipe(generate_imputation_marker, **default_columns) .pipe(wrap_get_cumulative_links, **default_columns) + .pipe(reapply_backdata, target=target) .pipe( create_and_merge_imputation_values, **default_columns, @@ -336,6 +456,7 @@ def ratio_of_means( construction_link="construction_link", imputation_types=imputation_types, ) + .pipe(reapply_backdata, target=target, dropping=True) ) # TODO: Reset index needed because of sorting, perhaps reset index @@ -381,3 +502,22 @@ def ratio_of_means( # TODO: Missing extra columns, default values and if filter was applied, all bool return df + + +def calculate_back_data_period(current_period, revision_period) -> str: + current_period = pd.to_datetime(current_period, format="%Y%m") + back_data_period = ( + (current_period - pd.DateOffset(months=revision_period)).date().strftime("%Y%m") + ) + return back_data_period + + +if __name__ == "__main__": + from mbs_results.utilities.inputs import load_config + + config = load_config() + bdp = calculate_back_data_period( + current_period=config["current_period"], + revision_period=config["revision_period"], + ) + print(config["current_period"], bdp) diff --git a/mbs_results/staging/data_cleaning.py b/mbs_results/staging/data_cleaning.py index 99ad0794..b9ac7bf2 100644 --- a/mbs_results/staging/data_cleaning.py +++ b/mbs_results/staging/data_cleaning.py @@ -79,7 +79,7 @@ def enforce_datatypes( keep_columns: list, master_column_type_dict: dict, temporarily_remove_cols: list, - **config + **config, ): """ function to change datatypes of columns based on config file @@ -187,7 +187,7 @@ def join_manual_constructions( reference: str, period: str, question_no: str = "question_no", - **config + **config, ): """ joins manual construction data from onto main dataframe @@ -278,9 +278,9 @@ def is_same_dtype(df: pd.DataFrame, df2: pd.DataFrame, col_name: str) -> bool: def run_live_or_frozen( df: pd.DataFrame, target: str or list[str], - error_marker: str, + status: str, state: str = "live", - error_values: List[str] = ["E", "W"], + error_values: List[str] = ["Check needed"], ) -> pd.DataFrame: """ For frozen, therefore target values are converted to null, hence responses @@ -292,15 +292,15 @@ def run_live_or_frozen( Original dataframe. target : str or list[str] Column(s) to treat as non-response. - error_marker : str - Column name with error values. + status : str + Column containing error status. state : str, optional Function config parameter. The default is "live". "live" state won't do - anyting, "frozen" will convert to null the error_values within error_marker + anyting, "frozen" will convert to null the error_values within status error_values : list[str], optional - Values to ignore. The default is ['E', 'W']. + Values to ignore. The default is ['Check needed']. Mapping: - E -> 'Check needed' : '201', + 'Check needed' : '201', ("E" or "W" for CSW) 'Clear' : '210', 'Clear - overridden' : '211' @@ -320,8 +320,9 @@ def run_live_or_frozen( ) if state == "frozen": - df["frozen_error"] = df.apply( - lambda x: x[target] if x[error_marker] in (error_values) else None, axis=1 + df[f"live_{target}"] = df[target].copy() + df[target] = df.apply( + lambda x: x[target] if x[status] not in (error_values) else None, axis=1 ) return df diff --git a/tests/data/imputation/back_data_testing/C_FIC_FIC_input.csv b/tests/data/imputation/back_data_testing/C_FIC_FIC_input.csv new file mode 100644 index 00000000..48c0d813 --- /dev/null +++ b/tests/data/imputation/back_data_testing/C_FIC_FIC_input.csv @@ -0,0 +1,21 @@ +identifier,date,group,question,other,imputation_flags_question +120001,202001,100,5240,50,R +120001,202002,100,2490,50 +120001,202003,100,3382,50 +120001,202004,100,4475,50 +120001,202005,100,1316,50 +120002,202001,100,7410,78,R +120002,202002,100,3602,78 +120002,202003,100,4972,78 +120002,202004,100,8838,78 +120002,202005,100,1535,78 +120003,202001,100,4530,94,R +120003,202002,100,7451,94 +120003,202003,100,7586,94 +120003,202004,100,283,94 +120003,202005,100,4416,94 +120004,202001,100,7738.738739,100,C +120004,202002,100,,100 +120004,202003,100,,100 +120004,202004,100,,100 +120004,202005,100,,100 diff --git a/tests/data/imputation/back_data_testing/C_FIC_FIC_output.csv b/tests/data/imputation/back_data_testing/C_FIC_FIC_output.csv new file mode 100644 index 00000000..f9b34c4e --- /dev/null +++ b/tests/data/imputation/back_data_testing/C_FIC_FIC_output.csv @@ -0,0 +1,21 @@ +identifier,date,group,output,imputation_flags_question,forward,backward,construction,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction +120001,202001,100,5240,R,1,1.268552019,77.38738739,0,3,3,true,false,false +120001,202002,100,2490,R,0.788300349,0.849623589,61.0045045,3,3,3,false,false,false +120001,202003,100,3382,R,1.176991804,1.172403648,71.8018018,3,3,3,false,false,false +120001,202004,100,4475,R,0.852948557,1.870923352,61.24324324,3,3,3,false,false,false +120001,202005,100,1316,R,0.53449544,1,32.73423423,3,0,3,false,true,false +120002,202001,100,7410,R,1,1.268552019,77.38738739,0,3,3,true,false,false +120002,202002,100,3602,R,0.788300349,0.849623589,61.0045045,3,3,3,false,false,false +120002,202003,100,4972,R,1.176991804,1.172403648,71.8018018,3,3,3,false,false,false +120002,202004,100,8838,R,0.852948557,1.870923352,61.24324324,3,3,3,false,false,false +120002,202005,100,1535,R,0.53449544,1,32.73423423,3,0,3,false,true,false +120003,202001,100,4530,R,1,1.268552019,77.38738739,0,3,3,true,false,false +120003,202002,100,7451,R,0.788300349,0.849623589,61.0045045,3,3,3,false,false,false +120003,202003,100,7586,R,1.176991804,1.172403648,71.8018018,3,3,3,false,false,false +120003,202004,100,283,R,0.852948557,1.870923352,61.24324324,3,3,3,false,false,false +120003,202005,100,4416,R,0.53449544,1,32.73423423,3,0,3,false,true,false +120004,202001,100,7738.738739,C,1,1.268552019,77.38738739,0,3,3,true,false,false +120004,202002,100,6100.450451,FIC,0.788300349,0.849623589,61.0045045,3,3,3,false,false,false +120004,202003,100,7180.180181,FIC,1.176991804,1.172403648,71.8018018,3,3,3,false,false,false +120004,202004,100,6124.324325,FIC,0.852948557,1.870923352,61.24324324,3,3,3,false,false,false +120004,202005,100,3273.423424,FIC,0.53449544,1,32.73423423,3,0,3,false,true,false diff --git a/tests/data/imputation/back_data_testing/FIC_FIC_FIC_input.csv b/tests/data/imputation/back_data_testing/FIC_FIC_FIC_input.csv new file mode 100644 index 00000000..2e147223 --- /dev/null +++ b/tests/data/imputation/back_data_testing/FIC_FIC_FIC_input.csv @@ -0,0 +1,13 @@ +identifier,date,group,question,other,imputation_flags_question +40001,202001,100,4783,35,R +40001,202002,100,7902,35 +40001,202003,100,4911,35 +40002,202001,100,442,63,R +40002,202002,100,3136,63 +40002,202003,100,2115,63 +40003,202001,100,8121,16,R +40003,202002,100,2151,16 +40003,202003,100,1377,16 +40004,202001,100,9836,78,FIC +40004,202002,100,,78 +40004,202003,100,,78 diff --git a/tests/data/imputation/back_data_testing/FIC_FIC_FIC_output.csv b/tests/data/imputation/back_data_testing/FIC_FIC_FIC_output.csv new file mode 100644 index 00000000..2a79af55 --- /dev/null +++ b/tests/data/imputation/back_data_testing/FIC_FIC_FIC_output.csv @@ -0,0 +1,13 @@ +identifier,date,group,output,imputation_flags_question,forward,backward,construction,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction +40001,202001,100,4783,R,1,1.011903859,117.070175438596,0,3,3,TRUE,FALSE,FALSE +40001,202002,100,7902,R,0.988236176,1.569558491,115.692982,3,3,3,false,false,false +40001,202003,100,4911,R,0.637121844,1,73.71052632,3,0,3,false,true,false +40002,202001,100,442,R,1,1.011903859,117.070175438596,0,3,3,TRUE,FALSE,FALSE +40002,202002,100,3136,R,0.988236176,1.569558491,115.692982,3,3,3,false,false,false +40002,202003,100,2115,R,0.637121844,1,73.71052632,3,0,3,false,true,false +40003,202001,100,8121,R,1,1.011903859,117.070175438596,0,3,3,TRUE,FALSE,FALSE +40003,202002,100,2151,R,0.988236176,1.569558491,115.692982,3,3,3,false,false,false +40003,202003,100,1377,R,0.637121844,1,73.71052632,3,0,3,false,true,false +40004,202001,100,9836,FIC,1,1.011903859,117.070175438596,0,3,3,TRUE,FALSE,FALSE +40004,202002,100,9720.291024,FIC,0.988236176,1.569558491,115.692982,3,3,3,false,false,false +40004,202003,100,6193.009741,FIC,0.637121844,1,73.71052632,3,0,3,false,true,false diff --git a/tests/data/imputation/back_data_testing/FIMC_FIMC_FIMC_input.csv b/tests/data/imputation/back_data_testing/FIMC_FIMC_FIMC_input.csv new file mode 100644 index 00000000..87f97339 --- /dev/null +++ b/tests/data/imputation/back_data_testing/FIMC_FIMC_FIMC_input.csv @@ -0,0 +1,13 @@ +identifier,date,group,question,other,imputation_flags_question +40001,202001,100,4783,35,R +40001,202002,100,7902,35 +40001,202003,100,4911,35 +40002,202001,100,442,63,R +40002,202002,100,3136,63 +40002,202003,100,2115,63 +40003,202001,100,8121,16,R +40003,202002,100,2151,16 +40003,202003,100,1377,16 +40004,202001,100,9836,78,FIMC +40004,202002,100,,78 +40004,202003,100,,78 diff --git a/tests/data/imputation/back_data_testing/FIMC_FIMC_FIMC_output.csv b/tests/data/imputation/back_data_testing/FIMC_FIMC_FIMC_output.csv new file mode 100644 index 00000000..7c43f93b --- /dev/null +++ b/tests/data/imputation/back_data_testing/FIMC_FIMC_FIMC_output.csv @@ -0,0 +1,13 @@ +identifier,date,group,output,imputation_flags_question,forward,backward,construction,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction +40001,202001,100,4783,R,1,1.011903859,117.070175438596,0,3,3,TRUE,FALSE,FALSE +40001,202002,100,7902,R,0.988236176,1.569558491,115.692982,3,3,3,false,false,false +40001,202003,100,4911,R,0.637121844,1,73.71052632,3,0,3,false,true,false +40002,202001,100,442,R,1,1.011903859,117.070175438596,0,3,3,TRUE,FALSE,FALSE +40002,202002,100,3136,R,0.988236176,1.569558491,115.692982,3,3,3,false,false,false +40002,202003,100,2115,R,0.637121844,1,73.71052632,3,0,3,false,true,false +40003,202001,100,8121,R,1,1.011903859,117.070175438596,0,3,3,TRUE,FALSE,FALSE +40003,202002,100,2151,R,0.988236176,1.569558491,115.692982,3,3,3,false,false,false +40003,202003,100,1377,R,0.637121844,1,73.71052632,3,0,3,false,true,false +40004,202001,100,9836,FIMC,1,1.011903859,117.070175438596,0,3,3,TRUE,FALSE,FALSE +40004,202002,100,9720.291024,FIMC,0.988236176,1.569558491,115.692982,3,3,3,false,false,false +40004,202003,100,6193.009741,FIMC,0.637121844,1,73.71052632,3,0,3,false,true,false diff --git a/tests/data/imputation/back_data_testing/FIR_FIR_FIR_input.csv b/tests/data/imputation/back_data_testing/FIR_FIR_FIR_input.csv new file mode 100644 index 00000000..decf3b8f --- /dev/null +++ b/tests/data/imputation/back_data_testing/FIR_FIR_FIR_input.csv @@ -0,0 +1,13 @@ +identifier,date,group,question,other,imputation_flags_question +40001,202001,100,4783,35,R +40001,202002,100,7902,35 +40001,202003,100,4911,35 +40002,202001,100,442,63,R +40002,202002,100,3136,63 +40002,202003,100,2115,63 +40003,202001,100,8121,16,R +40003,202002,100,2151,16 +40003,202003,100,1377,16 +40004,202001,100,9836,78,FIR +40004,202002,100,,78 +40004,202003,100,,78 diff --git a/tests/data/imputation/back_data_testing/FIR_FIR_FIR_output.csv b/tests/data/imputation/back_data_testing/FIR_FIR_FIR_output.csv new file mode 100644 index 00000000..f8fdf5a2 --- /dev/null +++ b/tests/data/imputation/back_data_testing/FIR_FIR_FIR_output.csv @@ -0,0 +1,13 @@ +identifier,date,group,output,imputation_flags_question,forward,backward,construction,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction +40001,202001,100,4783,R,1,1.011903859,117.070175438596,0,3,3,TRUE,FALSE,FALSE +40001,202002,100,7902,R,0.988236176,1.569558491,115.692982,3,3,3,FALSE,FALSE,FALSE +40001,202003,100,4911,R,0.637121844,1,73.71052632,3,0,3,FALSE,TRUE,FALSE +40002,202001,100,442,R,1,1.011903859,117.070175438596,0,3,3,TRUE,FALSE,FALSE +40002,202002,100,3136,R,0.988236176,1.569558491,115.692982,3,3,3,FALSE,FALSE,FALSE +40002,202003,100,2115,R,0.637121844,1,73.71052632,3,0,3,FALSE,TRUE,FALSE +40003,202001,100,8121,R,1,1.011903859,117.070175438596,0,3,3,TRUE,FALSE,FALSE +40003,202002,100,2151,R,0.988236176,1.569558491,115.692982,3,3,3,FALSE,FALSE,FALSE +40003,202003,100,1377,R,0.637121844,1,73.71052632,3,0,3,FALSE,TRUE,FALSE +40004,202001,100,9836,FIR,1,1.011903859,117.070175438596,0,3,3,TRUE,FALSE,FALSE +40004,202002,100,9720.291024,FIR,0.988236176,1.569558491,115.692982,3,3,3,FALSE,FALSE,FALSE +40004,202003,100,6193.009741,FIR,0.637121844,1,73.71052632,3,0,3,FALSE,TRUE,FALSE diff --git a/tests/data/imputation/back_data_testing/MC_FIMC_FIMC_input.csv b/tests/data/imputation/back_data_testing/MC_FIMC_FIMC_input.csv new file mode 100644 index 00000000..a9a9bb5e --- /dev/null +++ b/tests/data/imputation/back_data_testing/MC_FIMC_FIMC_input.csv @@ -0,0 +1,13 @@ +identifier,date,group,question,other,imputation_flags_question +40001,202001,100,4783,35,R +40001,202002,100,7902,35 +40001,202003,100,4911,35 +40002,202001,100,442,63,R +40002,202002,100,3136,63 +40002,202003,100,2115,63 +40003,202001,100,8121,16,R +40003,202002,100,2151,16 +40003,202003,100,1377,16 +40004,202001,100,9836,78,MC +40004,202002,100,,78 +40004,202003,100,,78 diff --git a/tests/data/imputation/back_data_testing/MC_FIMC_FIMC_output.csv b/tests/data/imputation/back_data_testing/MC_FIMC_FIMC_output.csv new file mode 100644 index 00000000..7e03188e --- /dev/null +++ b/tests/data/imputation/back_data_testing/MC_FIMC_FIMC_output.csv @@ -0,0 +1,13 @@ +identifier,date,group,output,imputation_flags_question,forward,backward,construction,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction +40001,202001,100,4783,R,1,1.011903859,117.070175438596,0,3,3,TRUE,FALSE,FALSE +40001,202002,100,7902,R,0.988236176,1.569558491,115.692982,3,3,3,false,false,false +40001,202003,100,4911,R,0.637121844,1,73.71052632,3,0,3,false,true,false +40002,202001,100,442,R,1,1.011903859,117.070175438596,0,3,3,TRUE,FALSE,FALSE +40002,202002,100,3136,R,0.988236176,1.569558491,115.692982,3,3,3,false,false,false +40002,202003,100,2115,R,0.637121844,1,73.71052632,3,0,3,false,true,false +40003,202001,100,8121,R,1,1.011903859,117.070175438596,0,3,3,TRUE,FALSE,FALSE +40003,202002,100,2151,R,0.988236176,1.569558491,115.692982,3,3,3,false,false,false +40003,202003,100,1377,R,0.637121844,1,73.71052632,3,0,3,false,true,false +40004,202001,100,9836,MC,1,1.011903859,117.070175438596,0,3,3,TRUE,FALSE,FALSE +40004,202002,100,9720.291024,FIMC,0.988236176,1.569558491,115.692982,3,3,3,false,false,false +40004,202003,100,6193.009741,FIMC,0.637121844,1,73.71052632,3,0,3,false,true,false diff --git a/tests/data/imputation/back_data_testing/R_FIR_FIR_input.csv b/tests/data/imputation/back_data_testing/R_FIR_FIR_input.csv new file mode 100644 index 00000000..c0a45b1a --- /dev/null +++ b/tests/data/imputation/back_data_testing/R_FIR_FIR_input.csv @@ -0,0 +1,13 @@ +identifier,date,group,question,other,imputation_flags_question +40001,202001,100,4783,35,R +40001,202002,100,7902,35 +40001,202003,100,4911,35 +40002,202001,100,442,63,R +40002,202002,100,3136,63 +40002,202003,100,2115,63 +40003,202001,100,8121,16,R +40003,202002,100,2151,16 +40003,202003,100,1377,16 +40004,202001,100,9836,78,R +40004,202002,100,,78 +40004,202003,100,,78 diff --git a/tests/data/imputation/back_data_testing/R_FIR_FIR_output.csv b/tests/data/imputation/back_data_testing/R_FIR_FIR_output.csv new file mode 100644 index 00000000..a64b050b --- /dev/null +++ b/tests/data/imputation/back_data_testing/R_FIR_FIR_output.csv @@ -0,0 +1,13 @@ +identifier,date,group,output,imputation_flags_question,forward,backward,construction,count_forward,count_backward,count_construction,default_forward,default_backward,default_construction +40001,202001,100,4783,R,1,1.011903859,120.7395833,0,3,4,TRUE,FALSE,FALSE +40001,202002,100,7902,R,0.988236176,1.569558491,115.692982,3,3,3,false,false,false +40001,202003,100,4911,R,0.637121844,1,73.71052632,3,0,3,false,true,false +40002,202001,100,442,R,1,1.011903859,120.7395833,0,3,4,TRUE,FALSE,FALSE +40002,202002,100,3136,R,0.988236176,1.569558491,115.692982,3,3,3,false,false,false +40002,202003,100,2115,R,0.637121844,1,73.71052632,3,0,3,false,true,false +40003,202001,100,8121,R,1,1.011903859,120.7395833,0,3,4,TRUE,FALSE,FALSE +40003,202002,100,2151,R,0.988236176,1.569558491,115.692982,3,3,3,false,false,false +40003,202003,100,1377,R,0.637121844,1,73.71052632,3,0,3,false,true,false +40004,202001,100,9836,R,1,1.011903859,120.7395833,0,3,4,TRUE,FALSE,FALSE +40004,202002,100,9720.291024,FIR,0.988236176,1.569558491,115.692982,3,3,3,false,false,false +40004,202003,100,6193.009741,FIR,0.637121844,1,73.71052632,3,0,3,false,true,false diff --git a/tests/data/staging/data_cleaning/test_run_live_or_frozen.csv b/tests/data/staging/data_cleaning/test_run_live_or_frozen.csv deleted file mode 100644 index 02471894..00000000 --- a/tests/data/staging/data_cleaning/test_run_live_or_frozen.csv +++ /dev/null @@ -1,8 +0,0 @@ -target,error,live,frozen,frozen_error -2,C,2,2, -7,E,7,,7 -1,O,1,1, -6,W,6,,6 -3,C,3,3, -5,E,5,,5 -4,W,4,,4 diff --git a/tests/data/staging/data_cleaning/test_run_live_or_frozen_frozen_output.csv b/tests/data/staging/data_cleaning/test_run_live_or_frozen_frozen_output.csv new file mode 100755 index 00000000..4d5ca1f0 --- /dev/null +++ b/tests/data/staging/data_cleaning/test_run_live_or_frozen_frozen_output.csv @@ -0,0 +1,4 @@ +target,status,live_target +2,Clear,2 +,Check needed,7 +1,Clear - overridden,1 diff --git a/tests/data/staging/data_cleaning/test_run_live_or_frozen_input.csv b/tests/data/staging/data_cleaning/test_run_live_or_frozen_input.csv new file mode 100644 index 00000000..bb5411cf --- /dev/null +++ b/tests/data/staging/data_cleaning/test_run_live_or_frozen_input.csv @@ -0,0 +1,4 @@ +target,status +2,Clear +7,Check needed +1,Clear - overridden diff --git a/tests/imputation/test_imputation_flags.py b/tests/imputation/test_imputation_flags.py index cd111b1d..291ec763 100644 --- a/tests/imputation/test_imputation_flags.py +++ b/tests/imputation/test_imputation_flags.py @@ -44,6 +44,7 @@ def test_imputation_marker(self, imputation_flag_test_data): strata="strata", auxiliary="auxiliary", predictive_auxiliary="f_match_auxiliary", + back_data_period=111, ) df_expected_output.drop( columns=[ @@ -53,6 +54,8 @@ def test_imputation_marker(self, imputation_flag_test_data): ], inplace=True, ) + df_output.drop(columns=["is_backdata"], inplace=True) + assert_frame_equal(df_output, df_expected_output) def test_imputation_marker_manual_construction( @@ -79,6 +82,7 @@ def test_imputation_marker_manual_construction( reference="reference", strata="strata", auxiliary="auxiliary", + back_data_period=111, ) df_expected_output.drop( columns=[ @@ -88,5 +92,6 @@ def test_imputation_marker_manual_construction( ], inplace=True, ) + df_output.drop(columns=["is_backdata"], inplace=True) assert_frame_equal(df_output, df_expected_output) diff --git a/tests/imputation/test_ratio_of_means.py b/tests/imputation/test_ratio_of_means.py index ddc7e395..70d70cc3 100644 --- a/tests/imputation/test_ratio_of_means.py +++ b/tests/imputation/test_ratio_of_means.py @@ -57,9 +57,7 @@ ] -pytestmark = pytest.mark.parametrize("base_file_name", scenarios) - - +@pytest.mark.parametrize("base_file_name", scenarios) class TestRatioOfMeans: def test_ratio_of_means(self, base_file_name): @@ -91,6 +89,8 @@ def test_ratio_of_means(self, base_file_name): "backward": "b_link_question", "construction": "construction_link", }, + current_period=202001, + revision_period=10, ) else: actual_output = ratio_of_means( @@ -101,6 +101,8 @@ def test_ratio_of_means(self, base_file_name): strata="group", auxiliary="other", filters=filter_df, + current_period=202001, + revision_period=10, ) actual_output = actual_output.rename( @@ -149,7 +151,6 @@ def test_ratio_of_means(self, base_file_name): errors="ignore", inplace=True, ) - print(expected_output.columns) expected_output = expected_output[actual_output.columns] actual_output = actual_output.sort_values(by=["identifier", "date"]) @@ -166,16 +167,12 @@ def test_ratio_of_means(self, base_file_name): assert_frame_equal(actual_output, expected_output, check_dtype=False) -pytestmark = pytest.mark.parametrize( - "base_file_name", scenarios[len(scenarios) - 10 : len(scenarios)] -) - - +@pytest.mark.parametrize("mc_base_file_name", scenarios[-10:]) class TestRatioOfMeansManConstruction: - def test_manual_construction_input(self, base_file_name): - df = pd.read_csv(scenario_path_prefix + base_file_name + "_input.csv") + def test_manual_construction_input(self, mc_base_file_name): + df = pd.read_csv(scenario_path_prefix + mc_base_file_name + "_input.csv") expected_output = pd.read_csv( - scenario_path_prefix + base_file_name + "_output.csv" + scenario_path_prefix + mc_base_file_name + "_output.csv" ) manual_constructions = df.copy()[ @@ -199,6 +196,8 @@ def test_manual_construction_input(self, base_file_name): strata="group", auxiliary="other", manual_constructions=manual_constructions, + current_period=202001, + revision_period=10, ) expected_output["date"] = convert_column_to_datetime(expected_output["date"]) diff --git a/tests/imputation/test_ratio_of_means_back_data.py b/tests/imputation/test_ratio_of_means_back_data.py new file mode 100644 index 00000000..bcc011f9 --- /dev/null +++ b/tests/imputation/test_ratio_of_means_back_data.py @@ -0,0 +1,110 @@ +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from mbs_results.imputation.ratio_of_means import ratio_of_means + +scenario_path_prefix = "tests/data/imputation/back_data_testing/" + +scenarios = [ + "R_FIR_FIR", + "FIR_FIR_FIR", + "C_FIC_FIC", + "FIC_FIC_FIC", + "MC_FIMC_FIMC", + "FIMC_FIMC_FIMC", +] + + +pytestmark = pytest.mark.parametrize("base_file_name", scenarios) + + +class TestRatioOfMeans: + def test_ratio_of_means_back_data(self, base_file_name): + + input_data = pd.read_csv(scenario_path_prefix + base_file_name + "_input.csv") + expected_output = pd.read_csv( + scenario_path_prefix + base_file_name + "_output.csv" + ) + + # Can't use load_format helper, test cases have date instead of period + + input_data["date"] = pd.to_datetime(input_data["date"], format="%Y%m") + expected_output["date"] = pd.to_datetime(expected_output["date"], format="%Y%m") + + actual_output = ratio_of_means( + input_data, + target="question", + period="date", + reference="identifier", + strata="group", + auxiliary="other", + current_period=202003, + revision_period=2, + ) + + actual_output = actual_output.rename( + columns={ + "default_link_b_match_question": "default_backward", + "default_link_f_match_question": "default_forward", + "default_link_flag_construction_matches": "default_construction", + "flag_construction_matches_pair_count": "flag_match_pair_count", + } + ) + + actual_output = actual_output.drop(columns=["other"]) + + # if stays like this we need a function to load expected data + expected_output = expected_output.rename( + columns={ + "output": "question", + "forward": "f_link_question", + "backward": "b_link_question", + "construction": "construction_link", + "count_forward": "f_match_question_pair_count", + "count_backward": "b_match_question_pair_count", + "count_construction": "flag_match_pair_count", + } + ) + + actual_output.drop(columns=["question_man"], errors="ignore", inplace=True) + # Temp work around to drop mc column until its fully integrated + actual_output.drop( + columns=[ + "b_match_filtered_question", + "b_predictive_filtered_question", + "b_link_filtered_question", + "f_predictive_filtered_question", + "f_link_filtered_question", + "filtered_question", + "cumulative_b_link_filtered_question", + "cumulative_f_link_filtered_question", + ], + errors="ignore", + inplace=True, + ) + actual_output.drop( + columns=["forward", "backward", "construction"], + errors="ignore", + inplace=True, + ) + actual_output.drop( + columns=["is_backdata", "backdata_flags_question", "backdata_question"], + errors="ignore", + inplace=True, + ) + + expected_output = expected_output[actual_output.columns] + + actual_output = actual_output.sort_values(by=["identifier", "date"]) + expected_output = expected_output.sort_values(by=["identifier", "date"]) + + actual_output = actual_output.reset_index(drop=True) + expected_output = expected_output.reset_index(drop=True) + + expected_output["imputation_flags_question"] = expected_output[ + "imputation_flags_question" + ].str.lower() + expected_output = expected_output.replace({"bi": "bir"}) + + assert_frame_equal(actual_output, expected_output, check_dtype=False) diff --git a/tests/staging/test_data_cleaning.py b/tests/staging/test_data_cleaning.py index 4d530a00..6b8376ae 100644 --- a/tests/staging/test_data_cleaning.py +++ b/tests/staging/test_data_cleaning.py @@ -114,25 +114,25 @@ def test_create_imputation_class(filepath): def test_run_live_or_frozen(filepath): - df = pd.read_csv(filepath / "test_run_live_or_frozen.csv") + df_in = pd.read_csv(filepath / "test_run_live_or_frozen_input.csv") - df_in = df.drop(columns=["frozen", "frozen_error"]) - - live_ouput = run_live_or_frozen(df_in, "target", "error", "live") + expected_frozen_output = pd.read_csv( + filepath / "test_run_live_or_frozen_frozen_output.csv" + ) - frozen_output = run_live_or_frozen(df_in, "target", "error", "frozen") + expected_live_output = df_in.copy() - expected_output_frozen = df.copy() + live_ouput = run_live_or_frozen(df_in, "target", "status", "live") - expected_output_frozen.drop(columns=["frozen"], inplace=True) + frozen_output = run_live_or_frozen(df_in, "target", "status", "frozen") - assert_frame_equal(frozen_output, expected_output_frozen) - assert_frame_equal(live_ouput, df_in) + assert_frame_equal(frozen_output, expected_frozen_output) + assert_frame_equal(live_ouput, expected_live_output) def test_run_live_or_frozen_exception(filepath): - df = pd.read_csv(filepath / "test_run_live_or_frozen.csv") + df = pd.read_csv(filepath / "test_run_live_or_frozen_input.csv") with pytest.raises(ValueError): run_live_or_frozen(df, "target", "error", "love")