From 23a75fffebfe4125fbb147902f707ff82ac1a439 Mon Sep 17 00:00:00 2001 From: Jordan-Day-ONS <57715292+Jday7879@users.noreply.github.com> Date: Thu, 19 Dec 2024 16:07:29 +0000 Subject: [PATCH] 692 create population counts (#137) * creating "create population count functions" * change sampled to is_sampled and is_census * Create unit test, remove script entry point * Adding docstrings * Adding optional functionality to save output instead of returning * Renaming df's and implementing in apply_estimation * Update mbs_results/estimation/apply_estimation.py Co-authored-by: Wil Roberts <47739563+robertswh@users.noreply.github.com> * Update mbs_results/estimation/apply_estimation.py Co-authored-by: Wil Roberts <47739563+robertswh@users.noreply.github.com> * Update tests/estimation/test_create_population_counts.py Co-authored-by: Wil Roberts <47739563+robertswh@users.noreply.github.com> * Update create_population_counts.py * Formatting comments --------- Co-authored-by: Wil Roberts Co-authored-by: Wil Roberts <47739563+robertswh@users.noreply.github.com> --- mbs_results/estimation/apply_estimation.py | 16 +++- .../estimation/create_population_counts.py | 87 +++++++++++++++++++ .../estimation/pre_processing_estimation.py | 4 +- .../calculate_predicted_unit_value.py | 12 ++- .../calculate_ratio_estimation.py | 8 +- .../calculate_winsorised_weight.py | 9 +- .../outlier_detection/winsorisation.py | 1 - .../derive_estimation_variables.csv | 8 +- .../predicted_unit_value_data.csv | 30 +++---- .../predicted_unit_value_output.csv | 30 +++---- .../ratio_estimation_data.csv | 26 +++--- .../ratio_estimation_data_output.csv | 26 +++--- .../winsorised_weight_data.csv | 30 +++---- .../winsorised_weight_data_output.csv | 30 +++---- .../winsorised_weight_data_output.csv | 30 +++---- .../test_create_population_counts.py | 61 +++++++++++++ .../test_pre_processing_estimation.py | 4 +- .../test_calculate_predicted_unit_value.py | 6 +- .../test_calculate_ratio_estimation.py | 6 +- .../test_calculate_winsorised_weight.py | 9 +- tests/outlier_detection/test_winsorisation.py | 4 +- 21 files changed, 295 insertions(+), 142 deletions(-) create mode 100644 mbs_results/estimation/create_population_counts.py create mode 100644 tests/estimation/test_create_population_counts.py diff --git a/mbs_results/estimation/apply_estimation.py b/mbs_results/estimation/apply_estimation.py index bd14c978..df02f7fb 100644 --- a/mbs_results/estimation/apply_estimation.py +++ b/mbs_results/estimation/apply_estimation.py @@ -6,6 +6,9 @@ calculate_calibration_factor, calculate_design_weight, ) +from mbs_results.estimation.create_population_counts import ( + create_population_count_output, +) from mbs_results.estimation.pre_processing_estimation import get_estimation_data from mbs_results.staging.data_cleaning import is_census @@ -71,7 +74,13 @@ def apply_estimation( census_df["design_weight"] = 1 census_df["calibration_factor"] = 1 - census_df["sampled"] = 0 + census_df["is_sampled"] = True + census_df["is_census"] = True + # is_census: bool, to distinguish fully sampled (i.e. census) strata from + # non-census strata. Used in outlier detection so census strata are + # not winsorised. + # is_sampled: bool. This is used to distinguish sampled refs from non-sampled + # refs in population non_census_df = estimation_data[ ~( @@ -83,6 +92,7 @@ def apply_estimation( non_census_df = calculate_design_weight(non_census_df, period, **config) non_census_df = calculate_calibration_factor(non_census_df, period, **config) + non_census_df["is_census"] = False all_together = pd.concat([non_census_df, census_df], ignore_index=True) @@ -90,6 +100,10 @@ def apply_estimation( estimation_df = pd.concat(estimation_df_list, ignore_index=True) + create_population_count_output( + estimation_df, period, calibration_group, save_output=True, **config + ) + # validate_estimation(estimation_df, **config) return estimation_df diff --git a/mbs_results/estimation/create_population_counts.py b/mbs_results/estimation/create_population_counts.py new file mode 100644 index 00000000..3c00ba2b --- /dev/null +++ b/mbs_results/estimation/create_population_counts.py @@ -0,0 +1,87 @@ +import pandas as pd + + +def calculate_turnover_sum_count( + df: pd.DataFrame, period: str, strata: str, colname: str, **config +) -> pd.DataFrame: + """ + Calculates turnover sum and count and returns an aggregated dataframe + with the given column name prefixed to the sum and count columns + + Parameters + ---------- + df : pd.DataFrame + original dataframe containing frotover. Groups by period and strata + period : str + period column name + strata : str + strate column name + colname : str + column name to prefix to the sum and count columns + + Returns + ------- + pd.DataFrame + A grouped dataframe with the sum and count columns prefixed with colname + """ + + df_pop_count = ( + df.groupby([period, strata]) + .agg(summing=("frotover", "sum"), count=("reference", "size")) + .reset_index() + ) + + df_pop_count.rename( + columns={"summing": f"{colname}_turnover_sum", "count": f"{colname}_count"}, + inplace=True, + ) + + return df_pop_count + + +def create_population_count_output( + df: pd.DataFrame, + period: str, + strata: str, + output_path: str = "", + save_output: bool = False, + **config: dict, +) -> pd.DataFrame: + """ + creates the population count output + + Parameters + ---------- + df : pd.DataFrame + original dataframe frotover and sampled. Groups by period and strata + period : str + period column name + strata : str + strata column name + output_path : str, optional + Output path to save dataframe + save_output : bool, optional + Default False. If True, saves the output to output_path + + Returns + ------- + pd.DataFrame + A grouped dataframe with the sum and count columns prefixed with colname. + Contains both population and sampled sum and counts for output. + Returns none if save_output is True + """ + + df_population = calculate_turnover_sum_count( + df, period, strata, colname="population", **config + ) + + df_sampled = calculate_turnover_sum_count( + df.loc[df["is_sampled"]], period, strata, colname="sample", **config + ) + combined = pd.merge(df_population, df_sampled, on=[period, strata]) + + if save_output: + combined.to_csv(output_path + "population_counts.csv", index=False) + return + else: + return combined diff --git a/mbs_results/estimation/pre_processing_estimation.py b/mbs_results/estimation/pre_processing_estimation.py index 06e40a3c..2413a317 100644 --- a/mbs_results/estimation/pre_processing_estimation.py +++ b/mbs_results/estimation/pre_processing_estimation.py @@ -110,8 +110,8 @@ def derive_estimation_variables( ) sample = sample.copy()[[reference, period]] - sample["sampled"] = 1 + sample["is_sampled"] = True return population_frame.merge(sample, on=[reference, period], how="left").fillna( - value={"sampled": 0} + value={"is_sampled": False} ) diff --git a/mbs_results/outlier_detection/calculate_predicted_unit_value.py b/mbs_results/outlier_detection/calculate_predicted_unit_value.py index 014e8860..74025148 100644 --- a/mbs_results/outlier_detection/calculate_predicted_unit_value.py +++ b/mbs_results/outlier_detection/calculate_predicted_unit_value.py @@ -2,7 +2,7 @@ def calculate_predicted_unit_value( - df, group, period, aux, sampled, a_weight, target_variable, nw_ag_flag + df, group, period, aux, is_census, a_weight, target_variable, nw_ag_flag ): """ Calculate predicted unit value @@ -17,8 +17,8 @@ def calculate_predicted_unit_value( Column name containing time period. aux : str Column name containing auxiliary variable (x). - sampled : str - Column name indicating whether it was sampled or not -boolean. + is_cenus : bool + Column name indicating whether the reference belongs to a cell that is a census. a_weight : str Column name containing the design weight. target_variable : str @@ -32,7 +32,7 @@ def calculate_predicted_unit_value( A pandas DataFrame with a new column containing the predicted unit value. """ - winsorised = (df[sampled] == 1) & (df[nw_ag_flag] == False) # noqa: E712 + winsorised = (~df[is_census]) & (~df[nw_ag_flag]) filtered_df = df.loc[winsorised] filtered_df["weighted_target_values"] = ( @@ -69,9 +69,7 @@ def calculate_predicted_unit_value( ["sum_weighted_target_values", "sum_weighted_auxiliary_values"], axis=1 ) - non_winsorised = (final_df[sampled] == 0) | ( - final_df[nw_ag_flag] == True # noqa: E712 - ) + non_winsorised = (final_df[is_census]) | (final_df[nw_ag_flag]) final_df["predicted_unit_value"] = final_df["predicted_unit_value"].mask( non_winsorised, np.nan ) diff --git a/mbs_results/outlier_detection/calculate_ratio_estimation.py b/mbs_results/outlier_detection/calculate_ratio_estimation.py index 88354378..6b15b916 100644 --- a/mbs_results/outlier_detection/calculate_ratio_estimation.py +++ b/mbs_results/outlier_detection/calculate_ratio_estimation.py @@ -4,7 +4,7 @@ def calculate_ratio_estimation( df, aux, - sampled, + is_census, a_weight, g_weight, target_variable, @@ -20,8 +20,8 @@ def calculate_ratio_estimation( Original dataframe. aux : str Column name containing auxiliary variable (x). - sampled : str - Column name indicating whether it was sampled or not -boolean. + is_census : bool + Column name indicating whether a reference belongs to a cell that is a census. a_weight : str Column name containing the design weight. g_weight : str @@ -48,7 +48,7 @@ def calculate_ratio_estimation( ) df = df.drop("flag_calculation", axis=1) - non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] == True) # noqa: E712 + non_winsorised = (df[is_census]) | (df[nw_ag_flag]) df["ratio_estimation_treshold"] = df["ratio_estimation_treshold"].mask( non_winsorised, np.nan ) diff --git a/mbs_results/outlier_detection/calculate_winsorised_weight.py b/mbs_results/outlier_detection/calculate_winsorised_weight.py index 3ae9c9bf..acdaac56 100644 --- a/mbs_results/outlier_detection/calculate_winsorised_weight.py +++ b/mbs_results/outlier_detection/calculate_winsorised_weight.py @@ -6,12 +6,11 @@ def calculate_winsorised_weight( group, period, aux, - sampled, + is_census, a_weight, g_weight, target_variable, predicted_unit_value, - l_values, ratio_estimation_treshold, nw_ag_flag, ): @@ -29,7 +28,7 @@ def calculate_winsorised_weight( aux : str Column name containing auxiliary variable (x). sampled : str - Column name indicating whether it was sampled or not -boolean. + Column name indicating whether a reference belongs to a cell that is a census. a_weight : str Column name containing the design weight. g_weight:str @@ -38,8 +37,6 @@ def calculate_winsorised_weight( Column name of the predicted target variable. predicted_unit_value: str column name containing the predicted unit value. - l_values: str - column name containing the l values as provided by methodology. ratio_estimation_treshold: str column name containing the previously calculated ratio estimation threshold. nw_ag_flag: str @@ -66,7 +63,7 @@ def calculate_winsorised_weight( df = df.drop(["w", "new_target"], axis=1) - non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag]) + non_winsorised = (df[is_census]) | (df[nw_ag_flag]) division_with_0 = ~non_winsorised & (df[target_variable] == 0) diff --git a/mbs_results/outlier_detection/winsorisation.py b/mbs_results/outlier_detection/winsorisation.py index 9c13faa7..381c5cde 100644 --- a/mbs_results/outlier_detection/winsorisation.py +++ b/mbs_results/outlier_detection/winsorisation.py @@ -94,7 +94,6 @@ def winsorise( g_weight, target_variable, "predicted_unit_value", - l_values, "ratio_estimation_treshold", "nw_ag_flag", ) diff --git a/tests/data/estimation/pre_processing_estimation/derive_estimation_variables.csv b/tests/data/estimation/pre_processing_estimation/derive_estimation_variables.csv index 7f8722ce..bb58561b 100644 --- a/tests/data/estimation/pre_processing_estimation/derive_estimation_variables.csv +++ b/tests/data/estimation/pre_processing_estimation/derive_estimation_variables.csv @@ -1,4 +1,4 @@ -reference,cell_no,auxiliary,period,sampled,calibration_group -11111111111,123456,1111111111111,202401,1,123456 -22222222222,234567,2222222222222,202401,1,123456 -33333333333,345678,3333333333333,202401,0,345678 +reference,cell_no,auxiliary,period,is_sampled,calibration_group +11111111111,123456,1111111111111,202401,True,123456 +22222222222,234567,2222222222222,202401,True,123456 +33333333333,345678,3333333333333,202401,False,345678 diff --git a/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_data.csv b/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_data.csv index 6b1ecceb..46e1f31c 100755 --- a/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_data.csv +++ b/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_data.csv @@ -1,15 +1,15 @@ -group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag -101,202401,10,0,1.666666667,1.023809524,12,False -101,202401,23,1,1.666666667,1.023809524,20,False -101,202401,41,1,1.666666667,1.023809524,20,False -101,202402,53,1,1.666666667,1.023809524,40,False -101,202401,12,0,1.666666667,1.023809524,10,False -102,202401,50,1,2.5,1.023809524,60,False -102,202402,40,1,2.5,1.023809524,50,False -102,202401,45,0,2.5,1.023809524,50,False -102,202401,70,0,2.5,1.023809524,60,False -102,202401,86,0,2.5,1.023809524,90,False -103,202401,20,0,0.32,0.004,90,True -103,202401,30,0,0.32,0.004,90,True -104,202401,20,0,,0.004,90,False -104,202401,30,0,,0.004,90,False +group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag +101,202401,10,True,1.666666667,1.023809524,12,False +101,202401,23,False,1.666666667,1.023809524,20,False +101,202401,41,False,1.666666667,1.023809524,20,False +101,202402,53,False,1.666666667,1.023809524,40,False +101,202401,12,True,1.666666667,1.023809524,10,False +102,202401,50,False,2.5,1.023809524,60,False +102,202402,40,False,2.5,1.023809524,50,False +102,202401,45,True,2.5,1.023809524,50,False +102,202401,70,True,2.5,1.023809524,60,False +102,202401,86,True,2.5,1.023809524,90,False +103,202401,20,True,0.32,0.004,90,True +103,202401,30,True,0.32,0.004,90,True +104,202401,20,True,,0.004,90,False +104,202401,30,True,,0.004,90,False diff --git a/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_output.csv b/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_output.csv index 8edf80d8..f9e1477c 100755 --- a/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_output.csv +++ b/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_output.csv @@ -1,15 +1,15 @@ -group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value -101,202401,10,0,1.666666667,1.023809524,12,False, -101,202401,23,1,1.666666667,1.023809524,20,False,14.375 -101,202401,41,1,1.666666667,1.023809524,20,False,25.625 -101,202402,53,1,1.666666667,1.023809524,40,False,40 -101,202401,12,0,1.666666667,1.023809524,10,False, -102,202401,50,1,2.5,1.023809524,60,False,60 -102,202402,40,1,2.5,1.023809524,50,False,50 -102,202401,45,0,2.5,1.023809524,50,False, -102,202401,70,0,2.5,1.023809524,60,False, -102,202401,86,0,2.5,1.023809524,90,False, -103,202401,20,0,0.32,0.004,90,True, -103,202401,30,0,0.32,0.004,90,True, -104,202401,20,0,,0.004,90,False, -104,202401,30,0,,0.004,90,False, +group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value +101,202401,10,True,1.666666667,1.023809524,12,False, +101,202401,23,False,1.666666667,1.023809524,20,False,14.375 +101,202401,41,False,1.666666667,1.023809524,20,False,25.625 +101,202402,53,False,1.666666667,1.023809524,40,False,40 +101,202401,12,True,1.666666667,1.023809524,10,False, +102,202401,50,False,2.5,1.023809524,60,False,60 +102,202402,40,False,2.5,1.023809524,50,False,50 +102,202401,45,True,2.5,1.023809524,50,False, +102,202401,70,True,2.5,1.023809524,60,False, +102,202401,86,True,2.5,1.023809524,90,False, +103,202401,20,True,0.32,0.004,90,True, +103,202401,30,True,0.32,0.004,90,True, +104,202401,20,True,,0.004,90,False, +104,202401,30,True,,0.004,90,False, diff --git a/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data.csv b/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data.csv index 10b3c9b8..5bd7e0ca 100755 --- a/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data.csv +++ b/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data.csv @@ -1,13 +1,13 @@ -group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value -101,202401,10,0,1.666666667,1.023809524,12,False,, -101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5 -101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5 -101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5 -101,202401,12,0,1.666666667,1.023809524,10,False,, -102,202401,50,1,2.5,1.023809524,60,False,60,0.5 -102,202402,40,1,2.5,1.023809524,50,False,50,0.5 -102,202401,45,0,2.5,1.023809524,50,False,, -102,202401,70,0,2.5,1.023809524,60,False,, -102,202401,86,0,2.5,1.023809524,90,False,, -104,202401,20,0,,0.004,90,False,, -104,202401,30,0,,0.004,90,False,, +group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value +101,202401,10,True,1.666666667,1.023809524,12,False,, +101,202401,23,False,1.666666667,1.023809524,20,False,14.375,0.5 +101,202401,41,False,1.666666667,1.023809524,20,False,25.625,0.5 +101,202402,53,False,1.666666667,1.023809524,40,False,40,0.5 +101,202401,12,True,1.666666667,1.023809524,10,False,, +102,202401,50,False,2.5,1.023809524,60,False,60,0.5 +102,202402,40,False,2.5,1.023809524,50,False,50,0.5 +102,202401,45,True,2.5,1.023809524,50,False,, +102,202401,70,True,2.5,1.023809524,60,False,, +102,202401,86,True,2.5,1.023809524,90,False,, +104,202401,20,True,,0.004,90,False,, +104,202401,30,True,,0.004,90,False,, diff --git a/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data_output.csv b/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data_output.csv index eccf0587..66d3d6f3 100755 --- a/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data_output.csv +++ b/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data_output.csv @@ -1,13 +1,13 @@ -group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold -101,202401,10,0,1.666666667,1.023809524,12,False,,, -101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652 -101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652 -101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5,40.7078652 -101,202401,12,0,1.666666667,1.023809524,10,False,,, -102,202401,50,1,2.5,1.023809524,60,False,60,0.5,60.3206107 -102,202402,40,1,2.5,1.023809524,50,False,50,0.5,50.3206107 -102,202401,45,0,2.5,1.023809524,50,False,,, -102,202401,70,0,2.5,1.023809524,60,False,,, -102,202401,86,0,2.5,1.023809524,90,False,,, -104,202401,20,0,,0.004,90,False,,, -104,202401,30,0,,0.004,90,False,,, +group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold +101,202401,10,True,1.666666667,1.023809524,12,False,,, +101,202401,23,False,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652 +101,202401,41,False,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652 +101,202402,53,False,1.666666667,1.023809524,40,False,40,0.5,40.7078652 +101,202401,12,True,1.666666667,1.023809524,10,False,,, +102,202401,50,False,2.5,1.023809524,60,False,60,0.5,60.3206107 +102,202402,40,False,2.5,1.023809524,50,False,50,0.5,50.3206107 +102,202401,45,True,2.5,1.023809524,50,False,,, +102,202401,70,True,2.5,1.023809524,60,False,,, +102,202401,86,True,2.5,1.023809524,90,False,,, +104,202401,20,True,,0.004,90,False,,, +104,202401,30,True,,0.004,90,False,,, diff --git a/tests/data/outlier_detection/calculate_winsorised_weight/winsorised_weight_data.csv b/tests/data/outlier_detection/calculate_winsorised_weight/winsorised_weight_data.csv index 99f54ee9..1d57f86c 100755 --- a/tests/data/outlier_detection/calculate_winsorised_weight/winsorised_weight_data.csv +++ b/tests/data/outlier_detection/calculate_winsorised_weight/winsorised_weight_data.csv @@ -1,15 +1,15 @@ -group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold -101,202401,10,0,1.666666667,1.023809524,12,False,,, -101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652 -101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652 -101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5,40.7078652 -101,202401,12,0,1.666666667,1.023809524,10,False,,, -102,202401,50,1,2.5,1.023809524,60,False,60,0.5,60.3206107 -102,202402,40,1,2.5,1.023809524,50,False,50,0.5,50.3206107 -102,202401,45,0,2.5,1.023809524,50,False,,, -102,202401,70,0,2.5,1.023809524,60,False,,, -102,202401,86,0,2.5,1.023809524,90,False,,, -104,202401,20,0,,0.004,90,False,,, -104,202401,30,0,,0.004,90,False,,, -104,202401,30,1,1,0.004,90,True,,, -104,202402,30,1,1,2,0,False,0,, +group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold +101,202401,10,True,1.666666667,1.023809524,12,False,,, +101,202401,23,False,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652 +101,202401,41,False,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652 +101,202402,53,False,1.666666667,1.023809524,40,False,40,0.5,40.7078652 +101,202401,12,True,1.666666667,1.023809524,10,False,,, +102,202401,50,False,2.5,1.023809524,60,False,60,0.5,60.3206107 +102,202402,40,False,2.5,1.023809524,50,False,50,0.5,50.3206107 +102,202401,45,True,2.5,1.023809524,50,False,,, +102,202401,70,True,2.5,1.023809524,60,False,,, +102,202401,86,True,2.5,1.023809524,90,False,,, +104,202401,20,True,,0.004,90,False,,, +104,202401,30,True,,0.004,90,False,,, +104,202401,30,False,1,0.004,90,True,,, +104,202402,30,False,1,2,0,False,0,, diff --git a/tests/data/outlier_detection/calculate_winsorised_weight/winsorised_weight_data_output.csv b/tests/data/outlier_detection/calculate_winsorised_weight/winsorised_weight_data_output.csv index 5434f2ca..1fcf782f 100755 --- a/tests/data/outlier_detection/calculate_winsorised_weight/winsorised_weight_data_output.csv +++ b/tests/data/outlier_detection/calculate_winsorised_weight/winsorised_weight_data_output.csv @@ -1,15 +1,15 @@ -group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold,new_target_variable,outlier_weight -101,202401,10,0,1.666666667,1.023809524,12,False,,,,,1 -101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652,17.96453488,0.898227 -101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652,20,1 -101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5,40.7078652,40,1 -101,202401,12,0,1.666666667,1.023809524,10,False,,,,,1 -102,202401,50,1,2.5,1.023809524,60,False,60,0.5,60.3206107,60,1 -102,202402,40,1,2.5,1.023809524,50,False,50,0.5,50.3206107,50,1 -102,202401,45,0,2.5,1.023809524,50,False,,,,,1 -102,202401,70,0,2.5,1.023809524,60,False,,,,,1 -102,202401,86,0,2.5,1.023809524,90,False,,,,,1 -104,202401,20,0,,0.004,90,False,,,,,1 -104,202401,30,0,,0.004,90,False,,,,,1 -104,202401,30,1,1,0.004,90,True,,,,,1 -104,202402,30,1,1,2,0,False,0,,,,1 +group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold,new_target_variable,outlier_weight +101,202401,10,True,1.666666667,1.023809524,12,False,,,,,1 +101,202401,23,False,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652,17.96453488,0.898227 +101,202401,41,False,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652,20,1 +101,202402,53,False,1.666666667,1.023809524,40,False,40,0.5,40.7078652,40,1 +101,202401,12,True,1.666666667,1.023809524,10,False,,,,,1 +102,202401,50,False,2.5,1.023809524,60,False,60,0.5,60.3206107,60,1 +102,202402,40,False,2.5,1.023809524,50,False,50,0.5,50.3206107,50,1 +102,202401,45,True,2.5,1.023809524,50,False,,,,,1 +102,202401,70,True,2.5,1.023809524,60,False,,,,,1 +102,202401,86,True,2.5,1.023809524,90,False,,,,,1 +104,202401,20,True,,0.004,90,False,,,,,1 +104,202401,30,True,,0.004,90,False,,,,,1 +104,202401,30,False,1,0.004,90,True,,,,,1 +104,202402,30,False,1,2,0,False,0,,,,1 diff --git a/tests/data/outlier_detection/test_winsorisation/winsorised_weight_data_output.csv b/tests/data/outlier_detection/test_winsorisation/winsorised_weight_data_output.csv index 5434f2ca..1fcf782f 100755 --- a/tests/data/outlier_detection/test_winsorisation/winsorised_weight_data_output.csv +++ b/tests/data/outlier_detection/test_winsorisation/winsorised_weight_data_output.csv @@ -1,15 +1,15 @@ -group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold,new_target_variable,outlier_weight -101,202401,10,0,1.666666667,1.023809524,12,False,,,,,1 -101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652,17.96453488,0.898227 -101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652,20,1 -101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5,40.7078652,40,1 -101,202401,12,0,1.666666667,1.023809524,10,False,,,,,1 -102,202401,50,1,2.5,1.023809524,60,False,60,0.5,60.3206107,60,1 -102,202402,40,1,2.5,1.023809524,50,False,50,0.5,50.3206107,50,1 -102,202401,45,0,2.5,1.023809524,50,False,,,,,1 -102,202401,70,0,2.5,1.023809524,60,False,,,,,1 -102,202401,86,0,2.5,1.023809524,90,False,,,,,1 -104,202401,20,0,,0.004,90,False,,,,,1 -104,202401,30,0,,0.004,90,False,,,,,1 -104,202401,30,1,1,0.004,90,True,,,,,1 -104,202402,30,1,1,2,0,False,0,,,,1 +group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold,new_target_variable,outlier_weight +101,202401,10,True,1.666666667,1.023809524,12,False,,,,,1 +101,202401,23,False,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652,17.96453488,0.898227 +101,202401,41,False,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652,20,1 +101,202402,53,False,1.666666667,1.023809524,40,False,40,0.5,40.7078652,40,1 +101,202401,12,True,1.666666667,1.023809524,10,False,,,,,1 +102,202401,50,False,2.5,1.023809524,60,False,60,0.5,60.3206107,60,1 +102,202402,40,False,2.5,1.023809524,50,False,50,0.5,50.3206107,50,1 +102,202401,45,True,2.5,1.023809524,50,False,,,,,1 +102,202401,70,True,2.5,1.023809524,60,False,,,,,1 +102,202401,86,True,2.5,1.023809524,90,False,,,,,1 +104,202401,20,True,,0.004,90,False,,,,,1 +104,202401,30,True,,0.004,90,False,,,,,1 +104,202401,30,False,1,0.004,90,True,,,,,1 +104,202402,30,False,1,2,0,False,0,,,,1 diff --git a/tests/estimation/test_create_population_counts.py b/tests/estimation/test_create_population_counts.py new file mode 100644 index 00000000..49bb66c7 --- /dev/null +++ b/tests/estimation/test_create_population_counts.py @@ -0,0 +1,61 @@ +import pandas as pd + +from mbs_results.estimation.create_population_counts import ( + calculate_turnover_sum_count, + create_population_count_output, +) + + +def test_calculate_turnover_sum_count(): + # Creating input data + df = pd.DataFrame( + { + "frotover": [1, 2, 3, 4, 5], + "reference": [1, 1, 2, 2, 3], + "period": [1, 1, 1, 1, 1], + "strata": ["A", "A", "B", "B", "C"], + } + ) + + # producing output + output = calculate_turnover_sum_count(df, "period", "strata", "population") + + # creating expected output + expected_output = pd.DataFrame( + { + "period": [1, 1, 1], + "strata": ["A", "B", "C"], + "population_turnover_sum": [3, 7, 5], + "population_count": [2, 2, 1], + } + ) + pd.testing.assert_frame_equal(output, expected_output) + + +def test_create_population_count_output(): + # Creating input data + df = pd.DataFrame( + { + "frotover": [1, 2, 3, 4, 5], + "reference": [1, 1, 2, 2, 3], + "period": [1, 1, 1, 1, 1], + "strata": ["A", "A", "B", "B", "C"], + "is_sampled": [True, False, True, False, True], + } + ) + + # producing output + output = create_population_count_output(df, "period", "strata") + + # creating expected output + expected_output = pd.DataFrame( + { + "period": [1, 1, 1], + "strata": ["A", "B", "C"], + "population_turnover_sum": [3, 7, 5], + "population_count": [2, 2, 1], + "sample_turnover_sum": [1, 3, 5], + "sample_count": [1, 1, 1], + } + ) + pd.testing.assert_frame_equal(output, expected_output) diff --git a/tests/estimation/test_pre_processing_estimation.py b/tests/estimation/test_pre_processing_estimation.py index 0d67f564..9fd2b103 100644 --- a/tests/estimation/test_pre_processing_estimation.py +++ b/tests/estimation/test_pre_processing_estimation.py @@ -26,10 +26,10 @@ def test_derive_estimation_variables(self, derive_estimation_variables_data): "cell_no", "calibration_group", "auxiliary", - "sampled", + "is_sampled", ] ] - population_frame = expected.drop(columns=["calibration_group", "sampled"]) + population_frame = expected.drop(columns=["calibration_group", "is_sampled"]) sample = population_frame.loc[:1, ["reference", "period"]] calibration_group_map = expected[["cell_no", "calibration_group"]] diff --git a/tests/outlier_detection/test_calculate_predicted_unit_value.py b/tests/outlier_detection/test_calculate_predicted_unit_value.py index e66b74f0..44580147 100644 --- a/tests/outlier_detection/test_calculate_predicted_unit_value.py +++ b/tests/outlier_detection/test_calculate_predicted_unit_value.py @@ -44,7 +44,7 @@ def test_calculate_predicted_unit_value( "group", "period", "aux", - "sampled", + "is_census", "a_weight", "target_variable", "nw_ag_flag", @@ -56,7 +56,7 @@ def test_calculate_predicted_unit_value( "group", "period", "aux", - "sampled", + "is_census", "a_weight", "target_variable", "nw_ag_flag", @@ -68,7 +68,7 @@ def test_calculate_predicted_unit_value( "group", "period", "aux", - "sampled", + "is_census", "a_weight", "target_variable", "nw_ag_flag", diff --git a/tests/outlier_detection/test_calculate_ratio_estimation.py b/tests/outlier_detection/test_calculate_ratio_estimation.py index 9c344146..22a4421b 100644 --- a/tests/outlier_detection/test_calculate_ratio_estimation.py +++ b/tests/outlier_detection/test_calculate_ratio_estimation.py @@ -42,7 +42,7 @@ def test_calculate_ratio_estimation( expected_output = ratio_estimation_test_output[ [ "aux", - "sampled", + "is_census", "a_weight", "g_weight", "target_variable", @@ -56,7 +56,7 @@ def test_calculate_ratio_estimation( input_data = ratio_estimation_test_data[ [ "aux", - "sampled", + "is_census", "a_weight", "g_weight", "target_variable", @@ -69,7 +69,7 @@ def test_calculate_ratio_estimation( actual_output = calculate_ratio_estimation( input_data, "aux", - "sampled", + "is_census", "a_weight", "g_weight", "target_variale", diff --git a/tests/outlier_detection/test_calculate_winsorised_weight.py b/tests/outlier_detection/test_calculate_winsorised_weight.py index 750c0ad5..de5ad9c6 100644 --- a/tests/outlier_detection/test_calculate_winsorised_weight.py +++ b/tests/outlier_detection/test_calculate_winsorised_weight.py @@ -44,12 +44,11 @@ def test_winsorised_weight( "group", "period", "aux", - "sampled", + "is_census", "a_weight", "g_weight", "target_variable", "predicted_unit_value", - "l_value", "ratio_estimation_treshold", "nw_ag_flag", "new_target_variable", @@ -61,12 +60,11 @@ def test_winsorised_weight( "group", "period", "aux", - "sampled", + "is_census", "a_weight", "g_weight", "target_variable", "predicted_unit_value", - "l_value", "ratio_estimation_treshold", "nw_ag_flag", ] @@ -77,12 +75,11 @@ def test_winsorised_weight( "group", "period", "aux", - "sampled", + "is_census", "a_weight", "g_weight", "target_variable", "predicted_unit_value", - "l_value", "ratio_estimation_treshold", "nw_ag_flag", ) diff --git a/tests/outlier_detection/test_winsorisation.py b/tests/outlier_detection/test_winsorisation.py index 70f0de06..c14c7234 100644 --- a/tests/outlier_detection/test_winsorisation.py +++ b/tests/outlier_detection/test_winsorisation.py @@ -28,7 +28,7 @@ def test_winsorised_weight(self, expected_output): "group", "period", "aux", - "sampled", + "is_census", "a_weight", "g_weight", "target_variable", @@ -41,7 +41,7 @@ def test_winsorised_weight(self, expected_output): "group", "period", "aux", - "sampled", + "is_census", "a_weight", "g_weight", "target_variable",