692 create population counts (#137)

* creating "create population count functions" * change sampled to is_sampled and is_census * Create unit test, remove script entry point * Adding docstrings * Adding optional functionality to save output instead of returning * Renaming df's and implementing in apply_estimation * Update mbs_results/estimation/apply_estimation.py Co-authored-by: Wil Roberts <[email protected]> * Update mbs_results/estimation/apply_estimation.py Co-authored-by: Wil Roberts <[email protected]> * Update tests/estimation/test_create_population_counts.py Co-authored-by: Wil Roberts <[email protected]> * Update create_population_counts.py * Formatting comments --------- Co-authored-by: Wil Roberts <[email protected]> Co-authored-by: Wil Roberts <[email protected]>
ONSdigital · Dec 19, 2024 · 23a75ff · 23a75ff
1 parent 6c2529d
commit 23a75ff
Show file tree

Hide file tree

Showing 21 changed files with 295 additions and 142 deletions.
diff --git a/mbs_results/estimation/apply_estimation.py b/mbs_results/estimation/apply_estimation.py
@@ -6,6 +6,9 @@
     calculate_calibration_factor,
     calculate_design_weight,
 )
+from mbs_results.estimation.create_population_counts import (
+    create_population_count_output,
+)
 from mbs_results.estimation.pre_processing_estimation import get_estimation_data
 from mbs_results.staging.data_cleaning import is_census
 
@@ -71,7 +74,13 @@ def apply_estimation(
 
         census_df["design_weight"] = 1
         census_df["calibration_factor"] = 1
-        census_df["sampled"] = 0
+        census_df["is_sampled"] = True
+        census_df["is_census"] = True
+        # is_census: bool, to distinguish fully sampled (i.e. census) strata from
+        # non-census strata. Used in outlier detection so census strata are
+        # not winsorised.
+        # is_sampled: bool. This is used to distinguish sampled refs from non-sampled
+        # refs in population
 
         non_census_df = estimation_data[
             ~(
@@ -83,13 +92,18 @@ def apply_estimation(
 
         non_census_df = calculate_design_weight(non_census_df, period, **config)
         non_census_df = calculate_calibration_factor(non_census_df, period, **config)
+        non_census_df["is_census"] = False
 
         all_together = pd.concat([non_census_df, census_df], ignore_index=True)
 
         estimation_df_list.append(all_together)
 
     estimation_df = pd.concat(estimation_df_list, ignore_index=True)
 
+    create_population_count_output(
+        estimation_df, period, calibration_group, save_output=True, **config
+    )
+
     # validate_estimation(estimation_df, **config)
 
     return estimation_df
diff --git a/mbs_results/estimation/create_population_counts.py b/mbs_results/estimation/create_population_counts.py
@@ -0,0 +1,87 @@
+import pandas as pd
+
+
+def calculate_turnover_sum_count(
+    df: pd.DataFrame, period: str, strata: str, colname: str, **config
+) -> pd.DataFrame:
+    """
+    Calculates turnover sum and count and returns an aggregated dataframe
+    with the given column name prefixed to the sum and count columns
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        original dataframe containing frotover. Groups by period and strata
+    period : str
+        period column name
+    strata : str
+        strate column name
+    colname : str
+        column name to prefix to the sum and count columns
+
+    Returns
+    -------
+    pd.DataFrame
+        A grouped dataframe with the sum and count columns prefixed with colname
+    """
+
+    df_pop_count = (
+        df.groupby([period, strata])
+        .agg(summing=("frotover", "sum"), count=("reference", "size"))
+        .reset_index()
+    )
+
+    df_pop_count.rename(
+        columns={"summing": f"{colname}_turnover_sum", "count": f"{colname}_count"},
+        inplace=True,
+    )
+
+    return df_pop_count
+
+
+def create_population_count_output(
+    df: pd.DataFrame,
+    period: str,
+    strata: str,
+    output_path: str = "",
+    save_output: bool = False,
+    **config: dict,
+) -> pd.DataFrame:
+    """
+    creates the population count output
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        original dataframe frotover and sampled. Groups by period and strata
+    period : str
+        period column name
+    strata : str
+        strata column name
+    output_path : str, optional
+        Output path to save dataframe
+    save_output : bool, optional
+        Default False. If True, saves the output to output_path
+
+    Returns
+    -------
+    pd.DataFrame
+        A grouped dataframe with the sum and count columns prefixed with colname.
+        Contains both population and sampled sum and counts for output.
+        Returns none if save_output is True
+    """
+
+    df_population = calculate_turnover_sum_count(
+        df, period, strata, colname="population", **config
+    )
+
+    df_sampled = calculate_turnover_sum_count(
+        df.loc[df["is_sampled"]], period, strata, colname="sample", **config
+    )
+    combined = pd.merge(df_population, df_sampled, on=[period, strata])
+
+    if save_output:
+        combined.to_csv(output_path + "population_counts.csv", index=False)
+        return
+    else:
+        return combined
diff --git a/mbs_results/estimation/pre_processing_estimation.py b/mbs_results/estimation/pre_processing_estimation.py
@@ -110,8 +110,8 @@ def derive_estimation_variables(
     )
 
     sample = sample.copy()[[reference, period]]
-    sample["sampled"] = 1
+    sample["is_sampled"] = True
 
     return population_frame.merge(sample, on=[reference, period], how="left").fillna(
-        value={"sampled": 0}
+        value={"is_sampled": False}
     )
diff --git a/mbs_results/outlier_detection/calculate_predicted_unit_value.py b/mbs_results/outlier_detection/calculate_predicted_unit_value.py
@@ -2,7 +2,7 @@
 
 
 def calculate_predicted_unit_value(
-    df, group, period, aux, sampled, a_weight, target_variable, nw_ag_flag
+    df, group, period, aux, is_census, a_weight, target_variable, nw_ag_flag
 ):
     """
     Calculate predicted unit value
@@ -17,8 +17,8 @@ def calculate_predicted_unit_value(
         Column name containing time period.
     aux : str
         Column name containing auxiliary variable (x).
-    sampled : str
-        Column name indicating whether it was sampled or not -boolean.
+    is_cenus : bool
+        Column name indicating whether the reference belongs to a cell that is a census.
     a_weight : str
         Column name containing the design weight.
     target_variable : str
@@ -32,7 +32,7 @@ def calculate_predicted_unit_value(
         A pandas DataFrame with a new column containing the predicted unit value.
     """
 
-    winsorised = (df[sampled] == 1) & (df[nw_ag_flag] == False)  # noqa: E712
+    winsorised = (~df[is_census]) & (~df[nw_ag_flag])
     filtered_df = df.loc[winsorised]
 
     filtered_df["weighted_target_values"] = (
@@ -69,9 +69,7 @@ def calculate_predicted_unit_value(
         ["sum_weighted_target_values", "sum_weighted_auxiliary_values"], axis=1
     )
 
-    non_winsorised = (final_df[sampled] == 0) | (
-        final_df[nw_ag_flag] == True  # noqa: E712
-    )
+    non_winsorised = (final_df[is_census]) | (final_df[nw_ag_flag])
     final_df["predicted_unit_value"] = final_df["predicted_unit_value"].mask(
         non_winsorised, np.nan
     )

diff --git a/mbs_results/outlier_detection/calculate_ratio_estimation.py b/mbs_results/outlier_detection/calculate_ratio_estimation.py
@@ -4,7 +4,7 @@
 def calculate_ratio_estimation(
     df,
     aux,
-    sampled,
+    is_census,
     a_weight,
     g_weight,
     target_variable,
@@ -20,8 +20,8 @@ def calculate_ratio_estimation(
         Original dataframe.
     aux : str
         Column name containing auxiliary variable (x).
-    sampled : str
-        Column name indicating whether it was sampled or not -boolean.
+    is_census : bool
+        Column name indicating whether a reference belongs to a cell that is a census.
     a_weight : str
         Column name containing the design weight.
     g_weight : str
@@ -48,7 +48,7 @@ def calculate_ratio_estimation(
     )
     df = df.drop("flag_calculation", axis=1)
 
-    non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] == True)  # noqa: E712
+    non_winsorised = (df[is_census]) | (df[nw_ag_flag])
     df["ratio_estimation_treshold"] = df["ratio_estimation_treshold"].mask(
         non_winsorised, np.nan
     )

diff --git a/mbs_results/outlier_detection/calculate_winsorised_weight.py b/mbs_results/outlier_detection/calculate_winsorised_weight.py
@@ -6,12 +6,11 @@ def calculate_winsorised_weight(
     group,
     period,
     aux,
-    sampled,
+    is_census,
     a_weight,
     g_weight,
     target_variable,
     predicted_unit_value,
-    l_values,
     ratio_estimation_treshold,
     nw_ag_flag,
 ):
@@ -29,7 +28,7 @@ def calculate_winsorised_weight(
     aux : str
         Column name containing auxiliary variable (x).
     sampled : str
-        Column name indicating whether it was sampled or not -boolean.
+        Column name indicating whether a reference belongs to a cell that is a census.
     a_weight : str
         Column name containing the design weight.
     g_weight:str
@@ -38,8 +37,6 @@ def calculate_winsorised_weight(
         Column name of the predicted target variable.
     predicted_unit_value: str
         column name containing the predicted unit value.
-    l_values: str
-        column name containing the l values as provided by methodology.
     ratio_estimation_treshold: str
         column name containing the previously calculated ratio estimation threshold.
     nw_ag_flag: str
@@ -66,7 +63,7 @@ def calculate_winsorised_weight(
 
     df = df.drop(["w", "new_target"], axis=1)
 
-    non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag])
+    non_winsorised = (df[is_census]) | (df[nw_ag_flag])
 
     division_with_0 = ~non_winsorised & (df[target_variable] == 0)
 

diff --git a/mbs_results/outlier_detection/winsorisation.py b/mbs_results/outlier_detection/winsorisation.py
@@ -94,7 +94,6 @@ def winsorise(
             g_weight,
             target_variable,
             "predicted_unit_value",
-            l_values,
             "ratio_estimation_treshold",
             "nw_ag_flag",
         )

diff --git a/tests/data/estimation/pre_processing_estimation/derive_estimation_variables.csv b/tests/data/estimation/pre_processing_estimation/derive_estimation_variables.csv
@@ -1,4 +1,4 @@
-reference,cell_no,auxiliary,period,sampled,calibration_group
-11111111111,123456,1111111111111,202401,1,123456
-22222222222,234567,2222222222222,202401,1,123456
-33333333333,345678,3333333333333,202401,0,345678
+reference,cell_no,auxiliary,period,is_sampled,calibration_group
+11111111111,123456,1111111111111,202401,True,123456
+22222222222,234567,2222222222222,202401,True,123456
+33333333333,345678,3333333333333,202401,False,345678
diff --git a/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_data.csv b/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_data.csv
@@ -1,15 +1,15 @@
-group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag
-101,202401,10,0,1.666666667,1.023809524,12,False
-101,202401,23,1,1.666666667,1.023809524,20,False
-101,202401,41,1,1.666666667,1.023809524,20,False
-101,202402,53,1,1.666666667,1.023809524,40,False
-101,202401,12,0,1.666666667,1.023809524,10,False
-102,202401,50,1,2.5,1.023809524,60,False
-102,202402,40,1,2.5,1.023809524,50,False
-102,202401,45,0,2.5,1.023809524,50,False
-102,202401,70,0,2.5,1.023809524,60,False
-102,202401,86,0,2.5,1.023809524,90,False
-103,202401,20,0,0.32,0.004,90,True
-103,202401,30,0,0.32,0.004,90,True
-104,202401,20,0,,0.004,90,False
-104,202401,30,0,,0.004,90,False
+group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag
+101,202401,10,True,1.666666667,1.023809524,12,False
+101,202401,23,False,1.666666667,1.023809524,20,False
+101,202401,41,False,1.666666667,1.023809524,20,False
+101,202402,53,False,1.666666667,1.023809524,40,False
+101,202401,12,True,1.666666667,1.023809524,10,False
+102,202401,50,False,2.5,1.023809524,60,False
+102,202402,40,False,2.5,1.023809524,50,False
+102,202401,45,True,2.5,1.023809524,50,False
+102,202401,70,True,2.5,1.023809524,60,False
+102,202401,86,True,2.5,1.023809524,90,False
+103,202401,20,True,0.32,0.004,90,True
+103,202401,30,True,0.32,0.004,90,True
+104,202401,20,True,,0.004,90,False
+104,202401,30,True,,0.004,90,False
diff --git a/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_output.csv b/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_output.csv
@@ -1,15 +1,15 @@
-group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value
-101,202401,10,0,1.666666667,1.023809524,12,False,
-101,202401,23,1,1.666666667,1.023809524,20,False,14.375
-101,202401,41,1,1.666666667,1.023809524,20,False,25.625
-101,202402,53,1,1.666666667,1.023809524,40,False,40
-101,202401,12,0,1.666666667,1.023809524,10,False,
-102,202401,50,1,2.5,1.023809524,60,False,60
-102,202402,40,1,2.5,1.023809524,50,False,50
-102,202401,45,0,2.5,1.023809524,50,False,
-102,202401,70,0,2.5,1.023809524,60,False,
-102,202401,86,0,2.5,1.023809524,90,False,
-103,202401,20,0,0.32,0.004,90,True,
-103,202401,30,0,0.32,0.004,90,True,
-104,202401,20,0,,0.004,90,False,
-104,202401,30,0,,0.004,90,False,
+group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value
+101,202401,10,True,1.666666667,1.023809524,12,False,
+101,202401,23,False,1.666666667,1.023809524,20,False,14.375
+101,202401,41,False,1.666666667,1.023809524,20,False,25.625
+101,202402,53,False,1.666666667,1.023809524,40,False,40
+101,202401,12,True,1.666666667,1.023809524,10,False,
+102,202401,50,False,2.5,1.023809524,60,False,60
+102,202402,40,False,2.5,1.023809524,50,False,50
+102,202401,45,True,2.5,1.023809524,50,False,
+102,202401,70,True,2.5,1.023809524,60,False,
+102,202401,86,True,2.5,1.023809524,90,False,
+103,202401,20,True,0.32,0.004,90,True,
+103,202401,30,True,0.32,0.004,90,True,
+104,202401,20,True,,0.004,90,False,
+104,202401,30,True,,0.004,90,False,
diff --git a/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data.csv b/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data.csv
@@ -1,13 +1,13 @@
-group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value
-101,202401,10,0,1.666666667,1.023809524,12,False,,
-101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5
-101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5
-101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5
-101,202401,12,0,1.666666667,1.023809524,10,False,,
-102,202401,50,1,2.5,1.023809524,60,False,60,0.5
-102,202402,40,1,2.5,1.023809524,50,False,50,0.5
-102,202401,45,0,2.5,1.023809524,50,False,,
-102,202401,70,0,2.5,1.023809524,60,False,,
-102,202401,86,0,2.5,1.023809524,90,False,,
-104,202401,20,0,,0.004,90,False,,
-104,202401,30,0,,0.004,90,False,,
+group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value
+101,202401,10,True,1.666666667,1.023809524,12,False,,
+101,202401,23,False,1.666666667,1.023809524,20,False,14.375,0.5
+101,202401,41,False,1.666666667,1.023809524,20,False,25.625,0.5
+101,202402,53,False,1.666666667,1.023809524,40,False,40,0.5
+101,202401,12,True,1.666666667,1.023809524,10,False,,
+102,202401,50,False,2.5,1.023809524,60,False,60,0.5
+102,202402,40,False,2.5,1.023809524,50,False,50,0.5
+102,202401,45,True,2.5,1.023809524,50,False,,
+102,202401,70,True,2.5,1.023809524,60,False,,
+102,202401,86,True,2.5,1.023809524,90,False,,
+104,202401,20,True,,0.004,90,False,,
+104,202401,30,True,,0.004,90,False,,
diff --git a/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data_output.csv b/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data_output.csv
@@ -1,13 +1,13 @@
-group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold
-101,202401,10,0,1.666666667,1.023809524,12,False,,,
-101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652
-101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652
-101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5,40.7078652
-101,202401,12,0,1.666666667,1.023809524,10,False,,,
-102,202401,50,1,2.5,1.023809524,60,False,60,0.5,60.3206107
-102,202402,40,1,2.5,1.023809524,50,False,50,0.5,50.3206107
-102,202401,45,0,2.5,1.023809524,50,False,,,
-102,202401,70,0,2.5,1.023809524,60,False,,,
-102,202401,86,0,2.5,1.023809524,90,False,,,
-104,202401,20,0,,0.004,90,False,,,
-104,202401,30,0,,0.004,90,False,,,
+group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold
+101,202401,10,True,1.666666667,1.023809524,12,False,,,
+101,202401,23,False,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652
+101,202401,41,False,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652
+101,202402,53,False,1.666666667,1.023809524,40,False,40,0.5,40.7078652
+101,202401,12,True,1.666666667,1.023809524,10,False,,,
+102,202401,50,False,2.5,1.023809524,60,False,60,0.5,60.3206107
+102,202402,40,False,2.5,1.023809524,50,False,50,0.5,50.3206107
+102,202401,45,True,2.5,1.023809524,50,False,,,
+102,202401,70,True,2.5,1.023809524,60,False,,,
+102,202401,86,True,2.5,1.023809524,90,False,,,
+104,202401,20,True,,0.004,90,False,,,
+104,202401,30,True,,0.004,90,False,,,