Skip to content

Commit

Permalink
692 create population counts (#137)
Browse files Browse the repository at this point in the history
* creating "create population count functions"

* change sampled to is_sampled and is_census

* Create unit test, remove script entry point

* Adding docstrings

* Adding optional functionality to save output instead of returning

* Renaming df's and implementing in apply_estimation

* Update mbs_results/estimation/apply_estimation.py

Co-authored-by: Wil Roberts <[email protected]>

* Update mbs_results/estimation/apply_estimation.py

Co-authored-by: Wil Roberts <[email protected]>

* Update tests/estimation/test_create_population_counts.py

Co-authored-by: Wil Roberts <[email protected]>

* Update create_population_counts.py

* Formatting comments

---------

Co-authored-by: Wil Roberts <[email protected]>
Co-authored-by: Wil Roberts <[email protected]>
  • Loading branch information
3 people authored Dec 19, 2024
1 parent 6c2529d commit 23a75ff
Show file tree
Hide file tree
Showing 21 changed files with 295 additions and 142 deletions.
16 changes: 15 additions & 1 deletion mbs_results/estimation/apply_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
calculate_calibration_factor,
calculate_design_weight,
)
from mbs_results.estimation.create_population_counts import (
create_population_count_output,
)
from mbs_results.estimation.pre_processing_estimation import get_estimation_data
from mbs_results.staging.data_cleaning import is_census

Expand Down Expand Up @@ -71,7 +74,13 @@ def apply_estimation(

census_df["design_weight"] = 1
census_df["calibration_factor"] = 1
census_df["sampled"] = 0
census_df["is_sampled"] = True
census_df["is_census"] = True
# is_census: bool, to distinguish fully sampled (i.e. census) strata from
# non-census strata. Used in outlier detection so census strata are
# not winsorised.
# is_sampled: bool. This is used to distinguish sampled refs from non-sampled
# refs in population

non_census_df = estimation_data[
~(
Expand All @@ -83,13 +92,18 @@ def apply_estimation(

non_census_df = calculate_design_weight(non_census_df, period, **config)
non_census_df = calculate_calibration_factor(non_census_df, period, **config)
non_census_df["is_census"] = False

all_together = pd.concat([non_census_df, census_df], ignore_index=True)

estimation_df_list.append(all_together)

estimation_df = pd.concat(estimation_df_list, ignore_index=True)

create_population_count_output(
estimation_df, period, calibration_group, save_output=True, **config
)

# validate_estimation(estimation_df, **config)

return estimation_df
87 changes: 87 additions & 0 deletions mbs_results/estimation/create_population_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import pandas as pd


def calculate_turnover_sum_count(
df: pd.DataFrame, period: str, strata: str, colname: str, **config
) -> pd.DataFrame:
"""
Calculates turnover sum and count and returns an aggregated dataframe
with the given column name prefixed to the sum and count columns
Parameters
----------
df : pd.DataFrame
original dataframe containing frotover. Groups by period and strata
period : str
period column name
strata : str
strate column name
colname : str
column name to prefix to the sum and count columns
Returns
-------
pd.DataFrame
A grouped dataframe with the sum and count columns prefixed with colname
"""

df_pop_count = (
df.groupby([period, strata])
.agg(summing=("frotover", "sum"), count=("reference", "size"))
.reset_index()
)

df_pop_count.rename(
columns={"summing": f"{colname}_turnover_sum", "count": f"{colname}_count"},
inplace=True,
)

return df_pop_count


def create_population_count_output(
df: pd.DataFrame,
period: str,
strata: str,
output_path: str = "",
save_output: bool = False,
**config: dict,
) -> pd.DataFrame:
"""
creates the population count output
Parameters
----------
df : pd.DataFrame
original dataframe frotover and sampled. Groups by period and strata
period : str
period column name
strata : str
strata column name
output_path : str, optional
Output path to save dataframe
save_output : bool, optional
Default False. If True, saves the output to output_path
Returns
-------
pd.DataFrame
A grouped dataframe with the sum and count columns prefixed with colname.
Contains both population and sampled sum and counts for output.
Returns none if save_output is True
"""

df_population = calculate_turnover_sum_count(
df, period, strata, colname="population", **config
)

df_sampled = calculate_turnover_sum_count(
df.loc[df["is_sampled"]], period, strata, colname="sample", **config
)
combined = pd.merge(df_population, df_sampled, on=[period, strata])

if save_output:
combined.to_csv(output_path + "population_counts.csv", index=False)
return
else:
return combined
4 changes: 2 additions & 2 deletions mbs_results/estimation/pre_processing_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,8 @@ def derive_estimation_variables(
)

sample = sample.copy()[[reference, period]]
sample["sampled"] = 1
sample["is_sampled"] = True

return population_frame.merge(sample, on=[reference, period], how="left").fillna(
value={"sampled": 0}
value={"is_sampled": False}
)
12 changes: 5 additions & 7 deletions mbs_results/outlier_detection/calculate_predicted_unit_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


def calculate_predicted_unit_value(
df, group, period, aux, sampled, a_weight, target_variable, nw_ag_flag
df, group, period, aux, is_census, a_weight, target_variable, nw_ag_flag
):
"""
Calculate predicted unit value
Expand All @@ -17,8 +17,8 @@ def calculate_predicted_unit_value(
Column name containing time period.
aux : str
Column name containing auxiliary variable (x).
sampled : str
Column name indicating whether it was sampled or not -boolean.
is_cenus : bool
Column name indicating whether the reference belongs to a cell that is a census.
a_weight : str
Column name containing the design weight.
target_variable : str
Expand All @@ -32,7 +32,7 @@ def calculate_predicted_unit_value(
A pandas DataFrame with a new column containing the predicted unit value.
"""

winsorised = (df[sampled] == 1) & (df[nw_ag_flag] == False) # noqa: E712
winsorised = (~df[is_census]) & (~df[nw_ag_flag])
filtered_df = df.loc[winsorised]

filtered_df["weighted_target_values"] = (
Expand Down Expand Up @@ -69,9 +69,7 @@ def calculate_predicted_unit_value(
["sum_weighted_target_values", "sum_weighted_auxiliary_values"], axis=1
)

non_winsorised = (final_df[sampled] == 0) | (
final_df[nw_ag_flag] == True # noqa: E712
)
non_winsorised = (final_df[is_census]) | (final_df[nw_ag_flag])
final_df["predicted_unit_value"] = final_df["predicted_unit_value"].mask(
non_winsorised, np.nan
)
Expand Down
8 changes: 4 additions & 4 deletions mbs_results/outlier_detection/calculate_ratio_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
def calculate_ratio_estimation(
df,
aux,
sampled,
is_census,
a_weight,
g_weight,
target_variable,
Expand All @@ -20,8 +20,8 @@ def calculate_ratio_estimation(
Original dataframe.
aux : str
Column name containing auxiliary variable (x).
sampled : str
Column name indicating whether it was sampled or not -boolean.
is_census : bool
Column name indicating whether a reference belongs to a cell that is a census.
a_weight : str
Column name containing the design weight.
g_weight : str
Expand All @@ -48,7 +48,7 @@ def calculate_ratio_estimation(
)
df = df.drop("flag_calculation", axis=1)

non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] == True) # noqa: E712
non_winsorised = (df[is_census]) | (df[nw_ag_flag])
df["ratio_estimation_treshold"] = df["ratio_estimation_treshold"].mask(
non_winsorised, np.nan
)
Expand Down
9 changes: 3 additions & 6 deletions mbs_results/outlier_detection/calculate_winsorised_weight.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,11 @@ def calculate_winsorised_weight(
group,
period,
aux,
sampled,
is_census,
a_weight,
g_weight,
target_variable,
predicted_unit_value,
l_values,
ratio_estimation_treshold,
nw_ag_flag,
):
Expand All @@ -29,7 +28,7 @@ def calculate_winsorised_weight(
aux : str
Column name containing auxiliary variable (x).
sampled : str
Column name indicating whether it was sampled or not -boolean.
Column name indicating whether a reference belongs to a cell that is a census.
a_weight : str
Column name containing the design weight.
g_weight:str
Expand All @@ -38,8 +37,6 @@ def calculate_winsorised_weight(
Column name of the predicted target variable.
predicted_unit_value: str
column name containing the predicted unit value.
l_values: str
column name containing the l values as provided by methodology.
ratio_estimation_treshold: str
column name containing the previously calculated ratio estimation threshold.
nw_ag_flag: str
Expand All @@ -66,7 +63,7 @@ def calculate_winsorised_weight(

df = df.drop(["w", "new_target"], axis=1)

non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag])
non_winsorised = (df[is_census]) | (df[nw_ag_flag])

division_with_0 = ~non_winsorised & (df[target_variable] == 0)

Expand Down
1 change: 0 additions & 1 deletion mbs_results/outlier_detection/winsorisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ def winsorise(
g_weight,
target_variable,
"predicted_unit_value",
l_values,
"ratio_estimation_treshold",
"nw_ag_flag",
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
reference,cell_no,auxiliary,period,sampled,calibration_group
11111111111,123456,1111111111111,202401,1,123456
22222222222,234567,2222222222222,202401,1,123456
33333333333,345678,3333333333333,202401,0,345678
reference,cell_no,auxiliary,period,is_sampled,calibration_group
11111111111,123456,1111111111111,202401,True,123456
22222222222,234567,2222222222222,202401,True,123456
33333333333,345678,3333333333333,202401,False,345678
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag
101,202401,10,0,1.666666667,1.023809524,12,False
101,202401,23,1,1.666666667,1.023809524,20,False
101,202401,41,1,1.666666667,1.023809524,20,False
101,202402,53,1,1.666666667,1.023809524,40,False
101,202401,12,0,1.666666667,1.023809524,10,False
102,202401,50,1,2.5,1.023809524,60,False
102,202402,40,1,2.5,1.023809524,50,False
102,202401,45,0,2.5,1.023809524,50,False
102,202401,70,0,2.5,1.023809524,60,False
102,202401,86,0,2.5,1.023809524,90,False
103,202401,20,0,0.32,0.004,90,True
103,202401,30,0,0.32,0.004,90,True
104,202401,20,0,,0.004,90,False
104,202401,30,0,,0.004,90,False
group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag
101,202401,10,True,1.666666667,1.023809524,12,False
101,202401,23,False,1.666666667,1.023809524,20,False
101,202401,41,False,1.666666667,1.023809524,20,False
101,202402,53,False,1.666666667,1.023809524,40,False
101,202401,12,True,1.666666667,1.023809524,10,False
102,202401,50,False,2.5,1.023809524,60,False
102,202402,40,False,2.5,1.023809524,50,False
102,202401,45,True,2.5,1.023809524,50,False
102,202401,70,True,2.5,1.023809524,60,False
102,202401,86,True,2.5,1.023809524,90,False
103,202401,20,True,0.32,0.004,90,True
103,202401,30,True,0.32,0.004,90,True
104,202401,20,True,,0.004,90,False
104,202401,30,True,,0.004,90,False
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value
101,202401,10,0,1.666666667,1.023809524,12,False,
101,202401,23,1,1.666666667,1.023809524,20,False,14.375
101,202401,41,1,1.666666667,1.023809524,20,False,25.625
101,202402,53,1,1.666666667,1.023809524,40,False,40
101,202401,12,0,1.666666667,1.023809524,10,False,
102,202401,50,1,2.5,1.023809524,60,False,60
102,202402,40,1,2.5,1.023809524,50,False,50
102,202401,45,0,2.5,1.023809524,50,False,
102,202401,70,0,2.5,1.023809524,60,False,
102,202401,86,0,2.5,1.023809524,90,False,
103,202401,20,0,0.32,0.004,90,True,
103,202401,30,0,0.32,0.004,90,True,
104,202401,20,0,,0.004,90,False,
104,202401,30,0,,0.004,90,False,
group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value
101,202401,10,True,1.666666667,1.023809524,12,False,
101,202401,23,False,1.666666667,1.023809524,20,False,14.375
101,202401,41,False,1.666666667,1.023809524,20,False,25.625
101,202402,53,False,1.666666667,1.023809524,40,False,40
101,202401,12,True,1.666666667,1.023809524,10,False,
102,202401,50,False,2.5,1.023809524,60,False,60
102,202402,40,False,2.5,1.023809524,50,False,50
102,202401,45,True,2.5,1.023809524,50,False,
102,202401,70,True,2.5,1.023809524,60,False,
102,202401,86,True,2.5,1.023809524,90,False,
103,202401,20,True,0.32,0.004,90,True,
103,202401,30,True,0.32,0.004,90,True,
104,202401,20,True,,0.004,90,False,
104,202401,30,True,,0.004,90,False,
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value
101,202401,10,0,1.666666667,1.023809524,12,False,,
101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5
101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5
101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5
101,202401,12,0,1.666666667,1.023809524,10,False,,
102,202401,50,1,2.5,1.023809524,60,False,60,0.5
102,202402,40,1,2.5,1.023809524,50,False,50,0.5
102,202401,45,0,2.5,1.023809524,50,False,,
102,202401,70,0,2.5,1.023809524,60,False,,
102,202401,86,0,2.5,1.023809524,90,False,,
104,202401,20,0,,0.004,90,False,,
104,202401,30,0,,0.004,90,False,,
group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value
101,202401,10,True,1.666666667,1.023809524,12,False,,
101,202401,23,False,1.666666667,1.023809524,20,False,14.375,0.5
101,202401,41,False,1.666666667,1.023809524,20,False,25.625,0.5
101,202402,53,False,1.666666667,1.023809524,40,False,40,0.5
101,202401,12,True,1.666666667,1.023809524,10,False,,
102,202401,50,False,2.5,1.023809524,60,False,60,0.5
102,202402,40,False,2.5,1.023809524,50,False,50,0.5
102,202401,45,True,2.5,1.023809524,50,False,,
102,202401,70,True,2.5,1.023809524,60,False,,
102,202401,86,True,2.5,1.023809524,90,False,,
104,202401,20,True,,0.004,90,False,,
104,202401,30,True,,0.004,90,False,,
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold
101,202401,10,0,1.666666667,1.023809524,12,False,,,
101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652
101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652
101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5,40.7078652
101,202401,12,0,1.666666667,1.023809524,10,False,,,
102,202401,50,1,2.5,1.023809524,60,False,60,0.5,60.3206107
102,202402,40,1,2.5,1.023809524,50,False,50,0.5,50.3206107
102,202401,45,0,2.5,1.023809524,50,False,,,
102,202401,70,0,2.5,1.023809524,60,False,,,
102,202401,86,0,2.5,1.023809524,90,False,,,
104,202401,20,0,,0.004,90,False,,,
104,202401,30,0,,0.004,90,False,,,
group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold
101,202401,10,True,1.666666667,1.023809524,12,False,,,
101,202401,23,False,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652
101,202401,41,False,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652
101,202402,53,False,1.666666667,1.023809524,40,False,40,0.5,40.7078652
101,202401,12,True,1.666666667,1.023809524,10,False,,,
102,202401,50,False,2.5,1.023809524,60,False,60,0.5,60.3206107
102,202402,40,False,2.5,1.023809524,50,False,50,0.5,50.3206107
102,202401,45,True,2.5,1.023809524,50,False,,,
102,202401,70,True,2.5,1.023809524,60,False,,,
102,202401,86,True,2.5,1.023809524,90,False,,,
104,202401,20,True,,0.004,90,False,,,
104,202401,30,True,,0.004,90,False,,,
Loading

0 comments on commit 23a75ff

Please sign in to comment.