From 23a75fffebfe4125fbb147902f707ff82ac1a439 Mon Sep 17 00:00:00 2001
From: Jordan-Day-ONS <57715292+Jday7879@users.noreply.github.com>
Date: Thu, 19 Dec 2024 16:07:29 +0000
Subject: [PATCH] 692 create population counts (#137)

* creating "create population count functions"

* change sampled to is_sampled and is_census

* Create unit test, remove script entry point

* Adding docstrings

* Adding optional functionality to save output instead of returning

* Renaming df's and implementing in apply_estimation

* Update mbs_results/estimation/apply_estimation.py

Co-authored-by: Wil Roberts <47739563+robertswh@users.noreply.github.com>

* Update mbs_results/estimation/apply_estimation.py

Co-authored-by: Wil Roberts <47739563+robertswh@users.noreply.github.com>

* Update tests/estimation/test_create_population_counts.py

Co-authored-by: Wil Roberts <47739563+robertswh@users.noreply.github.com>

* Update create_population_counts.py

* Formatting comments

---------

Co-authored-by: Wil Roberts <Wil.Roberts@ons.gov.uk>
Co-authored-by: Wil Roberts <47739563+robertswh@users.noreply.github.com>
---
 mbs_results/estimation/apply_estimation.py    | 16 +++-
 .../estimation/create_population_counts.py    | 87 +++++++++++++++++++
 .../estimation/pre_processing_estimation.py   |  4 +-
 .../calculate_predicted_unit_value.py         | 12 ++-
 .../calculate_ratio_estimation.py             |  8 +-
 .../calculate_winsorised_weight.py            |  9 +-
 .../outlier_detection/winsorisation.py        |  1 -
 .../derive_estimation_variables.csv           |  8 +-
 .../predicted_unit_value_data.csv             | 30 +++----
 .../predicted_unit_value_output.csv           | 30 +++----
 .../ratio_estimation_data.csv                 | 26 +++---
 .../ratio_estimation_data_output.csv          | 26 +++---
 .../winsorised_weight_data.csv                | 30 +++----
 .../winsorised_weight_data_output.csv         | 30 +++----
 .../winsorised_weight_data_output.csv         | 30 +++----
 .../test_create_population_counts.py          | 61 +++++++++++++
 .../test_pre_processing_estimation.py         |  4 +-
 .../test_calculate_predicted_unit_value.py    |  6 +-
 .../test_calculate_ratio_estimation.py        |  6 +-
 .../test_calculate_winsorised_weight.py       |  9 +-
 tests/outlier_detection/test_winsorisation.py |  4 +-
 21 files changed, 295 insertions(+), 142 deletions(-)
 create mode 100644 mbs_results/estimation/create_population_counts.py
 create mode 100644 tests/estimation/test_create_population_counts.py

diff --git a/mbs_results/estimation/apply_estimation.py b/mbs_results/estimation/apply_estimation.py
index bd14c978..df02f7fb 100644
--- a/mbs_results/estimation/apply_estimation.py
+++ b/mbs_results/estimation/apply_estimation.py
@@ -6,6 +6,9 @@
     calculate_calibration_factor,
     calculate_design_weight,
 )
+from mbs_results.estimation.create_population_counts import (
+    create_population_count_output,
+)
 from mbs_results.estimation.pre_processing_estimation import get_estimation_data
 from mbs_results.staging.data_cleaning import is_census
 
@@ -71,7 +74,13 @@ def apply_estimation(
 
         census_df["design_weight"] = 1
         census_df["calibration_factor"] = 1
-        census_df["sampled"] = 0
+        census_df["is_sampled"] = True
+        census_df["is_census"] = True
+        # is_census: bool, to distinguish fully sampled (i.e. census) strata from
+        # non-census strata. Used in outlier detection so census strata are
+        # not winsorised.
+        # is_sampled: bool. This is used to distinguish sampled refs from non-sampled
+        # refs in population
 
         non_census_df = estimation_data[
             ~(
@@ -83,6 +92,7 @@ def apply_estimation(
 
         non_census_df = calculate_design_weight(non_census_df, period, **config)
         non_census_df = calculate_calibration_factor(non_census_df, period, **config)
+        non_census_df["is_census"] = False
 
         all_together = pd.concat([non_census_df, census_df], ignore_index=True)
 
@@ -90,6 +100,10 @@ def apply_estimation(
 
     estimation_df = pd.concat(estimation_df_list, ignore_index=True)
 
+    create_population_count_output(
+        estimation_df, period, calibration_group, save_output=True, **config
+    )
+
     # validate_estimation(estimation_df, **config)
 
     return estimation_df
diff --git a/mbs_results/estimation/create_population_counts.py b/mbs_results/estimation/create_population_counts.py
new file mode 100644
index 00000000..3c00ba2b
--- /dev/null
+++ b/mbs_results/estimation/create_population_counts.py
@@ -0,0 +1,87 @@
+import pandas as pd
+
+
+def calculate_turnover_sum_count(
+    df: pd.DataFrame, period: str, strata: str, colname: str, **config
+) -> pd.DataFrame:
+    """
+    Calculates turnover sum and count and returns an aggregated dataframe
+    with the given column name prefixed to the sum and count columns
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        original dataframe containing frotover. Groups by period and strata
+    period : str
+        period column name
+    strata : str
+        strate column name
+    colname : str
+        column name to prefix to the sum and count columns
+
+    Returns
+    -------
+    pd.DataFrame
+        A grouped dataframe with the sum and count columns prefixed with colname
+    """
+
+    df_pop_count = (
+        df.groupby([period, strata])
+        .agg(summing=("frotover", "sum"), count=("reference", "size"))
+        .reset_index()
+    )
+
+    df_pop_count.rename(
+        columns={"summing": f"{colname}_turnover_sum", "count": f"{colname}_count"},
+        inplace=True,
+    )
+
+    return df_pop_count
+
+
+def create_population_count_output(
+    df: pd.DataFrame,
+    period: str,
+    strata: str,
+    output_path: str = "",
+    save_output: bool = False,
+    **config: dict,
+) -> pd.DataFrame:
+    """
+    creates the population count output
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        original dataframe frotover and sampled. Groups by period and strata
+    period : str
+        period column name
+    strata : str
+        strata column name
+    output_path : str, optional
+        Output path to save dataframe
+    save_output : bool, optional
+        Default False. If True, saves the output to output_path
+
+    Returns
+    -------
+    pd.DataFrame
+        A grouped dataframe with the sum and count columns prefixed with colname.
+        Contains both population and sampled sum and counts for output.
+        Returns none if save_output is True
+    """
+
+    df_population = calculate_turnover_sum_count(
+        df, period, strata, colname="population", **config
+    )
+
+    df_sampled = calculate_turnover_sum_count(
+        df.loc[df["is_sampled"]], period, strata, colname="sample", **config
+    )
+    combined = pd.merge(df_population, df_sampled, on=[period, strata])
+
+    if save_output:
+        combined.to_csv(output_path + "population_counts.csv", index=False)
+        return
+    else:
+        return combined
diff --git a/mbs_results/estimation/pre_processing_estimation.py b/mbs_results/estimation/pre_processing_estimation.py
index 06e40a3c..2413a317 100644
--- a/mbs_results/estimation/pre_processing_estimation.py
+++ b/mbs_results/estimation/pre_processing_estimation.py
@@ -110,8 +110,8 @@ def derive_estimation_variables(
     )
 
     sample = sample.copy()[[reference, period]]
-    sample["sampled"] = 1
+    sample["is_sampled"] = True
 
     return population_frame.merge(sample, on=[reference, period], how="left").fillna(
-        value={"sampled": 0}
+        value={"is_sampled": False}
     )
diff --git a/mbs_results/outlier_detection/calculate_predicted_unit_value.py b/mbs_results/outlier_detection/calculate_predicted_unit_value.py
index 014e8860..74025148 100644
--- a/mbs_results/outlier_detection/calculate_predicted_unit_value.py
+++ b/mbs_results/outlier_detection/calculate_predicted_unit_value.py
@@ -2,7 +2,7 @@
 
 
 def calculate_predicted_unit_value(
-    df, group, period, aux, sampled, a_weight, target_variable, nw_ag_flag
+    df, group, period, aux, is_census, a_weight, target_variable, nw_ag_flag
 ):
     """
     Calculate predicted unit value
@@ -17,8 +17,8 @@ def calculate_predicted_unit_value(
         Column name containing time period.
     aux : str
         Column name containing auxiliary variable (x).
-    sampled : str
-        Column name indicating whether it was sampled or not -boolean.
+    is_cenus : bool
+        Column name indicating whether the reference belongs to a cell that is a census.
     a_weight : str
         Column name containing the design weight.
     target_variable : str
@@ -32,7 +32,7 @@ def calculate_predicted_unit_value(
         A pandas DataFrame with a new column containing the predicted unit value.
     """
 
-    winsorised = (df[sampled] == 1) & (df[nw_ag_flag] == False)  # noqa: E712
+    winsorised = (~df[is_census]) & (~df[nw_ag_flag])
     filtered_df = df.loc[winsorised]
 
     filtered_df["weighted_target_values"] = (
@@ -69,9 +69,7 @@ def calculate_predicted_unit_value(
         ["sum_weighted_target_values", "sum_weighted_auxiliary_values"], axis=1
     )
 
-    non_winsorised = (final_df[sampled] == 0) | (
-        final_df[nw_ag_flag] == True  # noqa: E712
-    )
+    non_winsorised = (final_df[is_census]) | (final_df[nw_ag_flag])
     final_df["predicted_unit_value"] = final_df["predicted_unit_value"].mask(
         non_winsorised, np.nan
     )
diff --git a/mbs_results/outlier_detection/calculate_ratio_estimation.py b/mbs_results/outlier_detection/calculate_ratio_estimation.py
index 88354378..6b15b916 100644
--- a/mbs_results/outlier_detection/calculate_ratio_estimation.py
+++ b/mbs_results/outlier_detection/calculate_ratio_estimation.py
@@ -4,7 +4,7 @@
 def calculate_ratio_estimation(
     df,
     aux,
-    sampled,
+    is_census,
     a_weight,
     g_weight,
     target_variable,
@@ -20,8 +20,8 @@ def calculate_ratio_estimation(
         Original dataframe.
     aux : str
         Column name containing auxiliary variable (x).
-    sampled : str
-        Column name indicating whether it was sampled or not -boolean.
+    is_census : bool
+        Column name indicating whether a reference belongs to a cell that is a census.
     a_weight : str
         Column name containing the design weight.
     g_weight : str
@@ -48,7 +48,7 @@ def calculate_ratio_estimation(
     )
     df = df.drop("flag_calculation", axis=1)
 
-    non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] == True)  # noqa: E712
+    non_winsorised = (df[is_census]) | (df[nw_ag_flag])
     df["ratio_estimation_treshold"] = df["ratio_estimation_treshold"].mask(
         non_winsorised, np.nan
     )
diff --git a/mbs_results/outlier_detection/calculate_winsorised_weight.py b/mbs_results/outlier_detection/calculate_winsorised_weight.py
index 3ae9c9bf..acdaac56 100644
--- a/mbs_results/outlier_detection/calculate_winsorised_weight.py
+++ b/mbs_results/outlier_detection/calculate_winsorised_weight.py
@@ -6,12 +6,11 @@ def calculate_winsorised_weight(
     group,
     period,
     aux,
-    sampled,
+    is_census,
     a_weight,
     g_weight,
     target_variable,
     predicted_unit_value,
-    l_values,
     ratio_estimation_treshold,
     nw_ag_flag,
 ):
@@ -29,7 +28,7 @@ def calculate_winsorised_weight(
     aux : str
         Column name containing auxiliary variable (x).
     sampled : str
-        Column name indicating whether it was sampled or not -boolean.
+        Column name indicating whether a reference belongs to a cell that is a census.
     a_weight : str
         Column name containing the design weight.
     g_weight:str
@@ -38,8 +37,6 @@ def calculate_winsorised_weight(
         Column name of the predicted target variable.
     predicted_unit_value: str
         column name containing the predicted unit value.
-    l_values: str
-        column name containing the l values as provided by methodology.
     ratio_estimation_treshold: str
         column name containing the previously calculated ratio estimation threshold.
     nw_ag_flag: str
@@ -66,7 +63,7 @@ def calculate_winsorised_weight(
 
     df = df.drop(["w", "new_target"], axis=1)
 
-    non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag])
+    non_winsorised = (df[is_census]) | (df[nw_ag_flag])
 
     division_with_0 = ~non_winsorised & (df[target_variable] == 0)
 
diff --git a/mbs_results/outlier_detection/winsorisation.py b/mbs_results/outlier_detection/winsorisation.py
index 9c13faa7..381c5cde 100644
--- a/mbs_results/outlier_detection/winsorisation.py
+++ b/mbs_results/outlier_detection/winsorisation.py
@@ -94,7 +94,6 @@ def winsorise(
             g_weight,
             target_variable,
             "predicted_unit_value",
-            l_values,
             "ratio_estimation_treshold",
             "nw_ag_flag",
         )
diff --git a/tests/data/estimation/pre_processing_estimation/derive_estimation_variables.csv b/tests/data/estimation/pre_processing_estimation/derive_estimation_variables.csv
index 7f8722ce..bb58561b 100644
--- a/tests/data/estimation/pre_processing_estimation/derive_estimation_variables.csv
+++ b/tests/data/estimation/pre_processing_estimation/derive_estimation_variables.csv
@@ -1,4 +1,4 @@
-reference,cell_no,auxiliary,period,sampled,calibration_group
-11111111111,123456,1111111111111,202401,1,123456
-22222222222,234567,2222222222222,202401,1,123456
-33333333333,345678,3333333333333,202401,0,345678
+reference,cell_no,auxiliary,period,is_sampled,calibration_group
+11111111111,123456,1111111111111,202401,True,123456
+22222222222,234567,2222222222222,202401,True,123456
+33333333333,345678,3333333333333,202401,False,345678
diff --git a/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_data.csv b/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_data.csv
index 6b1ecceb..46e1f31c 100755
--- a/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_data.csv
+++ b/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_data.csv
@@ -1,15 +1,15 @@
-group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag
-101,202401,10,0,1.666666667,1.023809524,12,False
-101,202401,23,1,1.666666667,1.023809524,20,False
-101,202401,41,1,1.666666667,1.023809524,20,False
-101,202402,53,1,1.666666667,1.023809524,40,False
-101,202401,12,0,1.666666667,1.023809524,10,False
-102,202401,50,1,2.5,1.023809524,60,False
-102,202402,40,1,2.5,1.023809524,50,False
-102,202401,45,0,2.5,1.023809524,50,False
-102,202401,70,0,2.5,1.023809524,60,False
-102,202401,86,0,2.5,1.023809524,90,False
-103,202401,20,0,0.32,0.004,90,True
-103,202401,30,0,0.32,0.004,90,True
-104,202401,20,0,,0.004,90,False
-104,202401,30,0,,0.004,90,False
+group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag
+101,202401,10,True,1.666666667,1.023809524,12,False
+101,202401,23,False,1.666666667,1.023809524,20,False
+101,202401,41,False,1.666666667,1.023809524,20,False
+101,202402,53,False,1.666666667,1.023809524,40,False
+101,202401,12,True,1.666666667,1.023809524,10,False
+102,202401,50,False,2.5,1.023809524,60,False
+102,202402,40,False,2.5,1.023809524,50,False
+102,202401,45,True,2.5,1.023809524,50,False
+102,202401,70,True,2.5,1.023809524,60,False
+102,202401,86,True,2.5,1.023809524,90,False
+103,202401,20,True,0.32,0.004,90,True
+103,202401,30,True,0.32,0.004,90,True
+104,202401,20,True,,0.004,90,False
+104,202401,30,True,,0.004,90,False
diff --git a/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_output.csv b/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_output.csv
index 8edf80d8..f9e1477c 100755
--- a/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_output.csv
+++ b/tests/data/outlier_detection/calculate_predicted_unit_value/predicted_unit_value_output.csv
@@ -1,15 +1,15 @@
-group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value
-101,202401,10,0,1.666666667,1.023809524,12,False,
-101,202401,23,1,1.666666667,1.023809524,20,False,14.375
-101,202401,41,1,1.666666667,1.023809524,20,False,25.625
-101,202402,53,1,1.666666667,1.023809524,40,False,40
-101,202401,12,0,1.666666667,1.023809524,10,False,
-102,202401,50,1,2.5,1.023809524,60,False,60
-102,202402,40,1,2.5,1.023809524,50,False,50
-102,202401,45,0,2.5,1.023809524,50,False,
-102,202401,70,0,2.5,1.023809524,60,False,
-102,202401,86,0,2.5,1.023809524,90,False,
-103,202401,20,0,0.32,0.004,90,True,
-103,202401,30,0,0.32,0.004,90,True,
-104,202401,20,0,,0.004,90,False,
-104,202401,30,0,,0.004,90,False,
+group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value
+101,202401,10,True,1.666666667,1.023809524,12,False,
+101,202401,23,False,1.666666667,1.023809524,20,False,14.375
+101,202401,41,False,1.666666667,1.023809524,20,False,25.625
+101,202402,53,False,1.666666667,1.023809524,40,False,40
+101,202401,12,True,1.666666667,1.023809524,10,False,
+102,202401,50,False,2.5,1.023809524,60,False,60
+102,202402,40,False,2.5,1.023809524,50,False,50
+102,202401,45,True,2.5,1.023809524,50,False,
+102,202401,70,True,2.5,1.023809524,60,False,
+102,202401,86,True,2.5,1.023809524,90,False,
+103,202401,20,True,0.32,0.004,90,True,
+103,202401,30,True,0.32,0.004,90,True,
+104,202401,20,True,,0.004,90,False,
+104,202401,30,True,,0.004,90,False,
diff --git a/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data.csv b/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data.csv
index 10b3c9b8..5bd7e0ca 100755
--- a/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data.csv
+++ b/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data.csv
@@ -1,13 +1,13 @@
-group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value
-101,202401,10,0,1.666666667,1.023809524,12,False,,
-101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5
-101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5
-101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5
-101,202401,12,0,1.666666667,1.023809524,10,False,,
-102,202401,50,1,2.5,1.023809524,60,False,60,0.5
-102,202402,40,1,2.5,1.023809524,50,False,50,0.5
-102,202401,45,0,2.5,1.023809524,50,False,,
-102,202401,70,0,2.5,1.023809524,60,False,,
-102,202401,86,0,2.5,1.023809524,90,False,,
-104,202401,20,0,,0.004,90,False,,
-104,202401,30,0,,0.004,90,False,,
+group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value
+101,202401,10,True,1.666666667,1.023809524,12,False,,
+101,202401,23,False,1.666666667,1.023809524,20,False,14.375,0.5
+101,202401,41,False,1.666666667,1.023809524,20,False,25.625,0.5
+101,202402,53,False,1.666666667,1.023809524,40,False,40,0.5
+101,202401,12,True,1.666666667,1.023809524,10,False,,
+102,202401,50,False,2.5,1.023809524,60,False,60,0.5
+102,202402,40,False,2.5,1.023809524,50,False,50,0.5
+102,202401,45,True,2.5,1.023809524,50,False,,
+102,202401,70,True,2.5,1.023809524,60,False,,
+102,202401,86,True,2.5,1.023809524,90,False,,
+104,202401,20,True,,0.004,90,False,,
+104,202401,30,True,,0.004,90,False,,
diff --git a/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data_output.csv b/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data_output.csv
index eccf0587..66d3d6f3 100755
--- a/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data_output.csv
+++ b/tests/data/outlier_detection/calculate_ratio_estimation/ratio_estimation_data_output.csv
@@ -1,13 +1,13 @@
-group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold
-101,202401,10,0,1.666666667,1.023809524,12,False,,,
-101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652
-101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652
-101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5,40.7078652
-101,202401,12,0,1.666666667,1.023809524,10,False,,,
-102,202401,50,1,2.5,1.023809524,60,False,60,0.5,60.3206107
-102,202402,40,1,2.5,1.023809524,50,False,50,0.5,50.3206107
-102,202401,45,0,2.5,1.023809524,50,False,,,
-102,202401,70,0,2.5,1.023809524,60,False,,,
-102,202401,86,0,2.5,1.023809524,90,False,,,
-104,202401,20,0,,0.004,90,False,,,
-104,202401,30,0,,0.004,90,False,,,
+group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold
+101,202401,10,True,1.666666667,1.023809524,12,False,,,
+101,202401,23,False,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652
+101,202401,41,False,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652
+101,202402,53,False,1.666666667,1.023809524,40,False,40,0.5,40.7078652
+101,202401,12,True,1.666666667,1.023809524,10,False,,,
+102,202401,50,False,2.5,1.023809524,60,False,60,0.5,60.3206107
+102,202402,40,False,2.5,1.023809524,50,False,50,0.5,50.3206107
+102,202401,45,True,2.5,1.023809524,50,False,,,
+102,202401,70,True,2.5,1.023809524,60,False,,,
+102,202401,86,True,2.5,1.023809524,90,False,,,
+104,202401,20,True,,0.004,90,False,,,
+104,202401,30,True,,0.004,90,False,,,
diff --git a/tests/data/outlier_detection/calculate_winsorised_weight/winsorised_weight_data.csv b/tests/data/outlier_detection/calculate_winsorised_weight/winsorised_weight_data.csv
index 99f54ee9..1d57f86c 100755
--- a/tests/data/outlier_detection/calculate_winsorised_weight/winsorised_weight_data.csv
+++ b/tests/data/outlier_detection/calculate_winsorised_weight/winsorised_weight_data.csv
@@ -1,15 +1,15 @@
-group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold
-101,202401,10,0,1.666666667,1.023809524,12,False,,,
-101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652
-101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652
-101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5,40.7078652
-101,202401,12,0,1.666666667,1.023809524,10,False,,,
-102,202401,50,1,2.5,1.023809524,60,False,60,0.5,60.3206107
-102,202402,40,1,2.5,1.023809524,50,False,50,0.5,50.3206107
-102,202401,45,0,2.5,1.023809524,50,False,,,
-102,202401,70,0,2.5,1.023809524,60,False,,,
-102,202401,86,0,2.5,1.023809524,90,False,,,
-104,202401,20,0,,0.004,90,False,,,
-104,202401,30,0,,0.004,90,False,,,
-104,202401,30,1,1,0.004,90,True,,,
-104,202402,30,1,1,2,0,False,0,,
+group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold
+101,202401,10,True,1.666666667,1.023809524,12,False,,,
+101,202401,23,False,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652
+101,202401,41,False,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652
+101,202402,53,False,1.666666667,1.023809524,40,False,40,0.5,40.7078652
+101,202401,12,True,1.666666667,1.023809524,10,False,,,
+102,202401,50,False,2.5,1.023809524,60,False,60,0.5,60.3206107
+102,202402,40,False,2.5,1.023809524,50,False,50,0.5,50.3206107
+102,202401,45,True,2.5,1.023809524,50,False,,,
+102,202401,70,True,2.5,1.023809524,60,False,,,
+102,202401,86,True,2.5,1.023809524,90,False,,,
+104,202401,20,True,,0.004,90,False,,,
+104,202401,30,True,,0.004,90,False,,,
+104,202401,30,False,1,0.004,90,True,,,
+104,202402,30,False,1,2,0,False,0,,
diff --git a/tests/data/outlier_detection/calculate_winsorised_weight/winsorised_weight_data_output.csv b/tests/data/outlier_detection/calculate_winsorised_weight/winsorised_weight_data_output.csv
index 5434f2ca..1fcf782f 100755
--- a/tests/data/outlier_detection/calculate_winsorised_weight/winsorised_weight_data_output.csv
+++ b/tests/data/outlier_detection/calculate_winsorised_weight/winsorised_weight_data_output.csv
@@ -1,15 +1,15 @@
-group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold,new_target_variable,outlier_weight
-101,202401,10,0,1.666666667,1.023809524,12,False,,,,,1
-101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652,17.96453488,0.898227
-101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652,20,1
-101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5,40.7078652,40,1
-101,202401,12,0,1.666666667,1.023809524,10,False,,,,,1
-102,202401,50,1,2.5,1.023809524,60,False,60,0.5,60.3206107,60,1
-102,202402,40,1,2.5,1.023809524,50,False,50,0.5,50.3206107,50,1
-102,202401,45,0,2.5,1.023809524,50,False,,,,,1
-102,202401,70,0,2.5,1.023809524,60,False,,,,,1
-102,202401,86,0,2.5,1.023809524,90,False,,,,,1
-104,202401,20,0,,0.004,90,False,,,,,1
-104,202401,30,0,,0.004,90,False,,,,,1
-104,202401,30,1,1,0.004,90,True,,,,,1
-104,202402,30,1,1,2,0,False,0,,,,1
+group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold,new_target_variable,outlier_weight
+101,202401,10,True,1.666666667,1.023809524,12,False,,,,,1
+101,202401,23,False,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652,17.96453488,0.898227
+101,202401,41,False,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652,20,1
+101,202402,53,False,1.666666667,1.023809524,40,False,40,0.5,40.7078652,40,1
+101,202401,12,True,1.666666667,1.023809524,10,False,,,,,1
+102,202401,50,False,2.5,1.023809524,60,False,60,0.5,60.3206107,60,1
+102,202402,40,False,2.5,1.023809524,50,False,50,0.5,50.3206107,50,1
+102,202401,45,True,2.5,1.023809524,50,False,,,,,1
+102,202401,70,True,2.5,1.023809524,60,False,,,,,1
+102,202401,86,True,2.5,1.023809524,90,False,,,,,1
+104,202401,20,True,,0.004,90,False,,,,,1
+104,202401,30,True,,0.004,90,False,,,,,1
+104,202401,30,False,1,0.004,90,True,,,,,1
+104,202402,30,False,1,2,0,False,0,,,,1
diff --git a/tests/data/outlier_detection/test_winsorisation/winsorised_weight_data_output.csv b/tests/data/outlier_detection/test_winsorisation/winsorised_weight_data_output.csv
index 5434f2ca..1fcf782f 100755
--- a/tests/data/outlier_detection/test_winsorisation/winsorised_weight_data_output.csv
+++ b/tests/data/outlier_detection/test_winsorisation/winsorised_weight_data_output.csv
@@ -1,15 +1,15 @@
-group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold,new_target_variable,outlier_weight
-101,202401,10,0,1.666666667,1.023809524,12,False,,,,,1
-101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652,17.96453488,0.898227
-101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652,20,1
-101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5,40.7078652,40,1
-101,202401,12,0,1.666666667,1.023809524,10,False,,,,,1
-102,202401,50,1,2.5,1.023809524,60,False,60,0.5,60.3206107,60,1
-102,202402,40,1,2.5,1.023809524,50,False,50,0.5,50.3206107,50,1
-102,202401,45,0,2.5,1.023809524,50,False,,,,,1
-102,202401,70,0,2.5,1.023809524,60,False,,,,,1
-102,202401,86,0,2.5,1.023809524,90,False,,,,,1
-104,202401,20,0,,0.004,90,False,,,,,1
-104,202401,30,0,,0.004,90,False,,,,,1
-104,202401,30,1,1,0.004,90,True,,,,,1
-104,202402,30,1,1,2,0,False,0,,,,1
+group,period,aux,is_census,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold,new_target_variable,outlier_weight
+101,202401,10,True,1.666666667,1.023809524,12,False,,,,,1
+101,202401,23,False,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652,17.96453488,0.898227
+101,202401,41,False,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652,20,1
+101,202402,53,False,1.666666667,1.023809524,40,False,40,0.5,40.7078652,40,1
+101,202401,12,True,1.666666667,1.023809524,10,False,,,,,1
+102,202401,50,False,2.5,1.023809524,60,False,60,0.5,60.3206107,60,1
+102,202402,40,False,2.5,1.023809524,50,False,50,0.5,50.3206107,50,1
+102,202401,45,True,2.5,1.023809524,50,False,,,,,1
+102,202401,70,True,2.5,1.023809524,60,False,,,,,1
+102,202401,86,True,2.5,1.023809524,90,False,,,,,1
+104,202401,20,True,,0.004,90,False,,,,,1
+104,202401,30,True,,0.004,90,False,,,,,1
+104,202401,30,False,1,0.004,90,True,,,,,1
+104,202402,30,False,1,2,0,False,0,,,,1
diff --git a/tests/estimation/test_create_population_counts.py b/tests/estimation/test_create_population_counts.py
new file mode 100644
index 00000000..49bb66c7
--- /dev/null
+++ b/tests/estimation/test_create_population_counts.py
@@ -0,0 +1,61 @@
+import pandas as pd
+
+from mbs_results.estimation.create_population_counts import (
+    calculate_turnover_sum_count,
+    create_population_count_output,
+)
+
+
+def test_calculate_turnover_sum_count():
+    # Creating input data
+    df = pd.DataFrame(
+        {
+            "frotover": [1, 2, 3, 4, 5],
+            "reference": [1, 1, 2, 2, 3],
+            "period": [1, 1, 1, 1, 1],
+            "strata": ["A", "A", "B", "B", "C"],
+        }
+    )
+
+    # producing output
+    output = calculate_turnover_sum_count(df, "period", "strata", "population")
+
+    # creating expected output
+    expected_output = pd.DataFrame(
+        {
+            "period": [1, 1, 1],
+            "strata": ["A", "B", "C"],
+            "population_turnover_sum": [3, 7, 5],
+            "population_count": [2, 2, 1],
+        }
+    )
+    pd.testing.assert_frame_equal(output, expected_output)
+
+
+def test_create_population_count_output():
+    # Creating input data
+    df = pd.DataFrame(
+        {
+            "frotover": [1, 2, 3, 4, 5],
+            "reference": [1, 1, 2, 2, 3],
+            "period": [1, 1, 1, 1, 1],
+            "strata": ["A", "A", "B", "B", "C"],
+            "is_sampled": [True, False, True, False, True],
+        }
+    )
+
+    # producing output
+    output = create_population_count_output(df, "period", "strata")
+
+    # creating expected output
+    expected_output = pd.DataFrame(
+        {
+            "period": [1, 1, 1],
+            "strata": ["A", "B", "C"],
+            "population_turnover_sum": [3, 7, 5],
+            "population_count": [2, 2, 1],
+            "sample_turnover_sum": [1, 3, 5],
+            "sample_count": [1, 1, 1],
+        }
+    )
+    pd.testing.assert_frame_equal(output, expected_output)
diff --git a/tests/estimation/test_pre_processing_estimation.py b/tests/estimation/test_pre_processing_estimation.py
index 0d67f564..9fd2b103 100644
--- a/tests/estimation/test_pre_processing_estimation.py
+++ b/tests/estimation/test_pre_processing_estimation.py
@@ -26,10 +26,10 @@ def test_derive_estimation_variables(self, derive_estimation_variables_data):
                 "cell_no",
                 "calibration_group",
                 "auxiliary",
-                "sampled",
+                "is_sampled",
             ]
         ]
-        population_frame = expected.drop(columns=["calibration_group", "sampled"])
+        population_frame = expected.drop(columns=["calibration_group", "is_sampled"])
         sample = population_frame.loc[:1, ["reference", "period"]]
 
         calibration_group_map = expected[["cell_no", "calibration_group"]]
diff --git a/tests/outlier_detection/test_calculate_predicted_unit_value.py b/tests/outlier_detection/test_calculate_predicted_unit_value.py
index e66b74f0..44580147 100644
--- a/tests/outlier_detection/test_calculate_predicted_unit_value.py
+++ b/tests/outlier_detection/test_calculate_predicted_unit_value.py
@@ -44,7 +44,7 @@ def test_calculate_predicted_unit_value(
                 "group",
                 "period",
                 "aux",
-                "sampled",
+                "is_census",
                 "a_weight",
                 "target_variable",
                 "nw_ag_flag",
@@ -56,7 +56,7 @@ def test_calculate_predicted_unit_value(
                 "group",
                 "period",
                 "aux",
-                "sampled",
+                "is_census",
                 "a_weight",
                 "target_variable",
                 "nw_ag_flag",
@@ -68,7 +68,7 @@ def test_calculate_predicted_unit_value(
             "group",
             "period",
             "aux",
-            "sampled",
+            "is_census",
             "a_weight",
             "target_variable",
             "nw_ag_flag",
diff --git a/tests/outlier_detection/test_calculate_ratio_estimation.py b/tests/outlier_detection/test_calculate_ratio_estimation.py
index 9c344146..22a4421b 100644
--- a/tests/outlier_detection/test_calculate_ratio_estimation.py
+++ b/tests/outlier_detection/test_calculate_ratio_estimation.py
@@ -42,7 +42,7 @@ def test_calculate_ratio_estimation(
         expected_output = ratio_estimation_test_output[
             [
                 "aux",
-                "sampled",
+                "is_census",
                 "a_weight",
                 "g_weight",
                 "target_variable",
@@ -56,7 +56,7 @@ def test_calculate_ratio_estimation(
         input_data = ratio_estimation_test_data[
             [
                 "aux",
-                "sampled",
+                "is_census",
                 "a_weight",
                 "g_weight",
                 "target_variable",
@@ -69,7 +69,7 @@ def test_calculate_ratio_estimation(
         actual_output = calculate_ratio_estimation(
             input_data,
             "aux",
-            "sampled",
+            "is_census",
             "a_weight",
             "g_weight",
             "target_variale",
diff --git a/tests/outlier_detection/test_calculate_winsorised_weight.py b/tests/outlier_detection/test_calculate_winsorised_weight.py
index 750c0ad5..de5ad9c6 100644
--- a/tests/outlier_detection/test_calculate_winsorised_weight.py
+++ b/tests/outlier_detection/test_calculate_winsorised_weight.py
@@ -44,12 +44,11 @@ def test_winsorised_weight(
                 "group",
                 "period",
                 "aux",
-                "sampled",
+                "is_census",
                 "a_weight",
                 "g_weight",
                 "target_variable",
                 "predicted_unit_value",
-                "l_value",
                 "ratio_estimation_treshold",
                 "nw_ag_flag",
                 "new_target_variable",
@@ -61,12 +60,11 @@ def test_winsorised_weight(
                 "group",
                 "period",
                 "aux",
-                "sampled",
+                "is_census",
                 "a_weight",
                 "g_weight",
                 "target_variable",
                 "predicted_unit_value",
-                "l_value",
                 "ratio_estimation_treshold",
                 "nw_ag_flag",
             ]
@@ -77,12 +75,11 @@ def test_winsorised_weight(
             "group",
             "period",
             "aux",
-            "sampled",
+            "is_census",
             "a_weight",
             "g_weight",
             "target_variable",
             "predicted_unit_value",
-            "l_value",
             "ratio_estimation_treshold",
             "nw_ag_flag",
         )
diff --git a/tests/outlier_detection/test_winsorisation.py b/tests/outlier_detection/test_winsorisation.py
index 70f0de06..c14c7234 100644
--- a/tests/outlier_detection/test_winsorisation.py
+++ b/tests/outlier_detection/test_winsorisation.py
@@ -28,7 +28,7 @@ def test_winsorised_weight(self, expected_output):
                 "group",
                 "period",
                 "aux",
-                "sampled",
+                "is_census",
                 "a_weight",
                 "g_weight",
                 "target_variable",
@@ -41,7 +41,7 @@ def test_winsorised_weight(self, expected_output):
             "group",
             "period",
             "aux",
-            "sampled",
+            "is_census",
             "a_weight",
             "g_weight",
             "target_variable",