Correct design and calibration values (#108)

*Add reusable function *Add test data *Add unit test *Add TODO to use this function in other parts of the pipeline
ONSdigital · Oct 22, 2024 · 8294415 · 8294415
1 parent cc1cde3
commit 8294415
Show file tree

Hide file tree

Showing 3 changed files with 99 additions and 0 deletions.
diff --git a/mbs_results/utilities/data_cleaning.py b/mbs_results/utilities/data_cleaning.py
@@ -373,3 +373,74 @@ def create_imputation_class(
     )
 
     return df
+
+
+# TODO: Can be used when we set defaults in other parts of the pipeline
+def correct_values(
+    df: pd.DataFrame,
+    columns_to_correct: List[str] or str,
+    condition_column: str,
+    condition_values: List[int],
+    replace_with: int,
+) -> pd.DataFrame:
+    """
+    Sets values in a dataframe column(s) based on a condition, checks if
+    columns exists prior to correction to avoid creating them.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Original dataframe to apply the correction.
+    columns_to_correct : List(str) or str
+        Column(s) to set values.
+    condition_column : str
+        Column for the condition.
+    condition_values : List(int)
+        Values which exist in condition_column.
+    replace_with : int
+        Value to set if condition is met.
+
+    Returns
+    -------
+    df : pd.DataFrame
+        Dataframe with values replaced.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({'a': [0, 1, 2, 3, 4],
+            'b': [5, 6, 7, 8, 9],
+            'band': [1,2,3,4,5]})
+    >>> df
+        a  b  band
+    0  0  5     1
+    1  1  6     2
+    2  2  7     3
+    3  3  8     4
+    4  4  9     5
+
+    >>> df2 = correct_values(df,["a","b"],"band",[4,5],1)
+    >>> df2
+        a  b  band
+    0  0  5     1
+    1  1  6     2
+    2  2  7     3
+    3  1  1     4
+    4  1  1     5
+    """
+
+    df_temp = df.copy()  # to avoid changing input df
+
+    check_columns = (
+        columns_to_correct + [condition_column]  # list + list(str)
+        if pd.api.types.is_list_like(columns_to_correct)
+        else [columns_to_correct, condition_column]
+    )
+
+    # Update value only if columns exist
+    if set(check_columns).issubset(df.columns):
+
+        df_temp.loc[
+            df[condition_column].isin(condition_values), columns_to_correct
+        ] = replace_with
+
+    return df_temp
diff --git a/tests/test_correct_values.csv b/tests/test_correct_values.csv
@@ -0,0 +1,10 @@
+band_no,value_1,value_2,value_3,expected_value_1,expected_value_2,expected_value_3
+1,10,,12,10,,12
+2,20,21,,20,21,
+3,,31,32,,31,32
+4,40,41,42,1,1,42
+5,50,51,52,1,1,52
+6,60,61,62,60,61,62
+7,70,62,72,70,62,72
+,80,81,82,80,81,82
+4,,91,92,1,1,92
diff --git a/tests/test_utilities/test_data_cleaning.py b/tests/test_utilities/test_data_cleaning.py
@@ -6,6 +6,7 @@
 
 from mbs_results.utilities.data_cleaning import (
     clean_and_merge,
+    correct_values,
     create_imputation_class,
     enforce_datatypes,
     run_live_or_frozen,
@@ -116,3 +117,20 @@ def test_run_live_or_frozen_exception():
 
     with pytest.raises(ValueError):
         run_live_or_frozen(df, "target", "error", "love")
+
+
+def test_correct_values():
+
+    df = pd.read_csv(Path("tests") / "test_correct_values.csv")
+
+    df_in = df[["band_no", "value_1", "value_2", "value_3"]]
+
+    expected_output = df[
+        ["band_no", "expected_value_1", "expected_value_2", "expected_value_3"]
+    ]
+
+    expected_output.columns = df_in.columns
+
+    actual_output = correct_values(df_in, ["value_1", "value_2"], "band_no", [4, 5], 1)
+
+    assert_frame_equal(actual_output, expected_output)