Skip to content

Commit

Permalink
Correct design and calibration values (#108)
Browse files Browse the repository at this point in the history
*Add reusable function
*Add test data
*Add unit test
*Add TODO to use this function in other parts of the pipeline
  • Loading branch information
AntonZogk authored Oct 22, 2024
1 parent cc1cde3 commit 8294415
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 0 deletions.
71 changes: 71 additions & 0 deletions mbs_results/utilities/data_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,3 +373,74 @@ def create_imputation_class(
)

return df


# TODO: Can be used when we set defaults in other parts of the pipeline
def correct_values(
df: pd.DataFrame,
columns_to_correct: List[str] or str,
condition_column: str,
condition_values: List[int],
replace_with: int,
) -> pd.DataFrame:
"""
Sets values in a dataframe column(s) based on a condition, checks if
columns exists prior to correction to avoid creating them.
Parameters
----------
df : pd.DataFrame
Original dataframe to apply the correction.
columns_to_correct : List(str) or str
Column(s) to set values.
condition_column : str
Column for the condition.
condition_values : List(int)
Values which exist in condition_column.
replace_with : int
Value to set if condition is met.
Returns
-------
df : pd.DataFrame
Dataframe with values replaced.
Examples
--------
>>> df = pd.DataFrame({'a': [0, 1, 2, 3, 4],
'b': [5, 6, 7, 8, 9],
'band': [1,2,3,4,5]})
>>> df
a b band
0 0 5 1
1 1 6 2
2 2 7 3
3 3 8 4
4 4 9 5
>>> df2 = correct_values(df,["a","b"],"band",[4,5],1)
>>> df2
a b band
0 0 5 1
1 1 6 2
2 2 7 3
3 1 1 4
4 1 1 5
"""

df_temp = df.copy() # to avoid changing input df

check_columns = (
columns_to_correct + [condition_column] # list + list(str)
if pd.api.types.is_list_like(columns_to_correct)
else [columns_to_correct, condition_column]
)

# Update value only if columns exist
if set(check_columns).issubset(df.columns):

df_temp.loc[
df[condition_column].isin(condition_values), columns_to_correct
] = replace_with

return df_temp
10 changes: 10 additions & 0 deletions tests/test_correct_values.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
band_no,value_1,value_2,value_3,expected_value_1,expected_value_2,expected_value_3
1,10,,12,10,,12
2,20,21,,20,21,
3,,31,32,,31,32
4,40,41,42,1,1,42
5,50,51,52,1,1,52
6,60,61,62,60,61,62
7,70,62,72,70,62,72
,80,81,82,80,81,82
4,,91,92,1,1,92
18 changes: 18 additions & 0 deletions tests/test_utilities/test_data_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from mbs_results.utilities.data_cleaning import (
clean_and_merge,
correct_values,
create_imputation_class,
enforce_datatypes,
run_live_or_frozen,
Expand Down Expand Up @@ -116,3 +117,20 @@ def test_run_live_or_frozen_exception():

with pytest.raises(ValueError):
run_live_or_frozen(df, "target", "error", "love")


def test_correct_values():

df = pd.read_csv(Path("tests") / "test_correct_values.csv")

df_in = df[["band_no", "value_1", "value_2", "value_3"]]

expected_output = df[
["band_no", "expected_value_1", "expected_value_2", "expected_value_3"]
]

expected_output.columns = df_in.columns

actual_output = correct_values(df_in, ["value_1", "value_2"], "band_no", [4, 5], 1)

assert_frame_equal(actual_output, expected_output)

0 comments on commit 8294415

Please sign in to comment.