Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

683 filter out questions in staging #133

Merged
merged 5 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mbs_results/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,6 @@

"census_extra_calibration_group": [5043, 5113, 5123, 5203, 5233,
5403, 5643, 5763, 5783, 5903, 6073],
"filter_out_questions": [11, 12 ,146]

"calibration_group" : "calibration_group"
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was highlighted from the hooks, not related to this task, calibration_group is defined twice in the config.json so removed the second to pass hooks

}
40 changes: 40 additions & 0 deletions mbs_results/staging/data_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,3 +408,43 @@ def is_census(calibration_group: pd.Series, extra_bands: List) -> pd.Series:
rule_extra_bands = calibration_group.isin(extra_bands)

return rule_band_4_5 | rule_extra_bands


def filter_out_questions(
df: pd.DataFrame, column: str, questions_to_filter: List[int], save_full_path: str
) -> pd.DataFrame:
"""
Removes questions defined in `questions_to_filter` from df. The removed
questions are saved in `save_full_path`.

Parameters
----------
df : pd.DataFrame
Original dataframe.
column : str
Column name to search for questions.
questions_to_filter : List(int)
List of questions to removes.
save_full_path : str
Full path to save removeed values, e.g. `folder1/folder2/mydata.csv`.

Returns
-------
keep_questions_df : pd.DataFrame
Original dataframe without questions_to_filter questions.

"""
if not save_full_path.endswith(".csv"):
raise ValueError(
"Function argument {} is not a csv file.".format(save_full_path)
)

filter_out_questions_df = df[df[column].isin(questions_to_filter)]

keep_questions_df = df[~df[column].isin(questions_to_filter)]

filter_out_questions_df.to_csv(save_full_path, index=False)

keep_questions_df.reset_index(drop=True, inplace=True)

return keep_questions_df
15 changes: 14 additions & 1 deletion mbs_results/staging/stage_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
import pandas as pd

from mbs_results.staging.create_missing_questions import create_missing_questions
from mbs_results.staging.data_cleaning import enforce_datatypes, run_live_or_frozen
from mbs_results.staging.data_cleaning import (
enforce_datatypes,
filter_out_questions,
run_live_or_frozen,
)
from mbs_results.staging.dfs_from_spp import get_dfs_from_spp
from mbs_results.utilities.utils import read_colon_separated_file

Expand Down Expand Up @@ -163,6 +167,15 @@ def stage_dataframe(config: dict) -> pd.DataFrame:
contributors, on=[reference, period], suffixes=["_res", "_con"], how="left"
)

df = filter_out_questions(
df=df,
column=config["question_no"],
questions_to_filter=config["filter_out_questions"],
save_full_path=config["output_path"]
+ config["mbs_file_name"]
+ "filter_out_questions.csv",
)

warnings.warn("add live or frozen after fixing error marker column in config")
df = run_live_or_frozen(
df,
Expand Down
11 changes: 11 additions & 0 deletions tests/data/staging/data_cleaning/test_filter_out_questions.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
question,values
11,10000
42,1001
43,1002
46,1003
12,1004
47,1005
90,1006
40,1007
49,1008
146,1009
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
question,values
42,1001
43,1002
46,1003
47,1005
90,1006
40,1007
49,1008
27 changes: 27 additions & 0 deletions tests/staging/test_data_cleaning.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pathlib import Path
from unittest.mock import patch

import pandas as pd
import pytest
Expand All @@ -8,6 +9,7 @@
clean_and_merge,
create_imputation_class,
enforce_datatypes,
filter_out_questions,
is_census,
run_live_or_frozen,
)
Expand Down Expand Up @@ -162,3 +164,28 @@ def test_is_census(filepath):
actual_output.name = "is_census"

assert_series_equal(actual_output, expected_output)


@patch("pandas.DataFrame.to_csv") # mock pandas export csv function
def test_filter_out_questions(mock_to_csv, filepath):

df_in = pd.read_csv(filepath / "test_filter_out_questions.csv")
expected_output = pd.read_csv(filepath / "test_filter_out_questions_expected.csv")

questions_to_remove = [11, 12, 146]

actual_output = filter_out_questions(
df_in, "question", questions_to_remove, "export.csv"
)

# testing if pandas export was called once
mock_to_csv.assert_called_once_with("export.csv", index=False)

assert_frame_equal(actual_output, expected_output)


def test_filter_out_questions_save_full_path_errror():
"""Check if ValueError raised when path is not csv"""

with pytest.raises(ValueError):
filter_out_questions("dummy", "dummy", "dummy", "export.txt")
Loading