diff --git a/mbs_results/csw_to_spp_converter.py b/mbs_results/csw_to_spp_converter.py deleted file mode 100644 index 67d932bc..00000000 --- a/mbs_results/csw_to_spp_converter.py +++ /dev/null @@ -1,18 +0,0 @@ -import glob - -import pandas as pd - - -def csw_to_spp(filepath): - - files = glob.glob(filepath + "qv*") + glob.glob(filepath + "cp*") - - li = [] - - for f in files: - - temp_df = pd.read_csv(f) - - li.append(temp_df) - - print(f"Successfully created dataframe for {f} with shape {temp_df.shape}") diff --git a/mbs_results/utilities/csw_to_spp_converter.py b/mbs_results/utilities/csw_to_spp_converter.py new file mode 100644 index 00000000..38896cf6 --- /dev/null +++ b/mbs_results/utilities/csw_to_spp_converter.py @@ -0,0 +1,118 @@ +import fnmatch +from os import listdir +from os.path import isfile, join +import pandas as pd + +from utils import convert_column_to_datetime + +def get_patern_df( + filepath: str, + pattern: str + ) -> pd.DataFrame: + """Loads as pd dataframe all csv files with pattern. + + Parameters + ---------- + filepath : str + Filepath to folder containg desired files. + pattern : str + Regex pattern to filter files in the folder based on name. + + Returns + ------- + pd.DataFrame + Dataframe containg data from all selected files. + """ + + filenames = [ + filename for filename in listdir(filepath) if isfile(join(filepath, filename)) + ] + filenames = fnmatch.filter(filenames, pattern) + df_list = [pd.read_csv(filepath + "/" + filename) for filename in filenames] + df = pd.concat(df_list, ignore_index=True) + + return df + +def get_qv_and_cp_data( + cp_path: str, + qv_path: str, + ) -> pd.DataFrame: + """Reads and joins qv and cp data. + + Parameters + ---------- + cp_path : str + Filepath to folder containing cp data. + qv_path : str + Filepath to folder containing qv data. + + Returns + ------- + pd.DataFrame + Dataframe containing combined qv and cp data. + """ + + qv_df = get_patern_df(qv_path,"qv*.csv") + cp_df = get_patern_df(cp_path,"cp*.csv") + + qv_and_cp = pd.merge(qv_df,cp_df,how = "left",on = ["period","reference"]) + + return qv_and_cp + +def csw_to_spp( + cp_path: str, + qv_path: str, + output_path: str, + column_map: dict, + period: str, + period_range: int + ) -> None: + """Combines cp and qv files, filters and renames columns based on a mapping, and + then saves the output as a json file. + + Parameters + ---------- + cp_path : str + Filepath to folder containing cp data. + qv_path : str + Filepath to folder containing qv data. + output_path : str + Filepath to save json file. + column_map : dict + Dictionary containing desired columns from qv and cp data as keys and their + desired names as values. + period : str + Date to filter output on (YYYY-MM-DD). + period_range : str + Number of months from the period and previous to include in the output. + """ + qv_and_cp = get_qv_and_cp_data(cp_path,qv_path) + + qv_and_cp["period"] = convert_column_to_datetime(qv_and_cp["period"]) + + period = pd.Timestamp(period) + + qv_and_cp = qv_and_cp[(qv_and_cp['period'] > period - pd.DateOffset(months=period_range)) & (qv_and_cp['period'] <= period)] + + qv_and_cp["period"] = qv_and_cp["period"].dt.strftime('%Y%m') + + qv_and_cp = qv_and_cp[column_map.keys()].rename(columns=column_map) + + qv_and_cp.to_json(f"{output_path}_{period.strftime('%Y%m')}_{period_range}.json") + +col_mapping = { + "reference": "reference", + "period": "period", + "error_mkr": "status", + "question_no": "questioncode", + "returned_value": "response", + "adjusted_value": "adjustedresponse", + } + +filepath = "C:/Users/daviel9/Office for National Statistics/Legacy Uplift - MBS/MBS_Anonymised-Adjusted_Responses-Disclosive_Contributor_List_Applied-20240813T1530Z" + +csw_to_spp(filepath, filepath, "D:/test", col_mapping, "2023-03-01", 3) + +df = pd.read_json("D:/test_202303_3.json") +print(df.head()) +print(df.tail())