-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
118 additions
and
18 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
import fnmatch | ||
from os import listdir | ||
from os.path import isfile, join | ||
import pandas as pd | ||
|
||
from utils import convert_column_to_datetime | ||
|
||
def get_patern_df( | ||
filepath: str, | ||
pattern: str | ||
) -> pd.DataFrame: | ||
"""Loads as pd dataframe all csv files with pattern. | ||
Parameters | ||
---------- | ||
filepath : str | ||
Filepath to folder containg desired files. | ||
pattern : str | ||
Regex pattern to filter files in the folder based on name. | ||
Returns | ||
------- | ||
pd.DataFrame | ||
Dataframe containg data from all selected files. | ||
""" | ||
|
||
filenames = [ | ||
filename for filename in listdir(filepath) if isfile(join(filepath, filename)) | ||
] | ||
filenames = fnmatch.filter(filenames, pattern) | ||
df_list = [pd.read_csv(filepath + "/" + filename) for filename in filenames] | ||
df = pd.concat(df_list, ignore_index=True) | ||
|
||
return df | ||
|
||
def get_qv_and_cp_data( | ||
cp_path: str, | ||
qv_path: str, | ||
) -> pd.DataFrame: | ||
"""Reads and joins qv and cp data. | ||
Parameters | ||
---------- | ||
cp_path : str | ||
Filepath to folder containing cp data. | ||
qv_path : str | ||
Filepath to folder containing qv data. | ||
Returns | ||
------- | ||
pd.DataFrame | ||
Dataframe containing combined qv and cp data. | ||
""" | ||
|
||
qv_df = get_patern_df(qv_path,"qv*.csv") | ||
cp_df = get_patern_df(cp_path,"cp*.csv") | ||
|
||
qv_and_cp = pd.merge(qv_df,cp_df,how = "left",on = ["period","reference"]) | ||
|
||
return qv_and_cp | ||
|
||
def csw_to_spp( | ||
cp_path: str, | ||
qv_path: str, | ||
output_path: str, | ||
column_map: dict, | ||
period: str, | ||
period_range: int | ||
) -> None: | ||
"""Combines cp and qv files, filters and renames columns based on a mapping, and | ||
then saves the output as a json file. | ||
Parameters | ||
---------- | ||
cp_path : str | ||
Filepath to folder containing cp data. | ||
qv_path : str | ||
Filepath to folder containing qv data. | ||
output_path : str | ||
Filepath to save json file. | ||
column_map : dict | ||
Dictionary containing desired columns from qv and cp data as keys and their | ||
desired names as values. | ||
period : str | ||
Date to filter output on (YYYY-MM-DD). | ||
period_range : str | ||
Number of months from the period and previous to include in the output. | ||
""" | ||
qv_and_cp = get_qv_and_cp_data(cp_path,qv_path) | ||
|
||
qv_and_cp["period"] = convert_column_to_datetime(qv_and_cp["period"]) | ||
|
||
period = pd.Timestamp(period) | ||
|
||
qv_and_cp = qv_and_cp[(qv_and_cp['period'] > period - pd.DateOffset(months=period_range)) & (qv_and_cp['period'] <= period)] | ||
|
||
qv_and_cp["period"] = qv_and_cp["period"].dt.strftime('%Y%m') | ||
|
||
qv_and_cp = qv_and_cp[column_map.keys()].rename(columns=column_map) | ||
|
||
qv_and_cp.to_json(f"{output_path}_{period.strftime('%Y%m')}_{period_range}.json") | ||
|
||
col_mapping = { | ||
"reference": "reference", | ||
"period": "period", | ||
"error_mkr": "status", | ||
"question_no": "questioncode", | ||
"returned_value": "response", | ||
"adjusted_value": "adjustedresponse", | ||
} | ||
|
||
filepath = "C:/Users/daviel9/Office for National Statistics/Legacy Uplift - MBS/MBS_Anonymised-Adjusted_Responses-Disclosive_Contributor_List_Applied-20240813T1530Z" | ||
|
||
csw_to_spp(filepath, filepath, "D:/test", col_mapping, "2023-03-01", 3) | ||
|
||
df = pd.read_json("D:/test_202303_3.json") | ||
print(df.head()) | ||
print(df.tail()) |