From d42bef9af6a7cda7753202b2df2607c5cbf8dd9f Mon Sep 17 00:00:00 2001
From: daviel9 <Luke.Davies@ons.gov.uk>
Date: Thu, 31 Oct 2024 14:24:50 +0000
Subject: [PATCH] Create function and sub functions

---
 mbs_results/csw_to_spp_converter.py           |  18 ---
 mbs_results/utilities/csw_to_spp_converter.py | 118 ++++++++++++++++++
 2 files changed, 118 insertions(+), 18 deletions(-)
 delete mode 100644 mbs_results/csw_to_spp_converter.py
 create mode 100644 mbs_results/utilities/csw_to_spp_converter.py

diff --git a/mbs_results/csw_to_spp_converter.py b/mbs_results/csw_to_spp_converter.py
deleted file mode 100644
index 67d932bc..00000000
--- a/mbs_results/csw_to_spp_converter.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import glob
-
-import pandas as pd
-
-
-def csw_to_spp(filepath):
-
-    files = glob.glob(filepath + "qv*") + glob.glob(filepath + "cp*")
-
-    li = []
-
-    for f in files:
-
-        temp_df = pd.read_csv(f)
-
-        li.append(temp_df)
-
-        print(f"Successfully created dataframe for {f} with shape {temp_df.shape}")
diff --git a/mbs_results/utilities/csw_to_spp_converter.py b/mbs_results/utilities/csw_to_spp_converter.py
new file mode 100644
index 00000000..38896cf6
--- /dev/null
+++ b/mbs_results/utilities/csw_to_spp_converter.py
@@ -0,0 +1,118 @@
+import fnmatch
+from os import listdir
+from os.path import isfile, join
+import pandas as pd
+
+from utils import convert_column_to_datetime
+
+def get_patern_df(
+    filepath: str, 
+    pattern: str
+    ) -> pd.DataFrame:
+    """Loads as pd dataframe all csv files with pattern.
+
+    Parameters
+    ----------
+    filepath : str
+        Filepath to folder containg desired files.
+    pattern : str
+        Regex pattern to filter files in the folder based on name.
+
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe containg data from all selected files.
+    """
+      
+    filenames = [
+        filename for filename in listdir(filepath) if isfile(join(filepath, filename))
+    ]
+    filenames = fnmatch.filter(filenames, pattern)
+    df_list = [pd.read_csv(filepath + "/" + filename) for filename in filenames]
+    df = pd.concat(df_list, ignore_index=True)
+
+    return df
+
+def get_qv_and_cp_data(
+    cp_path: str,
+    qv_path: str,
+    ) -> pd.DataFrame:
+    """Reads and joins qv and cp data.
+
+    Parameters
+    ----------
+    cp_path : str
+        Filepath to folder containing cp data.
+    qv_path : str
+        Filepath to folder containing qv data.
+        
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe containing combined qv and cp data.
+    """
+    
+    qv_df = get_patern_df(qv_path,"qv*.csv")
+    cp_df = get_patern_df(cp_path,"cp*.csv")
+    
+    qv_and_cp = pd.merge(qv_df,cp_df,how = "left",on = ["period","reference"])
+    
+    return qv_and_cp
+
+def csw_to_spp(
+    cp_path: str,
+    qv_path: str,
+    output_path: str,
+    column_map: dict,
+    period: str,
+    period_range: int
+    ) -> None:
+    """Combines cp and qv files, filters and renames columns based on a mapping, and
+    then saves the output as a json file.
+
+    Parameters
+    ----------
+    cp_path : str
+        Filepath to folder containing cp data.
+    qv_path : str
+        Filepath to folder containing qv data.
+    output_path : str
+        Filepath to save json file.
+    column_map : dict
+        Dictionary containing desired columns from qv and cp data as keys and their 
+        desired names as values.
+    period : str
+        Date to filter output on (YYYY-MM-DD).
+    period_range : str
+        Number of months from the period and previous to include in the output.
+    """
+    qv_and_cp = get_qv_and_cp_data(cp_path,qv_path)
+    
+    qv_and_cp["period"] = convert_column_to_datetime(qv_and_cp["period"])
+    
+    period = pd.Timestamp(period)
+        
+    qv_and_cp = qv_and_cp[(qv_and_cp['period'] > period - pd.DateOffset(months=period_range)) & (qv_and_cp['period'] <= period)]
+    
+    qv_and_cp["period"] = qv_and_cp["period"].dt.strftime('%Y%m')
+    
+    qv_and_cp = qv_and_cp[column_map.keys()].rename(columns=column_map)
+        
+    qv_and_cp.to_json(f"{output_path}_{period.strftime('%Y%m')}_{period_range}.json")
+
+col_mapping = {
+        "reference": "reference",
+        "period": "period",
+        "error_mkr": "status",
+        "question_no": "questioncode",
+        "returned_value": "response",
+        "adjusted_value": "adjustedresponse",
+    }
+
+filepath = "C:/Users/daviel9/Office for National Statistics/Legacy Uplift - MBS/MBS_Anonymised-Adjusted_Responses-Disclosive_Contributor_List_Applied-20240813T1530Z"
+
+csw_to_spp(filepath, filepath, "D:/test", col_mapping, "2023-03-01", 3)
+
+df = pd.read_json("D:/test_202303_3.json")
+print(df.head())
+print(df.tail())