From 9a8489da5b2b2fa45a5ab3cf67c67800c4bb6063 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Thu, 7 Jul 2022 12:27:58 +0100 Subject: [PATCH 1/2] add local authority data utility functions functions for: * matching council name to local authority code * adding region and county based on local authority code * adding gss code based on local authority code * adding type, mapit code etc These all take a pandas dataframe as an argument and return the same. --- src/data_common/local_authority/__init__.py | 115 ++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 src/data_common/local_authority/__init__.py diff --git a/src/data_common/local_authority/__init__.py b/src/data_common/local_authority/__init__.py new file mode 100644 index 0000000..d28bbbf --- /dev/null +++ b/src/data_common/local_authority/__init__.py @@ -0,0 +1,115 @@ +from os.path import join +import pandas as pd + +from ..dataset import get_dataset_df + +def fix_council_name(council: str) -> str: + return ( + council.replace("council", "") + .replace(" - unitary", "") + .replace("(unitary)", "") + .strip() + ) + +def add_local_authority_code(df: pd.DataFrame) -> pd.DataFrame: + """ + Add the local-authority-code to the dataframe + """ + + name_to_code = get_dataset_df( + repo="uk_local_authority_names_and_codes", + package="uk_la_past_current", + version="1", + file="lookup_name_to_registry.csv", + ) + df["council_lower"] = df["council"].str.lower().apply(fix_council_name) + name_to_code["council_lower"] = ( + name_to_code["la-name"].str.lower().apply(fix_council_name) + ) + df = df.merge(name_to_code, on="council_lower", how="left") + + # local-authority-code is in last position, move it to the start of the dataframe + cols = list(df.columns) + cols.insert(0, cols.pop(-1)) + df = df[cols] + df = df.drop(columns=["council_lower", "la-name"]) + return df + +def add_region_and_county(df: pd.DataFrame) -> pd.DataFrame: + name_to_code = get_dataset_df( + repo="uk_local_authority_names_and_codes", + package="uk_la_past_current", + version="1", + file="uk_local_authorities_current.csv", + ) + + rows = len(df["council"]) + df["region"] = pd.Series([None] * rows, index=df.index) + df["county"] = pd.Series([None] * rows, index=df.index) + + for index, row in df.iterrows(): + authority_code = row["local-authority-code"] + if not pd.isnull(authority_code): + authority_match = name_to_code[ + name_to_code["local-authority-code"] == authority_code + ] + df.at[index, "region"] = authority_match["region"].values[0] + df.at[index, "county"] = authority_match["county-la"].values[0] + + return df + + +def add_gss_codes(df: pd.DataFrame) -> pd.DataFrame: + name_to_code = get_dataset_df( + repo="uk_local_authority_names_and_codes", + package="uk_la_past_current", + version="1", + file="uk_local_authorities_current.csv", + ) + + rows = len(df["council"]) + df["gss_code"] = pd.Series([None] * rows, index=df.index) + + for index, row in df.iterrows(): + authority_code = row["local-authority-code"] + if not pd.isnull(authority_code): + authority_match = name_to_code[ + name_to_code["local-authority-code"] == authority_code + ] + df.at[index, "gss_code"] = authority_match["gss-code"].values[0] + + return df + + +def add_extra_authority_info(df: pd.DataFrame) -> pd.DataFrame: + name_to_code = get_dataset_df( + repo="uk_local_authority_names_and_codes", + package="uk_la_past_current", + version="1", + file="uk_local_authorities_current.csv", + ) + + extra_df = name_to_code[ + [ + "local-authority-code", + "local-authority-type", + "wdtk-id", + "mapit-area-code", + "nation", + "gss-code", + ] + ] + + # the info sheet may contain updated version of columns previously + # loaded to sheet, need to drop them before the merge + # ignore errors in case columns are not present + columns_to_drop = [x for x in extra_df.columns if x != "local-authority-code"] + df = df.drop(columns=columns_to_drop, errors="ignore") + + # merge two dataframes using the authority_code as the common reference + extra_df = extra_df.merge(df, on="local-authority-code", how="left") + + is_non_english = extra_df["nation"].isin(["Wales", "Scotland", "Northern Ireland"]) + extra_df.loc[is_non_english, "local-authority-type"] = "UA" + + return extra_df From 701ab3f566874f66af183bbb2a3f112eef86786f Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Thu, 7 Jul 2022 13:28:25 +0100 Subject: [PATCH 2/2] add CSV utility functions * get google sheet as a csv * replace headers in a csv file Both of these write out a CSV file as the result. --- src/data_common/csv/__init__.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 src/data_common/csv/__init__.py diff --git a/src/data_common/csv/__init__.py b/src/data_common/csv/__init__.py new file mode 100644 index 0000000..499d9a8 --- /dev/null +++ b/src/data_common/csv/__init__.py @@ -0,0 +1,31 @@ +import requests +import urllib3 + +import pandas as pd + +import ssl + +ssl._create_default_https_context = ssl._create_unverified_context +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + +def get_google_sheet_as_csv(key, outfile, sheet_name=None): + sheet_url = f"https://docs.google.com/spreadsheets/d/{key}/gviz/tq?tqx=out:csv" + if sheet_name is not None: + sheet_url = f"{sheet_url}&sheet={sheet_name}" + r = requests.get(sheet_url) + + with open(outfile, "wb") as outfile: + outfile.write(r.content) + + +def replace_csv_headers(csv_file, new_headers, drop_empty_columns=True, outfile=None): + if outfile is None: + outfile = csv_file + + df = pd.read_csv(csv_file) + if drop_empty_columns: + df = df.dropna(axis="columns", how="all") + + df.columns = new_headers + df.to_csv(open(outfile, "w"), index=False, header=True)