From 9a8489da5b2b2fa45a5ab3cf67c67800c4bb6063 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Thu, 7 Jul 2022 12:27:58 +0100 Subject: [PATCH] add local authority data utility functions functions for: * matching council name to local authority code * adding region and county based on local authority code * adding gss code based on local authority code * adding type, mapit code etc These all take a pandas dataframe as an argument and return the same. --- src/data_common/local_authority/__init__.py | 115 ++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 src/data_common/local_authority/__init__.py diff --git a/src/data_common/local_authority/__init__.py b/src/data_common/local_authority/__init__.py new file mode 100644 index 0000000..d28bbbf --- /dev/null +++ b/src/data_common/local_authority/__init__.py @@ -0,0 +1,115 @@ +from os.path import join +import pandas as pd + +from ..dataset import get_dataset_df + +def fix_council_name(council: str) -> str: + return ( + council.replace("council", "") + .replace(" - unitary", "") + .replace("(unitary)", "") + .strip() + ) + +def add_local_authority_code(df: pd.DataFrame) -> pd.DataFrame: + """ + Add the local-authority-code to the dataframe + """ + + name_to_code = get_dataset_df( + repo="uk_local_authority_names_and_codes", + package="uk_la_past_current", + version="1", + file="lookup_name_to_registry.csv", + ) + df["council_lower"] = df["council"].str.lower().apply(fix_council_name) + name_to_code["council_lower"] = ( + name_to_code["la-name"].str.lower().apply(fix_council_name) + ) + df = df.merge(name_to_code, on="council_lower", how="left") + + # local-authority-code is in last position, move it to the start of the dataframe + cols = list(df.columns) + cols.insert(0, cols.pop(-1)) + df = df[cols] + df = df.drop(columns=["council_lower", "la-name"]) + return df + +def add_region_and_county(df: pd.DataFrame) -> pd.DataFrame: + name_to_code = get_dataset_df( + repo="uk_local_authority_names_and_codes", + package="uk_la_past_current", + version="1", + file="uk_local_authorities_current.csv", + ) + + rows = len(df["council"]) + df["region"] = pd.Series([None] * rows, index=df.index) + df["county"] = pd.Series([None] * rows, index=df.index) + + for index, row in df.iterrows(): + authority_code = row["local-authority-code"] + if not pd.isnull(authority_code): + authority_match = name_to_code[ + name_to_code["local-authority-code"] == authority_code + ] + df.at[index, "region"] = authority_match["region"].values[0] + df.at[index, "county"] = authority_match["county-la"].values[0] + + return df + + +def add_gss_codes(df: pd.DataFrame) -> pd.DataFrame: + name_to_code = get_dataset_df( + repo="uk_local_authority_names_and_codes", + package="uk_la_past_current", + version="1", + file="uk_local_authorities_current.csv", + ) + + rows = len(df["council"]) + df["gss_code"] = pd.Series([None] * rows, index=df.index) + + for index, row in df.iterrows(): + authority_code = row["local-authority-code"] + if not pd.isnull(authority_code): + authority_match = name_to_code[ + name_to_code["local-authority-code"] == authority_code + ] + df.at[index, "gss_code"] = authority_match["gss-code"].values[0] + + return df + + +def add_extra_authority_info(df: pd.DataFrame) -> pd.DataFrame: + name_to_code = get_dataset_df( + repo="uk_local_authority_names_and_codes", + package="uk_la_past_current", + version="1", + file="uk_local_authorities_current.csv", + ) + + extra_df = name_to_code[ + [ + "local-authority-code", + "local-authority-type", + "wdtk-id", + "mapit-area-code", + "nation", + "gss-code", + ] + ] + + # the info sheet may contain updated version of columns previously + # loaded to sheet, need to drop them before the merge + # ignore errors in case columns are not present + columns_to_drop = [x for x in extra_df.columns if x != "local-authority-code"] + df = df.drop(columns=columns_to_drop, errors="ignore") + + # merge two dataframes using the authority_code as the common reference + extra_df = extra_df.merge(df, on="local-authority-code", how="left") + + is_non_english = extra_df["nation"].isin(["Wales", "Scotland", "Northern Ireland"]) + extra_df.loc[is_non_english, "local-authority-type"] = "UA" + + return extra_df