InternetSociety · andy-isoc · Nov 21, 2024 · Nov 15, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,5 @@
+# Changelog
+
+## 1.0
+- load merged hdx and UN M49 data
+- extend `django-countries` with region, sub-region, SIDS, LLDC, and LDC data
diff --git a/README.md b/README.md
@@ -1,59 +1,56 @@
-# django-countries-regions
+# django-countries-hdx
 
-Adds region and subregion data to django-countries.
+This lib adds extra M49 data to django-countries.
+
+It uses [hdx-python-country]() with the default data augmented by more UN data to provide SIDS, LLDC and LDC grouping data
 
 ## Installation
 
 Install this library using `pip`:
 ```bash
-pip install django-countries-regions
+pip install django-countries-hdx
 ```
 ## Usage
 
-Extends [django-countries](https://pypi.org/project/django-countries/) to add region and sub-region data (as defined by the [UN M49 Standard](https://en.wikipedia.org/wiki/UN_M49)).
-
+It adds extra properties to a `Country` for the region (id and name), sub-region (id and name), SIDS, LDC and LLDC.
 It also contains helper methods to retrieve the countries in a region or sub-region.
 
-
-```python
-In [1]: from django_countries.fields import Country
-In [2]: from django_countries_regions import regions
-
-In [3]: Country('NZ').region
-Out[3]: '009'
-
-In [4]: Country("NZ").region_name
-Out[4]: 'Oceania'
-
-In [5]: Country('NZ').subregion
-Out[5]: '053'
-
-In [6]: Country("NZ").subregion_name
-Out[6]: 'Australia and New Zealand'
-
-In [7]: regions.region_name('009')
-Out[7]: 'Oceania'
-
-In [8]: regions.subregion_name('053')
-Out[8]: 'Australia and New Zealand'
-
-In [9]: regions.countries_by_region('009')
-Out[9]:
+```
+>>> from django_countries.fields import Country
+>>> from django_countries_hdx import regions
+>>> Country('NZ').region
+9
+>>> Country("NZ").region_name
+'Oceania'
+>>> Country('NZ').subregion
+53
+>>> Country("NZ").subregion_name
+'Australia and New Zealand'
+>>> Country("AF").ldc
+True
+>>> Country("AF").lldc
+True
+>>> Country("AI").sids
+True
+>>> regions.get_region_name(9)
+'Oceania'
+>>> regions.get_region_name(53)
+'Australia and New Zealand'
+>>> regions.countries_by_region(9)
 ['AS',
  'AU',
  'CK',
  # …
-]
-
-In [10]: regions.countries_by_subregion('053')
-Out[10]: ['AU', 'NZ', 'NF']
+ ]
+>>> regions.countries_by_subregion(53)
+['AU', 'NZ', 'NF']
 ```
 
 ## Development
 
 To contribute to this library, first checkout the code. Then create a new virtual environment:
 ```bash
-cd django-countries-regions
+cd django-countries-hdx
 python -m venv .venv
 source .venv/bin/activate
 ```
@@ -65,3 +62,11 @@ To run the tests:
 ```bash
 pytest
 ```
+
+## Data updates
+
+The data is a static file supplied with the lib. You can use the `data/merge.py` script to update this data.
+
+Download the latest UN data to `data/unsd_methodology.csv` and run the script from the `data` dir. It will read the default `hdx` data and augment it with the UN data.
+
+The merged result is then saved into the lib where it can be read back into the `hdx` lib.
diff --git a/data/merge.py b/data/merge.py
@@ -0,0 +1,124 @@
+from pathlib import Path
+
+import pandas as pd
+from hdx.location.country import Country
+from hdx.utilities.path import script_dir_plus_file
+
+from django_countries_hdx import Regions
+
+
+def merge_data_sources():
+    try:
+        # Resolve all file paths
+        hdx_file = Path(
+            script_dir_plus_file(
+                "Countries & Territories Taxonomy MVP - C&T Taxonomy with HXL Tags.csv",
+                Country,
+            )
+        )
+        unsd_file = Path(__file__).parent.resolve() / "unsd_methodology.csv"
+        output_file = Path(
+            script_dir_plus_file(
+                "hdx_plus_m49.csv",
+                Regions,
+            )
+        )
+
+        # Verify input files exist
+        if not hdx_file.exists():
+            raise FileNotFoundError(f"HDX file not found: {hdx_file}")
+        if not unsd_file.exists():
+            raise FileNotFoundError(f"UNSD file not found: {unsd_file}")
+
+        print(f"Reading HDX data from {hdx_file}")
+
+        # Read the entire HDX file to get headers.
+        # There are two rows of headers, the original headers and the hxl tags row
+        with open(hdx_file, "r") as f:
+            headers = f.readline().strip().split(",")
+            hxl_tags = f.readline().strip().split(",")
+
+        # Read the data with the correct headers, treat everything as a string by default
+        # so Pandas doesn't get clever and convert integers to floats
+        hdx_df = pd.read_csv(
+            hdx_file,
+            header=None,
+            names=headers,
+            skiprows=2,
+            dtype=str,
+            keep_default_na=False,
+        )
+
+        # Convert only the numeric columns we need
+        numeric_columns = {
+            "m49 numerical code": 'Int64',
+            "Latitude": 'float64',
+            "Longitude": 'float64',
+            "Region Code": 'Int64',
+            "Sub-region Code": 'Int64',
+            "Intermediate Region Code": 'Int64'
+        }
+
+        for col, dtype in numeric_columns.items():
+            if col in hdx_df.columns:
+                hdx_df[col] = pd.to_numeric(hdx_df[col], errors="coerce").astype(dtype)
+
+        print(f"Reading UNSD data from {unsd_file}")
+
+        # Read and process UNSD data
+        unsd_df = pd.read_csv(
+            unsd_file,
+            delimiter=";",
+            dtype=str,
+            keep_default_na=False,
+        )
+
+        unsd_df["LDC"] = unsd_df["Least Developed Countries (LDC)"] == "x"
+        unsd_df["LLDC"] = unsd_df["Land Locked Developing Countries (LLDC)"] == "x"
+        unsd_df["SIDS"] = unsd_df["Small Island Developing States (SIDS)"] == "x"
+
+        print("Merging data")
+
+        # Get the ISO2 column name from the headers
+        # iso2_col = next(col for col in headers if "ISO" in col and "2" in col)
+        iso2_col = "ISO 3166-1 Alpha 2-Codes"
+
+        # Merge the dataframes using the ISO2 column and then drop the duplicate column
+        merged_df = hdx_df.merge(
+            unsd_df[["ISO-alpha2 Code", "LDC", "LLDC", "SIDS"]],
+            left_on=iso2_col,
+            right_on="ISO-alpha2 Code",
+            how="left"
+        )
+
+        merged_df = merged_df.drop(columns=["ISO-alpha2 Code"])
+
+        # Coerce empty values to False
+        for col in ["LDC", "LLDC", "SIDS"]:
+            merged_df[col] = merged_df[col].astype("boolean").fillna(False)
+
+        print(f"Writing output to {output_file}")
+
+        # Define the new columns and their HXL tags
+        new_columns = ["LDC", "LLDC", "SIDS"]
+        new_hxl_tags = ["#meta+bool+ldc", "#meta+bool+lldc", "#meta+bool+sids"]
+
+        headers.extend(new_columns)
+        hxl_tags.extend(new_hxl_tags)
+
+        # Write the output file and append the data
+        with open(output_file, "w", newline="") as f:
+            f.write(",".join(headers) + "\n")
+            f.write(",".join(hxl_tags) + "\n")
+
+        merged_df.to_csv(output_file, mode="a", index=False, header=False)
+
+        print(f"Successfully merged country data to {output_file}")
+        exit(0)
+    except Exception as e:
+        print(f"Error merging country data {e}")
+        exit(1)
+
+
+if __name__ == '__main__':
+    merge_data_sources()