Tidy up pipelines

open-innovations · Jun 27, 2024 · 57faf06 · 57faf06
1 parent 92424da
commit 57faf06
Show file tree

Hide file tree

Showing 6 changed files with 95 additions and 49 deletions.
diff --git a/...ity-interest/_data/SIC_section_lookup.csv → metadata/SIC_section_lookup.csv b/...ity-interest/_data/SIC_section_lookup.csv → metadata/SIC_section_lookup.csv
diff --git a/pipelines/__pycache__/util.cpython-312.pyc b/pipelines/__pycache__/util.cpython-312.pyc
diff --git a/pipelines/datacity/community-interest/extract-company-numbers.ipynb b/pipelines/datacity/community-interest/extract-company-numbers.ipynb
@@ -7,61 +7,66 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "os.chdir('../../..')"
+    "os.chdir('../../..')\n",
+    "from pipelines.util import *"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 2,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "from pipelines.util import *"
+    "Read in the full list of CICs"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/var/folders/5k/p_091y6n6rscp__0p77_ynv80000gn/T/ipykernel_15931/3015610111.py:1: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.\n",
-      "  l = pd.read_csv('working/datacity/community-interest/community-interest-active-companies.csv')\n"
+      "/var/folders/5k/p_091y6n6rscp__0p77_ynv80000gn/T/ipykernel_3474/3107646130.py:1: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  l = pd.read_csv('working/datacity/community-interest/community-interest-company.csv')\n"
      ]
     }
    ],
    "source": [
-    "l = pd.read_csv('working/datacity/community-interest/community-interest-active-companies.csv')"
+    "l = pd.read_csv('working/datacity/community-interest/community-interest-company.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Remove the leading `GB-COH-` in all values in the `id` column"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
     "l['id'] = l['id'].str.replace('GB-COH-', '')"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 5,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "l['id'].to_csv('working/datacity/community-interest/active_ids_only.csv', index=False, header=False)"
+    "Going to write the file to a `csv`. This can be copied and pasted into the Data City platform to analyse all CICs in the UK. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "l['id'].to_csv('working/datacity/community-interest/ids_only.csv', index=False, header=False)"
+   ]
   }
  ],
  "metadata": {
@@ -80,7 +85,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.12.4"
   }
  },
  "nbformat": 4,

diff --git a/pipelines/datacity/community-interest/prepare.ipynb b/pipelines/datacity/community-interest/prepare.ipynb
@@ -7,7 +7,8 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "os.chdir('../../..')"
+    "os.chdir('../../..')\n",
+    "from pipelines.util import *"
    ]
   },
   {
@@ -16,10 +17,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from pipelines.util import *\n",
-    "import pandas as pd\n",
-    "\n",
     "CIC_DATADIR = os.path.join(SRC_DIR, 'themes/purpose-social-impact/community-interest/_data/')\n",
+    "\n",
+    "# Load the SIC code lookup file.\n",
     "sic_lookup = pd.read_csv(os.path.join(CIC_DATADIR, 'SIC_section_lookup.csv'))"
    ]
   },
@@ -30,13 +30,27 @@
    "outputs": [],
    "source": [
     "def sic_code_bar_chart(IN, OUT):\n",
+    "    # Read the csv located at `IN`\n",
     "    d = pd.read_csv(f'{IN}')\n",
+    "\n",
     "    # Sort values according to count, highest at the top\n",
     "    d.sort_values(by='Count', ascending=False, inplace=True)\n",
-    "    # take the top 6 largest counts only\n",
+    "\n",
+    "    # Take the top 6 largest counts only\n",
     "    d = d.head(6)\n",
+    "\n",
+    "    # Add the code's full name.\n",
     "    d = d.merge(sic_lookup, 'inner', on='SIC')\n",
-    "    d.to_csv(os.path.join(CIC_DATADIR, f'{OUT}'), index=False)"
+    "\n",
+    "    d.to_csv(os.path.join(CIC_DATADIR, f'{OUT}'), index=False)\n",
+    "    return"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Make a csv of business counts by sic section to power a bar chart"
    ]
   },
   {
@@ -48,6 +62,13 @@
     "sic_code_bar_chart('working/datacity/community-interest/Business_counts_by_SIC_section.csv', 'business_counts_by_sic_section.csv')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Make a csv of employee counts by sic section to power a bar chart"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 5,

diff --git a/pipelines/datacity/limited-by-guarantee/prepare.ipynb b/pipelines/datacity/limited-by-guarantee/prepare.ipynb
@@ -7,55 +7,46 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "os.chdir('../../..')"
+    "os.chdir('../../..')\n",
+    "from pipelines.util import *"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 2,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "from pipelines.util import *\n",
-    "import pandas as pd\n",
-    "\n",
-    "LBG_DATADIR = os.path.join(SRC_DIR, 'themes/purpose-social-impact/limited-by-guarantee/_data')\n",
-    "CIC_DATADIR = os.path.join(SRC_DIR, 'themes/purpose-social-impact/community-interest/_data/')\n",
-    "sic_lookup = pd.read_csv(os.path.join(CIC_DATADIR, 'SIC_section_lookup.csv'))"
+    "Directory to put the output data into"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def sic_code_bar_chart(IN, OUT):\n",
-    "    d = pd.read_csv(f'{IN}')\n",
-    "    # Sort values according to count, highest at the top\n",
-    "    d.sort_values(by='Count', ascending=False, inplace=True)\n",
-    "    # take the top 6 largest counts only\n",
-    "    d = d.head(6)\n",
-    "    d = d.merge(sic_lookup, 'inner', on='SICHLU')\n",
-    "    d.to_csv(os.path.join(LBG_DATADIR, f'{OUT}'), index=False)"
+    "LBG_DATADIR = os.path.join(SRC_DIR, 'themes/purpose-social-impact/limited-by-guarantee/_data')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "sic_code_bar_chart('working/datacity/limited-by-guarantee/SICHLU_sector_counts.csv', 'business_counts_by_sic_section.csv')"
+    "IN = 'working/datacity/limited-by-guarantee/SICHLU_sector_counts.csv'\n",
+    "fname = 'business_counts_by_sic_section.csv'\n",
+    "sic_code_bar_chart(IN, LBG_DATADIR, fname)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
-    "sic_code_bar_chart('working/datacity/limited-by-guarantee/Employees_by_SICHLU_sector.csv', 'employees_by_sic_section.csv')"
+    "IN = 'working/datacity/limited-by-guarantee/Employees_by_SICHLU_sector.csv'\n",
+    "fname = 'employees_by_sic_section.csv'\n",
+    "sic_code_bar_chart(IN, LBG_DATADIR, fname)"
    ]
   }
  ],
@@ -75,7 +66,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.12.4"
   }
  },
  "nbformat": 4,

diff --git a/pipelines/util.py b/pipelines/util.py
@@ -14,6 +14,9 @@
 SRC_DIR = os.path.join(TOP, 'src')
 METADATA_DIR = os.path.join(TOP, 'metadata')
 
+# Load the SIC-code lookup file.
+sic_lookup = pd.read_csv(os.path.join(METADATA_DIR, 'SIC_section_lookup.csv'))
+
 def etl_load(working, fname):
     '''load `fname`.csv from `working`.'''
     assert fname[-3:] == 'csv', 'Not a csv file'  
@@ -109,4 +112,30 @@ def remote_parquet_as_dataframe(query):
 
 def edd_last_updated_next_updated(id):
     data = remote_parquet_as_dataframe(f"SELECT id, \"desc\", last_update, next_update FROM 'https://raw.githubusercontent.com/economic-analytics/edd/main/data/edd_dict.csv' WHERE id=='{id}';")
-    return print(data)
+    return print(data)
+
+def sic_code_bar_chart(IN, OUTDIR, FNAME, top=6):
+    '''
+    ---
+    Read the raw datacity data split by sic section, and format into a bar chart
+    ---
+        IN: the raw data file, as downloaded from TDC
+        OUTDIR: directory to store output file
+        FNAME: name for the output file
+        top: how many values to get.
+    '''
+    # Read the csv located at `IN`
+    d = pd.read_csv(f'{IN}')
+
+    # Sort values according to count, highest at the top
+    d.sort_values(by='Count', ascending=False, inplace=True)
+
+    # Take the top 6 largest counts only
+    assert top <= len(d)
+    d = d.head(top)
+
+    # Add the code's full name.
+    d = d.merge(sic_lookup, 'inner', on='SICHLU')
+
+    d.to_csv(os.path.join(OUTDIR, FNAME), index=False)
+    return