Skip to content

Commit

Permalink
Tidy up pipelines
Browse files Browse the repository at this point in the history
  • Loading branch information
luke-strange committed Jun 27, 2024
1 parent 92424da commit 57faf06
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 49 deletions.
File renamed without changes.
Binary file modified pipelines/__pycache__/util.cpython-312.pyc
Binary file not shown.
41 changes: 23 additions & 18 deletions pipelines/datacity/community-interest/extract-company-numbers.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,61 +7,66 @@
"outputs": [],
"source": [
"import os\n",
"os.chdir('../../..')"
"os.chdir('../../..')\n",
"from pipelines.util import *"
]
},
{
"cell_type": "code",
"execution_count": 2,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from pipelines.util import *"
"Read in the full list of CICs"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/5k/p_091y6n6rscp__0p77_ynv80000gn/T/ipykernel_15931/3015610111.py:1: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" l = pd.read_csv('working/datacity/community-interest/community-interest-active-companies.csv')\n"
"/var/folders/5k/p_091y6n6rscp__0p77_ynv80000gn/T/ipykernel_3474/3107646130.py:1: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" l = pd.read_csv('working/datacity/community-interest/community-interest-company.csv')\n"
]
}
],
"source": [
"l = pd.read_csv('working/datacity/community-interest/community-interest-active-companies.csv')"
"l = pd.read_csv('working/datacity/community-interest/community-interest-company.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Remove the leading `GB-COH-` in all values in the `id` column"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"l['id'] = l['id'].str.replace('GB-COH-', '')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"l['id'].to_csv('working/datacity/community-interest/active_ids_only.csv', index=False, header=False)"
"Going to write the file to a `csv`. This can be copied and pasted into the Data City platform to analyse all CICs in the UK. "
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": []
"source": [
"l['id'].to_csv('working/datacity/community-interest/ids_only.csv', index=False, header=False)"
]
}
],
"metadata": {
Expand All @@ -80,7 +85,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
"version": "3.12.4"
}
},
"nbformat": 4,
Expand Down
33 changes: 27 additions & 6 deletions pipelines/datacity/community-interest/prepare.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
"outputs": [],
"source": [
"import os\n",
"os.chdir('../../..')"
"os.chdir('../../..')\n",
"from pipelines.util import *"
]
},
{
Expand All @@ -16,10 +17,9 @@
"metadata": {},
"outputs": [],
"source": [
"from pipelines.util import *\n",
"import pandas as pd\n",
"\n",
"CIC_DATADIR = os.path.join(SRC_DIR, 'themes/purpose-social-impact/community-interest/_data/')\n",
"\n",
"# Load the SIC code lookup file.\n",
"sic_lookup = pd.read_csv(os.path.join(CIC_DATADIR, 'SIC_section_lookup.csv'))"
]
},
Expand All @@ -30,13 +30,27 @@
"outputs": [],
"source": [
"def sic_code_bar_chart(IN, OUT):\n",
" # Read the csv located at `IN`\n",
" d = pd.read_csv(f'{IN}')\n",
"\n",
" # Sort values according to count, highest at the top\n",
" d.sort_values(by='Count', ascending=False, inplace=True)\n",
" # take the top 6 largest counts only\n",
"\n",
" # Take the top 6 largest counts only\n",
" d = d.head(6)\n",
"\n",
" # Add the code's full name.\n",
" d = d.merge(sic_lookup, 'inner', on='SIC')\n",
" d.to_csv(os.path.join(CIC_DATADIR, f'{OUT}'), index=False)"
"\n",
" d.to_csv(os.path.join(CIC_DATADIR, f'{OUT}'), index=False)\n",
" return"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Make a csv of business counts by sic section to power a bar chart"
]
},
{
Expand All @@ -48,6 +62,13 @@
"sic_code_bar_chart('working/datacity/community-interest/Business_counts_by_SIC_section.csv', 'business_counts_by_sic_section.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Make a csv of employee counts by sic section to power a bar chart"
]
},
{
"cell_type": "code",
"execution_count": 5,
Expand Down
39 changes: 15 additions & 24 deletions pipelines/datacity/limited-by-guarantee/prepare.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,55 +7,46 @@
"outputs": [],
"source": [
"import os\n",
"os.chdir('../../..')"
"os.chdir('../../..')\n",
"from pipelines.util import *"
]
},
{
"cell_type": "code",
"execution_count": 2,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"from pipelines.util import *\n",
"import pandas as pd\n",
"\n",
"LBG_DATADIR = os.path.join(SRC_DIR, 'themes/purpose-social-impact/limited-by-guarantee/_data')\n",
"CIC_DATADIR = os.path.join(SRC_DIR, 'themes/purpose-social-impact/community-interest/_data/')\n",
"sic_lookup = pd.read_csv(os.path.join(CIC_DATADIR, 'SIC_section_lookup.csv'))"
"Directory to put the output data into"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def sic_code_bar_chart(IN, OUT):\n",
" d = pd.read_csv(f'{IN}')\n",
" # Sort values according to count, highest at the top\n",
" d.sort_values(by='Count', ascending=False, inplace=True)\n",
" # take the top 6 largest counts only\n",
" d = d.head(6)\n",
" d = d.merge(sic_lookup, 'inner', on='SICHLU')\n",
" d.to_csv(os.path.join(LBG_DATADIR, f'{OUT}'), index=False)"
"LBG_DATADIR = os.path.join(SRC_DIR, 'themes/purpose-social-impact/limited-by-guarantee/_data')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"sic_code_bar_chart('working/datacity/limited-by-guarantee/SICHLU_sector_counts.csv', 'business_counts_by_sic_section.csv')"
"IN = 'working/datacity/limited-by-guarantee/SICHLU_sector_counts.csv'\n",
"fname = 'business_counts_by_sic_section.csv'\n",
"sic_code_bar_chart(IN, LBG_DATADIR, fname)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"sic_code_bar_chart('working/datacity/limited-by-guarantee/Employees_by_SICHLU_sector.csv', 'employees_by_sic_section.csv')"
"IN = 'working/datacity/limited-by-guarantee/Employees_by_SICHLU_sector.csv'\n",
"fname = 'employees_by_sic_section.csv'\n",
"sic_code_bar_chart(IN, LBG_DATADIR, fname)"
]
}
],
Expand All @@ -75,7 +66,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
"version": "3.12.4"
}
},
"nbformat": 4,
Expand Down
31 changes: 30 additions & 1 deletion pipelines/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
SRC_DIR = os.path.join(TOP, 'src')
METADATA_DIR = os.path.join(TOP, 'metadata')

# Load the SIC-code lookup file.
sic_lookup = pd.read_csv(os.path.join(METADATA_DIR, 'SIC_section_lookup.csv'))

def etl_load(working, fname):
'''load `fname`.csv from `working`.'''
assert fname[-3:] == 'csv', 'Not a csv file'
Expand Down Expand Up @@ -109,4 +112,30 @@ def remote_parquet_as_dataframe(query):

def edd_last_updated_next_updated(id):
data = remote_parquet_as_dataframe(f"SELECT id, \"desc\", last_update, next_update FROM 'https://raw.githubusercontent.com/economic-analytics/edd/main/data/edd_dict.csv' WHERE id=='{id}';")
return print(data)
return print(data)

def sic_code_bar_chart(IN, OUTDIR, FNAME, top=6):
'''
---
Read the raw datacity data split by sic section, and format into a bar chart
---
IN: the raw data file, as downloaded from TDC
OUTDIR: directory to store output file
FNAME: name for the output file
top: how many values to get.
'''
# Read the csv located at `IN`
d = pd.read_csv(f'{IN}')

# Sort values according to count, highest at the top
d.sort_values(by='Count', ascending=False, inplace=True)

# Take the top 6 largest counts only
assert top <= len(d)
d = d.head(top)

# Add the code's full name.
d = d.merge(sic_lookup, 'inner', on='SICHLU')

d.to_csv(os.path.join(OUTDIR, FNAME), index=False)
return

0 comments on commit 57faf06

Please sign in to comment.