Update employment page

open-innovations · Jun 11, 2024 · 246fd8a · 246fd8a
1 parent 97859ee
commit 246fd8a
Show file tree

Hide file tree

Showing 12 changed files with 948 additions and 54 deletions.
diff --git a/pipelines/people/dvc.lock b/pipelines/people/dvc.lock
@@ -33,21 +33,25 @@ stages:
     deps:
     - path: ../../pipelines/util.py
       hash: md5
-      md5: 36ef11ec36aad93bd1f7b596d8f635ce
-      size: 1301
+      md5: 17aacff551ae438adc3bd465dbecdd05
+      size: 1420
     - path: ../../working/cs/cs-true-north.csv
       hash: md5
       md5: af140a62c1f7e4fffc6c5b797be97290
       size: 1480442
     - path: employment.py
       hash: md5
-      md5: 4f9a0b99b9d722b105eba138f5989adc
-      size: 640
+      md5: 6ac3d1ef02e1a99afa6c4800d3fd1046
+      size: 1301
     outs:
-    - path: ../../src/themes/people-skills-future/_data/unemployment.csv
+    - path: ../../src/themes/people-skills-future/_data/economic_inactivity.csv
+      hash: md5
+      md5: 7e4807e59f4b6f80201ca37f68a28319
+      size: 3566
+    - path: ../../src/themes/people-skills-future/_data/employment.csv
       hash: md5
-      md5: b60d9a2d17a38edc13552071bf2187c0
-      size: 2943
+      md5: 04fb8c28d58910c5beb477f72bc00a30
+      size: 3572
   qualifications:
     cmd: PYTHONPATH=../.. python qualifications.py
     deps:
@@ -88,12 +92,21 @@ stages:
       hash: md5
       checksum: '"14ccdbb3a4299537b8eb04d5d83a1ac0f8eed61d0847d29812fa42b3b7c0b694"'
       size: 367480
+    - path: 
+        https://raw.githubusercontent.com/open-innovations/yff-data-pipelines/main/data/processed/yff/neet-factors.csv
+      hash: md5
+      checksum: '"7334c58ffd7f27be60a5f4bac77d8b7fa378686479246ecde1a4bc63235850de"'
+      size: 555225
     - path: neet.py
       hash: md5
-      md5: 46288d019d453a255f52a168a00e1c54
-      size: 1549
+      md5: 8c17c33ee343cf2a7bb71c0d8f96728a
+      size: 1106
     outs:
     - path: ../../src/themes/people-skills-future/_data/neet.csv
       hash: md5
       md5: 9388a8604edf7c27235d8d27aeede084
       size: 160
+    - path: ../../src/themes/people-skills-future/_data/risk_of_neet_by_la.csv
+      hash: md5
+      md5: 121be03fe40ba8f11044310423afd4bb
+      size: 23551
diff --git a/pipelines/people/dvc.yaml b/pipelines/people/dvc.yaml
@@ -20,7 +20,9 @@ stages:
       - ${TOP}/pipelines/util.py
       - ${TOP}/working/cs/cs-true-north.csv
     outs:
-      - ${TOP}/src/themes/people-skills-future/_data/unemployment.csv:
+      - ${TOP}/src/themes/people-skills-future/_data/employment.csv:
+          cache: false
+      - ${TOP}/src/themes/people-skills-future/_data/economic_inactivity.csv:
           cache: false
   qualifications:
     cmd: PYTHONPATH=${TOP} python qualifications.py
@@ -36,6 +38,9 @@ stages:
     deps:
       - neet.py
       - https://raw.githubusercontent.com/open-innovations/yff-data-pipelines/main/data/processed/neet.csv
+      - https://raw.githubusercontent.com/open-innovations/yff-data-pipelines/main/data/processed/yff/neet-factors.csv
     outs:
       - ${TOP}/src/themes/people-skills-future/_data/neet.csv:
+          cache: false
+      - ${TOP}/src/themes/people-skills-future/_data/risk_of_neet_by_la.csv:
           cache: false
diff --git a/pipelines/people/employment.py b/pipelines/people/employment.py
@@ -12,6 +12,21 @@
     # convert the iso dates to unix
     data = etl.addfield(data, 'unix_timestamp', iso_to_unix)
 
-    etl_write(data, os.path.join(TOP, 'src/themes/people-skills-future/_data/unemployment.csv'))
+    data = etl.addfield(data, 'decimal_date', decimal_date)
 
-    print("Got unemployment data")
+    etl_write(data, os.path.join(TOP, 'src/themes/people-skills-future/_data/employment.csv'))
+
+    ei_data = etl_load(WDIR, "cs/cs-true-north.csv")
+
+    ei_data = etl.select(ei_data, "{variable_name} == '% who are economically inactive - aged 16-64' and {measures_name} == 'Variable' ")
+
+    ei_data = etl.cut(ei_data, 'date', 'geography_code', 'value')
+
+    ei_data = etl.recast(ei_data, key='date', variablefield='geography_code', valuefield='value')
+
+    # convert the iso dates to unix
+    ei_data = etl.addfield(ei_data, 'unix_timestamp', iso_to_unix)
+
+    ei_data = etl.addfield(ei_data, 'decimal_date', decimal_date)
+
+    etl_write(ei_data, os.path.join(TOP, 'src/themes/people-skills-future/_data/economic_inactivity.csv'))
diff --git a/pipelines/people/neet.py b/pipelines/people/neet.py
@@ -2,27 +2,21 @@
 import duckdb
 import pandas as pd
 
-URL = "https://raw.githubusercontent.com/open-innovations/yff-data-pipelines/main/data/processed/neet.csv"
+HEADLINE_URL = "https://raw.githubusercontent.com/open-innovations/yff-data-pipelines/main/data/processed/neet.csv"
+
+LOCAL_AUTHORITY_URL = "https://raw.githubusercontent.com/open-innovations/yff-data-pipelines/main/data/processed/yff/neet-factors.csv"
 
 def total_neet_16_24():
     con = duckdb.connect()
-    data = con.execute(f"SELECT date, sheet, age, measure, value FROM '{URL}' WHERE sheet=='People - SA' AND age=='Aged 16-24' AND measure=='People who were NEET as a percentage of people in relevant population group'").fetch_df()
+    data = con.execute(f"SELECT date, sheet, age, measure, value FROM '{HEADLINE_URL}' WHERE sheet=='People - SA' AND age=='Aged 16-24' AND measure=='People who were NEET as a percentage of people in relevant population group'").fetch_df()
     data = data.tail(1).set_index('date')
     data.to_csv(os.path.join(SRC_DIR, 'themes/people-skills-future/_data/neet.csv'))
     return
 
 def neet_by_local_authority():
-    data = pd.read_csv(os.path.join(WDIR, 'neet/ud_neet_characteristics.csv'))
-    # combine all codes into one column. Uses LA code if exists, the region, then country.
-    data['geography_code'] = data['new_la_code'].combine_first(data['region_code']).combine_first(data['country_code'])
-    # data = data[data['new_la_code'].notnull()]
-
-    #drop un-used columns
-    data.drop(columns=['time_identifier', 'country_name', 'country_code', 'region_code', 'region_name', 'old_la_code', 'geographic_level'], inplace=True)
-    data = data[(data['Age']=='16-17') & (data['Characteristic']=='Total') & (data['time_period']==max(data['time_period']))]
-    data.set_index('time_period', inplace=True)
-    data.index.rename('date', inplace=True)
-    data.to_csv(os.path.join(SRC_DIR, 'themes/people-skills-future/_data/most_recent_neet_by_la.csv'))
+    con = duckdb.connect()
+    data = con.execute(f"SELECT * FROM '{LOCAL_AUTHORITY_URL}' WHERE variable=='Total Score'").fetchdf()
+    data.to_csv(os.path.join(SRC_DIR, 'themes/people-skills-future/_data/risk_of_neet_by_la.csv'), index=False)
 
 if __name__ == "__main__":
     total_neet_16_24()

diff --git a/pipelines/util.py b/pipelines/util.py
@@ -28,6 +28,10 @@ def iso_to_unix(row):
     dt = datetime.fromisoformat(iso_date)
     return int(dt.timestamp())
 
+def decimal_date(row):
+    timestamp = row['unix_timestamp']
+    return round((timestamp / (86400*365.25)) + 1970, 2)
+
 def slugify_column_names(headers):
     return [slugify(header, separator='_') for header in headers]