From abae37489b140bfc79f8f3016e2f27410389c06e Mon Sep 17 00:00:00 2001 From: luke-strange <92686634+luke-strange@users.noreply.github.com> Date: Wed, 26 Jun 2024 16:18:45 +0100 Subject: [PATCH] Update GVA --- pipelines/sustainable/dvc.lock | 10 +++++----- pipelines/sustainable/gva.py | 23 +++++++++++++---------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/pipelines/sustainable/dvc.lock b/pipelines/sustainable/dvc.lock index fdce8db..82fd6a5 100644 --- a/pipelines/sustainable/dvc.lock +++ b/pipelines/sustainable/dvc.lock @@ -5,12 +5,12 @@ stages: deps: - path: ../../pipelines/util.py hash: md5 - md5: 539e317fb1ff73675eba0177fe0ff274 - size: 2174 + md5: 4339ffbc976b069992b6f61e482b567f + size: 3489 - path: gva.py hash: md5 - md5: 0fab1bda58bb99b15f1f1804b953e9bf - size: 1209 + md5: 84fa23b347b950328a05de2f33cdf263 + size: 981 - path: https://github.com/economic-analytics/edd/raw/main/data/parquet/RGVA_LAD.parquet hash: md5 checksum: '"c1be64ed511e603676b92efb34eb6f06a7543ca07cd563f7e52cee03c7cec9fa"' @@ -22,5 +22,5 @@ stages: size: 6215 - path: ../../src/themes/sustainable-growth/gva/index.vto hash: md5 - md5: 7f9e4d5a23e34fa6c3635d57321159dc + md5: 868d4a98e1aa9660c82f20eb75b82733 size: 1906 diff --git a/pipelines/sustainable/gva.py b/pipelines/sustainable/gva.py index 6ecd7d6..cddf585 100644 --- a/pipelines/sustainable/gva.py +++ b/pipelines/sustainable/gva.py @@ -2,22 +2,25 @@ from pipelines.util import * URL = 'https://github.com/economic-analytics/edd/raw/main/data/parquet/RGVA_LAD.parquet' +query = f"SELECT \"dates.date\" AS date, \"variable.name\", \"geography.code\", \"industry.name\", value FROM '{URL}' WHERE \"industry.name\"=='All industries';" def gva_by_local_authority(): - con = duckdb.connect() + data = remote_parquet_as_dataframe(query) - # @TODO write a generalised function of below - # @TODO figure out why MAX('date') won't work - data = con.execute(f"SELECT \"dates.date\" AS date, \"variable.name\", \"geography.code\", \"industry.name\", value FROM '{URL}' WHERE \"industry.name\"=='All industries';").fetchdf() + # filter the frame + data = data[(data['variable.name'] == 'GVA Current Prices £m')] - # possible to rewrite below in the query, - # which may be quicker, but not sure if it's - # worth it given its easier for me to do in pandas. - data = data[(data['variable.name'] == 'GVA Current Prices £m') & (data['date'] == max(data['date']))] + # get the most recent date + data = most_recent_date(data, 'date') + + # pivot the frame data = data.pivot(columns='variable.name', values='value', index='geography.code') - #data.rename(columns={'GVA Current Prices £m': 'gva_current_prices', 'GVA Constant Prices £m': 'gva_constant_prices'}, inplace=True) + + # write to csv data.to_csv(os.path.join(SRC_DIR, 'themes/sustainable-growth/gva/_data/gva_lad.csv')) + return if __name__ == "__main__": gva_by_local_authority() - time_updated(os.path.join(SRC_DIR, 'themes/sustainable-growth/gva/index.vto'), 'nicetheme:') \ No newline at end of file + time_updated(os.path.join(SRC_DIR, 'themes/sustainable-growth/gva/index.vto'), 'nicetheme:') + edd_last_updated_next_updated(id='RGVA_LAD') \ No newline at end of file