From abae37489b140bfc79f8f3016e2f27410389c06e Mon Sep 17 00:00:00 2001
From: luke-strange <92686634+luke-strange@users.noreply.github.com>
Date: Wed, 26 Jun 2024 16:18:45 +0100
Subject: [PATCH] Update GVA

---
 pipelines/sustainable/dvc.lock | 10 +++++-----
 pipelines/sustainable/gva.py   | 23 +++++++++++++----------
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/pipelines/sustainable/dvc.lock b/pipelines/sustainable/dvc.lock
index fdce8db..82fd6a5 100644
--- a/pipelines/sustainable/dvc.lock
+++ b/pipelines/sustainable/dvc.lock
@@ -5,12 +5,12 @@ stages:
     deps:
     - path: ../../pipelines/util.py
       hash: md5
-      md5: 539e317fb1ff73675eba0177fe0ff274
-      size: 2174
+      md5: 4339ffbc976b069992b6f61e482b567f
+      size: 3489
     - path: gva.py
       hash: md5
-      md5: 0fab1bda58bb99b15f1f1804b953e9bf
-      size: 1209
+      md5: 84fa23b347b950328a05de2f33cdf263
+      size: 981
     - path: https://github.com/economic-analytics/edd/raw/main/data/parquet/RGVA_LAD.parquet
       hash: md5
       checksum: '"c1be64ed511e603676b92efb34eb6f06a7543ca07cd563f7e52cee03c7cec9fa"'
@@ -22,5 +22,5 @@ stages:
       size: 6215
     - path: ../../src/themes/sustainable-growth/gva/index.vto
       hash: md5
-      md5: 7f9e4d5a23e34fa6c3635d57321159dc
+      md5: 868d4a98e1aa9660c82f20eb75b82733
       size: 1906
diff --git a/pipelines/sustainable/gva.py b/pipelines/sustainable/gva.py
index 6ecd7d6..cddf585 100644
--- a/pipelines/sustainable/gva.py
+++ b/pipelines/sustainable/gva.py
@@ -2,22 +2,25 @@
 from pipelines.util import *
 
 URL = 'https://github.com/economic-analytics/edd/raw/main/data/parquet/RGVA_LAD.parquet'
+query = f"SELECT \"dates.date\" AS date, \"variable.name\", \"geography.code\", \"industry.name\", value FROM '{URL}' WHERE \"industry.name\"=='All industries';"
 
 def gva_by_local_authority():
-    con = duckdb.connect()
+    data = remote_parquet_as_dataframe(query)
 
-    # @TODO write a generalised function of below
-    # @TODO figure out why MAX('date') won't work
-    data = con.execute(f"SELECT \"dates.date\" AS date, \"variable.name\", \"geography.code\", \"industry.name\", value FROM '{URL}' WHERE \"industry.name\"=='All industries';").fetchdf()
+    # filter the frame
+    data = data[(data['variable.name'] == 'GVA Current Prices £m')]
 
-    # possible to rewrite below in the query, 
-    # which may be quicker, but not sure if it's
-    # worth it given its easier for me to do in pandas.
-    data = data[(data['variable.name'] == 'GVA Current Prices £m') & (data['date'] == max(data['date']))]
+    # get the most recent date
+    data = most_recent_date(data, 'date')
+    
+    # pivot the frame
     data = data.pivot(columns='variable.name', values='value', index='geography.code')
-    #data.rename(columns={'GVA Current Prices £m': 'gva_current_prices', 'GVA Constant Prices £m': 'gva_constant_prices'}, inplace=True)
+    
+    # write to csv
     data.to_csv(os.path.join(SRC_DIR, 'themes/sustainable-growth/gva/_data/gva_lad.csv'))
+    return 
 
 if __name__ == "__main__":
     gva_by_local_authority()
-    time_updated(os.path.join(SRC_DIR, 'themes/sustainable-growth/gva/index.vto'), 'nicetheme:')
\ No newline at end of file
+    time_updated(os.path.join(SRC_DIR, 'themes/sustainable-growth/gva/index.vto'), 'nicetheme:')
+    edd_last_updated_next_updated(id='RGVA_LAD')
\ No newline at end of file