From bc8c5a3d6c6ae0af006ccde7cb422aa07749bf4a Mon Sep 17 00:00:00 2001
From: Jesus Fernandez <Jesus.Fernandez@unican.es>
Date: Tue, 22 Oct 2024 20:28:29 +0200
Subject: [PATCH] Add storage_estimate.py

see #40
---
 storage_estimate.py | 86 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 storage_estimate.py

diff --git a/storage_estimate.py b/storage_estimate.py
new file mode 100644
index 0000000..aa2097c
--- /dev/null
+++ b/storage_estimate.py
@@ -0,0 +1,86 @@
+import pandas as pd
+import numpy as np
+from icecream import ic
+
+precision_factor = 4 # float
+compression_factor = 0.6
+bytes_to_TB = 1.0e-12
+priorities = ['CORE'] #, 'TIER1', 'TIER2']
+
+# Number of time records per year
+frequency_factor = {'mon': 12, 'day': 365, '6hr': 365*4, '1hr': 365*24}
+
+# Number of years depending on the experiment. Minimal periods considered here
+# as the evaluation can be extended beyond 2020 and the historical could start
+# in 1951. See https://cordex.org/wp-content/uploads/2021/05/CORDEX-CMIP6_exp_design_RCM.pdf
+experiment_factor = {'evaluation': 2020-1980+1, 'historical': 2014-1961+1}
+experiment_factor_default = 2100-2015+1
+
+domains = pd.read_csv(
+  'https://raw.githubusercontent.com/WCRP-CORDEX/domain-tables/refs/heads/main/CORDEX-CMIP5_rotated_grids.csv',
+  usecols=['domain_id', 'nlon', 'nlat']
+)
+domains['ngridcells'] = domains['nlon'] * domains['nlat']
+ngridcells = (domains
+  .drop(columns=['nlon', 'nlat'])
+  .set_index('domain_id')
+  .to_dict()
+  .get('ngridcells')
+)
+# Some fixes for missing domains
+ngridcells['AUS-20i'] = ngridcells['AUS-25'] #!!
+ngridcells['MENA-25'] = ngridcells['MNA-25']
+ngridcells['MED-25'] = ngridcells['MED-12']/4
+ngridcells['SEA-12'] = ngridcells['SEA-25']*4
+
+plans = pd.read_csv(
+  'https://raw.githubusercontent.com/WCRP-CORDEX/simulation-status/refs/heads/main/CMIP6_downscaling_plans.csv',
+  usecols=['domain', 'institute', 'experiment', 'status']
+) #.query('institute == "Ouranos"')
+
+simulation_count = plans.pivot_table(
+  index = 'domain',
+  columns= 'experiment',
+  aggfunc='size',
+  fill_value = 0
+).drop(columns = ['TBD','no plans','selected'])
+ic(simulation_count)
+
+dreq = pd.read_csv(
+  'https://raw.githubusercontent.com/WCRP-CORDEX/data-request-table/refs/heads/main/data-request/dreq_default.csv'
+)
+variable_count = dreq.pivot_table(
+  index = 'frequency',
+  columns= 'priority',
+  aggfunc='size',
+  fill_value = 0
+)
+ic(variable_count)
+
+freq_factor = variable_count.index.map(lambda x: frequency_factor.get(x, 0))
+variable_records_per_yr = (variable_count
+  .mul(freq_factor, axis=0)
+)
+ic(variable_records_per_yr)
+
+# Just the these variable records
+print(f'/!\ Considering just {priorities} vars.)')
+nrecords_factor = variable_records_per_yr[priorities].values.sum()
+
+ngridcell_factor = simulation_count.index.map(lambda x: ngridcells.get(str(x)))
+exp_factor = simulation_count.columns.map(lambda x: experiment_factor.get(x, experiment_factor_default))
+
+size_TB = (simulation_count
+  .mul(ngridcell_factor, axis=0)
+  .mul(exp_factor, axis=1)
+  .mul(nrecords_factor)
+  .mul(precision_factor)
+  .mul(compression_factor)
+  .mul(bytes_to_TB)
+  .round(1)
+)
+ic(size_TB)
+
+ic(size_TB.T.sum())
+
+print(f'Total CORDEX-CMIP6 estimated size is: {np.nansum(size_TB.values):.0f} TB')