Skip to content

Commit

Permalink
Merge pull request #138 from aradhakrishnanGFDL/CompVal
Browse files Browse the repository at this point in the history
Comp val (In progress)
  • Loading branch information
aradhakrishnanGFDL authored Jul 17, 2024
2 parents 78a681f + e2356bf commit 14e6e14
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 1 deletion.
1 change: 0 additions & 1 deletion cats/gfdl_template.json
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
},
{
"column_name": "chunk_freq",
"vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/GFDL_chunk_freq.json",
"required": false
},
{
Expand Down
122 changes: 122 additions & 0 deletions tests/compval
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/usr/bin/env python

import click
import json
from jsondiff import diff
import pandas as pd
import sys
import os
import re
import math

@click.command()
@click.argument('json_path',nargs=1,required=True)
@click.argument('cv_dir_path',nargs=1,required=False)
def main(json_path=None, cv_dir_path=None):

''' This test validates catalogs against CMIP6 or GFDL controlled vocabulary (CV) as provided by particular JSON schemas per vocabulary type. CMIP6 CV's are found in the WCRP-CMIP/CMIP6_CVs github repository. GFDL CV's are found in the NOAA-GFDL/CMIP6_CVs github repository.
JSON_PATH = Path to generated catalog JSON schema
CV_DIR_PATH = Path to CMIP6 CV Repository.
USAGE:
To validate against GFDL CV's: compval <json_path>
(Uses CV found in json's "vocabulary" field)
To validate against CMIP CV's: compval <json_path> <cv_dir_path>
(Must clone WCRP-CMIP/CMIP6_CVs github directory. CV is found automatically given the path to this directory) '''
bad_vocab = []
nan_list = []
vocab_list = []

#Open catalog json
j = json.load(open(json_path))

#Get CSV from JSON and open it
csv_path = str(j["catalog_file"])
catalog = pd.read_csv(csv_path)

if cv_dir_path:
#Gather all CV's from CMIP6_CV Dir
for filename in os.listdir(cv_dir_path):
if str(filename).startswith("CMIP6"):
cv_type = re.search('CMIP6_(.*).json',str(filename)).group(1)

#Open JSON
try:
cv = json.load(open(cv_dir_path+'/CMIP6_'+cv_type+'.json'))
except:
sys.exit("Unable to open json: " + cv_type)

#Ignoring directory structure for now
if list(cv.keys())[0] == 'DRS':
continue
#vals = list(cv.values())
#dir_structure = list(vals[0].values())[2]

else:
vals = list(cv.values())

#Sometimes the CV formats are a little different :)
if isinstance(vals[0],list):
for vocab in vals[0]:
vocab_list.append(vocab)

else:
for vocab in vals[0].keys():
vocab_list.append(vocab)

#Sometimes our catalogs don't include the cv type as a column
if cv_type in catalog:

for y in catalog[cv_type]:

#If there's NaN's this whole thing will break so let's get those out of the way
if not isinstance(y,str) and math.isnan(y):
#Add to the nan list
if cv_type not in nan_list:
nan_list.append(cv_type)
continue
else:
continue

#Check for bad vocab
if y not in vocab_list and y not in bad_vocab:
bad_vocab.append(y)

else:
#Parse through the JSON and find which CV is needed

for x in j["attributes"]:

#Checks to see if the vocabulary field is filled out. If so, use the vocab. Also, we avoid chunk_freq because it's currently broken.
if list(x.values())[1]:
cv_url = list(x.values())[1]
try:
df = pd.read_json(cv_url)
except:
sys.exit("Unable to open json: " + list(x.values())[0])

#This will probably break if formatting is different. Will adjust if necessary.
for y in df[list(x.values())[0]].keys():
vocab_list.append(y)

#Look for "bad vocab" in the CSV
for z in catalog[list(x.values())[0]]:
if z not in vocab_list and z not in bad_vocab:
if not isinstance(z,str) and math.isnan(z):
continue
else:
bad_vocab.append(z)

if nan_list:
print("WARNING: NaN's found in: " + str(nan_list))

if bad_vocab:
print("Found inconsistent value(s): " + str(bad_vocab))
else:
print("Check passed.")

if __name__ == '__main__':
main()

0 comments on commit 14e6e14

Please sign in to comment.