-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into dv-WS_uploader_performance
- Loading branch information
Showing
7 changed files
with
546 additions
and
9 deletions.
There are no files selected for viewing
209 changes: 209 additions & 0 deletions
209
src/common/collection_column_specs/parse_sample_template.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,209 @@ | ||
""" | ||
usage: parse_sample_template.py [-h] --input_yaml INPUT_YAML --core_yaml CORE_YAML --output_yaml OUTPUT_YAML | ||
Parse the YAML structure of a sample template file and the core YAML file to generate a YAML file containing | ||
specifications for sample collection columns. | ||
The sample template file can be downloaded from | ||
https://github.com/kbase/sample_service_validator_config/tree/master/templates | ||
The core yaml file can be downloaded from | ||
https://github.com/kbase/sample_service_validator_config/blob/master/vocabularies/core.yml | ||
PLEASE NOTE: | ||
users should manually examine the output yaml file and make sure the output is correct | ||
then manually copy the output to src/common/collection_column_specs/samples-[collection].yml | ||
In other words, the output of this script is to save the user's time when constructing the sample column | ||
specifications and is not expected to be completely accurate. | ||
Observed discrepancies in the contextual information for certain sample fields, | ||
such as description, display name, etc., between the two sample templates under the same field name. | ||
These inconsistencies should be addressed either through manual correction or by transitioning to the unified | ||
metadata_validation.yml file. | ||
options: | ||
-h, --help show this help message and exit | ||
--input_yaml INPUT_YAML | ||
sample template YAML file | ||
--core_yaml CORE_YAML | ||
core sample YAML file | ||
--output_yaml OUTPUT_YAML | ||
output YAML file | ||
TODO: | ||
metadata_validation.yml file might be another source of information for constructing the column specifications. | ||
We should consider using this file instead of combine the sample template and core yaml files if we are parsing more samples. | ||
https://github.com/kbase/sample_service_validator_config/blob/master/metadata_validation.yml | ||
""" | ||
import argparse | ||
from datetime import datetime | ||
|
||
import yaml | ||
|
||
# currently available samples fields in the collection service | ||
_CURRENT_SAMPLES = ['enigma:collection_time', 'enigma:experiment_name', 'enigma:well_name', 'env_package', 'material', | ||
'enigma:date', 'enigma:time_zone', 'latitude', 'longitude', 'sample_template', | ||
'sesar:igsn', 'sesar:material', 'sesar:field_name', 'other_names', 'sesar:collection_method', | ||
'sesar:collection_method_description', 'purpose', 'sesar:physiographic_feature_primary', | ||
'sesar:physiographic_feature_name', 'country', 'sesar:field_program_cruise', | ||
'sesar:collector_chief_scientist', 'sesar:collection_date', 'sesar:archive_contact_current'] | ||
|
||
_NGRAM_KEY = ['other_names', | ||
'purpose', | ||
'country', | ||
'sesar:collection_method_description', | ||
'sesar:archive_contact_current', | ||
'sesar:collector_chief_scientist', | ||
'sesar:collection_method', | ||
'sesar:field_program_cruise', | ||
'sesar:material', | ||
'sesar:physiographic_feature_name', | ||
'sesar:physiographic_feature_primary', | ||
'enigma:experiment_name', | ||
'enigma:well_name'] | ||
|
||
# description from sample service needs to be corrected | ||
_CUSTOM_DESCRIPTION = {'sesar:igsn': 'International Geo Sample Number.', | ||
'longitude': 'Longitude of the location where the sample was collected in WGS 84 coordinate ' | ||
'system.', | ||
'latitude': 'Latitude of the location where the sample was collected in WGS 84 coordinate ' | ||
'system.', | ||
} | ||
# shared sample attributes with distinct display names across sample services (sesar and enigma). | ||
_CUSTOM_DISPLAY_NAME = {'longitude': 'Longitude', | ||
'latitude': 'Latitude', | ||
} | ||
|
||
|
||
def _is_date_string(example_value, key): | ||
# https://github.com/kbase/collections/blob/main/src/loaders/common/loader_helper.py#L52 | ||
formats_to_try = ["%Y/%m/%d", "%Y-%m-%d", "%m/%d/%y", "%Y-%m-%dT%H:%M:%S%z"] | ||
for date_format in formats_to_try: | ||
try: | ||
datetime.strptime(example_value, date_format) | ||
return True | ||
except ValueError: | ||
pass | ||
|
||
|
||
def _string_type(example_value, key): | ||
if isinstance(example_value, str): | ||
# handle a situation like '2; 10' - IOW use the 1st of multiple examples | ||
# https://github.com/kbase/sample_service_validator_config/blob/master/templates/enigma_template.yml#L309C5-L309C19 | ||
example_value = example_value.split(';')[0].strip() | ||
if _is_date_string(example_value, key): | ||
return {"type": "date"} | ||
try: | ||
int(example_value) | ||
return {"type": "int"} | ||
except ValueError: | ||
try: | ||
float(example_value) | ||
return {"type": "float"} | ||
except ValueError: | ||
return {"type": "string", | ||
"filter_strategy": "ngram" if key in _NGRAM_KEY else "identity"} | ||
elif isinstance(example_value, int): | ||
return {"type": "int"} | ||
elif isinstance(example_value, float): | ||
return {"type": "float"} | ||
else: | ||
raise ValueError(f'Unknown type for {example_value} for key {key}') | ||
|
||
|
||
def _parse_input_yaml(input_yaml): | ||
with open(input_yaml, 'r') as f: | ||
data = yaml.safe_load(f) | ||
|
||
result = [] | ||
|
||
for column_name, column_data in data['Columns'].items(): | ||
|
||
if 'transformations' not in column_data: | ||
key = column_data['aliases'][0] | ||
else: | ||
transformation_data = column_data['transformations'][0] | ||
key = transformation_data['parameters'][0] | ||
|
||
parsed_item = { | ||
'key': key | ||
} | ||
parsed_item.update(_string_type(column_data['example'], key)) | ||
|
||
parsed_item.update({ | ||
'display_name': _CUSTOM_DISPLAY_NAME.get(key, column_name), | ||
'category': column_data['category'].capitalize(), | ||
'description': _CUSTOM_DESCRIPTION.get(key, column_data['definition']) | ||
}) | ||
|
||
result.append(parsed_item) | ||
|
||
return result | ||
|
||
|
||
def _parse_core_yaml(core_yaml): | ||
with open(core_yaml, 'r') as f: | ||
data = yaml.safe_load(f) | ||
|
||
result = [] | ||
for column_name, column_data in data['terms'].items(): | ||
parsed_item = { | ||
'key': column_name | ||
} | ||
|
||
parsed_item.update(_string_type(column_data['examples'][0], column_name)) | ||
|
||
parsed_item.update({ | ||
'display_name': column_data['title'], | ||
'category': 'description'.capitalize(), | ||
'description': _CUSTOM_DESCRIPTION.get(column_name, column_data['description']) | ||
}) | ||
|
||
result.append(parsed_item) | ||
|
||
return result | ||
|
||
|
||
def parse_sample_spec(input_yaml, core_yaml, output_yaml): | ||
template_samples = _parse_input_yaml(input_yaml) | ||
core_samples = _parse_core_yaml(core_yaml) | ||
|
||
template_keys = {entry['key'] for entry in template_samples} | ||
filtered_core_samples = [entry for entry in core_samples if entry['key'] not in template_keys] | ||
|
||
output_data = template_samples + filtered_core_samples | ||
output_data = [entry for entry in output_data if entry['key'] in _CURRENT_SAMPLES] | ||
with open(output_yaml, 'w') as file: | ||
yaml.dump(output_data, file, default_flow_style=False, sort_keys=False) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
desc = """ | ||
Parse the YAML structure of a sample template file and the core YAML file to generate a YAML file containing | ||
specifications for sample collection columns. | ||
The sample template file can be downloaded from | ||
https://github.com/kbase/sample_service_validator_config/tree/master/templates | ||
The core yaml file can be downloaded from | ||
https://github.com/kbase/sample_service_validator_config/blob/master/vocabularies/core.yml | ||
PLEASE NOTE: | ||
users should manually examine the output yaml file and make sure the output is correct | ||
then manually copy the output to src/common/collection_column_specs/samples-[collection].yml | ||
In other words, the output of this script is to save the user's time when constructing the sample column | ||
specifications and is not expected to be completely accurate. | ||
Observed discrepancies in the contextual information for certain sample fields, | ||
such as description, display name, etc., between the two sample templates under the same field name. | ||
These inconsistencies should be addressed either through manual correction or by transitioning to the unified | ||
metadata_validation.yml file. | ||
""" | ||
parser = argparse.ArgumentParser(description=desc, formatter_class=argparse.RawTextHelpFormatter) | ||
parser.add_argument('--input_yaml', help='sample template YAML file', required=True) | ||
parser.add_argument('--core_yaml', help='core sample YAML file', required=True) | ||
parser.add_argument('--output_yaml', help='output YAML file', required=True) | ||
args = parser.parse_args() | ||
|
||
parse_sample_spec(args.input_yaml, args.core_yaml, args.output_yaml) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
columns: | ||
- key: coll | ||
type: string | ||
filter_strategy: identity | ||
non_visible: true | ||
|
||
- key: load_ver | ||
type: string | ||
filter_strategy: identity | ||
non_visible: true | ||
|
||
- key: _mtchsel | ||
type: string | ||
filter_strategy: identity | ||
non_visible: true | ||
|
||
- key: kbase_ids | ||
type: string | ||
filter_strategy: identity | ||
non_visible: true | ||
display_name: Related KBase IDs | ||
category: Identifiers | ||
description: | ||
|
||
- key: genome_count | ||
type: int | ||
display_name: Genome Count | ||
category: Statistics | ||
description: | ||
|
||
- key: kbase_sample_id | ||
type: string | ||
filter_strategy: identity | ||
display_name: KBase Sample ID | ||
category: Identifiers | ||
description: | ||
|
||
- key: kbase_display_name | ||
type: string | ||
filter_strategy: ngram | ||
display_name: KBase Name | ||
category: Identifiers | ||
description: | ||
|
||
# sample attributes currently accessible in the ENIGMA collection | ||
# parsed using parse_sample_spec in parse_sample_spec.py | ||
- key: enigma:collection_time | ||
type: string | ||
filter_strategy: identity | ||
display_name: Collection Time | ||
category: Collection | ||
description: Collection Time | ||
- key: enigma:date | ||
type: date | ||
display_name: Date | ||
category: Collection | ||
description: Date (YYYY-MM-DD) | ||
- key: env_package | ||
type: string | ||
filter_strategy: identity | ||
display_name: Environmental Package | ||
category: Description | ||
description: Environmental Package (MIxS vocabulary) | ||
- key: enigma:experiment_name | ||
type: string | ||
filter_strategy: ngram | ||
display_name: Experiment Name | ||
category: Description | ||
description: Experiment Name | ||
- key: latitude | ||
type: float | ||
display_name: Latitude | ||
category: Geolocation | ||
description: Latitude of the location where the sample was collected in WGS 84 coordinate | ||
system. | ||
- key: longitude | ||
type: float | ||
display_name: Longitude | ||
category: Geolocation | ||
description: Longitude of the location where the sample was collected in WGS 84 | ||
coordinate system. | ||
- key: material | ||
type: string | ||
filter_strategy: identity | ||
display_name: Material | ||
category: Description | ||
description: Material from ENVO (child of ENVO:00010483) | ||
- key: other_names | ||
type: string | ||
filter_strategy: ngram | ||
display_name: Other Names | ||
category: Description | ||
description: Other name(s) used for the sample. | ||
- key: enigma:time_zone | ||
type: string | ||
filter_strategy: identity | ||
display_name: Time Zone | ||
category: Collection | ||
description: Time Zone (relative to UTC) | ||
- key: enigma:well_name | ||
type: string | ||
filter_strategy: ngram | ||
display_name: Well Name | ||
category: Description | ||
description: Well Name (ID) | ||
- key: country | ||
type: string | ||
filter_strategy: ngram | ||
display_name: Country | ||
category: Geolocation | ||
description: Country where the sample was collected | ||
- key: purpose | ||
type: string | ||
filter_strategy: ngram | ||
display_name: Purpose | ||
category: Description | ||
description: Purpose of sample | ||
- key: sample_template | ||
type: string | ||
filter_strategy: identity | ||
display_name: Template | ||
category: Description | ||
description: Template Format | ||
|
Oops, something went wrong.