Skip to content

Commit

Permalink
Create vessel_ETL.py in etl directory
Browse files Browse the repository at this point in the history
  • Loading branch information
Anita Caron authored Feb 22, 2024
1 parent 8d0781b commit 4932307
Showing 1 changed file with 80 additions and 0 deletions.
80 changes: 80 additions & 0 deletions src/scripts/etl/vessel_ETL.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import argparse

import pandas as pd
import numpy as np
from utils import extract, generate_id, load, read, search_id, transform_id


def transform_pattern(data: pd.DataFrame) -> pd.DataFrame:
# columns_pattern = ["defined_class", "label", "human_label", "parent", "location", "xrefs", "synonym", "synonym_xrefs", "taxon"]
data_pattern = []
data = data.drop_duplicates(['VesselBaseName'])
vccf_id = generate_id(1000000, 1999999)
for _, row in data.iterrows():
r = {}
r["defined_class"] = f'VCCF:{next(vccf_id)}'
r["label"] = row['VesselBaseName'].rstrip()
r["human_label"] = f'{row["VesselBaseName"].rstrip()} (Human)'

if not pd.isna(row['UBERON']):
r['parent'] = row['UBERON']
elif 'fma' not in row['VesselTypeID']:
r['parent'] = row['VesselTypeID']

r['location'] = row['BodyPart']

references = []
if str(row['ReferenceURL']) != 'nan':
if 'http' in row['ReferenceURL'] and 'UBERON' not in row['ReferenceURL']:
references.append(row['ReferenceURL'])
if not pd.isna(row['ReferenceDOI']):
references.append(f'DOI:{row["ReferenceDOI"]}')

r['xrefs'] = '|'.join(references)

r['synonym'] = row['FMALabel']
if not pd.isna(row['FMA']):
r['synonym_xrefs'] = transform_id(row['FMA'])
r['taxon'] = "http://purl.obolibrary.org/obo/NCBITaxon_9606"
data_pattern.append(r)

return pd.DataFrame.from_records(data_pattern)


def transform_template(
pattern_data: pd.DataFrame,
template_data: pd.DataFrame
) -> pd.DataFrame:
data_template = [{"Vessel": "ID", "branches from": "SC 'connecting branch of' some %"}]
for _, row in template_data.iterrows():
r = {}
r['Vessel'] = search_id(pattern_data, row['VesselBaseName'])
r['branches from'] = search_id(pattern_data, row['BranchesFrom'])
data_template.append(r)

return pd.DataFrame.from_records(data_template)


def main(args):
data = read(args.input)
pattern_data = extract(data=data, columns_extract=[
"VesselBaseName", "VesselType", "VesselTypeID", "ReferenceURL", "ReferenceDOI",
"BodyPart", "BodyPartID", "UBERON", "UBERONLabel", "FMALabel", "FMA"
]
)
pattern_data = transform_pattern(pattern_data)
load(pattern_data, args.pattern)

template_data = extract(data=data, columns_extract=["BranchesFrom", "VesselBaseName"])
template_data = transform_template(pattern_data, template_data)
load(template_data, args.template)


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--input', help='vessel input file')
parser.add_argument('--pattern', help='dos-dp pattern output file')
parser.add_argument('--template', help='robot template output file')

args = parser.parse_args()
main(args)

0 comments on commit 4932307

Please sign in to comment.