-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
✨ Dataservice study generator to aid in dev + test
- Loading branch information
Showing
10 changed files
with
1,082 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
Empty file.
Empty file.
81 changes: 81 additions & 0 deletions
81
creator/studies/data_generator/ingest_package/extract_configs/biospec.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
""" | ||
Extract config for bio manifest generated by | ||
creator.ingest_runs.data_generator.study_generator | ||
Contains minimal data needed to build Kids First Data Service entities: | ||
- family | ||
- participant | ||
- biospecimen | ||
See documentation at | ||
https://kids-first.github.io/kf-lib-data-ingest/tutorial/extract.html for | ||
information on writing extract config files. | ||
""" | ||
|
||
from kf_lib_data_ingest.common import constants # noqa F401 | ||
from kf_lib_data_ingest.common.concept_schema import CONCEPT | ||
from kf_lib_data_ingest.etl.extract.operations import * | ||
|
||
source_data_url = "file://../data/bio_manifest.tsv" | ||
|
||
source_data_read_params = {} | ||
|
||
# (Optional) You can set a custom read function with | ||
# source_data_read_func | ||
|
||
operations = [ | ||
keep_map( | ||
in_col="kf_id_family", | ||
out_col=CONCEPT.FAMILY.TARGET_SERVICE_ID, | ||
), | ||
keep_map( | ||
in_col="family_id", | ||
out_col=CONCEPT.FAMILY.ID, | ||
), | ||
keep_map( | ||
in_col="kf_id_participant", | ||
out_col=CONCEPT.PARTICIPANT.TARGET_SERVICE_ID, | ||
), | ||
keep_map( | ||
in_col="participant_id", | ||
out_col=CONCEPT.PARTICIPANT.ID, | ||
), | ||
value_map( | ||
in_col="gender", | ||
m={ | ||
"Male": constants.GENDER.MALE, | ||
"Female": constants.GENDER.FEMALE, | ||
}, | ||
out_col=CONCEPT.PARTICIPANT.GENDER, | ||
), | ||
keep_map( | ||
in_col="kf_id_biospecimen", | ||
out_col=CONCEPT.BIOSPECIMEN.TARGET_SERVICE_ID, | ||
), | ||
keep_map( | ||
in_col="sample_id", | ||
out_col=CONCEPT.BIOSPECIMEN.ID, | ||
), | ||
value_map( | ||
in_col="volume", | ||
m=lambda x: float(x), | ||
out_col=CONCEPT.BIOSPECIMEN.VOLUME_UL, | ||
), | ||
value_map( | ||
in_col="concentration", | ||
m=lambda x: float(x), | ||
out_col=CONCEPT.BIOSPECIMEN.CONCENTRATION_MG_PER_ML, | ||
), | ||
keep_map( | ||
in_col="tissue_type", | ||
out_col=CONCEPT.BIOSPECIMEN.TISSUE_TYPE, | ||
), | ||
constant_map( | ||
m=constants.SEQUENCING.CENTER.BROAD.KF_ID, | ||
out_col=CONCEPT.SEQUENCING.CENTER.TARGET_SERVICE_ID, | ||
), | ||
constant_map( | ||
m=constants.SEQUENCING.ANALYTE.DNA, | ||
out_col=CONCEPT.BIOSPECIMEN.ANALYTE, | ||
), | ||
] |
72 changes: 72 additions & 0 deletions
72
creator/studies/data_generator/ingest_package/extract_configs/genomic.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
""" | ||
Extract config for sequencing manifest generated by | ||
creator.ingest_runs.data_generator.study_generator | ||
Contains minimal data needed to build Kids First Data Service entities: | ||
- sequencing_experiment | ||
- genomic_file (source /unharmonized only) | ||
- biospecimen_genomic_file | ||
- sequencing_experiment_genomic_file | ||
See documentation at | ||
https://kids-first.github.io/kf-lib-data-ingest/tutorial/extract.html for | ||
information on writing extract config files. | ||
""" | ||
import os | ||
from kf_lib_data_ingest.common import constants, pandas_utils # noqa F401 | ||
from kf_lib_data_ingest.common.concept_schema import CONCEPT | ||
from kf_lib_data_ingest.etl.extract.operations import * | ||
from kf_lib_data_ingest.common.io import read_df | ||
|
||
DATA_DIR = ( | ||
os.path.join( | ||
os.path.dirname(os.path.dirname(__file__)), | ||
"data" | ||
) | ||
) | ||
source_data_url = "file://../data/sequencing_manifest.tsv" | ||
|
||
|
||
# TODO (Optional) Fill in special loading parameters here | ||
source_data_read_params = {} | ||
|
||
# TODO (Optional) You can set a custom read function with | ||
# source_data_read_func | ||
|
||
|
||
# TODO - Replace this with operations that make sense for your own data file | ||
operations = [ | ||
keep_map( | ||
in_col="project_id", | ||
out_col=CONCEPT.SEQUENCING.ID, | ||
), | ||
keep_map( | ||
in_col="sample_id", | ||
out_col=CONCEPT.BIOSPECIMEN.ID, | ||
), | ||
keep_map( | ||
in_col="experiment_strategy", | ||
out_col=CONCEPT.SEQUENCING.STRATEGY, | ||
), | ||
# Source genomic file KF ID | ||
keep_map( | ||
in_col="kf_id_source_genomic_file", | ||
out_col=CONCEPT.GENOMIC_FILE.TARGET_SERVICE_ID, | ||
), | ||
keep_map( | ||
in_col="source_path", | ||
out_col=CONCEPT.GENOMIC_FILE.ID, | ||
), | ||
constant_map( | ||
m=False, | ||
out_col=CONCEPT.GENOMIC_FILE.HARMONIZED, | ||
), | ||
constant_map( | ||
m=constants.SEQUENCING.REFERENCE_GENOME.GRCH38, | ||
out_col=CONCEPT.GENOMIC_FILE.REFERENCE_GENOME, | ||
), | ||
constant_map( | ||
m=True, | ||
out_col=CONCEPT.SEQUENCING.PAIRED_END, | ||
), | ||
] |
148 changes: 148 additions & 0 deletions
148
creator/studies/data_generator/ingest_package/extract_configs/s3_scrape_config.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
""" | ||
Extract config for the s3 object manifest generated by | ||
creator.ingest_runs.data_generator.study_generator | ||
Contains minimal data needed to build Kids First Data Service entities: | ||
- genomic_file (source /unharmonized only) | ||
See documentation at | ||
https://kids-first.github.io/kf-lib-data-ingest/tutorial/extract.html for | ||
information on writing extract config files. | ||
""" | ||
from kf_lib_data_ingest.common import constants | ||
from kf_lib_data_ingest.common.constants import GENOMIC_FILE, COMMON | ||
from kf_lib_data_ingest.common.concept_schema import CONCEPT | ||
from kf_lib_data_ingest.etl.extract.operations import ( | ||
keep_map, | ||
row_map, | ||
value_map, | ||
constant_map, | ||
) | ||
|
||
|
||
def genomic_file_ext(x): | ||
""" | ||
Get genomic file extension | ||
""" | ||
matches = [ | ||
file_ext for file_ext in FILE_EXT_FORMAT_MAP if x.endswith(file_ext) | ||
] | ||
if matches: | ||
file_ext = max(matches, key=len) | ||
else: | ||
file_ext = None | ||
|
||
return file_ext | ||
|
||
|
||
FILE_EXT_FORMAT_MAP = { | ||
".fq": GENOMIC_FILE.FORMAT.FASTQ, | ||
".fastq": GENOMIC_FILE.FORMAT.FASTQ, | ||
".fq.gz": GENOMIC_FILE.FORMAT.FASTQ, | ||
".fastq.gz": GENOMIC_FILE.FORMAT.FASTQ, | ||
".bam": GENOMIC_FILE.FORMAT.BAM, | ||
".hgv.bam": GENOMIC_FILE.FORMAT.BAM, | ||
".cram": GENOMIC_FILE.FORMAT.CRAM, | ||
".bam.bai": GENOMIC_FILE.FORMAT.BAI, | ||
".bai": GENOMIC_FILE.FORMAT.BAI, | ||
".cram.crai": GENOMIC_FILE.FORMAT.CRAI, | ||
".crai": GENOMIC_FILE.FORMAT.CRAI, | ||
".g.vcf.gz": GENOMIC_FILE.FORMAT.GVCF, | ||
".g.vcf.gz.tbi": GENOMIC_FILE.FORMAT.TBI, | ||
".vcf.gz": GENOMIC_FILE.FORMAT.VCF, | ||
".vcf": GENOMIC_FILE.FORMAT.VCF, | ||
".vcf.gz.tbi": GENOMIC_FILE.FORMAT.TBI, | ||
".peddy.html": GENOMIC_FILE.FORMAT.HTML, | ||
".md5": COMMON.OTHER, | ||
} | ||
|
||
DATA_TYPES = { | ||
GENOMIC_FILE.FORMAT.FASTQ: GENOMIC_FILE.DATA_TYPE.UNALIGNED_READS, | ||
GENOMIC_FILE.FORMAT.BAM: GENOMIC_FILE.DATA_TYPE.ALIGNED_READS, | ||
GENOMIC_FILE.FORMAT.CRAM: GENOMIC_FILE.DATA_TYPE.ALIGNED_READS, | ||
GENOMIC_FILE.FORMAT.BAI: GENOMIC_FILE.DATA_TYPE.ALIGNED_READS_INDEX, | ||
GENOMIC_FILE.FORMAT.CRAI: GENOMIC_FILE.DATA_TYPE.ALIGNED_READS_INDEX, | ||
GENOMIC_FILE.FORMAT.VCF: GENOMIC_FILE.DATA_TYPE.VARIANT_CALLS, | ||
GENOMIC_FILE.FORMAT.GVCF: GENOMIC_FILE.DATA_TYPE.GVCF, | ||
GENOMIC_FILE.FORMAT.HTML: COMMON.OTHER, | ||
# Different TBI types share the same format in FILE_EXT_FORMAT_MAP above | ||
".g.vcf.gz.tbi": GENOMIC_FILE.DATA_TYPE.GVCF_INDEX, | ||
".vcf.gz.tbi": GENOMIC_FILE.DATA_TYPE.VARIANT_CALLS_INDEX, | ||
".md5": COMMON.OTHER, | ||
} | ||
|
||
|
||
def filter_df_by_file_ext(df): | ||
""" | ||
Only keep rows where file extension is one of those in | ||
FILE_EXT_FORMAT_MAP.keys | ||
""" | ||
df[CONCEPT.GENOMIC_FILE.FILE_FORMAT] = df["Key"].apply(file_format) | ||
return df[df[CONCEPT.GENOMIC_FILE.FILE_FORMAT].notnull()] | ||
|
||
|
||
source_data_url = "file://../data/s3_source_gf_manifest.tsv" | ||
|
||
do_after_read = filter_df_by_file_ext | ||
|
||
|
||
def s3_url(row): | ||
""" | ||
Create S3 URL for object from S3 bucket and key | ||
""" | ||
return f's3://{row["Bucket"]}/{row["Key"]}' | ||
|
||
|
||
def file_format(x): | ||
""" | ||
Get genomic file format by looking genomic file ext up in | ||
FILE_EXT_FORMAT_MAP dict | ||
""" | ||
return FILE_EXT_FORMAT_MAP.get(genomic_file_ext(x)) | ||
|
||
|
||
def data_type(x): | ||
""" | ||
Get genomic file data type by looking up file format in DATA_TYPES. | ||
However, some types share formats, so then use the file extension itself | ||
to do the data type lookup. | ||
""" | ||
return ( | ||
DATA_TYPES.get(file_format(x)) or | ||
DATA_TYPES.get(genomic_file_ext(x)) | ||
) | ||
|
||
|
||
def fname(key): | ||
""" | ||
Return just the filename portion of the key | ||
""" | ||
return key.rsplit("/", 1)[-1] | ||
|
||
|
||
operations = [ | ||
row_map(out_col=CONCEPT.GENOMIC_FILE.ID, m=s3_url), | ||
row_map( | ||
out_col=CONCEPT.GENOMIC_FILE.URL_LIST, m=lambda row: [s3_url(row)] | ||
), | ||
value_map(out_col=CONCEPT.GENOMIC_FILE.FILE_NAME, in_col="Key", m=fname), | ||
keep_map(in_col="Size", out_col=CONCEPT.GENOMIC_FILE.SIZE), | ||
value_map( | ||
in_col="ETag", | ||
out_col=CONCEPT.GENOMIC_FILE.HASH_DICT, | ||
m=lambda x: {constants.FILE.HASH.S3_ETAG.lower(): x.replace('"', "")}, | ||
), | ||
constant_map( | ||
out_col=CONCEPT.GENOMIC_FILE.AVAILABILITY, | ||
m=constants.GENOMIC_FILE.AVAILABILITY.IMMEDIATE, | ||
), | ||
keep_map( | ||
in_col=CONCEPT.GENOMIC_FILE.FILE_FORMAT, | ||
out_col=CONCEPT.GENOMIC_FILE.FILE_FORMAT, | ||
), | ||
value_map( | ||
in_col="Key", | ||
out_col=CONCEPT.GENOMIC_FILE.DATA_TYPE, | ||
m=data_type, | ||
), | ||
] |
24 changes: 24 additions & 0 deletions
24
creator/studies/data_generator/ingest_package/ingest_package_config.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
""" Ingest Package Config """ | ||
|
||
from kf_lib_data_ingest.common.concept_schema import CONCEPT | ||
|
||
# The list of entities that will be loaded into the target service. These | ||
# should be class_name values of your target API config's target entity | ||
# classes. | ||
target_service_entities = [ | ||
"family", | ||
"participant", | ||
"biospecimen", | ||
"sequencing_experiment", | ||
"genomic_file", | ||
"biospecimen_genomic_file", | ||
"sequencing_experiment_genomic_file", | ||
] | ||
|
||
# All paths are relative to the directory this file is in | ||
extract_config_dir = "extract_configs" | ||
|
||
transform_function_path = "transform_module.py" | ||
|
||
# Kids First Study ID | ||
study = "" |
31 changes: 31 additions & 0 deletions
31
creator/studies/data_generator/ingest_package/transform_module.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
""" | ||
Transform module to merge data generated by | ||
creator.ingest_runs.data_generator.study_generator | ||
""" | ||
|
||
from kf_lib_data_ingest.common.concept_schema import CONCEPT # noqa F401 | ||
|
||
# Use these merge funcs, not pandas.merge | ||
from kf_lib_data_ingest.common.pandas_utils import ( # noqa F401 | ||
merge_wo_duplicates, | ||
outer_merge, | ||
) | ||
from kf_lib_data_ingest.config import DEFAULT_KEY | ||
|
||
|
||
def transform_function(mapped_df_dict): | ||
""" | ||
Merge clinical and genomic data together | ||
""" | ||
gf_df = merge_wo_duplicates( | ||
mapped_df_dict['s3_scrape_config.py'], | ||
mapped_df_dict['genomic.py'], | ||
on=CONCEPT.GENOMIC_FILE.ID, | ||
) | ||
df = merge_wo_duplicates( | ||
mapped_df_dict['biospec.py'], | ||
gf_df, | ||
on=CONCEPT.BIOSPECIMEN.ID, | ||
) | ||
|
||
return {DEFAULT_KEY: df} |
Oops, something went wrong.