Skip to content

Commit

Permalink
✨ Dataservice study generator to aid in dev + test
Browse files Browse the repository at this point in the history
  • Loading branch information
znatty22 committed Mar 25, 2021
1 parent 2629251 commit 47ee73e
Show file tree
Hide file tree
Showing 10 changed files with 1,082 additions and 0 deletions.
Empty file.
Empty file.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""
Extract config for bio manifest generated by
creator.ingest_runs.data_generator.study_generator
Contains minimal data needed to build Kids First Data Service entities:
- family
- participant
- biospecimen
See documentation at
https://kids-first.github.io/kf-lib-data-ingest/tutorial/extract.html for
information on writing extract config files.
"""

from kf_lib_data_ingest.common import constants # noqa F401
from kf_lib_data_ingest.common.concept_schema import CONCEPT
from kf_lib_data_ingest.etl.extract.operations import *

source_data_url = "file://../data/bio_manifest.tsv"

source_data_read_params = {}

# (Optional) You can set a custom read function with
# source_data_read_func

operations = [
keep_map(
in_col="kf_id_family",
out_col=CONCEPT.FAMILY.TARGET_SERVICE_ID,
),
keep_map(
in_col="family_id",
out_col=CONCEPT.FAMILY.ID,
),
keep_map(
in_col="kf_id_participant",
out_col=CONCEPT.PARTICIPANT.TARGET_SERVICE_ID,
),
keep_map(
in_col="participant_id",
out_col=CONCEPT.PARTICIPANT.ID,
),
value_map(
in_col="gender",
m={
"Male": constants.GENDER.MALE,
"Female": constants.GENDER.FEMALE,
},
out_col=CONCEPT.PARTICIPANT.GENDER,
),
keep_map(
in_col="kf_id_biospecimen",
out_col=CONCEPT.BIOSPECIMEN.TARGET_SERVICE_ID,
),
keep_map(
in_col="sample_id",
out_col=CONCEPT.BIOSPECIMEN.ID,
),
value_map(
in_col="volume",
m=lambda x: float(x),
out_col=CONCEPT.BIOSPECIMEN.VOLUME_UL,
),
value_map(
in_col="concentration",
m=lambda x: float(x),
out_col=CONCEPT.BIOSPECIMEN.CONCENTRATION_MG_PER_ML,
),
keep_map(
in_col="tissue_type",
out_col=CONCEPT.BIOSPECIMEN.TISSUE_TYPE,
),
constant_map(
m=constants.SEQUENCING.CENTER.BROAD.KF_ID,
out_col=CONCEPT.SEQUENCING.CENTER.TARGET_SERVICE_ID,
),
constant_map(
m=constants.SEQUENCING.ANALYTE.DNA,
out_col=CONCEPT.BIOSPECIMEN.ANALYTE,
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""
Extract config for sequencing manifest generated by
creator.ingest_runs.data_generator.study_generator
Contains minimal data needed to build Kids First Data Service entities:
- sequencing_experiment
- genomic_file (source /unharmonized only)
- biospecimen_genomic_file
- sequencing_experiment_genomic_file
See documentation at
https://kids-first.github.io/kf-lib-data-ingest/tutorial/extract.html for
information on writing extract config files.
"""
import os
from kf_lib_data_ingest.common import constants, pandas_utils # noqa F401
from kf_lib_data_ingest.common.concept_schema import CONCEPT
from kf_lib_data_ingest.etl.extract.operations import *
from kf_lib_data_ingest.common.io import read_df

DATA_DIR = (
os.path.join(
os.path.dirname(os.path.dirname(__file__)),
"data"
)
)
source_data_url = "file://../data/sequencing_manifest.tsv"


# TODO (Optional) Fill in special loading parameters here
source_data_read_params = {}

# TODO (Optional) You can set a custom read function with
# source_data_read_func


# TODO - Replace this with operations that make sense for your own data file
operations = [
keep_map(
in_col="project_id",
out_col=CONCEPT.SEQUENCING.ID,
),
keep_map(
in_col="sample_id",
out_col=CONCEPT.BIOSPECIMEN.ID,
),
keep_map(
in_col="experiment_strategy",
out_col=CONCEPT.SEQUENCING.STRATEGY,
),
# Source genomic file KF ID
keep_map(
in_col="kf_id_source_genomic_file",
out_col=CONCEPT.GENOMIC_FILE.TARGET_SERVICE_ID,
),
keep_map(
in_col="source_path",
out_col=CONCEPT.GENOMIC_FILE.ID,
),
constant_map(
m=False,
out_col=CONCEPT.GENOMIC_FILE.HARMONIZED,
),
constant_map(
m=constants.SEQUENCING.REFERENCE_GENOME.GRCH38,
out_col=CONCEPT.GENOMIC_FILE.REFERENCE_GENOME,
),
constant_map(
m=True,
out_col=CONCEPT.SEQUENCING.PAIRED_END,
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""
Extract config for the s3 object manifest generated by
creator.ingest_runs.data_generator.study_generator
Contains minimal data needed to build Kids First Data Service entities:
- genomic_file (source /unharmonized only)
See documentation at
https://kids-first.github.io/kf-lib-data-ingest/tutorial/extract.html for
information on writing extract config files.
"""
from kf_lib_data_ingest.common import constants
from kf_lib_data_ingest.common.constants import GENOMIC_FILE, COMMON
from kf_lib_data_ingest.common.concept_schema import CONCEPT
from kf_lib_data_ingest.etl.extract.operations import (
keep_map,
row_map,
value_map,
constant_map,
)


def genomic_file_ext(x):
"""
Get genomic file extension
"""
matches = [
file_ext for file_ext in FILE_EXT_FORMAT_MAP if x.endswith(file_ext)
]
if matches:
file_ext = max(matches, key=len)
else:
file_ext = None

return file_ext


FILE_EXT_FORMAT_MAP = {
".fq": GENOMIC_FILE.FORMAT.FASTQ,
".fastq": GENOMIC_FILE.FORMAT.FASTQ,
".fq.gz": GENOMIC_FILE.FORMAT.FASTQ,
".fastq.gz": GENOMIC_FILE.FORMAT.FASTQ,
".bam": GENOMIC_FILE.FORMAT.BAM,
".hgv.bam": GENOMIC_FILE.FORMAT.BAM,
".cram": GENOMIC_FILE.FORMAT.CRAM,
".bam.bai": GENOMIC_FILE.FORMAT.BAI,
".bai": GENOMIC_FILE.FORMAT.BAI,
".cram.crai": GENOMIC_FILE.FORMAT.CRAI,
".crai": GENOMIC_FILE.FORMAT.CRAI,
".g.vcf.gz": GENOMIC_FILE.FORMAT.GVCF,
".g.vcf.gz.tbi": GENOMIC_FILE.FORMAT.TBI,
".vcf.gz": GENOMIC_FILE.FORMAT.VCF,
".vcf": GENOMIC_FILE.FORMAT.VCF,
".vcf.gz.tbi": GENOMIC_FILE.FORMAT.TBI,
".peddy.html": GENOMIC_FILE.FORMAT.HTML,
".md5": COMMON.OTHER,
}

DATA_TYPES = {
GENOMIC_FILE.FORMAT.FASTQ: GENOMIC_FILE.DATA_TYPE.UNALIGNED_READS,
GENOMIC_FILE.FORMAT.BAM: GENOMIC_FILE.DATA_TYPE.ALIGNED_READS,
GENOMIC_FILE.FORMAT.CRAM: GENOMIC_FILE.DATA_TYPE.ALIGNED_READS,
GENOMIC_FILE.FORMAT.BAI: GENOMIC_FILE.DATA_TYPE.ALIGNED_READS_INDEX,
GENOMIC_FILE.FORMAT.CRAI: GENOMIC_FILE.DATA_TYPE.ALIGNED_READS_INDEX,
GENOMIC_FILE.FORMAT.VCF: GENOMIC_FILE.DATA_TYPE.VARIANT_CALLS,
GENOMIC_FILE.FORMAT.GVCF: GENOMIC_FILE.DATA_TYPE.GVCF,
GENOMIC_FILE.FORMAT.HTML: COMMON.OTHER,
# Different TBI types share the same format in FILE_EXT_FORMAT_MAP above
".g.vcf.gz.tbi": GENOMIC_FILE.DATA_TYPE.GVCF_INDEX,
".vcf.gz.tbi": GENOMIC_FILE.DATA_TYPE.VARIANT_CALLS_INDEX,
".md5": COMMON.OTHER,
}


def filter_df_by_file_ext(df):
"""
Only keep rows where file extension is one of those in
FILE_EXT_FORMAT_MAP.keys
"""
df[CONCEPT.GENOMIC_FILE.FILE_FORMAT] = df["Key"].apply(file_format)
return df[df[CONCEPT.GENOMIC_FILE.FILE_FORMAT].notnull()]


source_data_url = "file://../data/s3_source_gf_manifest.tsv"

do_after_read = filter_df_by_file_ext


def s3_url(row):
"""
Create S3 URL for object from S3 bucket and key
"""
return f's3://{row["Bucket"]}/{row["Key"]}'


def file_format(x):
"""
Get genomic file format by looking genomic file ext up in
FILE_EXT_FORMAT_MAP dict
"""
return FILE_EXT_FORMAT_MAP.get(genomic_file_ext(x))


def data_type(x):
"""
Get genomic file data type by looking up file format in DATA_TYPES.
However, some types share formats, so then use the file extension itself
to do the data type lookup.
"""
return (
DATA_TYPES.get(file_format(x)) or
DATA_TYPES.get(genomic_file_ext(x))
)


def fname(key):
"""
Return just the filename portion of the key
"""
return key.rsplit("/", 1)[-1]


operations = [
row_map(out_col=CONCEPT.GENOMIC_FILE.ID, m=s3_url),
row_map(
out_col=CONCEPT.GENOMIC_FILE.URL_LIST, m=lambda row: [s3_url(row)]
),
value_map(out_col=CONCEPT.GENOMIC_FILE.FILE_NAME, in_col="Key", m=fname),
keep_map(in_col="Size", out_col=CONCEPT.GENOMIC_FILE.SIZE),
value_map(
in_col="ETag",
out_col=CONCEPT.GENOMIC_FILE.HASH_DICT,
m=lambda x: {constants.FILE.HASH.S3_ETAG.lower(): x.replace('"', "")},
),
constant_map(
out_col=CONCEPT.GENOMIC_FILE.AVAILABILITY,
m=constants.GENOMIC_FILE.AVAILABILITY.IMMEDIATE,
),
keep_map(
in_col=CONCEPT.GENOMIC_FILE.FILE_FORMAT,
out_col=CONCEPT.GENOMIC_FILE.FILE_FORMAT,
),
value_map(
in_col="Key",
out_col=CONCEPT.GENOMIC_FILE.DATA_TYPE,
m=data_type,
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
""" Ingest Package Config """

from kf_lib_data_ingest.common.concept_schema import CONCEPT

# The list of entities that will be loaded into the target service. These
# should be class_name values of your target API config's target entity
# classes.
target_service_entities = [
"family",
"participant",
"biospecimen",
"sequencing_experiment",
"genomic_file",
"biospecimen_genomic_file",
"sequencing_experiment_genomic_file",
]

# All paths are relative to the directory this file is in
extract_config_dir = "extract_configs"

transform_function_path = "transform_module.py"

# Kids First Study ID
study = ""
31 changes: 31 additions & 0 deletions creator/studies/data_generator/ingest_package/transform_module.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""
Transform module to merge data generated by
creator.ingest_runs.data_generator.study_generator
"""

from kf_lib_data_ingest.common.concept_schema import CONCEPT # noqa F401

# Use these merge funcs, not pandas.merge
from kf_lib_data_ingest.common.pandas_utils import ( # noqa F401
merge_wo_duplicates,
outer_merge,
)
from kf_lib_data_ingest.config import DEFAULT_KEY


def transform_function(mapped_df_dict):
"""
Merge clinical and genomic data together
"""
gf_df = merge_wo_duplicates(
mapped_df_dict['s3_scrape_config.py'],
mapped_df_dict['genomic.py'],
on=CONCEPT.GENOMIC_FILE.ID,
)
df = merge_wo_duplicates(
mapped_df_dict['biospec.py'],
gf_df,
on=CONCEPT.BIOSPECIMEN.ID,
)

return {DEFAULT_KEY: df}
Loading

0 comments on commit 47ee73e

Please sign in to comment.