✨ Dataservice study generator to aid in dev + test

kids-first · Mar 25, 2021 · 47ee73e · 47ee73e
1 parent 2629251
commit 47ee73e
Show file tree

Hide file tree

Showing 10 changed files with 1,082 additions and 0 deletions.
diff --git a/creator/studies/data_generator/__init__.py b/creator/studies/data_generator/__init__.py
diff --git a/creator/studies/data_generator/ingest_package/__init__.py b/creator/studies/data_generator/ingest_package/__init__.py
diff --git a/creator/studies/data_generator/ingest_package/extract_configs/__init__.py b/creator/studies/data_generator/ingest_package/extract_configs/__init__.py
diff --git a/creator/studies/data_generator/ingest_package/extract_configs/biospec.py b/creator/studies/data_generator/ingest_package/extract_configs/biospec.py
@@ -0,0 +1,81 @@
+"""
+Extract config for bio manifest generated by
+creator.ingest_runs.data_generator.study_generator
+
+Contains minimal data needed to build Kids First Data Service entities:
+- family
+- participant
+- biospecimen
+
+See documentation at
+https://kids-first.github.io/kf-lib-data-ingest/tutorial/extract.html for
+information on writing extract config files.
+"""
+
+from kf_lib_data_ingest.common import constants  # noqa F401
+from kf_lib_data_ingest.common.concept_schema import CONCEPT
+from kf_lib_data_ingest.etl.extract.operations import *
+
+source_data_url = "file://../data/bio_manifest.tsv"
+
+source_data_read_params = {}
+
+# (Optional) You can set a custom read function with
+# source_data_read_func
+
+operations = [
+    keep_map(
+        in_col="kf_id_family",
+        out_col=CONCEPT.FAMILY.TARGET_SERVICE_ID,
+    ),
+    keep_map(
+        in_col="family_id",
+        out_col=CONCEPT.FAMILY.ID,
+    ),
+    keep_map(
+        in_col="kf_id_participant",
+        out_col=CONCEPT.PARTICIPANT.TARGET_SERVICE_ID,
+    ),
+    keep_map(
+        in_col="participant_id",
+        out_col=CONCEPT.PARTICIPANT.ID,
+    ),
+    value_map(
+        in_col="gender",
+        m={
+            "Male": constants.GENDER.MALE,
+            "Female": constants.GENDER.FEMALE,
+        },
+        out_col=CONCEPT.PARTICIPANT.GENDER,
+    ),
+    keep_map(
+        in_col="kf_id_biospecimen",
+        out_col=CONCEPT.BIOSPECIMEN.TARGET_SERVICE_ID,
+    ),
+    keep_map(
+        in_col="sample_id",
+        out_col=CONCEPT.BIOSPECIMEN.ID,
+    ),
+    value_map(
+        in_col="volume",
+        m=lambda x: float(x),
+        out_col=CONCEPT.BIOSPECIMEN.VOLUME_UL,
+    ),
+    value_map(
+        in_col="concentration",
+        m=lambda x: float(x),
+        out_col=CONCEPT.BIOSPECIMEN.CONCENTRATION_MG_PER_ML,
+    ),
+    keep_map(
+        in_col="tissue_type",
+        out_col=CONCEPT.BIOSPECIMEN.TISSUE_TYPE,
+    ),
+    constant_map(
+        m=constants.SEQUENCING.CENTER.BROAD.KF_ID,
+        out_col=CONCEPT.SEQUENCING.CENTER.TARGET_SERVICE_ID,
+    ),
+    constant_map(
+        m=constants.SEQUENCING.ANALYTE.DNA,
+        out_col=CONCEPT.BIOSPECIMEN.ANALYTE,
+    ),
+]
diff --git a/creator/studies/data_generator/ingest_package/extract_configs/genomic.py b/creator/studies/data_generator/ingest_package/extract_configs/genomic.py
@@ -0,0 +1,72 @@
+"""
+Extract config for sequencing manifest generated by
+creator.ingest_runs.data_generator.study_generator
+
+Contains minimal data needed to build Kids First Data Service entities:
+- sequencing_experiment
+- genomic_file (source /unharmonized only)
+- biospecimen_genomic_file
+- sequencing_experiment_genomic_file
+
+See documentation at
+https://kids-first.github.io/kf-lib-data-ingest/tutorial/extract.html for
+information on writing extract config files.
+"""
+import os
+from kf_lib_data_ingest.common import constants, pandas_utils  # noqa F401
+from kf_lib_data_ingest.common.concept_schema import CONCEPT
+from kf_lib_data_ingest.etl.extract.operations import *
+from kf_lib_data_ingest.common.io import read_df
+
+DATA_DIR = (
+    os.path.join(
+        os.path.dirname(os.path.dirname(__file__)),
+        "data"
+    )
+)
+source_data_url = "file://../data/sequencing_manifest.tsv"
+
+
+# TODO (Optional) Fill in special loading parameters here
+source_data_read_params = {}
+
+# TODO (Optional) You can set a custom read function with
+# source_data_read_func
+
+
+# TODO - Replace this with operations that make sense for your own data file
+operations = [
+    keep_map(
+        in_col="project_id",
+        out_col=CONCEPT.SEQUENCING.ID,
+    ),
+    keep_map(
+        in_col="sample_id",
+        out_col=CONCEPT.BIOSPECIMEN.ID,
+    ),
+    keep_map(
+        in_col="experiment_strategy",
+        out_col=CONCEPT.SEQUENCING.STRATEGY,
+    ),
+    # Source genomic file KF ID
+    keep_map(
+        in_col="kf_id_source_genomic_file",
+        out_col=CONCEPT.GENOMIC_FILE.TARGET_SERVICE_ID,
+    ),
+    keep_map(
+        in_col="source_path",
+        out_col=CONCEPT.GENOMIC_FILE.ID,
+    ),
+    constant_map(
+        m=False,
+        out_col=CONCEPT.GENOMIC_FILE.HARMONIZED,
+    ),
+    constant_map(
+        m=constants.SEQUENCING.REFERENCE_GENOME.GRCH38,
+        out_col=CONCEPT.GENOMIC_FILE.REFERENCE_GENOME,
+    ),
+    constant_map(
+        m=True,
+        out_col=CONCEPT.SEQUENCING.PAIRED_END,
+    ),
+]
diff --git a/creator/studies/data_generator/ingest_package/extract_configs/s3_scrape_config.py b/creator/studies/data_generator/ingest_package/extract_configs/s3_scrape_config.py
@@ -0,0 +1,148 @@
+"""
+Extract config for the s3 object manifest generated by
+creator.ingest_runs.data_generator.study_generator
+
+Contains minimal data needed to build Kids First Data Service entities:
+- genomic_file (source /unharmonized only)
+
+See documentation at
+https://kids-first.github.io/kf-lib-data-ingest/tutorial/extract.html for
+information on writing extract config files.
+"""
+from kf_lib_data_ingest.common import constants
+from kf_lib_data_ingest.common.constants import GENOMIC_FILE, COMMON
+from kf_lib_data_ingest.common.concept_schema import CONCEPT
+from kf_lib_data_ingest.etl.extract.operations import (
+    keep_map,
+    row_map,
+    value_map,
+    constant_map,
+)
+
+
+def genomic_file_ext(x):
+    """
+    Get genomic file extension
+    """
+    matches = [
+        file_ext for file_ext in FILE_EXT_FORMAT_MAP if x.endswith(file_ext)
+    ]
+    if matches:
+        file_ext = max(matches, key=len)
+    else:
+        file_ext = None
+
+    return file_ext
+
+
+FILE_EXT_FORMAT_MAP = {
+    ".fq": GENOMIC_FILE.FORMAT.FASTQ,
+    ".fastq": GENOMIC_FILE.FORMAT.FASTQ,
+    ".fq.gz": GENOMIC_FILE.FORMAT.FASTQ,
+    ".fastq.gz": GENOMIC_FILE.FORMAT.FASTQ,
+    ".bam": GENOMIC_FILE.FORMAT.BAM,
+    ".hgv.bam": GENOMIC_FILE.FORMAT.BAM,
+    ".cram": GENOMIC_FILE.FORMAT.CRAM,
+    ".bam.bai": GENOMIC_FILE.FORMAT.BAI,
+    ".bai": GENOMIC_FILE.FORMAT.BAI,
+    ".cram.crai": GENOMIC_FILE.FORMAT.CRAI,
+    ".crai": GENOMIC_FILE.FORMAT.CRAI,
+    ".g.vcf.gz": GENOMIC_FILE.FORMAT.GVCF,
+    ".g.vcf.gz.tbi": GENOMIC_FILE.FORMAT.TBI,
+    ".vcf.gz": GENOMIC_FILE.FORMAT.VCF,
+    ".vcf": GENOMIC_FILE.FORMAT.VCF,
+    ".vcf.gz.tbi": GENOMIC_FILE.FORMAT.TBI,
+    ".peddy.html": GENOMIC_FILE.FORMAT.HTML,
+    ".md5": COMMON.OTHER,
+}
+
+DATA_TYPES = {
+    GENOMIC_FILE.FORMAT.FASTQ: GENOMIC_FILE.DATA_TYPE.UNALIGNED_READS,
+    GENOMIC_FILE.FORMAT.BAM: GENOMIC_FILE.DATA_TYPE.ALIGNED_READS,
+    GENOMIC_FILE.FORMAT.CRAM: GENOMIC_FILE.DATA_TYPE.ALIGNED_READS,
+    GENOMIC_FILE.FORMAT.BAI: GENOMIC_FILE.DATA_TYPE.ALIGNED_READS_INDEX,
+    GENOMIC_FILE.FORMAT.CRAI: GENOMIC_FILE.DATA_TYPE.ALIGNED_READS_INDEX,
+    GENOMIC_FILE.FORMAT.VCF: GENOMIC_FILE.DATA_TYPE.VARIANT_CALLS,
+    GENOMIC_FILE.FORMAT.GVCF: GENOMIC_FILE.DATA_TYPE.GVCF,
+    GENOMIC_FILE.FORMAT.HTML: COMMON.OTHER,
+    # Different TBI types share the same format in FILE_EXT_FORMAT_MAP above
+    ".g.vcf.gz.tbi": GENOMIC_FILE.DATA_TYPE.GVCF_INDEX,
+    ".vcf.gz.tbi": GENOMIC_FILE.DATA_TYPE.VARIANT_CALLS_INDEX,
+    ".md5": COMMON.OTHER,
+}
+
+
+def filter_df_by_file_ext(df):
+    """
+    Only keep rows where file extension is one of those in
+    FILE_EXT_FORMAT_MAP.keys
+    """
+    df[CONCEPT.GENOMIC_FILE.FILE_FORMAT] = df["Key"].apply(file_format)
+    return df[df[CONCEPT.GENOMIC_FILE.FILE_FORMAT].notnull()]
+
+
+source_data_url = "file://../data/s3_source_gf_manifest.tsv"
+
+do_after_read = filter_df_by_file_ext
+
+
+def s3_url(row):
+    """
+    Create S3 URL for object from S3 bucket and key
+    """
+    return f's3://{row["Bucket"]}/{row["Key"]}'
+
+
+def file_format(x):
+    """
+    Get genomic file format by looking genomic file ext up in
+    FILE_EXT_FORMAT_MAP dict
+    """
+    return FILE_EXT_FORMAT_MAP.get(genomic_file_ext(x))
+
+
+def data_type(x):
+    """
+    Get genomic file data type by looking up file format in DATA_TYPES.
+    However, some types share formats, so then use the file extension itself
+    to do the data type lookup.
+    """
+    return (
+        DATA_TYPES.get(file_format(x)) or
+        DATA_TYPES.get(genomic_file_ext(x))
+    )
+
+
+def fname(key):
+    """
+    Return just the filename portion of the key
+    """
+    return key.rsplit("/", 1)[-1]
+
+
+operations = [
+    row_map(out_col=CONCEPT.GENOMIC_FILE.ID, m=s3_url),
+    row_map(
+        out_col=CONCEPT.GENOMIC_FILE.URL_LIST, m=lambda row: [s3_url(row)]
+    ),
+    value_map(out_col=CONCEPT.GENOMIC_FILE.FILE_NAME, in_col="Key", m=fname),
+    keep_map(in_col="Size", out_col=CONCEPT.GENOMIC_FILE.SIZE),
+    value_map(
+        in_col="ETag",
+        out_col=CONCEPT.GENOMIC_FILE.HASH_DICT,
+        m=lambda x: {constants.FILE.HASH.S3_ETAG.lower(): x.replace('"', "")},
+    ),
+    constant_map(
+        out_col=CONCEPT.GENOMIC_FILE.AVAILABILITY,
+        m=constants.GENOMIC_FILE.AVAILABILITY.IMMEDIATE,
+    ),
+    keep_map(
+        in_col=CONCEPT.GENOMIC_FILE.FILE_FORMAT,
+        out_col=CONCEPT.GENOMIC_FILE.FILE_FORMAT,
+    ),
+    value_map(
+        in_col="Key",
+        out_col=CONCEPT.GENOMIC_FILE.DATA_TYPE,
+        m=data_type,
+    ),
+]
diff --git a/creator/studies/data_generator/ingest_package/ingest_package_config.py b/creator/studies/data_generator/ingest_package/ingest_package_config.py
@@ -0,0 +1,24 @@
+""" Ingest Package Config """
+
+from kf_lib_data_ingest.common.concept_schema import CONCEPT
+
+# The list of entities that will be loaded into the target service. These
+# should be class_name values of your target API config's target entity
+# classes.
+target_service_entities = [
+    "family",
+    "participant",
+    "biospecimen",
+    "sequencing_experiment",
+    "genomic_file",
+    "biospecimen_genomic_file",
+    "sequencing_experiment_genomic_file",
+]
+
+# All paths are relative to the directory this file is in
+extract_config_dir = "extract_configs"
+
+transform_function_path = "transform_module.py"
+
+# Kids First Study ID
+study = ""
diff --git a/creator/studies/data_generator/ingest_package/transform_module.py b/creator/studies/data_generator/ingest_package/transform_module.py
@@ -0,0 +1,31 @@
+"""
+Transform module to merge data generated by
+creator.ingest_runs.data_generator.study_generator
+"""
+
+from kf_lib_data_ingest.common.concept_schema import CONCEPT  # noqa F401
+
+# Use these merge funcs, not pandas.merge
+from kf_lib_data_ingest.common.pandas_utils import (  # noqa F401
+    merge_wo_duplicates,
+    outer_merge,
+)
+from kf_lib_data_ingest.config import DEFAULT_KEY
+
+
+def transform_function(mapped_df_dict):
+    """
+    Merge clinical and genomic data together
+    """
+    gf_df = merge_wo_duplicates(
+        mapped_df_dict['s3_scrape_config.py'],
+        mapped_df_dict['genomic.py'],
+        on=CONCEPT.GENOMIC_FILE.ID,
+    )
+    df = merge_wo_duplicates(
+        mapped_df_dict['biospec.py'],
+        gf_df,
+        on=CONCEPT.BIOSPECIMEN.ID,
+    )
+
+    return {DEFAULT_KEY: df}