From b3555fce8a7b843c0837ac5bc7bbeb1e716742c1 Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Mon, 16 Oct 2023 10:15:46 -0400
Subject: [PATCH 1/2] :hammer: added file_type created from reannotation

---
 STUDY_CONFIGS/pbta_all_case_meta_config.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/STUDY_CONFIGS/pbta_all_case_meta_config.json b/STUDY_CONFIGS/pbta_all_case_meta_config.json
index 88896a9..c79dfc0 100644
--- a/STUDY_CONFIGS/pbta_all_case_meta_config.json
+++ b/STUDY_CONFIGS/pbta_all_case_meta_config.json
@@ -221,17 +221,17 @@
         "manifests": {
             "cbtn": {
                 "table": "bix_genomics_file.sd_bhjxbdqk-genomics_file_manifest",
-                "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
+                "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","consensus_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
                 "out_file": "cbtn_genomics_file_manifest.txt"
             },
             "x01": {
                 "table": "bix_genomics_file.sd_bhjxbdqk_x01-genomics_file_manifest",
-                "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
+                "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","consensus_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
                 "out_file": "x01_genomics_file_manifest.txt"
             },
             "cbtn_extra": {
                 "table": "bix_genomics_file.sd_bhjxbdqk_x01_extra-genomics_file_manifest",
-                "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
+                "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","consensus_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"],
                 "out_file": "cbtn_extra_genomics_file_manifest.txt"
             },
             "pnoc": {

From 8d468218d77c561f055948df515c32ee69a0fe63 Mon Sep 17 00:00:00 2001
From: Miguel Brown <miguel.a.brown@gmail.com>
Date: Thu, 19 Oct 2023 09:54:19 -0400
Subject: [PATCH 2/2] :heavy_minus_sign: removed many outdated refs and scripts

---
 REFS/cell_line_supplemental.txt               |  36 --
 .../1a_merge_cavatica_manifests.py            |  40 --
 .../1b_query_file_manifest.py                 |  88 -----
 .../2_query_ds_by_bs_id.py                    | 173 ---------
 .../3b_create_cbio_id_fname_tbl.py            |  84 ----
 .../2021-Sep-7_ETL_DEPRECATED/4_merge_maf.py  |  88 -----
 .../5a_cnv_genome2gene.py                     |  71 ----
 .../2021-Sep-7_ETL_DEPRECATED/5b_merge_cnv.py | 120 ------
 .../5c_cnv_discrete.py                        | 129 -------
 .../2021-Sep-7_ETL_DEPRECATED/5d_merge_seg.py |  73 ----
 .../6_merge_rename_rsem.py                    | 127 ------
 .../7_convert_fusion.py                       |  99 -----
 .../8_organize_upload_packages.py             | 218 -----------
 .../sample_id_builder_helper.py               |  68 ----
 scripts/dev/get_files_from_manifest.py        |  77 ----
 scripts/v1_etl_deprecated/0_ETL_wrapper.py    | 318 ---------------
 scripts/v1_etl_deprecated/1_query_cavatica.py |  86 -----
 .../v1_etl_deprecated/6b_z_score_transform.py |  60 ---
 .../v1_etl_deprecated/7_createLoadingFiles.R  | 365 ------------------
 scripts/v1_etl_deprecated/CNALoadingHelper.R  |  89 -----
 scripts/v1_etl_deprecated/RNALoadingHelper.R  | 171 --------
 scripts/v1_etl_deprecated/add_case_lists.py   |  81 ----
 .../entrez_id_fusion_format.R                 | 147 -------
 .../v1_etl_deprecated/merge_star_fusion.py    |  44 ---
 utilities/add_fusion_to_sample_sheets.py      |  39 --
 utilities/adjust_patient_inputs.py            | 194 ----------
 utilities/append_add_on_cols.py               |  55 ---
 utilities/get_survival_from_ds.py             |  59 ---
 utilities/match_pick_samples_from_files.py    |  30 --
 utilities/mount_link_sbfs.py                  |  55 ---
 utilities/null_cols_from_maf.py               |  32 --
 utilities/patch_pt_id_update.py               |  36 --
 utilities/patch_sample_id.py                  |  49 ---
 utilities/patch_survival_to_data_patient.py   |  77 ----
 utilities/rsem_replace_data_rna.py            |  55 ---
 utilities/subset_table_by_id.py               |  51 ---
 utilities/swap_samples.py                     |  68 ----
 37 files changed, 3652 deletions(-)
 delete mode 100644 REFS/cell_line_supplemental.txt
 delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/1a_merge_cavatica_manifests.py
 delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/1b_query_file_manifest.py
 delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/2_query_ds_by_bs_id.py
 delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/3b_create_cbio_id_fname_tbl.py
 delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/4_merge_maf.py
 delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/5a_cnv_genome2gene.py
 delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/5b_merge_cnv.py
 delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/5c_cnv_discrete.py
 delete mode 100644 scripts/2021-Sep-7_ETL_DEPRECATED/5d_merge_seg.py
 delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/6_merge_rename_rsem.py
 delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/7_convert_fusion.py
 delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/8_organize_upload_packages.py
 delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/sample_id_builder_helper.py
 delete mode 100644 scripts/dev/get_files_from_manifest.py
 delete mode 100755 scripts/v1_etl_deprecated/0_ETL_wrapper.py
 delete mode 100755 scripts/v1_etl_deprecated/1_query_cavatica.py
 delete mode 100644 scripts/v1_etl_deprecated/6b_z_score_transform.py
 delete mode 100644 scripts/v1_etl_deprecated/7_createLoadingFiles.R
 delete mode 100644 scripts/v1_etl_deprecated/CNALoadingHelper.R
 delete mode 100644 scripts/v1_etl_deprecated/RNALoadingHelper.R
 delete mode 100755 scripts/v1_etl_deprecated/add_case_lists.py
 delete mode 100644 scripts/v1_etl_deprecated/entrez_id_fusion_format.R
 delete mode 100755 scripts/v1_etl_deprecated/merge_star_fusion.py
 delete mode 100644 utilities/add_fusion_to_sample_sheets.py
 delete mode 100755 utilities/adjust_patient_inputs.py
 delete mode 100644 utilities/append_add_on_cols.py
 delete mode 100755 utilities/get_survival_from_ds.py
 delete mode 100644 utilities/match_pick_samples_from_files.py
 delete mode 100644 utilities/mount_link_sbfs.py
 delete mode 100755 utilities/null_cols_from_maf.py
 delete mode 100644 utilities/patch_pt_id_update.py
 delete mode 100644 utilities/patch_sample_id.py
 delete mode 100755 utilities/patch_survival_to_data_patient.py
 delete mode 100644 utilities/rsem_replace_data_rna.py
 delete mode 100755 utilities/subset_table_by_id.py
 delete mode 100644 utilities/swap_samples.py

diff --git a/REFS/cell_line_supplemental.txt b/REFS/cell_line_supplemental.txt
deleted file mode 100644
index 4cbdcce..0000000
--- a/REFS/cell_line_supplemental.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-BS_RXP2ZRQT	susp
-BS_40MP5BWR	susp
-BS_TX8C5VAJ	adh
-BS_GXTFW99H	adh
-BS_M659G06J	adh
-BS_5GNQC2FF	adh
-BS_ERFMPQN3	adh
-BS_PGK832G2	adh
-BS_DDC2WVJY	susp
-BS_ST5P69KR	susp
-BS_AFBPM6CN	susp
-BS_PNYN0AYD	susp
-BS_TF5TTEXH	susp
-BS_E60JZ9Z3	susp
-BS_XMP9XNR9	adh
-BS_P9JP6JFA	adh
-BS_HM5GFJN8	susp
-BS_2A162JH9	susp
-BS_KVPJVJR7	susp
-BS_K5WG6WGM	susp
-BS_PKZ1HWNB	adh
-BS_M8EA6R2A	adh
-BS_68TZMZH1	adh
-BS_0RQ4P069	adh
-BS_853PNV7P	adh
-BS_DRY58DTF	adh
-BS_JGKRN7NA	susp
-BS_MX23ZY0Y	susp
-BS_QWM9BPDY	adh
-BS_BWBDH9GM	adh
-BS_CZRA594T	adh
-BS_59ZJWJTF	adh
-BS_FJEZ3ASV	susp
-BS_8ZD6J47V	susp
-BS_ERAWW3H7	susp
-BS_QYPHA40N	susp
diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/1a_merge_cavatica_manifests.py b/scripts/2021-Sep-7_ETL_DEPRECATED/1a_merge_cavatica_manifests.py
deleted file mode 100755
index fcce9b2..0000000
--- a/scripts/2021-Sep-7_ETL_DEPRECATED/1a_merge_cavatica_manifests.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import argparse
-import concurrent.futures
-import re
-import json
-import pdb
-
-parser = argparse.ArgumentParser(description='Get all relevant analyzed file outputs from cavatica manifest.')
-parser.add_argument('-m', '--manifest', action='store', dest='manifest', help='cavatica manifest file csv list')
-
-args = parser.parse_args()
-out_head = 'id,name,project,Kids First Biospecimen ID Tumor,Kids First Biospecimen ID Normal,Kids First Biospecimen ID\n'
-out = open('1a_cavatica_merged_manifest.csv', 'w')
-out.write(out_head)
-for manifests in args.manifest.split(','):
-    meta = open(manifests)
-    rel_head = ["Kids First Biospecimen ID Tumor", "Kids First Biospecimen ID Normal", "Kids First Biospecimen ID"]
-    head = next(meta)
-    ind = []
-
-    # get index positions of where relevant bs ids would be form tumor dna, normal dna, and rna
-    header = head.rstrip('\n').split(',')
-    for i in range(len(rel_head)):
-        try:
-            ind.append(header.index(rel_head[i]))
-        except:
-            sys.stderr.write(rel_head[i] + " not in this manifest, assiging 0\n")
-            ind.append(0)
-    for line in meta:
-        info = line.rstrip('\n').split(',')
-        out.write(','.join(info[0:3]))
-        for idx in ind:
-            if idx == 0:
-                out.write(',')
-            else:
-                out.write(',' + info[idx])
-        out.write('\n')
-out.close()
\ No newline at end of file
diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/1b_query_file_manifest.py b/scripts/2021-Sep-7_ETL_DEPRECATED/1b_query_file_manifest.py
deleted file mode 100755
index 1f9d860..0000000
--- a/scripts/2021-Sep-7_ETL_DEPRECATED/1b_query_file_manifest.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import argparse
-import concurrent.futures
-import re
-import json
-
-parser = argparse.ArgumentParser(description='Get all relevant analyzed file outputs from cavatica manifest.')
-parser.add_argument('-o', '--output', action='store', dest='output', help='output basename name')
-parser.add_argument('-m', '--manifest', action='store', dest='manifest', help='cavatica csv manifest file, may be from step 1a')
-parser.add_argument('-b', '--blacklist', action='store', dest='blacklist', help='Optional bs id blacklist')
-parser.add_argument('-c', '--config', action='store', dest='config_file', help='json config file with data types and '
-                                                                               'data locations')
-args = parser.parse_args()
-with open(args.config_file) as f:
-    config_data = json.load(f)
-meta = open(args.manifest)
-rel_head = {"Kids First Biospecimen ID Tumor": "tum", "Kids First Biospecimen ID Normal": "norm", "Kids First Biospecimen ID": "rna"}
-head = next(meta)
-ind = {}
-# get index positions of where relevant bs ids would be form tumor dna, normal dna, and rna
-header = head.rstrip('\n').split(',')
-for i in range(len(header)):
-    if header[i] in rel_head:
-        ind[rel_head[header[i]]] = i
-
-old_out = open(args.output + '_task_list.txt', 'w')
-rna_ext_dict = config_data['rna_ext_list']
-rna_ext_list = []
-for key in rna_ext_dict:
-    rna_ext_list.append(rna_ext_dict[key])
-dna_ext_dict = config_data['dna_ext_list']
-dna_ext_list = []
-for key in dna_ext_dict:
-    dna_ext_list.append(dna_ext_dict[key])
-
-skip_dict = {}
-if args.blacklist is not None:
-    blist = open(args.blacklist)
-    for line in blist:
-        skip_dict[line.rstrip('\n')] = 0
-
-old_out.write('T/CL BS ID\tNorm BS ID\tTask Name\tTask ID\tAnalyte Type\tRelevant Outputs\tSource Project\n')
-cav_dict = {}
-for line in meta:
-    info = line.rstrip('\n').split(',')
-    parts = info[1].split('.')
-    ext = ".".join(parts[1:])
-    skip = 0
-    
-    for key in ind:
-        if info[ind[key]] in skip_dict:
-            skip = 1
-            skip_dict[info[ind[key]]] = 1
-            sys.stderr.write('BS ID in blacklist.  Skipping ' + line)
-    file_id = info[0]
-    fname = info[1]
-    project = info[2]
-    if ext in rna_ext_list and skip == 0:
-        bs_id = info[ind['rna']]
-        if bs_id not in cav_dict:
-            cav_dict[bs_id] = {}
-            cav_dict[bs_id]['atype'] = "RNA"
-            cav_dict[bs_id]['fname'] = []
-            cav_dict[bs_id]['bs_id'] = bs_id
-            cav_dict[bs_id]['project'] = []
-        cav_dict[bs_id]['project'].append(project)
-        cav_dict[bs_id]['fname'].append(fname)
-    elif ext in dna_ext_list and skip == 0:
-        t_bs_id = info[ind['tum']]
-        n_bs_id = info[ind['norm']]
-        dkey = t_bs_id + n_bs_id
-        if dkey not in cav_dict:
-            cav_dict[dkey] = {}
-            cav_dict[dkey]['atype'] = "DNA"
-            cav_dict[dkey]['fname'] = []
-            cav_dict[dkey]['t_bs_id'] = t_bs_id
-            cav_dict[dkey]['n_bs_id'] = n_bs_id
-            cav_dict[dkey]['project'] = []
-        cav_dict[dkey]['project'].append(project)
-        cav_dict[dkey]['fname'].append(fname)
-for tid in cav_dict:
-    if cav_dict[tid]['atype'] == 'RNA':
-        old_out.write("\t".join((cav_dict[tid]['bs_id'], "NA","RNA_TASK", tid, "RNA", ",".join(cav_dict[tid]['fname']), ",".join(cav_dict[tid]['project']))) + '\n')
-    else:
-        old_out.write("\t".join((cav_dict[tid]['t_bs_id'], cav_dict[tid]['n_bs_id'],"DNA_TASK", tid, "DNA", ",".join(cav_dict[tid]['fname']), ','.join(cav_dict[tid]['project']))) + '\n')
-old_out.close()
\ No newline at end of file
diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/2_query_ds_by_bs_id.py b/scripts/2021-Sep-7_ETL_DEPRECATED/2_query_ds_by_bs_id.py
deleted file mode 100755
index 69644b0..0000000
--- a/scripts/2021-Sep-7_ETL_DEPRECATED/2_query_ds_by_bs_id.py
+++ /dev/null
@@ -1,173 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-from requests import request
-import argparse
-import concurrent.futures
-import json
-from time import sleep
-import pdb
-
-
-def mt_query_calls(line):
-    try:
-        info = line.rstrip('\n').split('\t')
-        (tum_bs_id, norm_bs_id) = (info[0], info[1])
-        norm_str = ''
-        if norm_bs_id != 'NA' and norm_bs_id not in norm_seen:
-            bs_info = query_dataservice_bs_id(url, norm_bs_id, bs_attrs, pt_attrs, dx_attrs, outcome_attrs)
-            norm_str = norm_bs_id + '\t' + '\t'.join(bs_info) + '\n'
-            norm_seen[norm_bs_id] = 1
-        bs_info = query_dataservice_bs_id(url, tum_bs_id, bs_attrs, pt_attrs, dx_attrs, outcome_attrs)
-        tum_str = tum_bs_id + '\t' + '\t'.join(bs_info) + '\n'
-        return tum_str, norm_str
-    except Exception as e:
-        sys.stderr.write('Error ' + str(e) + ' occurred while trying to process ' + line)
-        exit(1)
-
-
-def get_obj(src_url):
-    """
-    GET requested object from url. It will occassionally fail so pausing, and trying again remedies the situation
-    """
-    try:
-        req_info = request('GET', src_url)
-        return req_info
-    except Exception as e:
-        sys.stderr.write('Got error ' + str(e) + ',could not retrieve info from ' + src_url + ', pausing and retrying\n')
-        sys.stderr.flush()
-        sleep(1)
-        req_info = request('GET', src_url)
-        return req_info
-
-
-def process_attr_dict(attr_dict, info_dict):
-    """
-    Iterate through a requested attribute dictionary and append to an output string (a list that wil be converted to tsv)
-    """
-    temp = []
-    for attr in attr_dict:
-        res = info_dict.json()['results'][attr]
-        if res is None:
-            res = 'NULL'
-        temp.append(res)
-    return temp
-
-
-def query_dataservice_bs_id(url, bs_id, bs_attrs, pt_attrs, dx_attrs, outcome_attrs):
-
-    bs_url = url + '/biospecimens/' + bs_id
-    bs_info = get_obj(bs_url)
-    result = []
-    if bs_info.json()['_status']['code'] == 404:
-        result.append(bs_info.json()['_status']['message'])
-        sys.stderr.write(bs_id + ' not found!\n')
-        sys.stderr.flush()
-        return result
-    elif not bs_info.json()['results']['visible']:
-        sys.stderr.write(bs_id + ' was set to hide.  skipping!\n')
-        sys.stderr.flush()
-        return result
-    dx_url = url + bs_info.json()['_links']['diagnoses']
-    dx_dict = {}
-    # dx can sometimes have multiple values, so dict is used to store them all
-    dx_dict = {}
-    dx_obj = get_obj(dx_url) if len(dx_attrs) > 0 else 'NoDX'
-
-    pt_url = url + bs_info.json()['_links']['participant']
-    pt_info = get_obj(pt_url)
-    result.append(pt_info.json()['results']['kf_id'])
-
-    outcome_url = url + pt_info.json()['_links']['outcomes']
-    outcome_info = get_obj(outcome_url)
-    
-    result.extend(process_attr_dict(bs_attrs, bs_info))
-    result.extend(process_attr_dict(pt_attrs, pt_info))
-
-    for attr in bs_attrs:
-        res = bs_info.json()['results'][attr]
-        if res is None:
-            res = 'NULL'
-        result.append(str(res))
-    for attr in pt_attrs:
-        res = pt_info.json()['results'][attr]
-        if res is None:
-            res = 'NULL'
-        result.append(res)
-    for attr in dx_attrs:
-        dx_dict[attr] = []
-        for cur_res in dx_obj.json()['results']:
-            dx_dict[attr].append(str(cur_res[attr]))
-        result.append(';'.join(dx_dict[attr]))
-    # default to last outcome in list
-    try:
-        if len(outcome_info.json()['results']):
-            for attr in outcome_attrs:
-                res = outcome_info.json()['results'][-1][attr]
-                if res is None:
-                    res = 'NULL'
-                result.append(str(res))
-        else:
-            for attr in outcome_attrs:
-                res = 'NULL'
-                result.append(str(res))
-                sys.stderr.write("WARN: " + pt_info.json()['results']['kf_id'] + " has no outcome data\n")
-    except Exception as e:
-        sys.stderr.write(str(e) + "\n")
-    return result
-
-
-parser = argparse.ArgumentParser(description='Script to walk through data service and grab all relevant metadata'
-                                             ' by bs id.')
-parser.add_argument('-u', '--kf-url', action='store', dest='url',
-                    help='Kids First data service url', default='https://kf-api-dataservice.kidsfirstdrc.org')
-parser.add_argument('-c', '--cavatica', action='store', dest='cav',
-                    help='file with task info from cavatica (see step 1)')
-parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types and '
-                                                                               'data locations')
-
-args = parser.parse_args()
-with open(args.config_file) as f:
-    config_data = json.load(f)
-file_list = args.cav
-bs_attrs = ['external_aliquot_id', 'analyte_type', 'source_text_tissue_type', 'source_text_tumor_descriptor',
-            'composition', 'external_sample_id']
-pt_attrs = ['external_id', 'gender', 'ethnicity', 'race']
-outcome_attrs = ['age_at_event_days', 'vital_status']
-dx_attrs = ['source_text_diagnosis', 'source_text_tumor_location', 'age_at_event_days']
-url = args.url
-
-norm_out_fh = open('norm_bs_ds_info.txt', 'w')
-tum_out_fh = open('tum_bs_ds_info.txt', 'w')
-norm_out_fh.write('BS_ID\tPT_ID\t' + '\t'.join(bs_attrs) + '\t' + '\t'.join(pt_attrs))
-tum_out_fh.write('BS_ID\tPT_ID\t' + '\t'.join(bs_attrs) + '\t' + '\t'.join(pt_attrs))
-if len(dx_attrs) > 0:
-    norm_out_fh.write('\t' + '\t'.join(dx_attrs))
-    tum_out_fh.write('\t' + '\t'.join(dx_attrs))
-# tack on outcome data, renaming the age field
-norm_out_fh.write('\toutcome_age_at_event_days\tvital_status\n')
-tum_out_fh.write('\toutcome_age_at_event_days\tvital_status\n')
-
-
-x = 1
-m = 100
-task_fh = open(file_list)
-head = next(task_fh)
-norm_seen = {}
-with concurrent.futures.ThreadPoolExecutor(config_data['threads']) as executor:
-    results = {executor.submit(mt_query_calls, entry): entry for entry in task_fh}
-    for result in concurrent.futures.as_completed(results):
-        if x % m == 0:
-            sys.stderr.write('Processed ' + str(x) + ' lines\n')
-            sys.stderr.flush()
-        (tum_info, norm_info) = result.result()
-        tum_out_fh.write(tum_info)
-        if norm_info != '':
-            norm_out_fh.write(norm_info)
-        x += 1
-# debug mode
-# for entry in task_fh:
-#     mt_query_calls(entry)
-norm_out_fh.close()
-tum_out_fh.close()
-sys.stderr.write('Done!')
diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/3b_create_cbio_id_fname_tbl.py b/scripts/2021-Sep-7_ETL_DEPRECATED/3b_create_cbio_id_fname_tbl.py
deleted file mode 100755
index db4f000..0000000
--- a/scripts/2021-Sep-7_ETL_DEPRECATED/3b_create_cbio_id_fname_tbl.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import sys
-import os
-import json
-import subprocess
-
-
-def process_ds(dsname, cav_dict, out):
-    try:
-        # last item in find will likely be empty after split
-        x = 0
-        parts = dsname.split('/')
-        cbio_proj = parts[-2]
-        # project/disease name should be name of directory hosting datasheet
-        sys.stderr.write('Processing ' + parts[-2] + ' project' + '\n')
-        cur_ds = open(dsname)
-        for i in range(0, 4, 1):
-            next(cur_ds)
-        ds_head = next(cur_ds)
-        ds_header = ds_head.rstrip('\n').split('\t')
-        sa_idx = ds_header.index('SAMPLE_ID')
-        sp_idx = ds_header.index('SPECIMEN_ID')
-        ns_idx = ds_header.index('MATCHED_NORMAL_SPECIMEN_ID')
-        for entry in cur_ds:
-            meta = entry.rstrip('\n').split('\t')
-            check = meta[sp_idx].split(';')
-            for bs_id in check:
-                # can't tell if RNA or DNA from datasheet, but DNA will be tum + norm bs id, RNA tum + NA
-                id1 = bs_id + "\t" + meta[ns_idx]
-                id2 = bs_id + "\tNA"
-                key = id1
-                if key in cav_dict:
-                    for ftype in cav_dict[key]:
-                        out.write("\t".join([cbio_proj, key, ftype, meta[sa_idx], meta[ns_idx], cav_dict[key][ftype]]) + "\n")
-                elif id2 in cav_dict:
-                    key = id2
-                    for ftype in cav_dict[key]:
-                        out.write("\t".join([cbio_proj, key, ftype, meta[sa_idx], 'NA', cav_dict[key][ftype]]) + "\n")
-                else:
-                    sys.stderr.write('WARNING: ' + id1 + ' nor ' + id2 + ' found in data sheet entry\n' + entry + 'Check inputs!\n')
-    except Exception as e:
-        print(e)
-        sys.exit()
-
-parser = argparse.ArgumentParser(description='Create temp table with cbio ids, bs ids, and file locations and types for downstream file merging. Should be run one level above datasheets dir.')
-parser.add_argument('-c', '--cavatica', action='store', dest='cav',
-                    help='file with task info from cavatica (see step 1b)')
-
-args = parser.parse_args()
-
-cav_dict = {}
-manifest = open(args.cav)
-head = next(manifest)
-for line in manifest:
-    info = line.rstrip('\n').split('\t')
-    bs_ids = info[0] + "\t" + info[1]
-    fnames = info[-2]
-    atype = info[4]
-    cav_dict[bs_ids] = {}
-    # will likely have to add fusion ext soon
-    for fname in fnames.split(','):
-        ftype = 'rsem'
-        if atype == 'DNA':
-            if fname[-3:] == 'maf':
-                ftype = 'maf'
-            elif fname[-3:] == 'seg':
-                ftype = 'seg'
-            else:
-                ftype =  "cnv"
-        elif fname[-3:] == 'tsv':
-            ftype = 'fusion'
-        cav_dict[bs_ids][ftype] = fname
-
-flist = subprocess.check_output('find ./datasheets -name data_clinical_sample.txt', shell=True)
-
-ds_list = flist.decode().split('\n')
-if ds_list[-1] == '':
-    ds_list.pop()
-out = open('cbio_id_fname_table.txt', 'w')
-out.write('Cbio_project\tT_CL_BS_ID\tNorm_BS_ID\tFile_Type\tCbio_Tumor_Name\tCbio_Matched_Normal_Name\tFile_Name\n')
-for dpath in ds_list:
-    process_ds(dpath, cav_dict, out)
-out.close()
\ No newline at end of file
diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/4_merge_maf.py b/scripts/2021-Sep-7_ETL_DEPRECATED/4_merge_maf.py
deleted file mode 100755
index 3b0f21a..0000000
--- a/scripts/2021-Sep-7_ETL_DEPRECATED/4_merge_maf.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-import argparse
-import json
-import subprocess
-from get_file_metadata_helper import get_file_metadata
-import concurrent.futures
-
-
-def process_maf(maf_fn, new_maf, maf_exc, tum_id, norm_id, tid_idx, h_idx, nid_idx, v_idx, eid_idx):
-    cur_maf = open(maf_fn)
-    next(cur_maf)
-    next(cur_maf)
-    for maf in cur_maf:
-        data = maf.rstrip('\n').split('\t')
-        # Want to allow TERT promoter as exception to exlcusion rules
-        if data[v_idx] not in maf_exc or (data[h_idx] == "TERT" and data[v_idx] == "5'Flank"):
-            data[tid_idx] = tum_id
-            data[nid_idx] = norm_id
-            data.pop(eid_idx)
-            new_maf.write('\t'.join(data) + '\n')
-    cur_maf.close()
-
-
-def process_tbl(cbio_dx, file_meta_dict, tid_idx, h_idx, nid_idx, v_idx, eid_idx, print_head):
-    try:
-        x = 0
-        # project/disease name should be name of directory hosting datasheet
-        sys.stderr.write('Processing ' + cbio_dx + ' project' + '\n')
-        new_maf = open(out_dir + cbio_dx + ".maf", 'w')
-        new_maf.write(print_head)
-        for cbio_tum_id in file_meta_dict[cbio_dx]:
-            cbio_norm_id = file_meta_dict[cbio_dx][cbio_tum_id]['cbio_norm_id']
-            fname = file_meta_dict[cbio_dx][cbio_tum_id]['fname']
-            sys.stderr.write('Found relevant maf to process for ' + ' ' + cbio_tum_id + ' ' + cbio_norm_id + ' '
-            + file_meta_dict[cbio_dx][cbio_tum_id]['kf_tum_id'] + ' ' + file_meta_dict[cbio_dx][cbio_tum_id]['kf_norm_id'] + ' ' + fname + '\n')
-            sys.stderr.flush()
-            process_maf(maf_dir + fname, new_maf, maf_exc, cbio_tum_id, cbio_norm_id, tid_idx, h_idx, nid_idx, v_idx, eid_idx)
-            x += 1
-        sys.stderr.write('Completed processing ' + str(x) + ' entries in ' + cbio_dx + '\n')
-        new_maf.close()
-    except Exception as e:
-        print(e)
-        sys.exit()
-
-
-parser = argparse.ArgumentParser(description='Merge and filter mafs using cavatica task info and datasheets.')
-parser.add_argument('-t', '--table', action='store', dest='table',
-                    help='Table with cbio project, kf bs ids, cbio IDs, and file names')
-parser.add_argument('-i', '--header', action='store', dest='header', help='File with maf header only')
-parser.add_argument('-m', '--maf-dir', action='store', dest='maf_dir', help='maf file directory')
-parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types and '
-                                                                               'data locations')
-
-args = parser.parse_args()
-with open(args.config_file) as f:
-    config_data = json.load(f)
-# get maf file ext
-maf_dir = args.maf_dir
-if maf_dir[-1] != '/':
-    maf_dir += '/'
-file_meta_dict = get_file_metadata(args.table, 'maf')
-head_fh = open(args.header)
-
-print_head = next(head_fh)
-cur_head = next(head_fh)
-cur_header = cur_head.rstrip('\n').split('\t')
-eid_idx = cur_header.index('Entrez_Gene_Id')
-tid_idx = cur_header.index('Tumor_Sample_Barcode')
-nid_idx = cur_header.index('Matched_Norm_Sample_Barcode')
-v_idx = cur_header.index('Variant_Classification')
-h_idx = cur_header.index('Hugo_Symbol')
-cur_header.pop(eid_idx)
-
-head_fh.close()
-print_head += '\t'.join(cur_header) + '\n'
-maf_exc = {"Silent": 0, "Intron": 0, "IGR": 0, "3'UTR": 0, "5'UTR": 0, "3'Flank": 0, "5'Flank": 0, "RNA": 0}
-out_dir = 'merged_mafs/'
-try:
-    os.mkdir(out_dir)
-except:
-    sys.stderr.write('output dir already exists\n')
-
-with concurrent.futures.ProcessPoolExecutor(config_data['cpus']) as executor:
-    results = {executor.submit(process_tbl, cbio_dx, file_meta_dict, tid_idx, h_idx, nid_idx, v_idx, eid_idx, print_head): cbio_dx for cbio_dx in file_meta_dict}
-
-sys.stderr.write('Done, check logs\n')
diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/5a_cnv_genome2gene.py b/scripts/2021-Sep-7_ETL_DEPRECATED/5a_cnv_genome2gene.py
deleted file mode 100755
index 6ae58e7..0000000
--- a/scripts/2021-Sep-7_ETL_DEPRECATED/5a_cnv_genome2gene.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-import argparse
-import json
-import subprocess
-import concurrent.futures
-
-
-def process_cnv(cpath):
-    try:
-        if cpath != '':
-            bedtools = config_data['bedtools']
-            cnv_cp_only = config_data['cp_only_script']
-            bed_file = config_data['bed_genes']
-            hugo_tsv = config_data['hugo_tsv']
-            sys.stderr.write('Processing ' + cpath + '\n')
-            root = os.path.basename(cpath).split('.')[0]
-            limit = config_data['cnv_min_len']
-            temp_len = root + '.CNVs_' + str(limit) + "_filtered.bed"
-            len_fh = open(out_dir + temp_len, 'w')
-            in_cnv = open(cpath)
-            skip = next(in_cnv)
-            for cnv in in_cnv:
-                ct_dict['total_cnvs'] += 1
-                cnv_data = cnv.rstrip('\n').split('\t')
-                if int(cnv_data[2]) - int(cnv_data[1]) >= limit:
-                    len_fh.write("\t".join(cnv_data[0:5]) + "\n")
-                else:
-                    ct_dict['short_cnvs'] += 1
-                    # sys.stderr.write("Entry too short " + cnv)
-            len_fh.close()
-            temp = root + '.CNVs.Genes'
-            to_genes_cmd = bedtools + ' intersect -a ' + out_dir + temp_len + ' -b ' + bed_file + ' -wb > ' + out_dir + temp
-            subprocess.call(to_genes_cmd, shell=True)
-            out_gene_cnv_only = 'perl ' + cnv_cp_only + ' ' + hugo_tsv + ' ' + out_dir + temp + ' > ' + out_dir + temp + '.copy_number'
-            subprocess.call(out_gene_cnv_only, shell=True)
-            rm_tmp = 'rm ' + out_dir + temp + ' ' + out_dir + temp_len
-            subprocess.call(rm_tmp, shell=True)
-    except Exception as e:
-        sys.stderr.write(str(e))
-        sys.exit(1)
-
-
-parser = argparse.ArgumentParser(description='Convert controlFreeC cnv genome coords to gene level')
-parser.add_argument('-d', '--cnv-dir', action='store', dest='cnv_dir', help='cnv as genome coords file directory')
-parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types and '
-                                                                               'data locations')
-
-args = parser.parse_args()
-with open(args.config_file) as f:
-    config_data = json.load(f)
-
-limit = config_data['cnv_min_len']
-suffix = config_data['dna_ext_list']['copy_number']
-f_cmd = 'find ' + args.cnv_dir + ' -name "*.' + suffix + '"'
-sys.stderr.write('Getting cnv list ' + f_cmd + '\n')
-flist = subprocess.check_output(f_cmd, shell=True)
-out_dir = 'converted_cnvs/'
-try:
-    os.mkdir(out_dir)
-except:
-    sys.stderr.write('output dir already exists\n')
-
-ct_dict = {'total_cnvs': 0, 'short_cnvs': 0}
-with concurrent.futures.ThreadPoolExecutor(config_data['cpus']) as executor:
-    results = {executor.submit(process_cnv, cpath): cpath for cpath in flist.decode().split('\n')}
-sys.stderr.write(str(ct_dict['total_cnvs']) + ' total cnv calls processed\n')
-sys.stderr.write(str(ct_dict['short_cnvs']) + ' calls dropped for not meeting min bp size ' + str(limit) + '\n')
-sys.stderr.write('Done, check logs\n')
-
diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/5b_merge_cnv.py b/scripts/2021-Sep-7_ETL_DEPRECATED/5b_merge_cnv.py
deleted file mode 100755
index 86892d7..0000000
--- a/scripts/2021-Sep-7_ETL_DEPRECATED/5b_merge_cnv.py
+++ /dev/null
@@ -1,120 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import sevenbridges as sbg
-from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
-import os
-import argparse
-import json
-import subprocess
-import concurrent.futures
-import pandas as pd
-import re
-from get_file_metadata_helper import get_file_metadata
-
-
-def process_cnv(cnv_fn, cur_cnv_dict, samp_id):
-    for entry in open(cnv_fn):
-        (gene, gid, value) = entry.rstrip('\n').split('\t')
-        if gene not in cur_cnv_dict:
-            cur_cnv_dict[gene] = {}
-        if samp_id in cur_cnv_dict[gene]:
-            sys.stderr.write('ERROR: Sample ID ' + samp_id + ' already processed.  Back to the drawing board!')
-            exit(1)
-        else:
-            cur_cnv_dict[gene][samp_id] = value
-    return cur_cnv_dict
-
-def get_ploidy(obj):
-    info = obj.content().split('\n')
-    for datum in info:
-        try:
-            if datum.startswith('Output_Ploidy'):
-                (key, value) = datum.split('\t')
-                return value
-        except Exception as e:
-            sys.stderr.write("WARN: " + str(e) + " entry could not be split\n")
-    return '2' # default if ploidy can't be found
-
-
-def process_table(cbio_dx, file_meta_dict):
-    try:
-
-        # project/disease name should be name of directory hosting datasheet
-        sys.stderr.write('Processing ' + cbio_dx + ' project' + '\n')
-        new_cnv = open(out_dir + cbio_dx + '.predicted_cnv.txt', 'w')
-        cur_cnv_dict = {}
-        s_list = []
-        ploidy_dict = {}
-        for cbio_tum_id in file_meta_dict[cbio_dx]:
-            orig_fname = file_meta_dict[cbio_dx][cbio_tum_id]['fname']
-            kf_bs_id = file_meta_dict[cbio_dx][cbio_tum_id]['kf_tum_id']
-            parts = re.search('^(.*)\.' + orig_suffix, orig_fname)
-            gene_fname = parts.group(1) + w_gene_suffix
-            sys.stderr.write('Found relevant cnv to process ' + ' ' + file_meta_dict[cbio_dx][cbio_tum_id]['kf_tum_id'] + ' '
-            + file_meta_dict[cbio_dx][cbio_tum_id]['kf_norm_id'] + ' ' + gene_fname + '\n')
-            sys.stderr.flush()
-            s_list.append(cbio_tum_id)
-            if manifest is not None:
-                ploidy = get_ploidy(api.files.get(manifest.loc[kf_bs_id]['id']))
-                ploidy_dict[cbio_tum_id] = ploidy
-            else:
-                ploidy_dict[cbio_tum_id] = '2'
-            cur_cnv_dict = process_cnv(cnv_dir + gene_fname, cur_cnv_dict, cbio_tum_id)
-        new_cnv.write('Hugo_Symbol\t' + '\t'.join(s_list) + '\n')
-        for gene in cur_cnv_dict:
-            new_cnv.write(gene)
-            for samp in s_list:
-                if samp in cur_cnv_dict[gene]:
-                    new_cnv.write('\t' + cur_cnv_dict[gene][samp])
-                else:
-                    # pdb.set_trace()
-                    new_cnv.write('\t' + ploidy_dict[samp])
-            new_cnv.write('\n')
-        new_cnv.close()
-    except Exception as e:
-        print(e)
-        sys.exit(1)
-
-
-parser = argparse.ArgumentParser(description='Merge cnv files using cavatica task info and datasheets.')
-parser.add_argument('-t', '--table', action='store', dest='table',
-                    help='Table with cbio project, kf bs ids, cbio IDs, and file names')
-parser.add_argument('-n', '--cnv-dir-gene', action='store', dest='cnv_dir', help='cnv as gene file directory')
-parser.add_argument('-m', '--info_manifest', action='store', dest='manifest', help='cavatica cfree info file manifest')
-parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types and '
-                                                                               'data locations')
-parser.add_argument('-p', '--profile', action='store', dest='profile', help='cavatica profile name. requires '
-                                                                            '.sevenbridges/credentials file be present')
-
-
-args = parser.parse_args()
-with open(args.config_file) as f:
-    config_data = json.load(f)
-cnv_dir = args.cnv_dir
-if cnv_dir[-1] != '/':
-    cnv_dir += '/'
-manifest = None
-if args.manifest is not None:
-    manifest = pd.read_csv(args.manifest)
-    manifest.set_index(['Kids First Biospecimen ID Tumor'], inplace=True)
-else:
-    sys.stderr.write("No info file given, will assume ploidy 2\n")
-
-config = sbg.Config(profile=args.profile)
-api = sbg.Api(config=config, error_handlers=[
-                sbg.http.error_handlers.rate_limit_sleeper,
-                sbg.http.error_handlers.maintenance_sleeper])
-
-
-orig_suffix = config_data['dna_ext_list']['copy_number']
-w_gene_suffix = '.CNVs.Genes.copy_number'
-out_dir = 'merged_cnvs/'
-try:
-    os.mkdir(out_dir)
-except:
-    sys.stderr.write('output dir already exists\n')
-file_meta_dict = get_file_metadata(args.table, 'cnv')
-with concurrent.futures.ProcessPoolExecutor(config_data['cpus']) as executor:
-    results = {executor.submit(process_table, cbio_dx, file_meta_dict): cbio_dx for cbio_dx in file_meta_dict}
-
-sys.stderr.write('Done, check logs\n')
diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/5c_cnv_discrete.py b/scripts/2021-Sep-7_ETL_DEPRECATED/5c_cnv_discrete.py
deleted file mode 100755
index 2525d5b..0000000
--- a/scripts/2021-Sep-7_ETL_DEPRECATED/5c_cnv_discrete.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/env python3
-
-import sevenbridges as sbg
-import sys
-import argparse
-from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
-import concurrent.futures
-import json
-import subprocess
-import re
-from get_file_metadata_helper import get_file_metadata
-import pandas as pd
-import numpy as np
-
-parser = argparse.ArgumentParser(description='Convert merged cnv values to discrete coded values.')
-parser.add_argument('-d', '--merged_cnv_dir', action='store', dest='merged_cnv_dir', help='merged cnv dir')
-parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types and '
-                                                                               'data locations')
-parser.add_argument('-m', '--info_manifest', action='store', dest='manifest', help='cavatica cfree info file manifest')
-parser.add_argument('-t', '--table', action='store', dest='table',
-                    help='Table with cbio project, kf bs ids, cbio IDs, and file names')
-parser.add_argument('-p', '--profile', action='store', dest='profile', help='cavatica profile name. requires '
-                                                                            '.sevenbridges/credentials file be present')
-
-
-def mt_adjust_cn(obj):
-    try:
-        # If there is a info file manifest, get ploidy. Else skip and assume ploidy 2
-        checkpoint = obj
-        if manifest is not None:
-            bs_id = obj.resource.metadata['Kids First Biospecimen ID Tumor']
-            info = obj.resource.content().split('\n')
-            checkpoint = obj.resource.id
-        else:
-            bs_id = obj
-        samp_id = bs_cbio_dict[bs_id]
-        ploidy = 2
-        if manifest is not None:
-            for datum in info:
-                try:
-                    if datum.startswith('Output_Ploidy'):
-                        (key, value) = datum.split('\t')
-                        ploidy = int(value)
-                        break
-                except Exception as e:
-                    sys.stderr.write("WARN: " + str(e) + " entry could not be split\n")
-        data[samp_id] = data[samp_id] - ploidy
-        min_cn = ploidy * -1
-        # adjust discrete low range only in ploidy > 2
-        if min_cn != -2:
-            data[samp_id] = np.where(data[samp_id].between((min_cn + 1),-2), -1, data[samp_id])
-            data[samp_id] = np.where(data[samp_id] == min_cn, -2, data[samp_id])
-        data[samp_id] = np.where(data[samp_id].between(2, high_gain), 1, data[samp_id])
-        data[samp_id] = np.where(data[samp_id] > high_gain, 2, data[samp_id])
-        return [0, checkpoint]
-
-    except Exception as E:
-        sys.stderr.write(str(E) + "\n")
-        sys.stderr.write('Error processing sample entry for ' + checkpoint + '\n')
-        return [1, checkpoint]
-
-
-args = parser.parse_args()
-with open(args.config_file) as f:
-    config_data = json.load(f)
-
-config = sbg.Config(profile=args.profile)
-api = sbg.Api(config=config, error_handlers=[
-                sbg.http.error_handlers.rate_limit_sleeper,
-                sbg.http.error_handlers.maintenance_sleeper])
-
-flist = subprocess.check_output('find ' + args.merged_cnv_dir + ' -name *.predicted_cnv.txt', shell=True)
-
-fname_list = flist.decode().split('\n')
-file_meta_dict = get_file_metadata(args.table, 'cnv')
-manifest = None
-if args.manifest is not None:
-    manifest = pd.read_csv(args.manifest)
-    manifest.set_index(['Kids First Biospecimen ID Tumor'], inplace=True)
-else:
-    sys.stderr.write("No info file given, will assume ploidy 2\n")
-if fname_list[-1] == '':
-    fname_list.pop()
-for fname in fname_list:
-    parts = re.search('^' + args.merged_cnv_dir + '/(.*).predicted_cnv.txt', fname)
-    cbio_dx = parts.group(1)
-    data = pd.read_csv(fname, sep="\t")
-    data.set_index('Hugo_Symbol')
-    # sample list would be cbio ids
-    samp_list = list(data.columns)[1:]
-    bulk_ids = []
-    bs_cbio_dict = {}
-    #fid_dict = {}
-    for samp_id in samp_list:
-        bs_id = file_meta_dict[cbio_dx][samp_id]['kf_tum_id']
-        bs_cbio_dict[bs_id] = samp_id
-        if manifest is not None:
-            bulk_ids.append(manifest.loc[bs_id]['id'])
-    file_objs = []
-    max_j = 100
-    total = len(bulk_ids)
-    for i in range(0, total, max_j):
-        uset = i + max_j
-        sys.stderr.write('Processing ' + str(uset) + ' set\n')
-        if uset > total:
-            uset = total
-        subset = api.files.bulk_get(files=bulk_ids[i:uset])
-        file_objs.extend(subset)
-
-    high_gain = config_data['cnv_high_gain']
-
-    x = 1
-    m = 50
-    with concurrent.futures.ThreadPoolExecutor(config_data['threads']) as executor:
-        if manifest is not None:
-            results = {executor.submit(mt_adjust_cn, obj): obj for obj in file_objs}
-        else:
-            results = {executor.submit(mt_adjust_cn, bs_id): bs_id for bs_id in bs_cbio_dict}
-        for result in concurrent.futures.as_completed(results):
-            if result.result()[0] == 1:
-                'Had trouble processing object ' + result.result([1] + '\n')
-                sys.exit(1)
-            if x % m == 0:
-                sys.stderr.write('Processed ' + str(x) + ' samples\n')
-                sys.stderr.flush()
-            x += 1
-    sys.stderr.write('Conversion completed.  Writing results to file\n')
-    new_fname = cbio_dx = args.merged_cnv_dir + '/' + parts.group(1) + '.discrete_cnvs.txt'
-    data.to_csv(new_fname, sep='\t', mode='w', index=False)
diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/5d_merge_seg.py b/scripts/2021-Sep-7_ETL_DEPRECATED/5d_merge_seg.py
deleted file mode 100644
index 5a64ef9..0000000
--- a/scripts/2021-Sep-7_ETL_DEPRECATED/5d_merge_seg.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-import argparse
-import json
-import subprocess
-from get_file_metadata_helper import get_file_metadata
-import concurrent.futures
-import pdb
-
-
-def process_seg(cur_seg, new_seg, cbio_tum_id, limit):
-    cur_seg = open(cur_seg)
-    next(cur_seg)
-    for seg in cur_seg:
-        data = seg.rstrip('\n').split('\t')
-        data[0] = cbio_tum_id
-        # cbio does not like leading chr in chromosome name
-        data[1] = data[1].replace("chr", "")
-        new_seg.write("\t".join(data) + "\n")
-
-
-def process_tbl(cbio_dx, file_meta_dict, limit):
-    try:
-        x = 0
-        # project/disease name should be name of directory hosting datasheet
-        sys.stderr.write('Processing ' + cbio_dx + ' project' + '\n')
-        new_seg = open(out_dir + cbio_dx + ".merged_seg.txt", 'w')
-        new_seg.write(seg_file_header)
-        for cbio_tum_id in file_meta_dict[cbio_dx]:
-            cbio_norm_id = file_meta_dict[cbio_dx][cbio_tum_id]['cbio_norm_id']
-            fname = file_meta_dict[cbio_dx][cbio_tum_id]['fname']
-            sys.stderr.write("Found relevant seg to process for {} {} {} {} {}\n".format(cbio_tum_id,cbio_norm_id,file_meta_dict[cbio_dx][cbio_tum_id]['kf_tum_id'],file_meta_dict[cbio_dx][cbio_tum_id]['kf_norm_id'],fname))
-            sys.stderr.flush()
-            process_seg(seg_dir + fname, new_seg, cbio_tum_id, limit)
-            x += 1
-        sys.stderr.write('Completed processing ' + str(x) + ' entries in ' + cbio_dx + '\n')
-        new_seg.close()
-    except Exception as e:
-        sys.stderr.write(e)
-        sys.exit()
-
-
-parser = argparse.ArgumentParser(description='Merge seg files')
-parser.add_argument('-t', '--table', action='store', dest='table',
-                    help='Table with cbio project, kf bs ids, cbio IDs, and file names')
-parser.add_argument('-m', '--seg-dir', action='store', dest='seg_dir', help='seg file directory')
-parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types and '
-                                                                               'data locations')
-
-args = parser.parse_args()
-if __name__ == "__main__":
-    with open(args.config_file) as f:
-        config_data = json.load(f)
-    seg_file_header = "ID\tchrom\tloc.start\tloc.end\tnum.mark\tseg.mean\n"
-    # get maf file ext
-    seg_dir = args.seg_dir
-    if seg_dir[-1] != '/':
-        seg_dir += '/'
-    file_meta_dict = get_file_metadata(args.table, 'seg')
-    # not sure about this, seg files might not allow for droppig regions...
-    limit = config_data['cnv_min_len']
-
-    out_dir = 'merged_cnvs/'
-    try:
-        os.mkdir(out_dir)
-    except:
-        sys.stderr.write('output dir already exists\n')
-
-    with concurrent.futures.ProcessPoolExecutor(config_data['cpus']) as executor:
-        results = {executor.submit(process_tbl, cbio_dx, file_meta_dict, limit): cbio_dx for cbio_dx in file_meta_dict}
-
-    sys.stderr.write('Done, check logs\n')
diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/6_merge_rename_rsem.py b/scripts/2021-Sep-7_ETL_DEPRECATED/6_merge_rename_rsem.py
deleted file mode 100755
index ab5496e..0000000
--- a/scripts/2021-Sep-7_ETL_DEPRECATED/6_merge_rename_rsem.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import argparse
-import os
-import concurrent.futures
-import gzip
-import pandas as pd
-import numpy as np
-from scipy import stats
-import pdb
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Merge rsem files using cavatica file info.')
-    parser.add_argument('-t', '--table', action='store', dest='table',
-                    help='Table with cbio project, kf bs ids, cbio IDs, and file names')
-    parser.add_argument('-r', '--rsem-dir', action='store', dest='rsem_dir', help='rsem file directory')
-    args = parser.parse_args()
-
-
-    def mt_collate_df(rsem_file):
-        try:
-            sample = rna_subset.loc[rna_subset['File_Name'] == rsem_file, 'Cbio_Tumor_Name'].iloc[0]
-            if sample not in seen_dict:
-                current = pd.read_csv(rsem_dir + rsem_file, sep="\t", index_col=0)
-                cur_subset = current[['FPKM']]
-                cur_subset.rename(columns={"FPKM": sample}, inplace=True)
-                df_list.append(cur_subset)
-                seen_dict[sample] = 1
-            else:
-                sys.stderr.write(sample + " was already seen, likely due to mulitple dx, skipping!\n")
-            return 1
-        except Exception as e:
-            sys.stderr.write(str(e) + "\nFailed to process " + rsem_file)
-
-
-    rsem_dir = args.rsem_dir
-
-    if rsem_dir[-1] != '/':
-        rsem_dir += '/'
-
-    # file_meta_dict = get_file_metadata(args.table, 'rsem')
-    out_dir = 'merged_rsem/'
-    try:
-        os.mkdir(out_dir)
-    except:
-        sys.stderr.write('output dir already exists\n')
-        sys.stderr.flush()
-    
-    # Use cbio table to create master table to choose represntative gene symbol, calc z scores, output both
-    all_file_meta = pd.read_csv(args.table, sep="\t")
-    rna_subset = all_file_meta.loc[all_file_meta['File_Type'] == 'rsem']
-    rsem_list = rna_subset['File_Name'].to_list()
-    # init_tbl = pd.read_csv(rsem_dir + rsem_list[0], sep="\t", index_col=0)
-    # Get cbio name to rename columns in master table
-    # master_tbl = init_tbl[['FPKM']]
-    # master_tbl.rename(columns={"FPKM": sample}, inplace=True)
-    sys.stderr.write('Creating merged rsem table\n')
-    x = 1
-    m = 50
-
-    df_list = []
-    # some samples have more than one dx, need only process once
-    seen_dict = {}
-    with concurrent.futures.ThreadPoolExecutor(16) as executor:
-        results = {executor.submit(mt_collate_df, rsem_list[i]): rsem_list[i] for i in range(len(rsem_list))}
-        for result in concurrent.futures.as_completed(results):
-            if x % m == 0:
-                sys.stderr.write('Merged ' + str(x) + ' files\n')
-                sys.stderr.flush()
-            x += 1
-    master_tbl = pd.concat(df_list, axis=1)
-    del df_list
-
-    sys.stderr.write('Merge complete, picking representative gene transcripts\n')
-    sys.stderr.flush()
-    master_tbl.reset_index(inplace=True)
-    # drop ens_id_sym and replace with just sym
-    gene_id_sym_list = master_tbl['gene_id'].to_list()
-    gene_sym_list = []
-    for entry in gene_id_sym_list:
-        parts = entry.split('_')
-        gene_sym = '_'.join(parts[1:])
-        gene_sym_list.append(gene_sym)
-    master_tbl['Hugo_Symbol'] = gene_sym_list
-    master_tbl.drop(columns =["gene_id"], inplace = True)
-    sample_list=list(master_tbl.columns.values)
-    # remove value with Hugo_Symbol
-    h_idx = sample_list.index('Hugo_Symbol')
-    sample_list.pop(h_idx)
-    # look for repeated instances of gene symbol, use highest average exp as rep value
-    dup_hugo_tbl = master_tbl[master_tbl.duplicated(['Hugo_Symbol'])]
-    rpt_sym = dup_hugo_tbl.Hugo_Symbol.unique()
-    for sym in rpt_sym:
-        gene_eval = master_tbl.loc[master_tbl['Hugo_Symbol'] == sym]
-        mean_expr = gene_eval[sample_list].mean(axis=1).sort_values(ascending=False)
-        to_dump = list(mean_expr.index)[1:]
-        master_tbl.drop(to_dump, inplace=True)
-    master_tbl.set_index('Hugo_Symbol', inplace=True)
-    gene_sym_list = master_tbl.index
-    project_list = rna_subset.Cbio_project.unique()
-    
-    sys.stderr.write('Outputting FPKM expression results\n')
-    sys.stderr.flush()
-    for project in project_list:
-        sub_sample_list = list(rna_subset.loc[rna_subset['Cbio_project'] == project, 'Cbio_Tumor_Name'])
-        expr_fname = out_dir + project + '.rsem_merged.txt'
-        master_tbl[sub_sample_list].to_csv(expr_fname, sep='\t', mode='w', index=True)
-    
-    sys.stderr.write('Calculating z scores\n')
-    sys.stderr.flush()
-    
-    z_scored = stats.zscore(np.log2(np.array(master_tbl + 1)), axis = 1)
-    del master_tbl
-    master_zscore_log = pd.DataFrame(z_scored, index=gene_sym_list, columns=sample_list)
-    # this may be memory-intensive for some insane reason...
-    sys.stderr.write("Replacing NaN with 0\n")
-    sys.stderr.flush()
-    master_zscore_log.fillna(0, inplace=True)
-    sys.stderr.write('Outputting z scored results\n')
-    sys.stderr.flush()
-    
-    for project in project_list:
-        sub_sample_list = list(rna_subset.loc[rna_subset['Cbio_project'] == project, 'Cbio_Tumor_Name'])
-        zscore_fname = out_dir + project + '.rsem_merged_zscore.txt'
-        master_zscore_log[sub_sample_list].to_csv(zscore_fname, sep='\t', mode='w', index=True, float_format='%.4f')
\ No newline at end of file
diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/7_convert_fusion.py b/scripts/2021-Sep-7_ETL_DEPRECATED/7_convert_fusion.py
deleted file mode 100755
index a56bb12..0000000
--- a/scripts/2021-Sep-7_ETL_DEPRECATED/7_convert_fusion.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import sys
-import argparse
-import json
-import os
-import pandas as pd
-import numpy as np
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Convert openPBTA fusion table OR list of annofuse files to cbio format.')
-    parser.add_argument('-t', '--table', action='store', dest='table',
-                    help='Table with cbio project, kf bs ids, cbio IDs, and file names')
-    parser.add_argument('-f', '--fusion-results', action='store', dest='fusion_results', help='openPBTA fusion file OR annoFuse results dir')
-    parser.add_argument('-m', '--mode', action='store', dest='mode', help='describe source, pbta or annofuse', required=True)
-    parser.add_argument('-s', '--center-file', action='store', dest='sq_file', help='File with BS IDs and sequencing centers. Should have headered columns: BS_ID\tSQ_Value')
-    parser.add_argument('-o', '--out-dir', action='store', dest='out_dir', default='merged_fusion/', help='Result output dir. Default is merged_fusion')
-    args = parser.parse_args()
-    if args.mode != "pbta" and args.mode != "annofuse":
-        sys.stderr.write("-m mode argument must be one of pbta or annofuse. It is case sensitive. You put " + args.mode + "\n")
-        exit(1)
-    def init_cbio_master(fusion_results, mode, rna_metadata):
-        if mode == 'pbta':
-            fusion_data = pd.read_csv(fusion_results, sep="\t")
-            return fusion_data[['Gene1A', 'Sample', 'FusionName', 'CalledBy', 'Fusion_Type']]
-        else:
-            flist = rna_metadata.File_Name
-            frame_list = []
-            for i in range(0, len(flist), 1):
-                ann_file = pd.read_csv(fusion_results + "/" + flist[i], sep="\t")
-                frame_list.append(ann_file)
-            concat_frame = pd.concat(frame_list)
-            del frame_list
-            fusion_data = concat_frame[['Gene1A', 'Sample', 'FusionName', 'Caller', 'Fusion_Type']]
-            del concat_frame
-            fusion_data = fusion_data.groupby(['Gene1A', 'Sample', 'FusionName', 'Fusion_Type'])['Caller'].apply(', '.join).reset_index()
-            fusion_data['Caller'] = fusion_data['Caller'].str.upper()
-            return fusion_data
-
-
-    out_dir = args.out_dir
-    try:
-        os.mkdir(out_dir)
-    except:
-        sys.stderr.write('output dir already exists\n')
-        sys.stderr.flush()
-
-    # deal only with RNA metadata
-    r_ext = 'rsem' # openPBTA data would cross-reference with expression results otherwise...
-    sq_info = pd.read_csv(args.sq_file, sep="\t")
-    all_file_meta = pd.read_csv(args.table, sep="\t")
-    if args.mode == 'annofuse':
-        r_ext = 'fusion'
-    rna_subset = all_file_meta.loc[all_file_meta['File_Type'] == r_ext]
-    rna_subset.rename(columns={"T_CL_BS_ID": "Sample"}, inplace=True)
-    rna_subset.set_index('Sample', inplace=True)
-    project_list = rna_subset.Cbio_project.unique()
-    cbio_master = init_cbio_master(args.fusion_results, args.mode, rna_subset)
-    
-    # Get relevant columns
-    cbio_master.set_index('Sample', inplace=True)
-    sq_info.rename(columns={"BS_ID": "Sample"}, inplace=True)
-    sq_info.set_index('Sample', inplace=True)
-    # get cbio IDs and seq center
-    cbio_master = pd.merge(cbio_master, rna_subset[['Cbio_Tumor_Name']], on='Sample', how='inner')
-    cbio_master = pd.merge(cbio_master, sq_info[['SQ_Value']], on='Sample', how='inner')
-    # drop bs ids
-    cbio_master.reset_index(inplace=True)
-    cbio_master.drop('Sample', axis=1, inplace=True)
-    if args.mode == 'pbta':
-        cbio_master.rename(columns={"Gene1A": "Hugo_Symbol", "FusionName": "Fusion", "CalledBy": "Method",
-        "Fusion_Type": "Frame", "Cbio_Tumor_Name": "Tumor_Sample_Barcode", "SQ_Value": "Center"}, inplace=True)
-    else:
-        cbio_master.rename(columns={"Gene1A": "Hugo_Symbol", "FusionName": "Fusion", "Caller": "Method",
-        "Fusion_Type": "Frame", "Cbio_Tumor_Name": "Tumor_Sample_Barcode", "SQ_Value": "Center"}, inplace=True)
-
-    cbio_master['DNA_support'] = "no"
-    cbio_master['RNA_support'] = "yes"
-    # create blank entrez ID column so that 3' gene names can be searched
-    cbio_master['Entrez_Gene_Id'] = ""
-    # Reorder table
-    order_list = ['Hugo_Symbol', 'Entrez_Gene_Id', 'Center', 'Tumor_Sample_Barcode', 'Fusion','DNA_support', 'RNA_support', 'Method', 'Frame']
-    cbio_master = cbio_master[order_list]
-    cbio_master.set_index('Hugo_Symbol', inplace=True)
-    for project in project_list:
-        sub_sample_list = list(rna_subset.loc[rna_subset['Cbio_project'] == project, 'Cbio_Tumor_Name'])
-        fus_fname = out_dir + project + '.fusions.txt'
-        fus_file = open(fus_fname, 'w')
-        fus_tbl = cbio_master[cbio_master.Tumor_Sample_Barcode.isin(sub_sample_list)]
-        fus_tbl.reset_index(inplace=True)
-        fus_head = "\t".join(fus_tbl.columns) + "\n"
-        fus_file.write(fus_head)
-        fus_data = list(fus_tbl.values)
-        # "hack" to allow 3' end of fusion to be searched
-        for data in fus_data:
-            fus_file.write("\t".join(data) + "\n")
-            (a, b) = data[4].split('--')
-            if a != b:
-                data[0] = b
-                fus_file.write("\t".join(data) + "\n")
-        fus_file.close()
diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/8_organize_upload_packages.py b/scripts/2021-Sep-7_ETL_DEPRECATED/8_organize_upload_packages.py
deleted file mode 100755
index 21846a6..0000000
--- a/scripts/2021-Sep-7_ETL_DEPRECATED/8_organize_upload_packages.py
+++ /dev/null
@@ -1,218 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import argparse
-import os
-import concurrent.futures
-import re
-import json
-import subprocess
-import pdb
-
-
-parser = argparse.ArgumentParser(description='Create cases lists, meta files, and organize data for cbio upload.'
-                                ' It is assumed you are at the dir level of all input data files')
-parser.add_argument('-o', '--output_dir', action='store', dest='out_dir', help='output directory name')
-parser.add_argument('-d', '--dx-file', action='store', dest='dx_file', help='Dx index file with cbio short and long names')
-parser.add_argument('-c', '--config', action='store', dest='config_file', help='json config file with meta information; see REFS/case_meta_config.json example')
-args = parser.parse_args()
-
-def process_meta_study(meta_data, output_dir, study):
-    meta_study = open(output_dir + "meta_study.txt", "w")
-    meta_study.write("type_of_cancer: " + study + "\n")
-    # default is dx value, otherwise make it "cancer_study_identifier" value from config, in not empty
-    canc_study_id = study
-    if meta_data["cancer_study_identifier"] != "":
-        canc_study_id = meta_data["cancer_study_identifier"]
-    canc_study_id += meta_data['dir_suffix']
-    meta_study.write("cancer_study_identifier: " + canc_study_id + "\n")
-    if "reference_genome" in meta_data:
-        meta_study.write("reference_genome: " + meta_data["reference_genome"] + "\n")
-    name = dx_dict[study]
-    if meta_data["name_append"] != "":
-        name += " " + meta_data["name_append"]
-    meta_study.write("name: " + name + "\n")
-    desc = meta_data["description"][0]
-    if len(meta_data["description"]) > 1:
-        desc += " " + dx_dict[study] + " " + meta_data["description"][1]
-    meta_study.write("description: " + desc + "\nshort_name: " + canc_study_id + "\ngroups: " + meta_data["groups"] + "\n")
-    meta_study.close()
-    return canc_study_id
-
-
-def process_meta_data(meta_data, output_dir, canc_study_id, study):
-    for dtype in meta_data["dtypes"]:
-        try:
-            # pointer for easier readability and dict key traciing
-            cur_data = meta_data["dtypes"][dtype]
-            cbio_name = cur_data["cbio_name"]
-            parts = cbio_name.split("_")
-            attr_dict = cur_data["meta_file_attr"]
-            meta_name = "meta_" + "_".join(parts[1:])
-            if attr_dict["datatype"] == "FUSION":
-                meta_name = "meta_" + attr_dict["datatype"] + ".txt"
-            meta_data_file = open(output_dir + meta_name, "w")
-            meta_data_file.write("cancer_study_identifier: " + canc_study_id + "\n")
-            for mkey in attr_dict:
-                meta_data_file.write(mkey + ": " + attr_dict[mkey] + "\n")
-            meta_data_file.write("data_filename: " + cbio_name + "\n")
-            meta_data_file.close()
-            # create data_ links to data
-            cmd = "ln -s " + cwd + meta_data["dir"] + "/" + study + "." + cur_data["ext"] + " " + output_dir + cbio_name
-            subprocess.call(cmd, shell=True)
-        except Exception as e:
-            sys.stderr.write(str(e) + " failed processing meta data file\n")
-            pdb.set_trace()
-            hold = 1
-
-
-def process_clinical_data(meta_data, output_dir, canc_study_id, study):
-    for dtype in meta_data["dtypes"]:
-        # pointer for easier readability and dict key tracing
-        try:
-            cur_data = meta_data["dtypes"][dtype]
-            cbio_name = cur_data["cbio_name"]
-            parts = cbio_name.split("_")
-            meta_name = "meta_" + "_".join(parts[1:])
-            meta_data_file = open(output_dir + meta_name, "w")
-            meta_data_file.write("cancer_study_identifier: " + canc_study_id + "\n")
-            attr_dict = cur_data["meta_file_attr"]
-            for mkey in attr_dict:
-                meta_data_file.write(mkey + ": " + attr_dict[mkey] + "\n")
-            meta_data_file.write("data_filename: " + cbio_name + "\n")
-            meta_data_file.close()
-            # create data_ links to data
-            cmd = "ln -s " + cwd + meta_data["dir"] + "/" + study + "/" + cbio_name + " " + output_dir  + cbio_name
-            subprocess.call(cmd, shell=True)
-        except Exception as e:
-            sys.stderr.write(str(e) + " failed processing meta data file\n")
-            pdb.set_trace()
-            hold = 1
-
-
-def write_case_list(case_key, attr_dict, sample_list, case_dir):
-    case_file = open(case_dir + case_key + ".txt", "w")
-    case_file.write("cancer_study_identifier: " + canc_study_id + "\n")
-    key_list = list(attr_dict)
-    case_file.write(key_list[0] + ": " + canc_study_id + "_" + attr_dict[key_list[0]] + "\n")
-    for i in range(1, len(key_list), 1):
-        to_write = key_list[i] + ": " + attr_dict[key_list[i]]
-        if key_list[i] == "case_list_description":
-            to_write += " (" + str(len(sample_list)) + ")"
-        case_file.write(to_write + "\n")
-    case_file.write("case_list_ids: " + "\t".join(sample_list) + "\n")
-    case_file.close()
-
-def create_case_lists(data_dict, output_dir, canc_study_id, study):
-    case_dir = output_dir + "case_lists/"
-    try:
-        os.mkdir(case_dir)
-    except:
-        sys.stderr.write(case_dir + ' already exists.\n')
-
-    muts_list = []
-    cna_list = []
-    rna_list = []
-    fusion_list = []
-
-    muts_fname = output_dir + config_data["merged_mafs"]["dtypes"]["mutation"]["cbio_name"]
-    muts_file = open(muts_fname)
-    head = next(muts_file)
-    head = next(muts_file)
-    header = head.rstrip('\n').split('\t')
-    s_idx = header.index('Tumor_Sample_Barcode')
-    for line in muts_file:
-        data = line.rstrip('\n').split('\t')
-        muts_list.append(data[s_idx])
-    muts_file.close()
-    muts_list = [*{*muts_list}]
-    if data_dict["merged_cnvs"] == 1:
-        cna_fname = output_dir +  config_data["merged_cnvs"]["dtypes"]["linear"]["cbio_name"]
-        cna_file = open(cna_fname)
-        head = next(cna_file)
-        # assumes header is Hugo_symbols\tsample_name1\tsamplename2 etc, if entrez ID, will need to change!
-        cna_list = head.rstrip('\n').split('\t')[1:]
-        cna_file.close()
-    if data_dict["merged_rsem"] == 1:
-        # assumes header is Hugo_symbols\tsample_name1\tsamplename2 etc, if entrez ID, will need to change!
-        rna_fname = output_dir + config_data["merged_rsem"]["dtypes"]["counts"]["cbio_name"]
-        rna_file = open(rna_fname)
-        head = next(rna_file)
-        rna_list = head.rstrip('\n').split('\t')[1:]
-        fusion_list = rna_list
-    # muts_plus_fusion = muts_list + fusion_list
-    # muts_plus_fusion = [*{*muts_plus_fusion}]
-    all_cases = muts_list
-    write_case_list('cases_sequenced', config_data['cases_sequenced'], muts_list, case_dir)
-    # write_case_list('cases_sequenced', config_data['cases_sequenced'], muts_list, case_dir)
-    if len(cna_list) > 0:
-        write_case_list('cases_cna', config_data['cases_cna'], cna_list, case_dir)
-        all_cases += cna_list
-        muts_plus_cna = list(set(muts_list) & set(cna_list))
-        write_case_list('cases_cnaseq', config_data['cases_cnaseq'], cna_list, case_dir)
-    if len(rna_list) > 0:
-        write_case_list('cases_RNA_Seq_v2_mRNA', config_data['cases_RNA_Seq_v2_mRNA'], rna_list, case_dir)
-        all_cases += rna_list
-        if "merged_fusion" in data_keys and data_keys["merged_fusion"] == 1:
-            write_case_list('cases_sv', config_data['cases_sv'], rna_list, case_dir)
-
-        # loading mutations is a minimum, so if cna exists...3 way file can be made
-        if len(cna_list) > 0:
-            three_way = list(set(muts_list) & set(cna_list) & set(rna_list))
-            write_case_list('cases_3way_complete', config_data['cases_3way_complete'], three_way, case_dir)
-    all_cases = [*{*all_cases}]
-    write_case_list('cases_all', config_data['cases_all'], all_cases, case_dir)
-
-
-cwd = os.getcwd() + "/"
-with open(args.config_file) as f:
-    config_data = json.load(f)
-
-dx_file = open(args.dx_file)
-dx_dict = {}
-head = next(dx_file)
-for line in dx_file:
-    (ds_dx, cbio_short, cbio_long) = line.rstrip('\n').split('\t')
-    dx_dict[cbio_short] = cbio_long
-dx_file.close()
-
-out_dir = args.out_dir
-if out_dir[-1] != "/":
-    out_dir += "/"
-try:
-    os.mkdir(out_dir)
-except:
-    sys.stderr.write(out_dir + ' already exists.\n')
-
-for dx in dx_dict:
-    try:
-        if os.path.isdir(config_data["data_sheets"]["dir"] + "/" + dx):
-            cur_dir = ""
-            if config_data["study"]["cancer_study_identifier"] == "":
-                cur_dir = out_dir + dx
-                if config_data['study']['dir_suffix'] != "":
-                    cur_dir += config_data['study']['dir_suffix']
-            else:
-                cur_dir = out_dir + config_data["study"]["cancer_study_identifier"]
-            cur_dir += "/"
-            try:
-                os.mkdir(cur_dir)
-            except:
-                sys.stderr.write(cur_dir + ' already exists.\n')
-            sys.stderr.write("Creating meta study file for " + dx + "\n")
-            canc_study_id = process_meta_study(config_data['study'], cur_dir, dx)
-            data_keys = {"merged_mafs": 0, "merged_cnvs": 0, "merged_rsem": 0, "merged_fusion": 0}
-            for key in data_keys:
-                if key in config_data and config_data[key]["dir"] != "":
-                    data_keys[key] = 1
-                    process_meta_data(config_data[key], cur_dir, canc_study_id, dx)
-                    sys.stderr.write("Creating meta data files and links for " + key + "\n")
-                else:
-                    sys.stderr.write("Skipping meta files for " + key + "\n")
-            sys.stderr.write("Creating clinical meta sheets and links\n")
-            process_clinical_data(config_data["data_sheets"], cur_dir, canc_study_id, dx)
-            create_case_lists(data_keys, cur_dir, canc_study_id, dx)
-        else:
-            sys.stderr.write("No datasheets for " + dx + ", skipping!\n")
-    except Exception as e:
-        sys.stderr.write(str(e) + "\nerror processing files for " + dx + "!\n")
diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/sample_id_builder_helper.py b/scripts/2021-Sep-7_ETL_DEPRECATED/sample_id_builder_helper.py
deleted file mode 100755
index eb71ecc..0000000
--- a/scripts/2021-Sep-7_ETL_DEPRECATED/sample_id_builder_helper.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/usr/bin/env python3
-import pdb
-
-
-import sys
-
-
-def build_samp_id(style, t_header, info):
-    if style == 'cbttc_dna_std':
-        # sample ID will consist of kf external_sample_id with .SUFFIX removed
-        sid_idx = t_header.index('external_sample_id')
-        samp_id = info[sid_idx].split('.')[0]
-        return samp_id
-
-    elif style == 'cbttc_norm_std':
-        # sample ID will consist of kf external_sample_id with .SUFFIX removed
-        sid_idx = t_header.index('external_sample_id')
-        try:
-            samp_id = info[sid_idx].split('.')[:-1][0]
-        except Exception as e:
-            sys.stderr.write(str(e) + "; old formula not working, try direct\n")
-            samp_id = info[sid_idx]
-        return samp_id
-
-    elif style == 'cbttc_rna_std':
-        # sample ID will consist of kf external_sample_id with .SUFFIX removed
-        sid_idx = t_header.index('external_sample_id')
-        samp_id = info[sid_idx].split('.')[0]
-        return samp_id
-
-    elif style == 'pnoc_dna_wgs':
-        # sample ID will consist of kf external_sample_id with .SUFFIX removed and external_aliquot_id added
-        sid_idx = t_header.index('external_sample_id')
-        ali_idx = t_header.index('external_aliquot_id')
-        samp_id = info[sid_idx].split('.')[0] + '-' + info[ali_idx]
-        return samp_id
-
-    elif style == 'pnoc_norm':
-        # sample ID will consist of kf external_sample_id with .SUFFIX removed and external_aliquot_id added
-        sid_idx = t_header.index('external_sample_id')
-        ali_idx = t_header.index('external_aliquot_id')
-        # Different styles for PNOC003 vs 008
-        # 008 style has naked 7316 ID, tack on aliquot to differentiate from tumor
-        samp_id = info[sid_idx] + '-' + info[ali_idx]
-
-        if info[sid_idx][-1] == "S":
-            # 003 style ending in .WGS or WXS
-            samp_id = info[sid_idx].split('.')[:-1][0]
-        return samp_id
-
-    elif style == 'pnoc_rna_wes':
-        # sample ID will consist of kf external_sample_id with .SUFFIX removed
-        sid_idx = t_header.index('external_sample_id')
-        ali_idx = t_header.index('external_aliquot_id')
-        samp_id = info[sid_idx] + '_' + info[ali_idx]
-        return samp_id
-
-    elif style == 'maris_dna_wgs':
-        # sample ID will consist of kf external_id, splitting on - and using only the last part
-        sid_idx = t_header.index('external_id')
-        samp_id = info[sid_idx].split("-")[2]
-        return samp_id
-    elif style == 'maris_norm_std':
-        # sample ID will consist of kf external_id, splitting on - and using only the last part
-        sid_idx = t_header.index('external_id')
-        samp_id = info[sid_idx].split("-")[2]
-        return samp_id
-
diff --git a/scripts/dev/get_files_from_manifest.py b/scripts/dev/get_files_from_manifest.py
deleted file mode 100644
index 1647f03..0000000
--- a/scripts/dev/get_files_from_manifest.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import pdb
-import sys
-import boto3
-import concurrent.futures
-import urllib3
-import os
-import pandas as pd
-
-
-def mt_type_download(file_type):
-    """
-    Download files from each desired file type at the same time
-    """
-    sys.stderr.write("Downloading " + file_type + " files\n")
-    sys.stderr.flush()
-    sub_file_list = list(selected.loc[selected['file_type'] == file_type, 's3_path'])
-    try:
-        os.mkdir(file_type)
-    except Exception as e:
-        sys.stderr.write(str(e) + " error while making directory for " + file_type + "\n")
-    for loc in sub_file_list:
-        out = file_type + "/" + loc.split("/")[-1]
-        parse_url = urllib3.util.parse_url(loc)
-        try:
-            dl_client.download_file(Bucket=parse_url.host, Key=parse_url.path.lstrip("/"), Filename=out)
-        except Exception as e:
-            sys.stderr.write(str(e) + " could not download from " + loc + "\n")
-            sys.stderr.flush()
-    sys.stderr.write("Completed downloading files for " + file_type + "\n")
-    sys.stderr.flush()
-
-
-parser = argparse.ArgumentParser(description='Get all files for a project.')
-parser.add_argument('-m', '--manifest-list', action='store', dest='manifest', help='csv list of of genomic file location manifests')
-parser.add_argument('-f', '--file-types', action='store', dest='fts', help='csv list of workflow types to download')
-parser.add_argument('-p', '--profile', action='store', dest='profile', help='aws profile name')
-
-args = parser.parse_args()
-# concat multiple possible manifests
-sys.stderr.write("Concatenating manifests\n")
-sys.stderr.flush()
-manifest_list = args.manifest.split(",")
-manifest_concat = pd.DataFrame()
-for manifest in manifest_list:
-    sys.stderr.write("Processing " + manifest + "\n")
-    current = pd.read_csv(manifest, sep=None)
-    if manifest_concat.empty:
-        manifest_concat = current.copy()
-    else:
-        manifest_concat = manifest_concat.append(current, ignore_index=True)
-
-file_types = args.fts.split(",")
-# subset concatenated manifests
-sys.stderr.write("Subsetting concatenated manifest\n")
-sys.stderr.flush()
-# change col names to lower case for input compatibility
-manifest_concat.columns= manifest_concat.columns.str.lower()
-selected = manifest_concat[manifest_concat['file_type'].isin(file_types)]
-# remove vcfs as we only want mafs
-pattern_del = ".vcf.gz"
-filter = selected['file_name'].str.contains(pattern_del)
-selected = selected[~filter]
-
-out_file = "manifest_subset.tsv"
-selected.to_csv(out_file, sep='\t', mode='w', index=False)
-
-# download files by type
-session = boto3.Session(profile_name=args.profile)
-dl_client = session.client('s3')
-# pdb.set_trace()
-
-with concurrent.futures.ThreadPoolExecutor(16) as executor:
-    results = {executor.submit(mt_type_download, ftype): ftype for ftype in file_types}
-
diff --git a/scripts/v1_etl_deprecated/0_ETL_wrapper.py b/scripts/v1_etl_deprecated/0_ETL_wrapper.py
deleted file mode 100755
index 08c1d64..0000000
--- a/scripts/v1_etl_deprecated/0_ETL_wrapper.py
+++ /dev/null
@@ -1,318 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import subprocess
-import json
-import argparse
-import os
-from argparse import RawTextHelpFormatter
-
-
-def check_vars(var_list, step_sub_list):
-    flag = 0
-    for var in var_list:
-        if var not in config_data:
-            flag = 1
-            sys.stderr.write('ERROR: ' + var + ' not defined in ' + config_file + ' for steps ' + step_sub_list + '\n')
-    if flag:
-        sys.exit(1)
-
-
-def config_precheck():
-    if 'all' in steps:
-        var_list = ['threads', 'cpus']
-        check_vars(var_list, 'all')
-    if '1' in steps or '4' in steps or '5a' in steps or '5b' in steps or '6' in steps:
-        var_list = ['cpus']
-        check_vars(var_list, '1, 4, 5a, 5b, 6')
-    if '2' in steps:
-        var_list = ['threads']
-        check_vars(var_list, '2')
-    if 'all' in steps or '1' in steps or '3' in steps or '7' in steps:
-        var_list = ['rna_flag']
-        check_vars(var_list, 'all, 1, 7')
-    if 'all' in steps or '1' in steps:
-        var_list = ['cav_dna_projects', 'tn_order']
-        check_vars(var_list, 'all, 1')
-        if config_data['rna_flag']:
-            check_vars(['cav_rna_projects'], 'all, 1')
-    if 'all' in steps or '1' in steps or '7' in steps:
-        var_list = ['cna_flag']
-        check_vars(var_list, 'all, 1, 7')
-    if 'all' in steps or '3' in steps:
-        var_list = ['dna_samp_id_style', 'dx_tbl_fn', 'ds_pt_desc', 'ds_samp_desc']
-        if config_data['rna_flag']:
-            var_list.append('rna_samp_id_style')
-        check_vars(var_list, 'all, 3')
-        input_config_files = [config_data['dx_tbl_fn']]
-        check_files(input_config_files, 'CONFIG', '3')
-    if 'all' in steps or '5a' in steps:
-        var_list = ['bedtools', 'cp_only_script', 'bed_genes', 'hugo_tsv']
-        check_vars(var_list, 'all, 5a')
-        input_config_files = [config_data['cp_only_script'], config_data['bed_genes'],
-                              config_data['hugo_tsv']]
-        check_files(input_config_files, 'CONFIG', '5a')
-    if 'all' in steps or '6' in steps:
-        var_list = ['ens_gene_list', 'mapFileLoc']
-        check_vars(var_list, 'all, 6')
-        input_config_files = [config_data['ens_gene_list'], config_data['mapFileLoc']]
-        check_files(input_config_files, 'CONFIG', '6')
-    if 'all' in steps or '7' in steps:
-        var_list = ['dx_tbl_fn', 'data_dir', 'cancStudyID', 'group', 'study_short_desc', 'study_desc_p1',
-                    'study_desc_p2']
-        if config_data['rna_flag']:
-            var_list.extend(['fpkmFileLoc', 'mapFileLoc', 'genesFileLoc'])
-        check_vars(var_list, 'all, 7')
-        if config_data['rna_flag']:
-            input_config_files = [config_data['mapFileLoc'], config_data['genesFileLoc']]
-            check_files(input_config_files, 'CONFIG', '7')
-        input_config_dirs = [config_data['data_dir']]
-        check_dirs(input_config_dirs, 'CONFIG', '7')
-
-
-def check_files(file_list, ftype, step):
-    flag = 0
-    for fn in file_list:
-        if fn is None:
-            if step[0] != '7' and ftype == 'INPUT':
-                sys.stderr.write('ERROR: Missing an input file completely. Try running ' + script_dir + step + ' -h\n')
-            else:
-                sys.stderr.write('ERROR: No file defined of type ' + ftype + ' for step ' + step + '\n')
-            flag = 1
-        elif not os.path.isfile(fn):
-            sys.stderr.write('ERROR: ' + ftype + ' file ' + fn + ' not found. Check file names and config file\n')
-            flag = 1
-    if flag == 1:
-        sys.stderr.write('Missing files for step(s) ' + step + '.  Exiting.')
-        sys.exit(1)
-
-
-def check_dirs(dir_list, ftype, step):
-    flag = 0
-    for dn in dir_list:
-        if dn is None:
-            if step[0] != '7' and ftype == 'INPUT':
-                sys.stderr.write('ERROR: Missing an input dir  completely. Try running ' + script_dir + step + ' -h\n')
-            else:
-                sys.stderr.write('ERROR: No dir defined of type ' + ftype + ' for step ' + step + '\n')
-            flag = 1
-        elif not os.path.isdir(dn):
-            sys.stderr.write('ERROR: ' + ftype + ' directory ' + dn
-                             + ' not found. Check directory names and config file\n')
-            flag = 1
-    if flag == 1:
-        sys.stderr.write('Missing directories for step(s) ' + step + '. Exiting.\n')
-        sys.exit(1)
-
-
-parser = argparse.ArgumentParser(description='Wrapper script to run all or some of ETL component scripts.',
-                                 formatter_class=RawTextHelpFormatter)
-parser.add_argument('-x', '--steps', action='store', dest='steps', help='csv list of steps to execute, valid choices '
-        'are: "all", "1", "2", "3", "4", "5a", "5b", "6", "7".  Refers to scripts:\n'
-        '1_query_cavatica.py\n'
-        '2_query_ds_by_bs_id.py\n'
-        '3_create_data_sheets.py\n'
-        '4_merge_maf.py\n'
-        '5a_cnv_genome2gene.py\n'
-        '5b_merge_cnv.py\n'
-        '6_merge_rsem.py\n'
-        '7_createLoadingFiles.R')
-parser.add_argument('-c', '--cavatica', action='store', dest='cav', help='cavatica output file name, relevant steps: '
-                                                                         '1, 2, 3, 4, 5b, 6')
-parser.add_argument('-p', '--profile', action='store', dest='profile', help='cavatica profile name. requires '
-                                        '.sevenbridges/credentials file be present. Relevant steps: 1, 6')
-parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types and '
-                                                                               'data locations')
-parser.add_argument('-u', '--kf-url', action='store', dest='url',
-                    help='Kids First data service url, i.e. https://kf-api-dataservice.kidsfirstdrc.org/. '
-                         'Relevant step: 2')
-parser.add_argument('-t', '--tumor', action='store', dest='tumor',
-                    help='file with tumor metadata (see step 2).  Relevant step: 3, not needed if step 2 run')
-parser.add_argument('-n', '--normal', action='store', dest='normal', help='file with normal metadata (see step 2)'
-                                                                          ' Relevant step: 3, not needed if step 2 run')
-parser.add_argument('-s', '--cell-line-supplement', action='store', dest='cl_supp', help='supplemental file with cell '
-                                                        'line meta data - bs id<tab>type. Relevant step: 2, OPTIONAL')
-parser.add_argument('-i', '--header', action='store', dest='header', help='File with maf header only. Relevant step: 4')
-parser.add_argument('-m', '--maf-dir', action='store', dest='maf_dir', help='maf file directory. Relevant step: 4')
-parser.add_argument('-d', '--cnv-dir', action='store', dest='cnv_dir', help='cnv as genome coords file directory.'
-                                                                            ' Relevant step: 5a')
-parser.add_argument('-e', '--cnv-dir-gene', action='store', dest='cnv_dir_gene', help='cnv as gene file directory.'
-                                                                          'Relevant step: 5b, not needed if step5a run')
-parser.add_argument('-r', '--rsem-dir', action='store', dest='rsem_dir', help='rsem file directory. Relevant step: 6')
-
-if len(sys.argv) == 1:
-    parser.print_help()
-    sys.exit(1)
-
-args = parser.parse_args()
-
-steps = args.steps.split(',')
-valid_steps = ['all', '1', '2', '3', '4', '5a', '5b', '6', '7']
-s_flag = 0
-for step in steps:
-    if step not in valid_steps:
-        sys.stderr.write('Invalid step detected: ' + step + '\n')
-        s_flag = 1
-if s_flag:
-    sys.stderr.write('Invalid step or steps used.  Please specify all or list as csv, i.e 1,2,3 (no spaces!)\n')
-    sys.exit(1)
-
-config_file = args.config_file
-init_in = [config_file]
-
-if config_file is not None:
-    sys.stderr.write('Doing a quick check to see if config file ' + config_file + ' exists\n')
-else:
-    sys.stderr.write('No config file set!  Need file for param -j or --config\n')
-    sys.exit(1)
-check_files(init_in, 'WRAP', '0')
-with open(config_file) as f:
-    config_data = json.load(f)
-script_dir = config_data['script_dir']
-if script_dir[-1] != '/':
-    script_dir += '/'
-sys.stderr.write('Doing a quick check to see if script dir ' + script_dir + ' exists\n')
-check_dirs([script_dir], 'WRAP', '0')
-sys.stderr.write('Doing a quick check to see if relevant vars in config file are set and files and dirs actually '
-                 'exist for step(s) ' + args.steps + '\n')
-config_precheck()
-
-if 'all' in steps or '1' in steps:
-    step1 = '1_query_cavatica.py'
-    cmd = script_dir + step1 + ' --output ' + args.cav + ' --profile ' + args.profile + ' --config ' \
-          + config_file + ' 2> step1.errs'
-    sys.stderr.write('Running step 1: ' + cmd + '\n')
-    sys.stderr.flush()
-    try:
-        subprocess.check_call(cmd, shell=True)
-    except Exception as e:
-        print(e)
-        sys.stderr.write('Failed at step 1.  Maybe check cavatica profile and if sevenbridges api in installed?\n')
-        sys.exit(1)
-    outputs = [args.cav]
-    check_files(outputs, 'OUTPUT', '1')
-
-if 'all' in steps or '2' in steps:
-    step2 = '2_query_ds_by_bs_id.py'
-    inputs = [args.cav]
-    check_files(inputs, 'INPUT', step2)
-    cmd = script_dir + step2 + ' --kf-url ' + args.url + ' --cavatica ' + args.cav + ' --config ' \
-          + config_file + ' 2> step2.errs'
-    sys.stderr.write('Running step 2: ' + cmd + '\n')
-    sys.stderr.flush()
-    try:
-        subprocess.check_call(cmd, shell=True)
-    except Exception as e:
-        print(e)
-        sys.stderr.write('Failed at step 2. Maybe the url is wrong or inaccessible?\n')
-        sys.exit(1)
-    outputs = ['tum_bs_ds_info.txt', 'norm_bs_ds_info.txt']
-    check_files(outputs, 'OUTPUT', '2')
-
-if 'all' in steps or '3' in steps:
-    tumor = args.tumor
-    normal = args.normal
-    step3 = '3_create_data_sheets.py'
-    if '2' in steps or 'all' in steps:
-        tumor = 'tum_bs_ds_info.txt'
-        normal = 'norm_bs_ds_info.txt'
-    inputs = [tumor, normal, args.cav]
-    check_files(inputs, 'INPUT', step3)
-    cmd = script_dir + step3 + ' --cavatica ' + args.cav + ' --config ' + config_file + ' --tumor ' \
-          + tumor + ' --normal ' + normal
-    if args.cl_supp is not None:
-        cmd += ' --cell-line-supplement ' + args.cl_supp
-    cmd += ' 2> step3.errs'
-    sys.stderr.write('Running step 3: ' + cmd + '\n')
-    sys.stderr.flush()
-    subprocess.call(cmd, shell=True)
-    outputs = ['datasheets']
-    check_dirs(outputs, 'OUTPUT', '3')
-
-if 'all' in steps or '4' in steps:
-    step4 = '4_merge_maf.py'
-    input_files = [args.header, args.cav]
-    check_files(input_files, 'INPUT', step4)
-    input_dirs = [args.maf_dir]
-    check_dirs(input_dirs, 'INPUT', step4)
-    cmd = script_dir + step4 + ' --header ' + args.header + ' --cavatica ' + args.cav + ' --config ' \
-          + config_file + ' --maf-dir ' + args.maf_dir + ' 2> step4.errs'
-    sys.stderr.write('Running step 4: ' + cmd + '\n')
-    sys.stderr.flush()
-    subprocess.call(cmd, shell=True)
-    output_dirs = ['merged_mafs']
-    check_dirs(output_dirs, 'OUTPUT', '4')
-
-if 'all' in steps or '5a' in steps:
-    step5a = '5a_cnv_genome2gene.py'
-    if config_data['cna_flag']:
-        input_dirs = [args.cnv_dir]
-        check_dirs(input_dirs, 'INPUT', step5a)
-        cmd = script_dir + step5a + ' --config ' + config_file + ' --cnv-dir ' + args.cnv_dir \
-              + ' 2> step5a.errs'
-        sys.stderr.write('Running step 5a: ' + cmd + '\n')
-        sys.stderr.flush()
-        try:
-            subprocess.check_call(cmd, shell=True)
-        except Exception as e:
-            print(e)
-            sys.stderr.write('Failed at step 5a.  Maybe bedtools is improperly defined?\n')
-            sys.exit(1)
-        output_dirs = ['converted_cnvs']
-        check_dirs(output_dirs, 'OUTPUT', '5a')
-    else:
-        sys.stderr.write('cna_flag set to 0. Check that you have CNA data to process and set to 1 in the config file\n')
-        sys.stderr.flush()
-
-if 'all' in steps or '5b' in steps:
-    step5b = '5b_merge_cnv.py'
-    if config_data['cna_flag']:
-        input_files = [args.cav]
-        check_files(input_files, 'INPUT', step5b)
-        cnv_gene_dir = args.cnv_dir_gene
-        if 'all' in steps or '5a' in steps:
-            cnv_gene_dir = 'converted_cnvs'
-        input_dirs = [cnv_gene_dir]
-        check_dirs(input_dirs, 'INPUT', step5b)
-        cmd = script_dir + step5b + '  --cavatica ' + args.cav + ' --config ' + config_file + ' --cnv-dir-gene ' \
-              + cnv_gene_dir + ' 2> step5b.errs'
-        sys.stderr.write('Running step 5b: ' + cmd + '\n')
-        sys.stderr.flush()
-        subprocess.call(cmd, shell=True)
-        output_dirs = ['merged_cnvs']
-        check_dirs(output_dirs, 'OUTPUT', '5b')
-    else:
-        sys.stderr.write('cna_flag set to 0. Check that you have CNA data to process and set to 1 in the config file\n')
-        sys.stderr.flush()
-
-if 'all' in steps or '6' in steps:
-    step6 = '6_merge_rsem.py'
-    if config_data['rna_flag']:
-        input_files = [args.cav]
-        check_files(input_files, 'INPUT', step6)
-        input_dirs = [args.rsem_dir]
-        check_dirs(input_dirs, 'INPUT', step6)
-        cmd = script_dir + step6 + '  --cavatica ' + args.cav + ' --config ' + config_file + ' --rsem-dir ' \
-              + args.rsem_dir + ' --profile ' + args.profile + ' 2> step6.errs'
-        sys.stderr.write('Running step 6: ' + cmd + '\n')
-        sys.stderr.flush()
-        subprocess.call(cmd, shell=True)
-        output_dirs = ['merged_rsem']
-        check_dirs(output_dirs, 'OUTPUT', '6')
-    else:
-        sys.stderr.write('rna_flag set to 0. Check that you have RNA data to process and set to 1 in the config file\n')
-        sys.stderr.flush()
-
-if 'all' in steps or '7' in steps:
-    input_files = []
-    if config_data['rna_flag']:
-        input_files.append(config_data['fpkmFileLoc'])
-        check_files(input_files, 'INPUT', '7')
-    cmd = 'Rscript ' + script_dir + '7_createLoadingFiles.R ' + config_file + ' 2> step7.errs > step7.out'
-    sys.stderr.write('Running step 7: ' + cmd + '\n')
-    sys.stderr.flush()
-    subprocess.call(cmd, shell=True)
-    output_dirs = ['processed']
-    check_dirs(output_dirs, 'OUTPUT', '7')
-
-sys.stderr.write('Pipe complete, check logs\n')
diff --git a/scripts/v1_etl_deprecated/1_query_cavatica.py b/scripts/v1_etl_deprecated/1_query_cavatica.py
deleted file mode 100755
index 45b6de8..0000000
--- a/scripts/v1_etl_deprecated/1_query_cavatica.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/env python3
-
-import sevenbridges as sbg
-import sys
-import argparse
-import concurrent.futures
-from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
-import re
-import json
-
-
-def process_project_tasks(project, task, atype, cna_flag=None):
-    task_name = task.name.rstrip('\n')
-    task_id = task.id
-    if atype == 'DNA' and 'strelka2_vep_maf' in task.outputs and task.status == 'COMPLETED' and \
-            re.search(config_data['dna_task_keyword'], task.name):
-        try:
-            (bs_id1, bs_id2) = re.findall('BS_[A-Z0-9]*',task.name)
-            tum_bs_id = bs_id1
-            norm_bs_id = bs_id2
-            if config_data['tn_order'] == 'NT':
-                tum_bs_id = bs_id2
-                norm_bs_id = bs_id1
-            outs = task.outputs['strelka2_vep_maf'].name
-            if cna_flag == 1:
-                outs += ',' + task.outputs['cnv_pval'].name
-            return '\t'.join((tum_bs_id, norm_bs_id, task_name, task_id, atype, outs, project)) + '\n'
-        except Exception as e:
-            sys.stderr.write(str(e) + '\n')
-            sys.stderr.write('Failed to process ' + task.id + ' ' + task.name + ', skipping\n')
-    elif atype == 'RNA':
-        try:
-            if task.outputs is not None and 'RSEM_gene' in task.outputs:
-                bs_id = task.outputs['RSEM_gene'].metadata['Kids First Biospecimen ID']
-                outs = task.outputs['RSEM_gene'].name
-                return '\t'.join((bs_id, 'NA', task_name, task_id, atype, outs, project)) + '\n'
-
-        except Exception as e:
-            sys.stderr.write(str(e) + '\n')
-            sys.stderr.write('Task ' + task_name + ' with ID ' + task_id + ' failed to gather all info\n')
-            sys.stderr.flush()
-            return None
-
-
-def get_file_info(api, project, atype, out_fh, cna_flag=None):
-    proj_tasks = api.tasks.query(project=project).all()
-    sys.stderr.write('Processing project ' + project + '\n')
-    x = 1
-    n = 20
-    with concurrent.futures.ThreadPoolExecutor(config_data['cpus']) as executor:
-        results = {executor.submit(process_project_tasks, project, task, atype, cna_flag): task for task in proj_tasks}
-        for result in concurrent.futures.as_completed(results):
-            if x % n == 0:
-                sys.stderr.write('Processed ' + str(x) + ' tasks\n')
-                sys.stderr.flush()
-            outstr = result.result()
-            if outstr is not None:
-                out_fh.write(outstr)
-            x += 1
-
-
-parser = argparse.ArgumentParser(description='Get all relevant analyzed file outputs from projects in cavatica.')
-parser.add_argument('-o', '--output', action='store', dest='out', help='output file name')
-parser.add_argument('-p', '--profile', action='store', dest='profile', help='cavatica profile name. requires '
-                                                                            '.sevenbridges/credentials file be present')
-parser.add_argument('-c', '--config', action='store', dest='config_file', help='json config file with data types and '
-                                                                               'data locations')
-
-args = parser.parse_args()
-config = sbg.Config(profile=args.profile)
-api = sbg.Api(config=config, error_handlers=[
-                sbg.http.error_handlers.rate_limit_sleeper,
-                sbg.http.error_handlers.maintenance_sleeper])
-with open(args.config_file) as f:
-    config_data = json.load(f)
-dna_types = config_data['cav_dna_projects']
-rna_types = config_data['cav_rna_projects']
-out_fh = open(args.out, 'w')
-out_fh.write('T/CL BS ID\tNorm BS ID\tTask Name\tTask ID\tAnalyte Type\tRelevant Outputs\tSource Project\n')
-for project in dna_types:
-    get_file_info(api, project, 'DNA', out_fh, config_data['cna_flag'])
-for project in rna_types:
-    get_file_info(api, project, 'RNA', out_fh)
-out_fh.close()
-sys.stderr.write('Complete\n')
-
diff --git a/scripts/v1_etl_deprecated/6b_z_score_transform.py b/scripts/v1_etl_deprecated/6b_z_score_transform.py
deleted file mode 100644
index d482227..0000000
--- a/scripts/v1_etl_deprecated/6b_z_score_transform.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import sys
-import argparse
-import os
-import concurrent.futures
-import json
-import subprocess
-import pandas as pd
-import numpy as np
-from scipy import stats
-
-
-parser = argparse.ArgumentParser(description='Merge rsem files using cavatica file info.')
-parser.add_argument('-d', '--merged_rsem_dir', action='store', dest='merged_rsem_dir', help='merged rsem dir')
-parser.add_argument('-t', '--table', action='store', dest='table',
-                    help='Table with cbio project, kf bs ids, cbio IDs, and file names, ' +
-                    'give only if you want to normalize first across all tables, then split')
-parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types '
-                                                                                'and data locations')
-args = parser.parse_args()
-
-with open(args.config_file) as f:
-    config_data = json.load(f)
-
-flist = subprocess.check_output('find ' + args.merged_rsem_dir + ' -name *.rsem_merged.txt', shell=True)
-fname_list = flist.decode().split('\n')
-if fname_list[-1] == '':
-    fname_list.pop()
-
-if args.table == None:
-    for fname in fname_list:
-        data = pd.read_csv(fname, sep="\t", index_col=0)
-        header = list(data.columns)
-        gene_list = list(data.index.values)
-        if len(header) > 2:
-            new_fname = fname.replace('.rsem_merged.txt', '.rsem_merged_zscore.txt')
-            z_scored = stats.zscore(np.array(data), axis=1)
-            new_data = pd.DataFrame(z_scored, index=gene_list, columns=header)
-            new_data.fillna('NA', inplace=True)
-            sys.stderr.write('Outputting results to ' + new_fname + '\n')
-            new_data.to_csv(new_fname, sep='\t', mode='w', index=True)
-        else:
-            sys.stderr.write('Only 1 sample found in ' + fname + ', skipping!\n')
-else:
-    mega_tbl = pd.read_csv(fname_list[0], sep="\t", index_col=0)
-    mega_header = list(mega_tbl.columns)
-    gene_list = list(mega_tbl.index.values)
-    sample_info = pd.read_csv(args.table, sep="\t")
-    for i in range(1, len(fname_list), 1):
-        data = pd.read_csv(fname_list[i], sep="\t", index_col=0)
-        mega_header.extend(list(data.columns))
-        mega_tbl = pd.concat([mega_tbl, data])
-    log_tf = np.log(np.array(mega_tbl))
-    z_scored = stats.zscore(log_tf, axis = 1)
-    mega_df = pd.DataFrame(z_scored, index=gene_list, columns=header)
-    cbio_proj = sample_info.Cbio_project.unique()
-    for proj in cbio_proj:
-        new_fname = proj + ".rsem_merged_zscore.txt"
-        cur = mega_df[[proj]]
-        cur.fillna('NA', inplace=True)
-        cur.to_csv(new_fname, sep='\t', mode='w', index=True)
\ No newline at end of file
diff --git a/scripts/v1_etl_deprecated/7_createLoadingFiles.R b/scripts/v1_etl_deprecated/7_createLoadingFiles.R
deleted file mode 100644
index 724c489..0000000
--- a/scripts/v1_etl_deprecated/7_createLoadingFiles.R
+++ /dev/null
@@ -1,365 +0,0 @@
-#!/usr/bin/env Rscript
-##############################################
-#Purpose: Code to create Pedcbioportal loading files
-#Date: 10/5/2018
-#Author: Pichai Raman, Modified by Miguel Brown
-##############################################
-
-##############################################
-#Rough outline
-#1. We will start with the clinical file and update it with meta file
-#2. We will find corresponding MAF files and create MAF meta file
-#3. We will then find corresponding RNA-Seq files and update
-##############################################
-
-#Set Mutation, CNA, and Datasheets Directory
-args = commandArgs(trailingOnly=TRUE)
-if (length(args)==0) {
-  stop("Please supply a json config file\n", call.=FALSE)
-}
-library("rjson")
-config_data  <- fromJSON(file=args[1])
-data_dir = config_data$data_dir
-mutDir <- paste(data_dir, "/merged_mafs/", sep = "")
-cnaDir <- paste(data_dir, "/merged_cnvs/", sep = "")
-hasRNA = config_data$rna_flag
-if (hasRNA){
-	fusion_dir <- paste(data_dir, config_data$fusion_dir, sep = "")
-}
-hasCNA = config_data$cna_flag
-dataSheetsDir <- paste(data_dir, "/datasheets/", sep = "")
-diseaseMappingFile <- paste(data_dir, "/", config_data$dx_tbl_fn, sep = "");
-
-
-#Source RNA-Seq Functions
-script_dir = config_data$script_dir
-# source("RNALoadingHelper.R")
-source(paste(script_dir,"/CNALoadingHelper.R", sep = ""))
-
-#Create Meta File for MAF files
-createMutationsMeta <- function(myLoc=NULL, cancStudyID=NULL, myDesc=NULL)
-{
-	#Create File
-	myFile <- paste(myLoc, "/meta_mutations_extended.txt", sep="");
-	file.create(myFile);
-
-	#Add Cancer Study
-	cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="")
-	write(cancStudyIDLine, myFile, append=TRUE)
-
-	#Add stable id
-	stableIDTypeLine <- "stable_id: mutations"
-	write(stableIDTypeLine, myFile, append=TRUE)
-
-	#Add Profile Name
-	profileNameLine <- "profile_name: Mutations"
-	write(profileNameLine, myFile, append=TRUE)
-
-	#Add Profile Description
-	profileDescLine <- paste("profile_description: ", myDesc, sep="")
-	write(profileDescLine, myFile, append=TRUE)
-
-	#Add Genetic Alteration Type
-	genAltTypeLine <- "genetic_alteration_type: MUTATION_EXTENDED";
-	write(genAltTypeLine, myFile, append=TRUE)
-
-	#Add Data Type
-	dataTypeLine <- "datatype: MAF"
-	write(dataTypeLine, myFile, append=TRUE)
-
-	#Add Show Profile in analysis Tab
-	showProfileInTabLine <- "show_profile_in_analysis_tab: true"
-	write(showProfileInTabLine, myFile, append=TRUE)
-
-	#Add Data_filename
-	dataFileLine <- "data_filename: data_mutations_extended.txt"
-	write(dataFileLine, myFile, append=TRUE)
-
-	print(paste("Done Creation Mutations Meta for", cancStudyID, sep = " "));
-}
-
-#Create Meta File for MAF files
-createMetaStudy <- function(myLoc=NULL, myTypeOfCancer=NULL, cancStudyID=NULL, myName=NULL, myDesc=NULL, myShortName=NULL, myGroups=NULL)
-{
-	#Create File
-	myFile <- paste(myLoc, "/meta_study.txt", sep="");
-	file.create(myFile);
-
-	#Add Type of Cancer
-	typeOfCancerLine <- paste("type_of_cancer: ", myTypeOfCancer, sep="")
-	write(typeOfCancerLine, myFile, append=TRUE)
-
-	#Add Cancer Study
-	cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="")
-	write(cancStudyIDLine, myFile, append=TRUE)
-
-	#Add Profile Name
-	nameLine <- paste("name: ", myName, sep="")
-	write(nameLine, myFile, append=TRUE)
-
-	#Add Profile Description
-	descLine <- paste("description: ", myDesc, sep="")
-	write(descLine, myFile, append=TRUE)
-
-	#Add Data_filename
-	shortNameLine <- paste("short_name: ", myShortName, sep="")
-	write(shortNameLine, myFile, append=TRUE)
-
-	#Add group
-	groupLine <- paste("groups: ", myGroups, sep="")
-	write(groupLine, myFile, append=TRUE)
-
-	print(paste("Done Creation Meta Study for", cancStudyID, sep = " "));
-}
-
-#Create Meta Clinical Sample
-createMetaSample <- function(myLoc=NULL, cancStudyID=NULL)
-{
-	#Create File
-	myFile <- paste(myLoc, "/meta_clinical_sample.txt", sep="");
-	file.create(myFile);
-
-	#Add Cancer Study
-	cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="")
-	write(cancStudyIDLine, myFile, append=TRUE)
-
-	#Add Genetic Alteration type
-	genAltLine <- "genetic_alteration_type: CLINICAL"
-	write(genAltLine, myFile, append=TRUE)
-
-	#Add Profile Description
-	datTypeLine <- "datatype: SAMPLE_ATTRIBUTES"
-	write(datTypeLine, myFile, append=TRUE)
-
-	#Add Data_filename
-	datFileNameLine <- "data_filename: data_clinical_sample.txt"
-	write(datFileNameLine, myFile, append=TRUE)
-	print(paste("Done Creation Meta Sample for", cancStudyID, sep=" "));
-}
-
-#Create Meta Clinical Patient
-createMetaClinical <- function(myLoc=NULL, cancStudyID=NULL)
-{
-	#Create File
-	myFile <- paste(myLoc, "/meta_clinical_patient.txt", sep="");
-	file.create(myFile);
-
-	#Add Cancer Study
-	cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="")
-	write(cancStudyIDLine, myFile, append=TRUE)
-
-	#Add Genetic Alteration type
-	genAltLine <- "genetic_alteration_type: CLINICAL"
-	write(genAltLine, myFile, append=TRUE)
-
-	#Add Profile Description
-	datTypeLine <- "datatype: PATIENT_ATTRIBUTES"
-	write(datTypeLine, myFile, append=TRUE)
-
-	#Add Data_filename
-	datFileNameLine <- "data_filename: data_clinical_patient.txt"
-	write(datFileNameLine, myFile, append=TRUE)
-	print(paste("Done Creation Meta Patient for", cancStudyID, sep=" "));
-}
-
-
-createCaseLists <- function(myLoc=NULL, cancStudyID=NULL, cancStableID=NULL, caseListName=NULL, caseListDesc=NULL, caseListCat=NULL, myCaseListIDs=NULL)
-{
-	#Create File
-	myFile <- paste(myLoc, sep="");
-	file.create(myFile);
-
-	#Add Cancer Study
-	cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="")
-	write(cancStudyIDLine, myFile, append=TRUE)
-
-	#Add Cancer Stable ID
-	cancStableIDLine <- paste("stable_id: ", cancStableID, sep="")
-	write(cancStableIDLine, myFile, append=TRUE)
-
-	#Add Case List Name
-	caseNameLine <- paste("case_list_name: ", caseListName, sep="")
-	write(caseNameLine, myFile, append=TRUE)
-
-	#Add Case List Description
-	caseDescLine <- paste("case_list_description: ", caseListDesc, sep="")
-	write(caseDescLine, myFile, append=TRUE)
-
-	#Add Case List Category
-	caseCatLine <- paste("case_list_category: ", caseListCat, sep="")
-	write(caseCatLine, myFile, append=TRUE)
-
-	#Add list ids
-	myCaseListIDs <- paste(myCaseListIDs, collapse="\t")
-	caseListIDsLine <- paste("case_list_ids: ", myCaseListIDs, sep="")
-	write(caseListIDsLine, myFile, append=TRUE)
-
-	print(paste("Done Creation of case list for", cancStudyID, sep=" "));
-
-}
-
-
-
-createStudyAll <- function(myDisease=NULL, 
-	myName=NULL, 
-	myDescMS=NULL, 
-	myDescMM=config_data$mut_desc,
-	myDescMRF="profile_description: Expression levels from RNA-Seq (FPKM)",
-	myDescMRZ="profile_description: Expression levels from RNA-Seq (z-scores)",
-	myDescMC="profile_description: Predicted copy number values from WGS (Continuous)",
-	myDescMCD="profile_description: Predicted copy number values from WGS (Discrete)")
-{
-	myDiseaseStudyFolder <- paste("./processed/", config_data$cancStudyID, sep="")
-	myCancStudyID <- config_data$cancStudyID;
-	if (config_data$prepend_cancStudyID_flag){
-		myDiseaseStudyFolder <- paste("./processed/", myDisease, "_", config_data$cancStudyID, sep="")
-		myCancStudyID <- paste(myDisease, "_", config_data$cancStudyID, sep="");
-	}
-	caseListFolder <- paste(myDiseaseStudyFolder, "/case_lists", sep="")
-
-	#Create Directory
-	dir.create(myDiseaseStudyFolder, recursive = TRUE)
-	dir.create(caseListFolder, recursive = TRUE);
-	#Move clinical and patient files
-	system(paste("cp ", dataSheetsDir, myDisease, "/data* ", myDiseaseStudyFolder, sep=""));
-
-	#Move MAF File
-	system(paste("cp ", mutDir, myDisease, ".strelka.vep.filtered.maf ", myDiseaseStudyFolder, "/data_mutations_extended.txt", sep=""));
-
-	if(hasCNA){
-		#Move CNA File
-		system(paste("cp ", cnaDir, myDisease, ".predicted_cnv.txt ", myDiseaseStudyFolder, "/data_linear_CNA.txt", sep=""));
-		system(paste("cp ", cnaDir, myDisease, ".discrete_cnvs.txt ", myDiseaseStudyFolder, "/data_CNA.txt", sep=""));
-
-		#Create CNV meta files
-		createCNVMeta(myDiseaseStudyFolder, cancStudyID=myCancStudyID, myDescMC);
-		createCNVDiscreteMeta(myDiseaseStudyFolder, cancStudyID=myCancStudyID, myDescMCD);
-		#Case List CNA
-		tmpData <- read.delim(paste(myDiseaseStudyFolder, "/data_linear_CNA.txt", sep=""));
-		tmpSamps <- as.character(colnames(tmpData));
-		tmpSamps <- tmpSamps[2:length(tmpSamps)]
-		tmpSamps <- gsub("^X", "", tmpSamps)
-		tmpSamps <- gsub("\\.", "-", tmpSamps)
-		createCaseLists(paste(myDiseaseStudyFolder, "/case_lists/cases_cna.txt", sep=""),
-			cancStudyID=myCancStudyID,
-			cancStableID=paste(myCancStudyID, "_cna", sep=""),
-			caseListName="Tumor Samples with CNA data",
-			caseListDesc=paste("All tumors with CNA data (", length(tmpSamps), " samples)", sep=""),
-			caseListCat="all_cases_with_cna_data",
-			myCaseListIDs=tmpSamps)
-		}
-
-	#Create Study met file
-	createMetaStudy(myDiseaseStudyFolder, 
-		myTypeOfCancer=myDisease, 
-		cancStudyID=myCancStudyID, 
-		myName=myName, 
-		myDesc=myDescMS, 
-		myShortName=myCancStudyID,
-		myGroups=config_data$group
-		);
-
-	#Create mutation meta file
-	createMutationsMeta(myDiseaseStudyFolder, 
-		cancStudyID=myCancStudyID, 
-		myDesc=myDescMM);
-
-	if(hasRNA)
-	{
-		#Create Expression meta files
-		z_check = createRNASeqExpMatrix(loc=myDiseaseStudyFolder, myDisease=myDisease, dataSheetsDir=dataSheetsDir, res=res);
-		createExpressionMeta(myDiseaseStudyFolder, cancStudyID=myCancStudyID, myDescMRF);
-		if (z_check){
-			createExpressionZMeta(myDiseaseStudyFolder, cancStudyID=myCancStudyID, myDescMRZ);
-		}
-		#Copy fusions files
-		# system(paste("cp ", fusion_dir, myDisease, "/data_fusions.txt ", myDiseaseStudyFolder, "/data_fusions.txt", sep=""));
-		# system(paste("cp ", fusion_dir, myDisease, "/meta_FUSION.txt ", myDiseaseStudyFolder, "/meta_FUSION.txt", sep=""));
-	}
-
-
-	#Create meta sample & Clinical
-	createMetaSample(myDiseaseStudyFolder, cancStudyID=myCancStudyID);
-	createMetaClinical(myDiseaseStudyFolder, cancStudyID=myCancStudyID);
-
-	#Create case lists
-	#Case List All
-	tmpData <- read.delim(paste(myDiseaseStudyFolder, "/data_clinical_sample.txt", sep=""), skip=4);
-	tmpSamps <- as.character(unique(tmpData[,"SAMPLE_ID"]));
-	createCaseLists(paste(myDiseaseStudyFolder, "/case_lists/cases_all.txt", sep=""), 
-		cancStudyID=myCancStudyID,
-		cancStableID=paste(myCancStudyID, "_all", sep=""), 
-		caseListName="All Tumors", 
-		caseListDesc=paste("All tumor samples (", length(tmpSamps), " samples)", sep=""),
-		caseListCat="all_cases_in_study",
-		myCaseListIDs=tmpSamps)
-	print("Done Creation cases all");
-
-	#Case List Mutations
-	print("Creating mutations case list");
-	tmpData <- read.delim(paste(mutDir, myDisease, ".strelka.vep.filtered.maf", sep=""), skip=1, row.names=NULL);
-	tmpSamps <- as.character(unique(tmpData[,"Tumor_Sample_Barcode"]));
-	print("Read in mutations");
-	#For now, fusions are lumped into mutation data, need to concat together
-	if (hasRNA){
-		print("RNA flag given, reading fusions as mutations");
-		tmpData1 <- read.delim(paste(fusion_dir, myDisease, "/data_fusions.txt", sep=""), row.names=NULL);
-		tmpSamps1 <- as.character(unique(tmpData[,"Tumor_Sample_Barcode"]));
-		tmpSamps <- unique(rbind(tmpSamps, tmpSamps1))
-		print("Done reading fusions");
-	}
-	createCaseLists(paste(myDiseaseStudyFolder, "/case_lists/cases_sequenced.txt", sep=""), 
-		cancStudyID=myCancStudyID,
-		cancStableID=paste(myCancStudyID, "_sequenced", sep=""), 
-		caseListName="All Sequenced Tumors", 
-		caseListDesc=paste("All sequenced tumor samples (", length(tmpSamps), " samples)", sep=""),
-		caseListCat="all_cases_with_mutation_data",
-		myCaseListIDs=tmpSamps)
-	print("Created sequenced case list");
-
-	if(hasRNA)
-	{
-	#Case List RNA
-	print("Creating RNA case list");
-	tmpData <- read.delim(paste(myDiseaseStudyFolder, "/data_rna_seq_v2_mrna.txt", sep=""));
-	tmpSamps <- as.character(colnames(tmpData));
-	tmpSamps <- tmpSamps[2:length(tmpSamps)]
-	tmpSamps <- gsub("^X", "", tmpSamps)
-	tmpSamps <- gsub("\\.", "-", tmpSamps)
-	createCaseLists(paste(myDiseaseStudyFolder, "/case_lists/cases_RNA_Seq_v2_mRNA.txt", sep=""),
-		cancStudyID=myCancStudyID,
-		cancStableID=paste(myCancStudyID, "_rna_seq_v2_mrna", sep=""),
-		caseListName="Tumor Samples with mRNA data (RNA Seq V2)",
-		caseListDesc=paste("All samples with mRNA expression data (", length(tmpSamps), " samples)", sep=""),
-		caseListCat="all_cases_with_mrna_rnaseq_data",
-		myCaseListIDs=tmpSamps)
-	}
-	print("Created RNA case list");
-}
-
-
-#Create studies
-diseaseMapping <- read.delim(diseaseMappingFile, header=T, stringsAsFactors=F)
-colnames(diseaseMapping) <- c("Disease", "Abbr", "Disease2");
-diseaseMapping <- diseaseMapping[!duplicated(diseaseMapping[,2]),]
-rownames(diseaseMapping) <- diseaseMapping[,"Abbr"];
-myFiles <- list.files(dataSheetsDir)
-
-if(hasRNA)
-{
-  #Create Expression meta files
-  source(paste(script_dir,"/RNALoadingHelper.R", sep = ""))
-  res = init_res(config_data$fpkmFileLoc, config_data$mapFileLoc, config_data$genesFileLoc)
-}
-for(i in 1:length(myFiles))
-{
-	print(i);
-	myAbbrev <- myFiles[i];
-	myDisease <- as.character(diseaseMapping[myAbbrev,3]);
-	createStudyAll(myDisease=myAbbrev, 
-	myName=paste(myDisease, config_data$study_short_desc, sep=" "),
-	myDescMS=paste(config_data$study_desc_p1, myDisease, config_data$study_desc_p2, sep=" ")
-	)
-
-}
-
diff --git a/scripts/v1_etl_deprecated/CNALoadingHelper.R b/scripts/v1_etl_deprecated/CNALoadingHelper.R
deleted file mode 100644
index df1fbec..0000000
--- a/scripts/v1_etl_deprecated/CNALoadingHelper.R
+++ /dev/null
@@ -1,89 +0,0 @@
-##############################################
-#Purpose: Code to create RNA-Seq raw data per disease
-#Date: 9/5/2018
-#Author: Pichai Raman
-##############################################
-
-#Create Meta File for MAF files
-createCNVMeta <- function(myLoc=NULL, cancStudyID=NULL, myDesc=NULL)
-{
-	#Create File
-	myFile <- paste(myLoc, "/meta_linear_CNA.txt", sep="");
-	file.create(myFile);
-
-	#Add Cancer Study
-	cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="")
-	write(cancStudyIDLine, myFile, append=TRUE)
-
-	#Add stable id
-	stableIDTypeLine <- "stable_id: linear_CNA"
-	write(stableIDTypeLine, myFile, append=TRUE)
-
-	#Add Profile Name
-	profileNameLine <- "profile_name: copy-number values"
-	write(profileNameLine, myFile, append=TRUE)
-
-	#Add Profile Description
-	profileDescLine <- myDesc
-	write(profileDescLine, myFile, append=TRUE)
-
-	#Add Genetic Alteration Type
-	genAltTypeLine <- "genetic_alteration_type: COPY_NUMBER_ALTERATION";
-	write(genAltTypeLine, myFile, append=TRUE)
-
-	#Add Data Type
-	dataTypeLine <- "datatype: CONTINUOUS"
-	write(dataTypeLine, myFile, append=TRUE)
-
-	#Add Show Profile in analysis Tab
-	showProfileInTabLine <- "show_profile_in_analysis_tab: false"
-	write(showProfileInTabLine, myFile, append=TRUE)
-
-	#Add Data_filename
-	dataFileLine <- "data_filename: data_linear_CNA.txt"
-	write(dataFileLine, myFile, append=TRUE)
-
-	print("Done Creation CNV Meta");
-}
-
-#Create Meta File for MAF files
-createCNVDiscreteMeta <- function(myLoc=NULL, cancStudyID=NULL, myDesc=NULL)
-{
-	#Create File
-	myFile <- paste(myLoc, "/meta_CNA.txt", sep="");
-	file.create(myFile);
-
-	#Add Cancer Study
-	cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="")
-	write(cancStudyIDLine, myFile, append=TRUE)
-
-	#Add stable id
-	stableIDTypeLine <- "stable_id: cna"
-	write(stableIDTypeLine, myFile, append=TRUE)
-
-	#Add Profile Name
-	profileNameLine <- "profile_name: Binned copy-number values"
-	write(profileNameLine, myFile, append=TRUE)
-
-	#Add Profile Description
-	profileDescLine <- myDesc
-	write(profileDescLine, myFile, append=TRUE)
-
-	#Add Genetic Alteration Type
-	genAltTypeLine <- "genetic_alteration_type: COPY_NUMBER_ALTERATION";
-	write(genAltTypeLine, myFile, append=TRUE)
-
-	#Add Data Type
-	dataTypeLine <- "datatype: DISCRETE"
-	write(dataTypeLine, myFile, append=TRUE)
-
-	#Add Show Profile in analysis Tab
-	showProfileInTabLine <- "show_profile_in_analysis_tab: true"
-	write(showProfileInTabLine, myFile, append=TRUE)
-
-	#Add Data_filename
-	dataFileLine <- "data_filename: data_CNA.txt"
-	write(dataFileLine, myFile, append=TRUE)
-
-	print("Done Creation CNV Meta Discrete");
-}
diff --git a/scripts/v1_etl_deprecated/RNALoadingHelper.R b/scripts/v1_etl_deprecated/RNALoadingHelper.R
deleted file mode 100644
index 5d8ef18..0000000
--- a/scripts/v1_etl_deprecated/RNALoadingHelper.R
+++ /dev/null
@@ -1,171 +0,0 @@
-##############################################
-#Purpose: Code to create RNA-Seq raw data per disease
-#Date: 10/5/2018
-#Author: Pichai Raman
-##############################################
-
-
-#Read in mapping file
-#ExpressionFile 
-
-init_res <- function(fpkmFileLoc=NULL, mapFileLoc=NULL, genesFileLoc=NULL)
-{
-	print(paste("Reading inputs for expression data"))
-	res = read.table(file=fpkmFileLoc, header=TRUE, sep="\t", check.names = FALSE)
-	cBioGenes <- levels(read.delim(genesFileLoc)[,2]);
-
-	#Format RNA-Seq Rows
-	res[,"median"] <- apply(res[3:ncol(res)], FUN=median, MARGIN=1)
-	res <- res[order(-res[,"median"]),]
-	res <- res[!duplicated(as.character(res[,2])),]
-	rownames(res) <- res[,2];
-	res <- res[-1:-2];
-	res <- res[intersect(rownames(res), cBioGenes),]
-	res <- res[-ncol(res)]
-
-	#load mapping file and rename columns
-	mappingFile <- read.delim(mapFileLoc, header=F, stringsAsFactors=F)
-	rownames(mappingFile) <- mappingFile[,2];
-	res <- res[, intersect(colnames(res), rownames(mappingFile))]
-	colnames(res) <- mappingFile[colnames(res),1]
-	return(res)
-
-}
-
-myZ <- function(x)
-{
-	x <- (x-mean(x))/sd(x);
-	return(x);
-}
-
-createRNASeqExpMatrix <- function(loc=NULL, myDisease=NULL, dataSheetsDir=NULL, res=NULL)
-{
-	print(paste("Reading datasheet info for", myDisease, sep=" "))
-	# z score out check
-	writeOut <- 0
-	#Filter and write out
-	tmpData <- read.delim(paste(dataSheetsDir, myDisease, "/data_clinical_sample.txt", sep=""), skip=4);
-	tmpSamps <- as.character(tmpData[,"SAMPLE_ID"]);
-	tmpMat <- data.frame(res[,intersect(tmpSamps, colnames(res))]);
-	if(ncol(tmpMat)==1) 
-	{
-		colnames(tmpMat)<- intersect(tmpSamps, colnames(res));
-	  	rownames(tmpMat) <- rownames(res);
-	}
-	if(length(intersect(tmpSamps, colnames(res)))>0)
-	{
-		dataFPKM <- data.frame(rownames(tmpMat), tmpMat);
-
-		#Write out expression matrix
-		colnames(dataFPKM)[1] <- c("Hugo_Symbol")
-		colnames(dataFPKM) <- gsub("^X", "", colnames(dataFPKM))
-		colnames(dataFPKM) <- gsub("\\.", "-", colnames(dataFPKM))
-		write.table(dataFPKM, paste(loc, "/data_rna_seq_v2_mrna.txt", sep=""), sep="\t", row.names=F, quote=F)
-
-		#Write out Z-score
-
-		if (length(intersect(tmpSamps, colnames(res))) > 1){
-			tmpMat <- data.frame(log2(tmpMat+1));
-			tmpMat <- data.frame(t(apply(tmpMat, FUN=myZ, MARGIN=1)))
-			dataZ <- data.frame(rownames(tmpMat), tmpMat);
-			colnames(dataZ) <- colnames(dataFPKM)
-			write.table(dataZ, paste(loc, "/data_rna_seq_v2_mrna_median_Zscores.txt", sep=""), sep="\t", row.names=F, quote=F)
-			writeOut <- 1
-			print(paste("Finished writing out Expression Files for", myDisease, sep=" "));
-		}
-		else{
-			print(paste("One or fewer samples for", myDisease, "detected, skipping Z score output!", sep=" "))
-		}
-	}
-	return(writeOut)
-}
-
-#Create Meta File for RNA exp files
-createExpressionMeta <- function(myLoc=NULL, cancStudyID=NULL, myDesc=NULL)
-{
-	#Create File
-	myFile <- paste(myLoc, "/meta_rna_seq_v2_mrna.txt", sep="");
-	file.create(myFile);
-
-	#Add Cancer Study
-	cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="")
-	write(cancStudyIDLine, myFile, append=TRUE)
-
-	#Add stable id
-	stableIDTypeLine <- "stable_id: rna_seq_mrna"
-	write(stableIDTypeLine, myFile, append=TRUE)
-
-	#Add Profile Name
-	profileNameLine <- "profile_name: mRNA expression"
-	write(profileNameLine, myFile, append=TRUE)
-
-	#Add Profile Description
-	profileDescLine <- myDesc
-	write(profileDescLine, myFile, append=TRUE)
-
-	#Add Genetic Alteration Type
-	genAltTypeLine <- "genetic_alteration_type: MRNA_EXPRESSION";
-	write(genAltTypeLine, myFile, append=TRUE)
-
-	#Add Data Type
-	dataTypeLine <- "datatype: CONTINUOUS"
-	write(dataTypeLine, myFile, append=TRUE)
-
-	#Add Show Profile in analysis Tab
-	showProfileInTabLine <- "show_profile_in_analysis_tab: false"
-	write(showProfileInTabLine, myFile, append=TRUE)
-
-	#Add Data_filename
-	dataFileLine <- "data_filename: data_rna_seq_v2_mrna.txt"
-	write(dataFileLine, myFile, append=TRUE)
-
-	print(paste("Done Creation RNA Expression Meta for", cancStudyID, sep=" "));
-}
-
-#Create Meta File for RNA exp files
-createExpressionZMeta <- function(myLoc=NULL, cancStudyID=NULL, myDesc=NULL)
-{
-	#Create File
-	myFile <- paste(myLoc, "/meta_rna_seq_v2_mrna_median_Zscores.txt", sep="");
-	file.create(myFile);
-
-	#Add Cancer Study
-	cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="")
-	write(cancStudyIDLine, myFile, append=TRUE)
-
-	#Add stable id
-	stableIDTypeLine <- "stable_id: rna_seq_mrna_median_Zscores"
-	write(stableIDTypeLine, myFile, append=TRUE)
-
-	#Add Profile Name
-	profileNameLine <- "profile_name: mRNA expression z-scores"
-	write(profileNameLine, myFile, append=TRUE)
-
-	#Add Profile Description
-	profileDescLine <- myDesc;
-	write(profileDescLine, myFile, append=TRUE)
-
-	#Add Genetic Alteration Type
-	genAltTypeLine <- "genetic_alteration_type: MRNA_EXPRESSION";
-	write(genAltTypeLine, myFile, append=TRUE)
-
-	#Add Data Type
-	dataTypeLine <- "datatype: Z-SCORE"
-	write(dataTypeLine, myFile, append=TRUE)
-
-	#Add Show Profile in analysis Tab
-	showProfileInTabLine <- "show_profile_in_analysis_tab: true"
-	write(showProfileInTabLine, myFile, append=TRUE)
-
-	#Add Data_filename
-	dataFileLine <- "data_filename: data_rna_seq_v2_mrna_median_Zscores.txt"
-	write(dataFileLine, myFile, append=TRUE)
-
-	print(paste("Done Creation Expression Z Meta", cancStudyID, sep=" "));
-}
-
-
-
-
-
-
diff --git a/scripts/v1_etl_deprecated/add_case_lists.py b/scripts/v1_etl_deprecated/add_case_lists.py
deleted file mode 100755
index d974066..0000000
--- a/scripts/v1_etl_deprecated/add_case_lists.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import argparse
-import os
-import subprocess
-import pdb
-
-def process_col_samps(h_cnt, s_col, fname, samp_dict):
-    cur = open(fname)
-    temp = {}
-    for x in range(h_cnt):
-        skip = next(cur)
-    for row in cur:
-        data = row.rstrip('\n').split('\t')
-        samp = data[s_col]
-        if samp not in temp:
-            temp[samp] = 1
-            if samp not in samp_dict:
-                samp_dict[samp] = 1
-            else:
-                samp_dict[samp] += 1
-    cur.close()
-    return samp_dict
-
-def process_row_samps(s_col, fname, samp_dict):
-    cur = open(fname)
-    head = next(cur)
-    header = head.rstrip('\n').split('\t')
-    for x in range(s_col, len(header), 1):
-        if header[x] not in samp_dict:
-            samp_dict[header[x]] = 1
-        else:
-            samp_dict[header[x]] += 1
-    cur.close()
-    return samp_dict
-
-
-parser = argparse.ArgumentParser(description='Add case lists to existing projects')
-parser.add_argument('-t', '--type', action='store', dest='ctype', help='case_list type to gen, i.e. cnaseq, 3way_complete, SCRIPT SHOULD BE RUN RIGHT AT STUDY DIR LEVEL')
-
-args = parser.parse_args()
-if len(sys.argv) == 1:
-    parser.print_help()
-    sys.exit(1)
-
-# files to search for case building
-clist_fdict = {'cnaseq': {'fnames': ['data_mutations_extended.txt', 'data_CNA.txt'], 
-            'name': 'Tumor samples with mutatation and CNA data','desc': 'All tumor samples with mutation and CNA data','cat': 'all_cases_with_mutation_and_cna_data'} , 
-            '3way_complete': { 'fnames': ['data_mutations_extended.txt', 'data_CNA.txt','data_rna_seq_v2_mrna.txt'], 
-            'name': 'Tumor samples with mutatation, CNA and mRNA data','desc': 'All tumor samples with mutation, CNA, and mRNA data','cat': 'all_cases_with_mutation_and_cna_and_mrna_data'}}
-
-f_cmd = 'find . -name ' + clist_fdict[args.ctype]['fnames'][0]
-result = subprocess.check_output(f_cmd, shell=True)
-muts_list = result.decode().split('\n')
-for mutFile in muts_list:
-    if mutFile != '':
-        s_dict = {}
-        project = os.path.dirname(mutFile)
-        project = project.replace('./','')
-        sys.stderr.write("Generating case lists for project " + project + "\n")
-        sys.stderr.write("Processing " + mutFile + "\n")
-        s_dict = process_col_samps(2, 13, mutFile, s_dict)
-        for i in range(1, len(clist_fdict[args.ctype]['fnames'])):
-            fname = project + "/" +  clist_fdict[args.ctype]['fnames'][i]
-            sys.stderr.write("Processing " + fname + "\n")
-            s_dict = process_row_samps(1, fname, s_dict)
-
-        new_case_list = open(project + "/case_lists/cases_" + args.ctype + ".txt", 'w')
-        new_case_list.write('cancer_study_identifier: ' + project + '\nstable_id: ' + project + '_' + args.ctype + '\ncase_list_name: ' + clist_fdict[args.ctype]['name'] + '\n')
-        count = len(clist_fdict[args.ctype]['fnames'])
-        s_list = []
-        for samp in s_dict:
-            if s_dict[samp] == count:
-                s_list.append(samp)
-        total = len(s_list)
-        new_case_list.write('case_list_description: ' + clist_fdict[args.ctype]['desc'] + ' (' + str(total) + ' samples)\ncase_list_category: ' + clist_fdict[args.ctype]['cat'] + '\ncase_list_ids: ' + '\t'.join(s_list))
-        new_case_list.close()
-        sys.stderr.write("Completed creating case list for " + project + "\n")
-    else:
-        sys.stderr.write(clist_fdict[args.ctype]['fnames'][0] + " path was empty str.  skpping!\n")
diff --git a/scripts/v1_etl_deprecated/entrez_id_fusion_format.R b/scripts/v1_etl_deprecated/entrez_id_fusion_format.R
deleted file mode 100644
index 5a3825b..0000000
--- a/scripts/v1_etl_deprecated/entrez_id_fusion_format.R
+++ /dev/null
@@ -1,147 +0,0 @@
-suppressPackageStartupMessages(library("readr"))
-suppressPackageStartupMessages(library("dplyr"))
-suppressPackageStartupMessages(library("optparse"))
-suppressPackageStartupMessages(library(org.Hs.eg.db))
-hs <- org.Hs.eg.db
-
-
-option_list <- list(
-  make_option(c("-f", "--fusionfile"),type="character",
-              help="Merged fusion calls from [STARfusion | Arriba] QC filtered for pedcbio"),
-  make_option(c("-a", "--approvedHugoSymbols"), type="character",
-              help="Hugo symbol approved gene name list"),
-  make_option(c("-m","--manifest"),type="character",
-               help="manifest with Tumor_Sample_Barcode\tCenter"),
-  make_option(c("-o","--outputfile"),type="character",
-              help="Standardized fusion calls from [STARfusion | Arriba] (.TSV)")
-)
-
-# Get command line options, if help option encountered print help and exit,
-opt <- parse_args(OptionParser(option_list=option_list))
-inputfile <- opt$fusionfile
-approved <- opt$approvedHugoSymbols
-manifest<-opt$manifest
-outputfile <- opt$outputfile
-
-
-
-total<-read_tsv(inputfile) %>% as.data.frame()
-genes<-c(total$Gene1A,total$Gene1B,total$Gene2A,total$Gene2B)
-manifest<-read_tsv(manifest) %>% as.data.frame() %>% unique()
-
-
-my.symbols<-genes
-entrez<-select(hs, keys=my.symbols, columns= c("ENTREZID", "SYMBOL"),keytype = "SYMBOL")
-nomatch<-entrez[is.na(entrez$ENTREZID),]
-#genes without entrez id
-#nomatch
-
-match<-read_tsv(approved) %>% as.data.frame()
-
-
-my.symbols <- total$Gene1A
-if (!all(is.na(my.symbols))){
-  Gene1A_entrez<-select(hs, 
-                      keys = my.symbols,
-                      columns = c("ENTREZID", "SYMBOL"),
-                      keytype = "SYMBOL")
-  Gene1A_entrez[is.na(Gene1A_entrez$ENTREZID),"ENTREZID"]=""
-} else {Gene1A_entrez=data.frame("ENTREZID"="","SYMBOL"="",stringsAsFactors = FALSE)}
-
-my.symbols <- total$Gene2A
-if (!all(is.na(my.symbols))){
-  Gene2A_entrez<-select(hs,
-                      keys = my.symbols,
-                      columns = c("ENTREZID", "SYMBOL"),
-                      keytype = "SYMBOL")
-  Gene2A_entrez[is.na(Gene2A_entrez$ENTREZID),"ENTREZID"]=""
-} else {Gene2A_entrez=data.frame("ENTREZID"="","SYMBOL"="",stringsAsFactors = FALSE)}
-
-my.symbols <- total$Gene1B
-if (!all(is.na(my.symbols))){
-  Gene1B_entrez<-select(hs,
-                      keys = my.symbols,
-                      columns = c("ENTREZID", "SYMBOL"),
-                      keytype = "SYMBOL")
-  Gene1B_entrez[is.na(Gene1B_entrez$ENTREZID),"ENTREZID"]=""
-} else {Gene1B_entrez=data.frame("ENTREZID"="","SYMBOL"="",stringsAsFactors = FALSE)}
-
-my.symbols <- total$Gene2B
-if (!all(is.na(my.symbols))){
-  Gene2B_entrez<-select(hs,
-                      keys = my.symbols,
-                      columns = c("ENTREZID", "SYMBOL"),
-                      keytype = "SYMBOL")
-
-  Gene2B_entrez[is.na(Gene2B_entrez$ENTREZID),"ENTREZID"]=""
-}else {Gene2B_entrez=data.frame("ENTREZID"="","SYMBOL"="",stringsAsFactors = FALSE)}
-
-
-# total$Gene1A_entrez<-Gene1A_entrez$ENTREZID
-# total$Gene1B_entrez<-Gene1B_entrez$ENTREZID
-# total$Gene2A_entrez<-Gene2A_entrez$ENTREZID
-# total$Gene2B_entrez<-Gene2B_entrez$ENTREZID
-
-total$Gene1A[is.na(total$Gene1A)]<-""
-total$Gene2A[is.na(total$Gene2A)]<-""
-total$Gene1B[is.na(total$Gene1B)]<-""
-total$Gene2B[is.na(total$Gene2B)]<-""
-
-total<-total %>% 
-  left_join(unique(Gene1A_entrez), by=c("Gene1A"="SYMBOL")) %>% rename("ENTREZID"="Gene1A_entrez") %>%
-  left_join(unique(Gene2A_entrez), by=c("Gene2A"="SYMBOL")) %>% rename("ENTREZID"="Gene2A_entrez") %>%
-  left_join(unique(Gene1B_entrez), by=c("Gene1B"="SYMBOL")) %>% rename("ENTREZID"="Gene1B_entrez") %>%
-  left_join(unique(Gene2B_entrez), by=c("Gene2B"="SYMBOL")) %>% rename("ENTREZID"="Gene2B_entrez")
-  
-
-
-
-#aggregate
-fusion_format<-data.frame("Hugo_Symbol"=total$Gene1A,"Entrez_Gene_Id"=total$Gene1A_entrez,"Tumor_Sample_Barcode"=total$Sample,"Fusion"=total$FusionName,"DNA_support"=rep("no",nrow(total)),"RNA_support"=rep("yes",nrow(total)),"Method"=total$Caller,"Frame"=total$Fusion_Type)
-fusion_format<-rbind(fusion_format,data.frame("Hugo_Symbol"=total$Gene2A,"Entrez_Gene_Id"=total$Gene2A_entrez,"Tumor_Sample_Barcode"=total$Sample,"Fusion"=total$FusionName,"DNA_support"=rep("no",nrow(total)),"RNA_support"=rep("yes",nrow(total)),"Method"=total$Caller,"Frame"=total$Fusion_Type))
-fusion_format<-rbind(fusion_format,data.frame("Hugo_Symbol"=total$Gene1B,"Entrez_Gene_Id"=total$Gene1B_entrez,"Tumor_Sample_Barcode"=total$Sample,"Fusion"=total$FusionName,"DNA_support"=rep("no",nrow(total)),"RNA_support"=rep("yes",nrow(total)),"Method"=total$Caller,"Frame"=total$Fusion_Type))
-fusion_format<-rbind(fusion_format,data.frame("Hugo_Symbol"=total$Gene2B,"Entrez_Gene_Id"=total$Gene2B_entrez,"Tumor_Sample_Barcode"=total$Sample,"Fusion"=total$FusionName,"DNA_support"=rep("no",nrow(total)),"RNA_support"=rep("yes",nrow(total)),"Method"=total$Caller,"Frame"=total$Fusion_Type))
-
-
-fusion_format<-fusion_format %>% left_join(manifest,by=c("Tumor_Sample_Barcode"="Tumor_Sample_Barcode"))
-
-fusion_format<-fusion_format[!is.na(fusion_format$Hugo_Symbol),]
-fusion_format<-unique(fusion_format)
-
-#filter other
-if (nrow(fusion_format[grep("other",fusion_format$Frame),])>0){
-fusion_format<-fusion_format[-grep("other",fusion_format$Frame),]
-}
-#filter intergenic
-if (nrow(fusion_format[grep("/",fusion_format$Fusion),])>0){
-fusion_format<-fusion_format[-grep("/", fusion_format$Fusion),]
-}
-
-
-#format for entrez== NA
-fusion_format[is.na(fusion_format$Entrez_Gene_Id),"Entrez_Gene_Id"]<-""
-fusion_format<-unique(fusion_format)
-
-#remove blank Hugo Symbol rows caused by Gene2A/2B being empty for STARfusion and geneic arriba calls
-fusion_format<-fusion_format[-which(fusion_format$Hugo_Symbol==""),]
-
-#ensure order of cols correct before print
-col_order = c("Hugo_Symbol", "Entrez_Gene_Id", "Center", "Tumor_Sample_Barcode", "Fusion", "DNA_support", "RNA_support", "Method", "Frame")
-fusion_format = fusion_format[, col_order]
-write.table(fusion_format,outputfile,quote = FALSE,row.names = FALSE,sep="\t")
-
-# #get Center 
-# #TGEN
-# fusion_format[grep("C0",fusion_format$Tumor_Sample_Barcode),"Center"]<-"TGEN"
-# #BGI
-# #from Yuankun "all the fq source input are from BGI"
-# manifest<-read.delim("~/Downloads/1562961569658-manifest.csv",sep=",",stringsAsFactors=F)
-# manifest$Tumor_Sample_Barcode<-sub("_1.fq.gz","",manifest$name)
-# manifest$Tumor_Sample_Barcode<-sub("_2.fq.gz","",manifest$Tumor_Sample_Barcode)
-# manifest$Tumor_Sample_Barcode<-sub("_RNA","",manifest$Tumor_Sample_Barcode)
-# manifest$Tumor_Sample_Barcode<-sub("7316_","7316-",manifest$Tumor_Sample_Barcode)
-# fusion_format[which(fusion_format$Tumor_Sample_Barcode %in% manifest$Tumor_Sample_Barcode),"Center"]<-"BGI"
-# #NANT
-# fusion_format[-which(fusion_format$Center %in% c("TGEN","BGI")),"Center"]<-"NANT"
-
-
diff --git a/scripts/v1_etl_deprecated/merge_star_fusion.py b/scripts/v1_etl_deprecated/merge_star_fusion.py
deleted file mode 100755
index 9786c29..0000000
--- a/scripts/v1_etl_deprecated/merge_star_fusion.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-import argparse
-import json
-
-parser = argparse.ArgumentParser(description='Merge and add sample id to STAR Fusion output.')
-parser.add_argument('-d', '--dir', action='store', dest='dir', help='star fusion file directory')
-parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types and '
-                                                                               'data locations')
-parser.add_argument('-i', '--header', action='store', dest='header', help='File with fusion header only')
-parser.add_argument('-m', '--manifest', action='store', dest='manifest', help='star fusion cavatica manifest file')
-
-args = parser.parse_args()
-with open(args.config_file) as f:
-    config_data = json.load(f)
-h_file = open(args.header)
-header = next(h_file)
-header = "tumor_id\t" + header
-merged = open('merged_fusions.tsv', 'w')
-merged.write(header)
-map_dict = {}
-for line in open(config_data['mapFileLoc']):
-    (samp_id, bs_id) = line.rstrip('\n').split('\t')
-    map_dict[bs_id] = samp_id
-
-
-
-manifest = open(args.manifest)
-head = next(manifest)
-header = head.rstrip('\n').split(',')
-b_idx = header.index('Kids First Biospecimen ID')
-
-for line in manifest:
-    info = line.rstrip('\n').split(',')
-    bs_id = info[b_idx]
-    if bs_id in map_dict:
-        samp_id = map_dict[bs_id]
-        floc = args.dir + "/" + info[1]
-        fus = open(floc)
-        skip = next(fus)
-        for fdata in fus:
-            merged.write(samp_id + "\t" + fdata)
-merged.close()
\ No newline at end of file
diff --git a/utilities/add_fusion_to_sample_sheets.py b/utilities/add_fusion_to_sample_sheets.py
deleted file mode 100644
index 45db571..0000000
--- a/utilities/add_fusion_to_sample_sheets.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import sys
-import argparse
-import pandas as pd
-import os
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Add fusion flag to data clinical sample')
-    parser.add_argument('-f', '--fusion-dir', action='store', dest='fusion_dir',
-                    help='Dir with fusions')
-    parser.add_argument('-d', '--datasheet-list', action='store', dest='datasheet_list', help='cbio data sample sheet list')
-    parser.add_argument('-o', '--out-dir', action='store', dest='out_dir', help='output dir - don\'t make it the same as current datasheet location!!!')
-
-    args = parser.parse_args()
-    added_header = ['WITH_FUSION_DATA',
-    'Indicator that sample has fusions associated with it',
-    'STRING',
-    '1',
-    'WITH_FUSION_DATA']
-    os.mkdir(args.out_dir)
-    datasheet_list = open(args.datasheet_list)
-    for fname in datasheet_list:
-        datasheet = open(fname.rstrip('\n'))
-        parts = fname.rstrip('\n').split("/")
-        fusion_data = pd.read_csv(args.fusion_dir + "/" + parts[-2] + ".fusions.txt", sep="\t")
-        sample_list = fusion_data.Tumor_Sample_Barcode.unique()
-        cur_dir = args.out_dir + "/" + parts[-2]
-        os.mkdir(cur_dir)
-        out = open(cur_dir + "/" + parts[-1], "w")
-        for i in range(0, len(added_header)):
-            head = next(datasheet)
-            out.write(head.rstrip('\n') + '\t' + added_header[i] + '\n')
-        for line in datasheet:
-            out.write(line.rstrip('\n'))
-            data = line.split('\t')
-            if data[1] in sample_list:
-                out.write('\tYES\n')
-            else:
-                out.write('\tNO\n')
-        out.close()
\ No newline at end of file
diff --git a/utilities/adjust_patient_inputs.py b/utilities/adjust_patient_inputs.py
deleted file mode 100755
index 4c03347..0000000
--- a/utilities/adjust_patient_inputs.py
+++ /dev/null
@@ -1,194 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import argparse
-import os
-import re
-from shutil import copyfile
-import concurrent.futures
-# import pdb
-
-
-def process_case_files(old_cdir, new_cdir, case_files, rm_sample_list):
-    f = 0
-    for fn in case_files:
-        old_cf = open(old_cdir + '/' + fn)
-        new_cf = open(new_cdir + '/' + fn, 'w')
-        new_s_list = []
-        # print standard lines
-        standard = next(old_cf)
-        new_cf.write(standard)
-        standard = next(old_cf)
-        new_cf.write(standard)
-        standard = next(old_cf)
-        new_cf.write(standard)
-        # hold lines to change description
-        desc = next(old_cf)
-        buffer = next(old_cf)
-        entry = next(old_cf)
-        info = entry.rstrip('\n').split()
-        for i in range(1, len(info)):
-            if info[i] not in rm_sample_list:
-                new_s_list.append(info[i])
-        f = len(new_s_list)
-        desc = re.sub('\d+', str(f), desc)
-        new_cf.write(desc + buffer + info[0] + ' ' + '\t'.join(new_s_list) + '\n')
-        old_cf.close()
-        new_cf.close()
-
-    return f
-
-
-def process_data_clinical(old_ds, new_ds, rm_patient_list):
-    for entry in old_ds:
-        info = entry.split('\t')
-        if info[0] not in rm_patient_list:
-            new_ds.write(entry)
-
-
-def process_samp_in_head(old_file, new_file, rm_sample_list):
-    try:
-        old_head = next(old_file)
-        good_i = []
-        head_split = old_head.rstrip('\n').split('\t')
-        new_head = []
-        new_head.append(head_split[0])
-        for i in range(1, len(head_split)):
-            if head_split[i] not in rm_sample_list:
-                good_i.append(i)
-                new_head.append(head_split[i])
-        new_file.write('\t'.join(new_head) + '\n')
-        for data in old_file:
-            datum = data.rstrip('\n').split('\t')
-            new_out = datum[0]
-            for i in good_i:
-                new_out += '\t' + datum[i]
-            new_file.write(new_out + '\n')
-    except Exception as e:
-        print(e)
-        exit(1)
-
-
-def process_samp_in_tbl(old_file, new_file, rm_sample_list):
-    # pdb.set_trace()
-    head = next(old_file)
-    new_file.write(head)
-    for data in old_file:
-        datum = data.split('\t')
-        if datum[13] not in rm_sample_list:
-            new_file.write(data)
-
-
-def cleanup_all(dname, rm_sample_list, rm_patient_list, new_out):
-    try:
-        # redo case list files
-        new_cdir = new_out + '/case_lists'
-        os.mkdir(new_cdir)
-        old_cdir = dname + '/case_lists'
-        case_files = os.listdir(old_cdir)
-        n_samp = process_case_files(old_cdir, new_cdir, case_files, rm_sample_list)
-        sys.stderr.write('Reprocessed case lists for ' + dname + '\n')
-        sys.stderr.flush()
-        if n_samp == 0:
-            sys.stderr.write('All samples in ' + dname + ' were REMOVED!\n')
-        else:
-            # redo data_clinical sheets
-            new_dss = open(new_out + '/data_clinical_sample.txt', 'w')
-            old_dss = open(dname + '/data_clinical_sample.txt')
-            process_data_clinical(old_dss, new_dss, rm_patient_list)
-            new_dss.close()
-
-            new_dps = open(new_out + '/data_clinical_patient.txt', 'w')
-            old_dps = open(dname + '/data_clinical_patient.txt')
-            process_data_clinical(old_dps, new_dps, rm_patient_list)
-            new_dps.close()
-            sys.stderr.write('Reprocessed data clinical sheets for ' + dname + '\n')
-            sys.stderr.flush()
-
-            samp_in_head_fn = ['data_CNA.txt', 'data_linear_CNA.txt', 'data_rna_seq_v2_mrna_median_Zscores.txt',
-                               'data_rna_seq_v2_mrna.txt']
-            for fn in samp_in_head_fn:
-                old_fh = open(dname + '/' + fn)
-                new_fh = open(new_out + '/' + fn, 'w')
-                process_samp_in_head(old_fh, new_fh, rm_sample_list)
-                new_fh.close()
-                sys.stderr.write('Reprocessed ' + fn + ' for ' + dname + '\n')
-                sys.stderr.flush()
-
-            samp_in_tbl = ['data_mutations_extended.txt']
-            for fn in samp_in_tbl:
-                old_fh = open(dname + '/' + fn)
-                new_fh = open(new_out + '/' + fn, 'w')
-                process_samp_in_tbl(old_fh, new_fh, rm_sample_list)
-                new_fh.close()
-                sys.stderr.write('Reprocessed ' + fn + ' for ' + dname + '\n')
-                sys.stderr.flush()
-            # copy over all meta tables
-            meta_list = os.listdir(dname)
-            for fn in meta_list:
-                if fn[0:5] == 'meta_':
-                    copyfile((dname + '/' + fn), (new_out + '/' + fn))
-                    sys.stderr.write('Copied meta file ' + fn + '\n')
-                    sys.stderr.flush()
-    except Exception as e:
-        print(e)
-
-
-def check_data_sheet(dname, exc_list, outdir):
-    try:
-        cur = open(dname + '/data_clinical_sample.txt')
-        f = 0
-        rm_slist = []
-        rm_plist = []
-        for line in cur:
-            info = line.rstrip('\n').split('\t')
-            if info[0] in exc_list:
-                f = 1
-                rm_slist.append(info[1])
-                if info[0] not in rm_plist:
-                    rm_plist.append(info[0])
-        cur.close()
-        if f == 0:
-            sys.stderr.write('No samples to remove from ' + dname + '\n')
-            sys.stderr.flush()
-        else:
-            sys.stderr.write(dname + ' has samples ' + str(len(rm_slist)) + ' to be excluded:' + '\t'.join(rm_slist)
-                             + ' from patients ' + '\t'.join(rm_plist) + '\n')
-            sys.stderr.flush()
-            parts = dname.split('/')
-            new_out = outdir + '/' + parts[-1]
-            os.mkdir(new_out)
-            cleanup_all(dname, rm_slist, rm_plist, new_out)
-        return dname
-    except Exception as e:
-        print(e)
-        exit(1)
-
-
-parser = argparse.ArgumentParser(description='Use list of sample IDs to remove to rebuild a project.')
-parser.add_argument('-l', '--list', action='store', dest='plist', help='new-line separated patient list')
-parser.add_argument('-d', '--dir', action='store', dest='dir', help='data dir to search')
-parser.add_argument('-p', '--pattern', action='store', dest='pattern', help='dir search pattern - a suffix or prefix')
-parser.add_argument('-o', '--outdir', action='store', dest='outdir', help='output directory for projects that change')
-
-args = parser.parse_args()
-
-# read in patient list
-exc_list = []
-
-for line in open(args.plist):
-    exc_list.append(line.rstrip('\n'))
-
-# get dir list
-all_files = os.listdir(args.dir)
-dir_list = []
-pattern = '.*' + args.pattern + '.*'
-for fn in all_files:
-    if re.match(pattern, fn):
-        dir_list.append(args.dir + '/' + fn)
-# for pdir in dir_list:
-#     check_data_sheet(pdir, exc_list, args.outdir)
-with concurrent.futures.ProcessPoolExecutor(8) as executor:
-    results = {executor.submit(check_data_sheet, pdir, exc_list, args.outdir): pdir for pdir in dir_list}
-    for result in concurrent.futures.as_completed(results):
-        sys.stderr.write('Completed processing ' + result.result() + '\n')
diff --git a/utilities/append_add_on_cols.py b/utilities/append_add_on_cols.py
deleted file mode 100644
index b1bab39..0000000
--- a/utilities/append_add_on_cols.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import argparse
-import os
-
-parser = argparse.ArgumentParser(description='Quickly add cols to data_clinical sheet')
-parser.add_argument('-d', '--datasheet', action='store', dest='data', help='datasheet to add to. Output will be to stdout')
-parser.add_argument('-i', '--col_id', action='store', dest='cid', help='Name of column to use for finding ID')
-parser.add_argument('-c', '--header', action='store', dest='header', help='tsv with cbio headers to add')
-parser.add_argument('-a', '--add', action='store', dest='add', help='csv with BS_ID or PT_ID key and values to add, in same order as header')
-
-args = parser.parse_args()
-
-# tack on new header cols
-datasheet = open(args.data)
-header = open(args.header)
-labels = ""
-for head in header:
-    entry = next(datasheet)
-    sys.stdout.write(entry.rstrip('\n') + "\t" + head)
-    labels = entry
-header.close()
-labels = labels.rstrip('\n').split('\t')
-cid = labels.index(args.cid)
-# store IDs and vals
-value_add = open(args.add)
-skip = next(value_add)
-add_dict = {}
-blanks = ""
-for entry in value_add:
-    data = entry.rstrip('\n').split(',')
-    add_dict[data[0]] = '\t'.join(data[1:])
-    # init blanks for when no new value to be added
-    if blanks == "":
-        for i in range(1, len(data), 1):
-            blanks += "\t"
-value_add.close()
-
-for line in datasheet:
-    info = line.rstrip('\n').split('\t')
-    sys.stdout.write(line.rstrip('\n'))
-    f = 0
-    c = 0
-    # IDs might be separeted by ;
-    check = info[cid].split(";")
-    for i in range(len(check)):
-        if check[i] in add_dict:
-            f = 1
-            c = i
-    if f:
-        sys.stdout.write("\t" + add_dict[check[c]] + "\n")
-    else:
-        sys.stdout.write(blanks + "\n")
-datasheet.close()
diff --git a/utilities/get_survival_from_ds.py b/utilities/get_survival_from_ds.py
deleted file mode 100755
index 335b607..0000000
--- a/utilities/get_survival_from_ds.py
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-from requests import request
-import argparse
-import concurrent.futures
-import json
-from time import sleep
-import pdb
-
-def query_ds(pt_id):
-    try:
-        pt_id = pt_id.rstrip('\n')
-        outcome_url = url + "outcomes?participant_id=" + pt_id
-        out_info = request('GET', outcome_url)
-        statuses = []
-        for result in out_info.json()['results']:
-            if result['vital_status'] is not None:
-                statuses.append([pt_id, result['vital_status'], str(result['age_at_event_days'])])
-        return statuses
-    except Exception as e:
-        sys.stderr.write(str(e))
-        sys.stderr.write('Could not get outcome info for ' + pt_id + "\n")
-        sys.exit(1)
-
-
-parser = argparse.ArgumentParser(description='Get vital status by PT ID')
-parser.add_argument('-u', '--kf-url', action='store', dest='url',
-                    help='Kids First data service url, i.e. https://kf-api-dataservice.kidsfirstdrc.org/')
-parser.add_argument('-p', '--pt-list', action='store', dest='pt_list',
-                    help='pt_id list')
-
-args = parser.parse_args()
-
-url = args.url
-pt_file = open(args.pt_list)
-out = open('outcomes.txt', 'w')
-
-x = 1
-m = 100
-
-with concurrent.futures.ThreadPoolExecutor(16) as executor:
-    results = {executor.submit(query_ds, entry): entry for entry in pt_file}
-    for result in concurrent.futures.as_completed(results):
-        if x % m == 0:
-            sys.stderr.write('Processed ' + str(x) + ' lines\n')
-            sys.stderr.flush()
-        outcomes = result.result()
-        try:
-            if len(outcomes) > 0:
-                for outcome in outcomes:
-                    out.write("\t".join(outcome) + "\n")
-            else:
-                sys.stderr.write('No outcome for a patient\n')
-        except Exception as e:
-            sys.stderr.write(str(e))
-            pdb.set_trace()
-            hold = 1
-        x += 1
diff --git a/utilities/match_pick_samples_from_files.py b/utilities/match_pick_samples_from_files.py
deleted file mode 100644
index 93f6c06..0000000
--- a/utilities/match_pick_samples_from_files.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import pandas as pd
-import argparse
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Script to pick out samples from CNVS file when sample type in sample map file is DNA")
-
-    parser.add_argument('-ID','--sample_map',help="File to read in sample ID")
-    parser.add_argument('-cnv','--cnvs',help="File to pick cnv samples")
-    parser.add_argument('-o','--output_file',help="Name of output file")
-
-    args=parser.parse_args()
-
-    sample_map = pd.read_csv(args.ID, sep='\t')
-
-    #predicted = pd.read_csv("openpbta.predicted_cnv.txt", sep='\t')
-    predicted = pd.read_csv(args.cnvs, sep='\t')
-    cbio_ID_DNA=sample_map.loc[sample_map['Sample Type']=='DNA']['Cbio ID']
-
-    common_IDs=[]
-    for col in cbio_ID_DNA:
-        check=0
-        for row in predicted.columns:
-            if row == col:
-                check=1
-        if check!=1:
-            common_IDs.append(col)
-            predicted[col]=0
-
-    with open(args.output_file,'w') as write_tsv:
-        write_tsv.write(predicted.to_csv(sep='\t', index=False))    
diff --git a/utilities/mount_link_sbfs.py b/utilities/mount_link_sbfs.py
deleted file mode 100644
index 3add267..0000000
--- a/utilities/mount_link_sbfs.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-import argparse
-import json
-import subprocess
-
-parser = argparse.ArgumentParser(description='Mount sbfs, create soft links bases on file type')
-parser.add_argument('-m', '--manifest', action='store', dest='manifest', help='cavatica csv manifest file')
-parser.add_argument('-c', '--config', action='store', dest='config_file', help='json config file with data types and '
-                                                                               'data locations')
-parser.add_argument('-p', '--profile', action='store', dest='profile', help='cavatica profile name. requires '
-                                                                            '.sevenbridges/credentials file be present')
-args = parser.parse_args()
-with open(args.config_file) as f:
-    config_data = json.load(f)
-os.makedirs('mafs', exist_ok=True)
-if config_data['cna_flag'] == 1:
-    os.makedirs('cnvs', exist_ok=True)
-if config_data['rna_flag'] == 1:
-    os.makedirs('rsem', exist_ok=True)
-
-# create mounts and make links
-m_dict = {}
-manifest = open(args.manifest)
-# just last two cols required
-head = next(manifest)
-cwd = os.getcwd() + "/"
-for line in manifest:
-    info = line.rstrip('\n').split('\t')
-    fnames = info[-2].split(',')
-    projects = info[-1]
-    project = projects.split(',')
-    atype = info[4]
-    for i in range(len(project)):
-        (u,p) = project[i].split('/')
-        if p not in m_dict:
-            os.mkdir(p)
-            cmd = 'sbfs mount ' + p + ' --project ' + project[i] + ' --profile turbo --read-only'
-            subprocess.call(cmd, shell=True)
-            mdir = cwd + p + "/projects/" + project[i] + "/"
-            m_dict[p] = mdir
-        cmd = 'ln -s ' + m_dict[p] + fnames[i]
-        if atype == 'DNA':
-            if fnames[i][-3:] == 'maf':
-                cmd += " mafs"
-            else:
-                cmd += " cnvs"
-        else:
-            cmd += " rsem"
-        subprocess.call(cmd, shell=True)
-out = open('sbfs_mount.txt', 'w')
-for p in m_dict:
-    out.write(m_dict[p] + '\n')
-out.close()
\ No newline at end of file
diff --git a/utilities/null_cols_from_maf.py b/utilities/null_cols_from_maf.py
deleted file mode 100755
index a75d9c9..0000000
--- a/utilities/null_cols_from_maf.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import argparse
-
-parser = argparse.ArgumentParser(description='Quickly add cols to data_clinical sheet')
-parser.add_argument('-c', '--col-names-file', action='store', dest='col_file', help='File the new line seprated header names to null out')
-parser.add_argument('-m', '--maf_in', action='store', dest='maf_in', help='MAF file with cols to null. Output will be to stdout')
-
-args = parser.parse_args()
-
-col_file = args.col_file
-maf = args.maf_in
-
-col_names = []
-for line in open(col_file):
-    col_names.append(line.rstrip('\n'))
-
-col_idx = []
-maf_rd = open(maf)
-head = next(maf_rd)
-sys.stdout.write(head)
-head = next(maf_rd)
-header = head.rstrip('\n').split('\t')
-for col in col_names:
-    col_idx.append(header.index(col))
-sys.stdout.write(head)
-for data in maf_rd:
-    entry = data.rstrip('\n').split('\t')
-    for i in col_idx:
-        entry[i] = ""
-    sys.stdout.write("\t".join(entry) + "\n")
diff --git a/utilities/patch_pt_id_update.py b/utilities/patch_pt_id_update.py
deleted file mode 100644
index a313404..0000000
--- a/utilities/patch_pt_id_update.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import argparse
-import os
-import json
-
-parser = argparse.ArgumentParser(description='Quickly patch KF PT ID update for PNOC WES')
-parser.add_argument('-d', '--data', action='store', dest='data', help='Sample file with patients IDS and external IDs')
-parser.add_argument('-s', '--sample', action='store', dest='sample', help='Sample file with patients IDS and external IDs')
-parser.add_argument('-j', '--json', action='store', dest='json', help='Input json file')
-
-args = parser.parse_args()
-with open(args.json) as f:
-    pt_data = json.load(f)
-pt_id_dict = {}
-pt_sheet = open('data_clinical_patient.txt', 'w')
-for line in open(args.data):
-    info = line.rstrip('\n').split('\t')
-    if info[1] in pt_data['participant']:
-        sys.stderr.write('Replacing old id ' + info[0] + ' with ' + pt_data['participant'][info[1]] + '\n')
-        pt_id_dict[info[0]] = pt_data['participant'][info[1]]
-        info[0] = pt_data['participant'][info[1]]
-    pt_sheet.write('\t'.join(info) + '\n')
-pt_sheet.close()
-sys.stderr.write('Finished with patient file.\n')
-samp_sheet = open('data_clinical_sample.txt', 'w')
-
-for line in open(args.sample):
-    info = line.rstrip('\n').split('\t')
-    if info[0] in pt_id_dict:
-        sys.stderr.write('Replacing old id ' + info[0] + ' with ' + pt_id_dict[info[0]] + '\n')
-        info[0] = pt_id_dict[info[0]]
-    samp_sheet.write('\t'.join(info) + '\n')
-samp_sheet.close()
-sys.stderr.write('Finished with sample file.\n')
diff --git a/utilities/patch_sample_id.py b/utilities/patch_sample_id.py
deleted file mode 100644
index d751614..0000000
--- a/utilities/patch_sample_id.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import argparse
-import os
-import re
-
-parser = argparse.ArgumentParser(description='Quickly patch sample id from step2 with data warehouse entries')
-parser.add_argument('-i', '--info', action='store', dest='info', help='A sample info sheet from step 2')
-parser.add_argument('-d', '--data', action='store', dest='data', help='Data Warehouse tsv with biospecimen ID, master aliquot id, and sample id')
-
-new_samp = {}
-args = parser.parse_args()
-
-data_in = open(args.data)
-head = next(data_in)
-header = head.rstrip('\n').split('\t')
-b_idx = header.index('kf_biospecimen_id')
-s_idx = header.index('sample_id')
-a_idx = header.index('master_parent_aliquot_id')
-
-for line in data_in:
-    meta = line.rstrip('\n').split('\t')
-    new_samp[meta[b_idx]] = meta[s_idx] + "_" + meta[a_idx]
-data_in.close()
-
-info_in = open(args.info)
-head = next(info_in)
-header = head.rstrip('\n').split('\t')
-b_idx = header.index('BS_ID')
-s_idx = header.index('external_sample_id')
-a_idx = header.index('analyte_type')
-
-sys.stdout.write(head)
-for line in info_in:
-    meta = line.rstrip('\n').split('\t')
-    if meta[b_idx] in new_samp:
-        meta[s_idx] = new_samp[meta[b_idx]]
-    else:
-        # modify original to fit new
-        sys.stderr.write("WARN: Entry for " + meta[b_idx] + " " + meta[s_idx] + " not found, reformatting\n")
-        meta[s_idx] = re.sub(r"-[T|N]-", "_", meta[s_idx])
-        meta[s_idx] = re.sub(r"\..*$", "", meta[s_idx])
-        meta[s_idx] = re.sub(r"-[T|N]$", "", meta[s_idx])
-    if meta[a_idx] == "Not Applicable":
-        meta[a_idx] = "DNA"
-    print("\t".join(meta))
-info_in.close()
-
diff --git a/utilities/patch_survival_to_data_patient.py b/utilities/patch_survival_to_data_patient.py
deleted file mode 100755
index 6a0a3b2..0000000
--- a/utilities/patch_survival_to_data_patient.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import argparse
-import os
-import math
-import pdb
-
-parser = argparse.ArgumentParser(description='Added survival data to data_clinical_patient.txt sheet. Run from dir level of disease subdirs')
-parser.add_argument('-d', '--data', action='store', dest='data',
-                    help='File with patients IDS and survival data, can be gotten from running get_survival_from_ds.py')
-parser.add_argument('-l', '--list', action='store', dest='flist', help='Data file list')
-parser.add_argument('-m', '--tum-bs-id', action='store', dest='tum_file', help='Tum bs id file from step 2 to properly calculate survival')
-parser.add_argument('-o', '--outdir', action='store', dest='outdir', help='output dir - MAKE THIS DIFFERENT FROM INPUT DIR')
-parser.add_argument('-t', '--id-type', action='store', dest='itype', help='kf or ext')
-parser.add_argument('-f', '--dfs-flag', action='store', dest='dflag', help='1 if DFS data available, 0 if not')
-
-
-def calc_os_months(surv_data_val):
-    (status, status_days) = surv_data_val.split('\t')
-    # calc months basically as average length of a month
-    as_months = status_days
-    if as_months != "NA":
-        as_months = math.floor(int(status_days)/(365.25/12))
-    return status + '\t' + str(as_months)
-
-
-args = parser.parse_args()
-
-added_header = ['OS_STATUS\tOS_MONTHS\tDFS_STATUS\tDFS_MONTHS',
-    'Overall patient survival status\tOverall survival in months since initial diagnosis\tDisease free status since initial treatment\tDisease free (months) since initial treatment',
-    'STRING\tNUMBER\tSTRING\tNUMBER',
-    '1\t1\t1\t1',
-    'OS_STATUS\tOS_MONTHS\tDFS_STATUS\tDFS_MONTHS']
-if int(args.dflag) == 0:
-    added_header = ['OS_STATUS\tOS_MONTHS',
-    'Overall patient survival status\tOverall survival in months since initial diagnosis',
-    'STRING\tNUMBER',
-    '1\t1',
-    'OS_STATUS\tOS_MONTHS']
-
-if not os.path.isdir(args.outdir):
-    os.mkdir(args.outdir)
-surv_dict = {}
-surv_data = open(args.data)
-skip_head = next(surv_data)
-# In format pt ID<tab>OS STATUS<tab>age in days of that status
-for line in surv_data:
-    data = line.rstrip('\n').split('\t')
-    surv_dict[data[0]] = '\t'.join([data[2], data[1]])
-surv_data.close()
-id_idx = {'kf': 0, 'ext': 1}
-for fname in open(args.flist):
-    fname = fname.rstrip('\n')
-    subdir = os.path.dirname(fname)
-    newdir = args.outdir + '/' + subdir
-    if not os.path.isdir(newdir):
-        os.makedirs(newdir, exist_ok=True)
-    cur = open(fname)
-    newdata = open(newdir + '/data_clinical_patient.txt', 'w')
-    # pdb.set_trace()
-    for i in range(0, len(added_header)):
-        head = next(cur)
-        newdata.write(head.rstrip('\n') + '\t' + added_header[i] + '\n')
-    for line in cur:
-        newdata.write(line.rstrip('\n'))
-        data = line.split('\t')
-        pt_id = data[id_idx[args.itype]]
-        
-        if pt_id in surv_dict:
-            surv_data_value = calc_os_months(surv_dict[pt_id])
-            newdata.write('\t' + surv_data_value + '\n')
-        else:
-            newdata.write('\t\t\t\t\n')
-            sys.stderr.write('No survival entry for ' + pt_id + ' in ' + fname + '\n')
-    cur.close()
-    newdata.close()
diff --git a/utilities/rsem_replace_data_rna.py b/utilities/rsem_replace_data_rna.py
deleted file mode 100644
index b7b4e62..0000000
--- a/utilities/rsem_replace_data_rna.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import argparse
-import os
-
-parser = argparse.ArgumentParser(description='Updates outdated data in data_rna_seq sheet from new rsem_merged file')
-parser.add_argument('-m', '--mapFileLoc', action='store', dest='map', help='RNA sample name map file')
-parser.add_argument('-r', '--rsem', action='store', dest='rsem', help='rsem file with new data')
-parser.add_argument('-d', '--data', action='store', dest='data', help='data_rna_seq_v2_mrna.txt sheet to update')
-parser.add_argument('-o', '--outdir', action='store', dest='outdir', help='outputdir')
-
-args = parser.parse_args()
-map_dict = {}
-for line in open(args.map):
-    (new_name, old_name) = line.rstrip('\n').split('\t')
-    map_dict[old_name] = new_name
-
-rsem_data = {}
-
-rsem_file = open(args.rsem)
-head = next(rsem_file)
-slist = []
-header = head.rstrip('\n').split('\t')
-for line in rsem_file:
-    info = line.rstrip('\n').split('\t')
-    rsem_data[info[1]] = {}
-    for i in range(2, len(info)):
-        samp = map_dict[header[i]]
-        slist.append(samp)
-        rsem_data[info[1]][samp] = info[i]
-
-rsem_file.close()
-
-outdir = args.outdir + '/' + os.path.dirname(args.data)
-if not os.path.isdir(outdir):
-    os.mkdir(outdir)
-update = open(outdir + '/data_rna_seq_v2_mrna.txt', 'w')
-
-old_data = open(args.data)
-head = next(old_data)
-update.write(head)
-header = head.rstrip('\n').split('\t')
-
-h_i = []
-for i in range(1, len(header)):
-    if header[i] in slist:
-        h_i.append(i)
-        sys.stderr.write('Found sample ' + header[i] + ' to update at index ' + str(i) + '\n')
-for line in old_data:
-    info = line.rstrip('\n').split('\t')
-    for i in h_i:
-        info[i] = rsem_data[info[0]][header[i]]
-    update.write('\t'.join(info) + '\n')
-update.close()
diff --git a/utilities/subset_table_by_id.py b/utilities/subset_table_by_id.py
deleted file mode 100755
index 3bbe6a2..0000000
--- a/utilities/subset_table_by_id.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import argparse
-import os
-import json
-import pdb
-
-parser = argparse.ArgumentParser(description='Quickly patch KF PT ID update for PNOC WES')
-parser.add_argument('-i', '--input-file', action='store', dest='in_file', help='File to subset')
-parser.add_argument('-l', '--id-list', action='store', dest='id_list', help='list of ids seprated by new line')
-parser.add_argument('-c', '--columns', action='store', dest='cols', help='csv cols to search')
-parser.add_argument('-n', '--num-header-lines', action='store', dest='n', help='number of header lines')
-parser.add_argument('-s', '--separator', action='store', dest='s', help='seprator type - \'comma\' or \'tab\'')
-parser.add_argument('-o', '--out-dir', action='store', dest='o', help='output dir name - should be different if file being processed is in same dir')
-parser.add_argument('-x', '--extra_split', action='store', dest='x', help='split id name on ;, special case')
-
-args = parser.parse_args()
-id_file = open(args.id_list)
-id_dict = {}
-sep = '\t'
-if args.s == "comma":
-    sep = ','
-n = int(args.n)
-for line in id_file:
-    id_dict[line.rstrip('\n')] = 0
-
-cols = list(map(int, args.cols.split(',')))
-#pdb.set_trace()
-out_dir = args.o
-new_fn = out_dir + os.path.basename(args.in_file)
-new_out = open(new_fn, 'w')
-in_file = open(args.in_file)
-for i in range(n):
-    header = next(in_file)
-    new_out.write(header)
-for line in in_file:
-    info = line.rstrip('\n').split(sep)
-    if args.x is None:
-        for i in cols:
-            if info[i] in id_dict:
-                id_dict[info[i]] = 1
-                new_out.write(line)
-    else:
-        for i in cols:
-            subset = info[i].split(';')
-            for j in subset:
-                if j in id_dict:
-                    id_dict[info[i]] = 1
-                    new_out.write(line)
-                    break
diff --git a/utilities/swap_samples.py b/utilities/swap_samples.py
deleted file mode 100644
index ab9e5f7..0000000
--- a/utilities/swap_samples.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import argparse
-import os
-
-parser = argparse.ArgumentParser(description='Swaps data between two projects. Run from dir level of disease subdirs')
-parser.add_argument('-a', '--datainput1', action='store', dest='d1', help='First sheet with sample to swap')
-parser.add_argument('-b', '--datainput2', action='store', dest='d2', help='Second sheet with sample to swap')
-parser.add_argument('-x', '--sample1', action='store', dest='x', help='sample 1 name to swap with...')
-parser.add_argument('-y', '--sample2', action='store', dest='y', help='sample2 name to swap')
-parser.add_argument('-o', '--outdir', action='store', dest='outdir', help='outputdir')
-parser.add_argument('-f', '--flag', action='store', dest='f', help='0 to keep original sampe names, 1 to swap '
-                                                                      'names too')
-
-args = parser.parse_args()
-d1_out_dir = args.outdir + '/' + os.path.dirname(args.d1)
-d2_out_dir = args.outdir + '/' + os.path.dirname(args.d2)
-if not os.path.isdir(d1_out_dir):
-    os.mkdir(d1_out_dir)
-if not os.path.isdir(d2_out_dir):
-    os.mkdir(d2_out_dir)
-
-d1_out_fh = open(d1_out_dir + '/' + os.path.basename(args.d1), 'w')
-d2_out_fh = open(d2_out_dir + '/' + os.path.basename(args.d2), 'w')
-
-d1_in = open(args.d1)
-d2_in = open(args.d2)
-# find sample one to swap
-h1 = next(d1_in)
-h1_info = h1.rstrip('\n').split('\t')
-h1_i = 0
-for i in range(0, len(h1_info)):
-    if h1_info[i] == args.x:
-        h1_i = i
-        sys.stderr.write('Found sample ' + args.x + ' to swap in ' + args.d1 + ' at position ' + str(i) + '\n')
-        if args.f == '1':
-            h1_info[i] = args.y
-            sys.stderr.write('Swapped header name to ' + args.y + '\n')
-        break
-d1_out_fh.write('\t'.join(h1_info) + '\n')
-
-#find sample 2 to swap
-h2 = next(d2_in)
-h2_info = h2.rstrip('\n').split('\t')
-h2_i = 0
-for i in range(0, len(h2_info)):
-    if h2_info[i] == args.y:
-        h2_i = i
-        sys.stderr.write('Found sample ' + args.y + ' to swap in ' + args.d2 + ' at position ' + str(i) + '\n')
-        if args.f == '1':
-            h2_info[i] = args.x
-            sys.stderr.write('Swapped header name to ' + args.x + '\n')
-        break
-d2_out_fh.write('\t'.join(h2_info) + '\n')
-
-for line1 in d1_in:
-    line2 = next(d2_in)
-
-    info1 = line1.rstrip('\n').split('\t')
-    info2 = line2.rstrip('\n').split('\t')
-    t1 = info1[h1_i]
-    info1[h1_i] = info2[h2_i]
-    info2[h2_i] = t1
-    d1_out_fh.write('\t'.join(info1) + '\n')
-    d2_out_fh.write('\t'.join(info2) + '\n')
-d1_out_fh.close()
-d2_out_fh.close()
\ No newline at end of file