From b3555fce8a7b843c0837ac5bc7bbeb1e716742c1 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Mon, 16 Oct 2023 10:15:46 -0400 Subject: [PATCH 1/2] :hammer: added file_type created from reannotation --- STUDY_CONFIGS/pbta_all_case_meta_config.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/STUDY_CONFIGS/pbta_all_case_meta_config.json b/STUDY_CONFIGS/pbta_all_case_meta_config.json index 88896a9..c79dfc0 100644 --- a/STUDY_CONFIGS/pbta_all_case_meta_config.json +++ b/STUDY_CONFIGS/pbta_all_case_meta_config.json @@ -221,17 +221,17 @@ "manifests": { "cbtn": { "table": "bix_genomics_file.sd_bhjxbdqk-genomics_file_manifest", - "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"], + "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","consensus_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"], "out_file": "cbtn_genomics_file_manifest.txt" }, "x01": { "table": "bix_genomics_file.sd_bhjxbdqk_x01-genomics_file_manifest", - "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"], + "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","consensus_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"], "out_file": "x01_genomics_file_manifest.txt" }, "cbtn_extra": { "table": "bix_genomics_file.sd_bhjxbdqk_x01_extra-genomics_file_manifest", - "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"], + "file_type": ["RSEM_gene","annofuse_filtered_fusions_tsv","annotated_public_outputs","consensus_public_outputs","ctrlfreec_pval","ctrlfreec_info","ctrlfreec_bam_seg"], "out_file": "cbtn_extra_genomics_file_manifest.txt" }, "pnoc": { From 8d468218d77c561f055948df515c32ee69a0fe63 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Thu, 19 Oct 2023 09:54:19 -0400 Subject: [PATCH 2/2] :heavy_minus_sign: removed many outdated refs and scripts --- REFS/cell_line_supplemental.txt | 36 -- .../1a_merge_cavatica_manifests.py | 40 -- .../1b_query_file_manifest.py | 88 ----- .../2_query_ds_by_bs_id.py | 173 --------- .../3b_create_cbio_id_fname_tbl.py | 84 ---- .../2021-Sep-7_ETL_DEPRECATED/4_merge_maf.py | 88 ----- .../5a_cnv_genome2gene.py | 71 ---- .../2021-Sep-7_ETL_DEPRECATED/5b_merge_cnv.py | 120 ------ .../5c_cnv_discrete.py | 129 ------- .../2021-Sep-7_ETL_DEPRECATED/5d_merge_seg.py | 73 ---- .../6_merge_rename_rsem.py | 127 ------ .../7_convert_fusion.py | 99 ----- .../8_organize_upload_packages.py | 218 ----------- .../sample_id_builder_helper.py | 68 ---- scripts/dev/get_files_from_manifest.py | 77 ---- scripts/v1_etl_deprecated/0_ETL_wrapper.py | 318 --------------- scripts/v1_etl_deprecated/1_query_cavatica.py | 86 ----- .../v1_etl_deprecated/6b_z_score_transform.py | 60 --- .../v1_etl_deprecated/7_createLoadingFiles.R | 365 ------------------ scripts/v1_etl_deprecated/CNALoadingHelper.R | 89 ----- scripts/v1_etl_deprecated/RNALoadingHelper.R | 171 -------- scripts/v1_etl_deprecated/add_case_lists.py | 81 ---- .../entrez_id_fusion_format.R | 147 ------- .../v1_etl_deprecated/merge_star_fusion.py | 44 --- utilities/add_fusion_to_sample_sheets.py | 39 -- utilities/adjust_patient_inputs.py | 194 ---------- utilities/append_add_on_cols.py | 55 --- utilities/get_survival_from_ds.py | 59 --- utilities/match_pick_samples_from_files.py | 30 -- utilities/mount_link_sbfs.py | 55 --- utilities/null_cols_from_maf.py | 32 -- utilities/patch_pt_id_update.py | 36 -- utilities/patch_sample_id.py | 49 --- utilities/patch_survival_to_data_patient.py | 77 ---- utilities/rsem_replace_data_rna.py | 55 --- utilities/subset_table_by_id.py | 51 --- utilities/swap_samples.py | 68 ---- 37 files changed, 3652 deletions(-) delete mode 100644 REFS/cell_line_supplemental.txt delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/1a_merge_cavatica_manifests.py delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/1b_query_file_manifest.py delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/2_query_ds_by_bs_id.py delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/3b_create_cbio_id_fname_tbl.py delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/4_merge_maf.py delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/5a_cnv_genome2gene.py delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/5b_merge_cnv.py delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/5c_cnv_discrete.py delete mode 100644 scripts/2021-Sep-7_ETL_DEPRECATED/5d_merge_seg.py delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/6_merge_rename_rsem.py delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/7_convert_fusion.py delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/8_organize_upload_packages.py delete mode 100755 scripts/2021-Sep-7_ETL_DEPRECATED/sample_id_builder_helper.py delete mode 100644 scripts/dev/get_files_from_manifest.py delete mode 100755 scripts/v1_etl_deprecated/0_ETL_wrapper.py delete mode 100755 scripts/v1_etl_deprecated/1_query_cavatica.py delete mode 100644 scripts/v1_etl_deprecated/6b_z_score_transform.py delete mode 100644 scripts/v1_etl_deprecated/7_createLoadingFiles.R delete mode 100644 scripts/v1_etl_deprecated/CNALoadingHelper.R delete mode 100644 scripts/v1_etl_deprecated/RNALoadingHelper.R delete mode 100755 scripts/v1_etl_deprecated/add_case_lists.py delete mode 100644 scripts/v1_etl_deprecated/entrez_id_fusion_format.R delete mode 100755 scripts/v1_etl_deprecated/merge_star_fusion.py delete mode 100644 utilities/add_fusion_to_sample_sheets.py delete mode 100755 utilities/adjust_patient_inputs.py delete mode 100644 utilities/append_add_on_cols.py delete mode 100755 utilities/get_survival_from_ds.py delete mode 100644 utilities/match_pick_samples_from_files.py delete mode 100644 utilities/mount_link_sbfs.py delete mode 100755 utilities/null_cols_from_maf.py delete mode 100644 utilities/patch_pt_id_update.py delete mode 100644 utilities/patch_sample_id.py delete mode 100755 utilities/patch_survival_to_data_patient.py delete mode 100644 utilities/rsem_replace_data_rna.py delete mode 100755 utilities/subset_table_by_id.py delete mode 100644 utilities/swap_samples.py diff --git a/REFS/cell_line_supplemental.txt b/REFS/cell_line_supplemental.txt deleted file mode 100644 index 4cbdcce..0000000 --- a/REFS/cell_line_supplemental.txt +++ /dev/null @@ -1,36 +0,0 @@ -BS_RXP2ZRQT susp -BS_40MP5BWR susp -BS_TX8C5VAJ adh -BS_GXTFW99H adh -BS_M659G06J adh -BS_5GNQC2FF adh -BS_ERFMPQN3 adh -BS_PGK832G2 adh -BS_DDC2WVJY susp -BS_ST5P69KR susp -BS_AFBPM6CN susp -BS_PNYN0AYD susp -BS_TF5TTEXH susp -BS_E60JZ9Z3 susp -BS_XMP9XNR9 adh -BS_P9JP6JFA adh -BS_HM5GFJN8 susp -BS_2A162JH9 susp -BS_KVPJVJR7 susp -BS_K5WG6WGM susp -BS_PKZ1HWNB adh -BS_M8EA6R2A adh -BS_68TZMZH1 adh -BS_0RQ4P069 adh -BS_853PNV7P adh -BS_DRY58DTF adh -BS_JGKRN7NA susp -BS_MX23ZY0Y susp -BS_QWM9BPDY adh -BS_BWBDH9GM adh -BS_CZRA594T adh -BS_59ZJWJTF adh -BS_FJEZ3ASV susp -BS_8ZD6J47V susp -BS_ERAWW3H7 susp -BS_QYPHA40N susp diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/1a_merge_cavatica_manifests.py b/scripts/2021-Sep-7_ETL_DEPRECATED/1a_merge_cavatica_manifests.py deleted file mode 100755 index fcce9b2..0000000 --- a/scripts/2021-Sep-7_ETL_DEPRECATED/1a_merge_cavatica_manifests.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -import concurrent.futures -import re -import json -import pdb - -parser = argparse.ArgumentParser(description='Get all relevant analyzed file outputs from cavatica manifest.') -parser.add_argument('-m', '--manifest', action='store', dest='manifest', help='cavatica manifest file csv list') - -args = parser.parse_args() -out_head = 'id,name,project,Kids First Biospecimen ID Tumor,Kids First Biospecimen ID Normal,Kids First Biospecimen ID\n' -out = open('1a_cavatica_merged_manifest.csv', 'w') -out.write(out_head) -for manifests in args.manifest.split(','): - meta = open(manifests) - rel_head = ["Kids First Biospecimen ID Tumor", "Kids First Biospecimen ID Normal", "Kids First Biospecimen ID"] - head = next(meta) - ind = [] - - # get index positions of where relevant bs ids would be form tumor dna, normal dna, and rna - header = head.rstrip('\n').split(',') - for i in range(len(rel_head)): - try: - ind.append(header.index(rel_head[i])) - except: - sys.stderr.write(rel_head[i] + " not in this manifest, assiging 0\n") - ind.append(0) - for line in meta: - info = line.rstrip('\n').split(',') - out.write(','.join(info[0:3])) - for idx in ind: - if idx == 0: - out.write(',') - else: - out.write(',' + info[idx]) - out.write('\n') -out.close() \ No newline at end of file diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/1b_query_file_manifest.py b/scripts/2021-Sep-7_ETL_DEPRECATED/1b_query_file_manifest.py deleted file mode 100755 index 1f9d860..0000000 --- a/scripts/2021-Sep-7_ETL_DEPRECATED/1b_query_file_manifest.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -import concurrent.futures -import re -import json - -parser = argparse.ArgumentParser(description='Get all relevant analyzed file outputs from cavatica manifest.') -parser.add_argument('-o', '--output', action='store', dest='output', help='output basename name') -parser.add_argument('-m', '--manifest', action='store', dest='manifest', help='cavatica csv manifest file, may be from step 1a') -parser.add_argument('-b', '--blacklist', action='store', dest='blacklist', help='Optional bs id blacklist') -parser.add_argument('-c', '--config', action='store', dest='config_file', help='json config file with data types and ' - 'data locations') -args = parser.parse_args() -with open(args.config_file) as f: - config_data = json.load(f) -meta = open(args.manifest) -rel_head = {"Kids First Biospecimen ID Tumor": "tum", "Kids First Biospecimen ID Normal": "norm", "Kids First Biospecimen ID": "rna"} -head = next(meta) -ind = {} -# get index positions of where relevant bs ids would be form tumor dna, normal dna, and rna -header = head.rstrip('\n').split(',') -for i in range(len(header)): - if header[i] in rel_head: - ind[rel_head[header[i]]] = i - -old_out = open(args.output + '_task_list.txt', 'w') -rna_ext_dict = config_data['rna_ext_list'] -rna_ext_list = [] -for key in rna_ext_dict: - rna_ext_list.append(rna_ext_dict[key]) -dna_ext_dict = config_data['dna_ext_list'] -dna_ext_list = [] -for key in dna_ext_dict: - dna_ext_list.append(dna_ext_dict[key]) - -skip_dict = {} -if args.blacklist is not None: - blist = open(args.blacklist) - for line in blist: - skip_dict[line.rstrip('\n')] = 0 - -old_out.write('T/CL BS ID\tNorm BS ID\tTask Name\tTask ID\tAnalyte Type\tRelevant Outputs\tSource Project\n') -cav_dict = {} -for line in meta: - info = line.rstrip('\n').split(',') - parts = info[1].split('.') - ext = ".".join(parts[1:]) - skip = 0 - - for key in ind: - if info[ind[key]] in skip_dict: - skip = 1 - skip_dict[info[ind[key]]] = 1 - sys.stderr.write('BS ID in blacklist. Skipping ' + line) - file_id = info[0] - fname = info[1] - project = info[2] - if ext in rna_ext_list and skip == 0: - bs_id = info[ind['rna']] - if bs_id not in cav_dict: - cav_dict[bs_id] = {} - cav_dict[bs_id]['atype'] = "RNA" - cav_dict[bs_id]['fname'] = [] - cav_dict[bs_id]['bs_id'] = bs_id - cav_dict[bs_id]['project'] = [] - cav_dict[bs_id]['project'].append(project) - cav_dict[bs_id]['fname'].append(fname) - elif ext in dna_ext_list and skip == 0: - t_bs_id = info[ind['tum']] - n_bs_id = info[ind['norm']] - dkey = t_bs_id + n_bs_id - if dkey not in cav_dict: - cav_dict[dkey] = {} - cav_dict[dkey]['atype'] = "DNA" - cav_dict[dkey]['fname'] = [] - cav_dict[dkey]['t_bs_id'] = t_bs_id - cav_dict[dkey]['n_bs_id'] = n_bs_id - cav_dict[dkey]['project'] = [] - cav_dict[dkey]['project'].append(project) - cav_dict[dkey]['fname'].append(fname) -for tid in cav_dict: - if cav_dict[tid]['atype'] == 'RNA': - old_out.write("\t".join((cav_dict[tid]['bs_id'], "NA","RNA_TASK", tid, "RNA", ",".join(cav_dict[tid]['fname']), ",".join(cav_dict[tid]['project']))) + '\n') - else: - old_out.write("\t".join((cav_dict[tid]['t_bs_id'], cav_dict[tid]['n_bs_id'],"DNA_TASK", tid, "DNA", ",".join(cav_dict[tid]['fname']), ','.join(cav_dict[tid]['project']))) + '\n') -old_out.close() \ No newline at end of file diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/2_query_ds_by_bs_id.py b/scripts/2021-Sep-7_ETL_DEPRECATED/2_query_ds_by_bs_id.py deleted file mode 100755 index 69644b0..0000000 --- a/scripts/2021-Sep-7_ETL_DEPRECATED/2_query_ds_by_bs_id.py +++ /dev/null @@ -1,173 +0,0 @@ -#!/usr/bin/env python3 - -import sys -from requests import request -import argparse -import concurrent.futures -import json -from time import sleep -import pdb - - -def mt_query_calls(line): - try: - info = line.rstrip('\n').split('\t') - (tum_bs_id, norm_bs_id) = (info[0], info[1]) - norm_str = '' - if norm_bs_id != 'NA' and norm_bs_id not in norm_seen: - bs_info = query_dataservice_bs_id(url, norm_bs_id, bs_attrs, pt_attrs, dx_attrs, outcome_attrs) - norm_str = norm_bs_id + '\t' + '\t'.join(bs_info) + '\n' - norm_seen[norm_bs_id] = 1 - bs_info = query_dataservice_bs_id(url, tum_bs_id, bs_attrs, pt_attrs, dx_attrs, outcome_attrs) - tum_str = tum_bs_id + '\t' + '\t'.join(bs_info) + '\n' - return tum_str, norm_str - except Exception as e: - sys.stderr.write('Error ' + str(e) + ' occurred while trying to process ' + line) - exit(1) - - -def get_obj(src_url): - """ - GET requested object from url. It will occassionally fail so pausing, and trying again remedies the situation - """ - try: - req_info = request('GET', src_url) - return req_info - except Exception as e: - sys.stderr.write('Got error ' + str(e) + ',could not retrieve info from ' + src_url + ', pausing and retrying\n') - sys.stderr.flush() - sleep(1) - req_info = request('GET', src_url) - return req_info - - -def process_attr_dict(attr_dict, info_dict): - """ - Iterate through a requested attribute dictionary and append to an output string (a list that wil be converted to tsv) - """ - temp = [] - for attr in attr_dict: - res = info_dict.json()['results'][attr] - if res is None: - res = 'NULL' - temp.append(res) - return temp - - -def query_dataservice_bs_id(url, bs_id, bs_attrs, pt_attrs, dx_attrs, outcome_attrs): - - bs_url = url + '/biospecimens/' + bs_id - bs_info = get_obj(bs_url) - result = [] - if bs_info.json()['_status']['code'] == 404: - result.append(bs_info.json()['_status']['message']) - sys.stderr.write(bs_id + ' not found!\n') - sys.stderr.flush() - return result - elif not bs_info.json()['results']['visible']: - sys.stderr.write(bs_id + ' was set to hide. skipping!\n') - sys.stderr.flush() - return result - dx_url = url + bs_info.json()['_links']['diagnoses'] - dx_dict = {} - # dx can sometimes have multiple values, so dict is used to store them all - dx_dict = {} - dx_obj = get_obj(dx_url) if len(dx_attrs) > 0 else 'NoDX' - - pt_url = url + bs_info.json()['_links']['participant'] - pt_info = get_obj(pt_url) - result.append(pt_info.json()['results']['kf_id']) - - outcome_url = url + pt_info.json()['_links']['outcomes'] - outcome_info = get_obj(outcome_url) - - result.extend(process_attr_dict(bs_attrs, bs_info)) - result.extend(process_attr_dict(pt_attrs, pt_info)) - - for attr in bs_attrs: - res = bs_info.json()['results'][attr] - if res is None: - res = 'NULL' - result.append(str(res)) - for attr in pt_attrs: - res = pt_info.json()['results'][attr] - if res is None: - res = 'NULL' - result.append(res) - for attr in dx_attrs: - dx_dict[attr] = [] - for cur_res in dx_obj.json()['results']: - dx_dict[attr].append(str(cur_res[attr])) - result.append(';'.join(dx_dict[attr])) - # default to last outcome in list - try: - if len(outcome_info.json()['results']): - for attr in outcome_attrs: - res = outcome_info.json()['results'][-1][attr] - if res is None: - res = 'NULL' - result.append(str(res)) - else: - for attr in outcome_attrs: - res = 'NULL' - result.append(str(res)) - sys.stderr.write("WARN: " + pt_info.json()['results']['kf_id'] + " has no outcome data\n") - except Exception as e: - sys.stderr.write(str(e) + "\n") - return result - - -parser = argparse.ArgumentParser(description='Script to walk through data service and grab all relevant metadata' - ' by bs id.') -parser.add_argument('-u', '--kf-url', action='store', dest='url', - help='Kids First data service url', default='https://kf-api-dataservice.kidsfirstdrc.org') -parser.add_argument('-c', '--cavatica', action='store', dest='cav', - help='file with task info from cavatica (see step 1)') -parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types and ' - 'data locations') - -args = parser.parse_args() -with open(args.config_file) as f: - config_data = json.load(f) -file_list = args.cav -bs_attrs = ['external_aliquot_id', 'analyte_type', 'source_text_tissue_type', 'source_text_tumor_descriptor', - 'composition', 'external_sample_id'] -pt_attrs = ['external_id', 'gender', 'ethnicity', 'race'] -outcome_attrs = ['age_at_event_days', 'vital_status'] -dx_attrs = ['source_text_diagnosis', 'source_text_tumor_location', 'age_at_event_days'] -url = args.url - -norm_out_fh = open('norm_bs_ds_info.txt', 'w') -tum_out_fh = open('tum_bs_ds_info.txt', 'w') -norm_out_fh.write('BS_ID\tPT_ID\t' + '\t'.join(bs_attrs) + '\t' + '\t'.join(pt_attrs)) -tum_out_fh.write('BS_ID\tPT_ID\t' + '\t'.join(bs_attrs) + '\t' + '\t'.join(pt_attrs)) -if len(dx_attrs) > 0: - norm_out_fh.write('\t' + '\t'.join(dx_attrs)) - tum_out_fh.write('\t' + '\t'.join(dx_attrs)) -# tack on outcome data, renaming the age field -norm_out_fh.write('\toutcome_age_at_event_days\tvital_status\n') -tum_out_fh.write('\toutcome_age_at_event_days\tvital_status\n') - - -x = 1 -m = 100 -task_fh = open(file_list) -head = next(task_fh) -norm_seen = {} -with concurrent.futures.ThreadPoolExecutor(config_data['threads']) as executor: - results = {executor.submit(mt_query_calls, entry): entry for entry in task_fh} - for result in concurrent.futures.as_completed(results): - if x % m == 0: - sys.stderr.write('Processed ' + str(x) + ' lines\n') - sys.stderr.flush() - (tum_info, norm_info) = result.result() - tum_out_fh.write(tum_info) - if norm_info != '': - norm_out_fh.write(norm_info) - x += 1 -# debug mode -# for entry in task_fh: -# mt_query_calls(entry) -norm_out_fh.close() -tum_out_fh.close() -sys.stderr.write('Done!') diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/3b_create_cbio_id_fname_tbl.py b/scripts/2021-Sep-7_ETL_DEPRECATED/3b_create_cbio_id_fname_tbl.py deleted file mode 100755 index db4f000..0000000 --- a/scripts/2021-Sep-7_ETL_DEPRECATED/3b_create_cbio_id_fname_tbl.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import sys -import os -import json -import subprocess - - -def process_ds(dsname, cav_dict, out): - try: - # last item in find will likely be empty after split - x = 0 - parts = dsname.split('/') - cbio_proj = parts[-2] - # project/disease name should be name of directory hosting datasheet - sys.stderr.write('Processing ' + parts[-2] + ' project' + '\n') - cur_ds = open(dsname) - for i in range(0, 4, 1): - next(cur_ds) - ds_head = next(cur_ds) - ds_header = ds_head.rstrip('\n').split('\t') - sa_idx = ds_header.index('SAMPLE_ID') - sp_idx = ds_header.index('SPECIMEN_ID') - ns_idx = ds_header.index('MATCHED_NORMAL_SPECIMEN_ID') - for entry in cur_ds: - meta = entry.rstrip('\n').split('\t') - check = meta[sp_idx].split(';') - for bs_id in check: - # can't tell if RNA or DNA from datasheet, but DNA will be tum + norm bs id, RNA tum + NA - id1 = bs_id + "\t" + meta[ns_idx] - id2 = bs_id + "\tNA" - key = id1 - if key in cav_dict: - for ftype in cav_dict[key]: - out.write("\t".join([cbio_proj, key, ftype, meta[sa_idx], meta[ns_idx], cav_dict[key][ftype]]) + "\n") - elif id2 in cav_dict: - key = id2 - for ftype in cav_dict[key]: - out.write("\t".join([cbio_proj, key, ftype, meta[sa_idx], 'NA', cav_dict[key][ftype]]) + "\n") - else: - sys.stderr.write('WARNING: ' + id1 + ' nor ' + id2 + ' found in data sheet entry\n' + entry + 'Check inputs!\n') - except Exception as e: - print(e) - sys.exit() - -parser = argparse.ArgumentParser(description='Create temp table with cbio ids, bs ids, and file locations and types for downstream file merging. Should be run one level above datasheets dir.') -parser.add_argument('-c', '--cavatica', action='store', dest='cav', - help='file with task info from cavatica (see step 1b)') - -args = parser.parse_args() - -cav_dict = {} -manifest = open(args.cav) -head = next(manifest) -for line in manifest: - info = line.rstrip('\n').split('\t') - bs_ids = info[0] + "\t" + info[1] - fnames = info[-2] - atype = info[4] - cav_dict[bs_ids] = {} - # will likely have to add fusion ext soon - for fname in fnames.split(','): - ftype = 'rsem' - if atype == 'DNA': - if fname[-3:] == 'maf': - ftype = 'maf' - elif fname[-3:] == 'seg': - ftype = 'seg' - else: - ftype = "cnv" - elif fname[-3:] == 'tsv': - ftype = 'fusion' - cav_dict[bs_ids][ftype] = fname - -flist = subprocess.check_output('find ./datasheets -name data_clinical_sample.txt', shell=True) - -ds_list = flist.decode().split('\n') -if ds_list[-1] == '': - ds_list.pop() -out = open('cbio_id_fname_table.txt', 'w') -out.write('Cbio_project\tT_CL_BS_ID\tNorm_BS_ID\tFile_Type\tCbio_Tumor_Name\tCbio_Matched_Normal_Name\tFile_Name\n') -for dpath in ds_list: - process_ds(dpath, cav_dict, out) -out.close() \ No newline at end of file diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/4_merge_maf.py b/scripts/2021-Sep-7_ETL_DEPRECATED/4_merge_maf.py deleted file mode 100755 index 3b0f21a..0000000 --- a/scripts/2021-Sep-7_ETL_DEPRECATED/4_merge_maf.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -import argparse -import json -import subprocess -from get_file_metadata_helper import get_file_metadata -import concurrent.futures - - -def process_maf(maf_fn, new_maf, maf_exc, tum_id, norm_id, tid_idx, h_idx, nid_idx, v_idx, eid_idx): - cur_maf = open(maf_fn) - next(cur_maf) - next(cur_maf) - for maf in cur_maf: - data = maf.rstrip('\n').split('\t') - # Want to allow TERT promoter as exception to exlcusion rules - if data[v_idx] not in maf_exc or (data[h_idx] == "TERT" and data[v_idx] == "5'Flank"): - data[tid_idx] = tum_id - data[nid_idx] = norm_id - data.pop(eid_idx) - new_maf.write('\t'.join(data) + '\n') - cur_maf.close() - - -def process_tbl(cbio_dx, file_meta_dict, tid_idx, h_idx, nid_idx, v_idx, eid_idx, print_head): - try: - x = 0 - # project/disease name should be name of directory hosting datasheet - sys.stderr.write('Processing ' + cbio_dx + ' project' + '\n') - new_maf = open(out_dir + cbio_dx + ".maf", 'w') - new_maf.write(print_head) - for cbio_tum_id in file_meta_dict[cbio_dx]: - cbio_norm_id = file_meta_dict[cbio_dx][cbio_tum_id]['cbio_norm_id'] - fname = file_meta_dict[cbio_dx][cbio_tum_id]['fname'] - sys.stderr.write('Found relevant maf to process for ' + ' ' + cbio_tum_id + ' ' + cbio_norm_id + ' ' - + file_meta_dict[cbio_dx][cbio_tum_id]['kf_tum_id'] + ' ' + file_meta_dict[cbio_dx][cbio_tum_id]['kf_norm_id'] + ' ' + fname + '\n') - sys.stderr.flush() - process_maf(maf_dir + fname, new_maf, maf_exc, cbio_tum_id, cbio_norm_id, tid_idx, h_idx, nid_idx, v_idx, eid_idx) - x += 1 - sys.stderr.write('Completed processing ' + str(x) + ' entries in ' + cbio_dx + '\n') - new_maf.close() - except Exception as e: - print(e) - sys.exit() - - -parser = argparse.ArgumentParser(description='Merge and filter mafs using cavatica task info and datasheets.') -parser.add_argument('-t', '--table', action='store', dest='table', - help='Table with cbio project, kf bs ids, cbio IDs, and file names') -parser.add_argument('-i', '--header', action='store', dest='header', help='File with maf header only') -parser.add_argument('-m', '--maf-dir', action='store', dest='maf_dir', help='maf file directory') -parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types and ' - 'data locations') - -args = parser.parse_args() -with open(args.config_file) as f: - config_data = json.load(f) -# get maf file ext -maf_dir = args.maf_dir -if maf_dir[-1] != '/': - maf_dir += '/' -file_meta_dict = get_file_metadata(args.table, 'maf') -head_fh = open(args.header) - -print_head = next(head_fh) -cur_head = next(head_fh) -cur_header = cur_head.rstrip('\n').split('\t') -eid_idx = cur_header.index('Entrez_Gene_Id') -tid_idx = cur_header.index('Tumor_Sample_Barcode') -nid_idx = cur_header.index('Matched_Norm_Sample_Barcode') -v_idx = cur_header.index('Variant_Classification') -h_idx = cur_header.index('Hugo_Symbol') -cur_header.pop(eid_idx) - -head_fh.close() -print_head += '\t'.join(cur_header) + '\n' -maf_exc = {"Silent": 0, "Intron": 0, "IGR": 0, "3'UTR": 0, "5'UTR": 0, "3'Flank": 0, "5'Flank": 0, "RNA": 0} -out_dir = 'merged_mafs/' -try: - os.mkdir(out_dir) -except: - sys.stderr.write('output dir already exists\n') - -with concurrent.futures.ProcessPoolExecutor(config_data['cpus']) as executor: - results = {executor.submit(process_tbl, cbio_dx, file_meta_dict, tid_idx, h_idx, nid_idx, v_idx, eid_idx, print_head): cbio_dx for cbio_dx in file_meta_dict} - -sys.stderr.write('Done, check logs\n') diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/5a_cnv_genome2gene.py b/scripts/2021-Sep-7_ETL_DEPRECATED/5a_cnv_genome2gene.py deleted file mode 100755 index 6ae58e7..0000000 --- a/scripts/2021-Sep-7_ETL_DEPRECATED/5a_cnv_genome2gene.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -import argparse -import json -import subprocess -import concurrent.futures - - -def process_cnv(cpath): - try: - if cpath != '': - bedtools = config_data['bedtools'] - cnv_cp_only = config_data['cp_only_script'] - bed_file = config_data['bed_genes'] - hugo_tsv = config_data['hugo_tsv'] - sys.stderr.write('Processing ' + cpath + '\n') - root = os.path.basename(cpath).split('.')[0] - limit = config_data['cnv_min_len'] - temp_len = root + '.CNVs_' + str(limit) + "_filtered.bed" - len_fh = open(out_dir + temp_len, 'w') - in_cnv = open(cpath) - skip = next(in_cnv) - for cnv in in_cnv: - ct_dict['total_cnvs'] += 1 - cnv_data = cnv.rstrip('\n').split('\t') - if int(cnv_data[2]) - int(cnv_data[1]) >= limit: - len_fh.write("\t".join(cnv_data[0:5]) + "\n") - else: - ct_dict['short_cnvs'] += 1 - # sys.stderr.write("Entry too short " + cnv) - len_fh.close() - temp = root + '.CNVs.Genes' - to_genes_cmd = bedtools + ' intersect -a ' + out_dir + temp_len + ' -b ' + bed_file + ' -wb > ' + out_dir + temp - subprocess.call(to_genes_cmd, shell=True) - out_gene_cnv_only = 'perl ' + cnv_cp_only + ' ' + hugo_tsv + ' ' + out_dir + temp + ' > ' + out_dir + temp + '.copy_number' - subprocess.call(out_gene_cnv_only, shell=True) - rm_tmp = 'rm ' + out_dir + temp + ' ' + out_dir + temp_len - subprocess.call(rm_tmp, shell=True) - except Exception as e: - sys.stderr.write(str(e)) - sys.exit(1) - - -parser = argparse.ArgumentParser(description='Convert controlFreeC cnv genome coords to gene level') -parser.add_argument('-d', '--cnv-dir', action='store', dest='cnv_dir', help='cnv as genome coords file directory') -parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types and ' - 'data locations') - -args = parser.parse_args() -with open(args.config_file) as f: - config_data = json.load(f) - -limit = config_data['cnv_min_len'] -suffix = config_data['dna_ext_list']['copy_number'] -f_cmd = 'find ' + args.cnv_dir + ' -name "*.' + suffix + '"' -sys.stderr.write('Getting cnv list ' + f_cmd + '\n') -flist = subprocess.check_output(f_cmd, shell=True) -out_dir = 'converted_cnvs/' -try: - os.mkdir(out_dir) -except: - sys.stderr.write('output dir already exists\n') - -ct_dict = {'total_cnvs': 0, 'short_cnvs': 0} -with concurrent.futures.ThreadPoolExecutor(config_data['cpus']) as executor: - results = {executor.submit(process_cnv, cpath): cpath for cpath in flist.decode().split('\n')} -sys.stderr.write(str(ct_dict['total_cnvs']) + ' total cnv calls processed\n') -sys.stderr.write(str(ct_dict['short_cnvs']) + ' calls dropped for not meeting min bp size ' + str(limit) + '\n') -sys.stderr.write('Done, check logs\n') - diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/5b_merge_cnv.py b/scripts/2021-Sep-7_ETL_DEPRECATED/5b_merge_cnv.py deleted file mode 100755 index 86892d7..0000000 --- a/scripts/2021-Sep-7_ETL_DEPRECATED/5b_merge_cnv.py +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/bin/env python3 -import sys -import sevenbridges as sbg -from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper -import os -import argparse -import json -import subprocess -import concurrent.futures -import pandas as pd -import re -from get_file_metadata_helper import get_file_metadata - - -def process_cnv(cnv_fn, cur_cnv_dict, samp_id): - for entry in open(cnv_fn): - (gene, gid, value) = entry.rstrip('\n').split('\t') - if gene not in cur_cnv_dict: - cur_cnv_dict[gene] = {} - if samp_id in cur_cnv_dict[gene]: - sys.stderr.write('ERROR: Sample ID ' + samp_id + ' already processed. Back to the drawing board!') - exit(1) - else: - cur_cnv_dict[gene][samp_id] = value - return cur_cnv_dict - -def get_ploidy(obj): - info = obj.content().split('\n') - for datum in info: - try: - if datum.startswith('Output_Ploidy'): - (key, value) = datum.split('\t') - return value - except Exception as e: - sys.stderr.write("WARN: " + str(e) + " entry could not be split\n") - return '2' # default if ploidy can't be found - - -def process_table(cbio_dx, file_meta_dict): - try: - - # project/disease name should be name of directory hosting datasheet - sys.stderr.write('Processing ' + cbio_dx + ' project' + '\n') - new_cnv = open(out_dir + cbio_dx + '.predicted_cnv.txt', 'w') - cur_cnv_dict = {} - s_list = [] - ploidy_dict = {} - for cbio_tum_id in file_meta_dict[cbio_dx]: - orig_fname = file_meta_dict[cbio_dx][cbio_tum_id]['fname'] - kf_bs_id = file_meta_dict[cbio_dx][cbio_tum_id]['kf_tum_id'] - parts = re.search('^(.*)\.' + orig_suffix, orig_fname) - gene_fname = parts.group(1) + w_gene_suffix - sys.stderr.write('Found relevant cnv to process ' + ' ' + file_meta_dict[cbio_dx][cbio_tum_id]['kf_tum_id'] + ' ' - + file_meta_dict[cbio_dx][cbio_tum_id]['kf_norm_id'] + ' ' + gene_fname + '\n') - sys.stderr.flush() - s_list.append(cbio_tum_id) - if manifest is not None: - ploidy = get_ploidy(api.files.get(manifest.loc[kf_bs_id]['id'])) - ploidy_dict[cbio_tum_id] = ploidy - else: - ploidy_dict[cbio_tum_id] = '2' - cur_cnv_dict = process_cnv(cnv_dir + gene_fname, cur_cnv_dict, cbio_tum_id) - new_cnv.write('Hugo_Symbol\t' + '\t'.join(s_list) + '\n') - for gene in cur_cnv_dict: - new_cnv.write(gene) - for samp in s_list: - if samp in cur_cnv_dict[gene]: - new_cnv.write('\t' + cur_cnv_dict[gene][samp]) - else: - # pdb.set_trace() - new_cnv.write('\t' + ploidy_dict[samp]) - new_cnv.write('\n') - new_cnv.close() - except Exception as e: - print(e) - sys.exit(1) - - -parser = argparse.ArgumentParser(description='Merge cnv files using cavatica task info and datasheets.') -parser.add_argument('-t', '--table', action='store', dest='table', - help='Table with cbio project, kf bs ids, cbio IDs, and file names') -parser.add_argument('-n', '--cnv-dir-gene', action='store', dest='cnv_dir', help='cnv as gene file directory') -parser.add_argument('-m', '--info_manifest', action='store', dest='manifest', help='cavatica cfree info file manifest') -parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types and ' - 'data locations') -parser.add_argument('-p', '--profile', action='store', dest='profile', help='cavatica profile name. requires ' - '.sevenbridges/credentials file be present') - - -args = parser.parse_args() -with open(args.config_file) as f: - config_data = json.load(f) -cnv_dir = args.cnv_dir -if cnv_dir[-1] != '/': - cnv_dir += '/' -manifest = None -if args.manifest is not None: - manifest = pd.read_csv(args.manifest) - manifest.set_index(['Kids First Biospecimen ID Tumor'], inplace=True) -else: - sys.stderr.write("No info file given, will assume ploidy 2\n") - -config = sbg.Config(profile=args.profile) -api = sbg.Api(config=config, error_handlers=[ - sbg.http.error_handlers.rate_limit_sleeper, - sbg.http.error_handlers.maintenance_sleeper]) - - -orig_suffix = config_data['dna_ext_list']['copy_number'] -w_gene_suffix = '.CNVs.Genes.copy_number' -out_dir = 'merged_cnvs/' -try: - os.mkdir(out_dir) -except: - sys.stderr.write('output dir already exists\n') -file_meta_dict = get_file_metadata(args.table, 'cnv') -with concurrent.futures.ProcessPoolExecutor(config_data['cpus']) as executor: - results = {executor.submit(process_table, cbio_dx, file_meta_dict): cbio_dx for cbio_dx in file_meta_dict} - -sys.stderr.write('Done, check logs\n') diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/5c_cnv_discrete.py b/scripts/2021-Sep-7_ETL_DEPRECATED/5c_cnv_discrete.py deleted file mode 100755 index 2525d5b..0000000 --- a/scripts/2021-Sep-7_ETL_DEPRECATED/5c_cnv_discrete.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python3 - -import sevenbridges as sbg -import sys -import argparse -from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper -import concurrent.futures -import json -import subprocess -import re -from get_file_metadata_helper import get_file_metadata -import pandas as pd -import numpy as np - -parser = argparse.ArgumentParser(description='Convert merged cnv values to discrete coded values.') -parser.add_argument('-d', '--merged_cnv_dir', action='store', dest='merged_cnv_dir', help='merged cnv dir') -parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types and ' - 'data locations') -parser.add_argument('-m', '--info_manifest', action='store', dest='manifest', help='cavatica cfree info file manifest') -parser.add_argument('-t', '--table', action='store', dest='table', - help='Table with cbio project, kf bs ids, cbio IDs, and file names') -parser.add_argument('-p', '--profile', action='store', dest='profile', help='cavatica profile name. requires ' - '.sevenbridges/credentials file be present') - - -def mt_adjust_cn(obj): - try: - # If there is a info file manifest, get ploidy. Else skip and assume ploidy 2 - checkpoint = obj - if manifest is not None: - bs_id = obj.resource.metadata['Kids First Biospecimen ID Tumor'] - info = obj.resource.content().split('\n') - checkpoint = obj.resource.id - else: - bs_id = obj - samp_id = bs_cbio_dict[bs_id] - ploidy = 2 - if manifest is not None: - for datum in info: - try: - if datum.startswith('Output_Ploidy'): - (key, value) = datum.split('\t') - ploidy = int(value) - break - except Exception as e: - sys.stderr.write("WARN: " + str(e) + " entry could not be split\n") - data[samp_id] = data[samp_id] - ploidy - min_cn = ploidy * -1 - # adjust discrete low range only in ploidy > 2 - if min_cn != -2: - data[samp_id] = np.where(data[samp_id].between((min_cn + 1),-2), -1, data[samp_id]) - data[samp_id] = np.where(data[samp_id] == min_cn, -2, data[samp_id]) - data[samp_id] = np.where(data[samp_id].between(2, high_gain), 1, data[samp_id]) - data[samp_id] = np.where(data[samp_id] > high_gain, 2, data[samp_id]) - return [0, checkpoint] - - except Exception as E: - sys.stderr.write(str(E) + "\n") - sys.stderr.write('Error processing sample entry for ' + checkpoint + '\n') - return [1, checkpoint] - - -args = parser.parse_args() -with open(args.config_file) as f: - config_data = json.load(f) - -config = sbg.Config(profile=args.profile) -api = sbg.Api(config=config, error_handlers=[ - sbg.http.error_handlers.rate_limit_sleeper, - sbg.http.error_handlers.maintenance_sleeper]) - -flist = subprocess.check_output('find ' + args.merged_cnv_dir + ' -name *.predicted_cnv.txt', shell=True) - -fname_list = flist.decode().split('\n') -file_meta_dict = get_file_metadata(args.table, 'cnv') -manifest = None -if args.manifest is not None: - manifest = pd.read_csv(args.manifest) - manifest.set_index(['Kids First Biospecimen ID Tumor'], inplace=True) -else: - sys.stderr.write("No info file given, will assume ploidy 2\n") -if fname_list[-1] == '': - fname_list.pop() -for fname in fname_list: - parts = re.search('^' + args.merged_cnv_dir + '/(.*).predicted_cnv.txt', fname) - cbio_dx = parts.group(1) - data = pd.read_csv(fname, sep="\t") - data.set_index('Hugo_Symbol') - # sample list would be cbio ids - samp_list = list(data.columns)[1:] - bulk_ids = [] - bs_cbio_dict = {} - #fid_dict = {} - for samp_id in samp_list: - bs_id = file_meta_dict[cbio_dx][samp_id]['kf_tum_id'] - bs_cbio_dict[bs_id] = samp_id - if manifest is not None: - bulk_ids.append(manifest.loc[bs_id]['id']) - file_objs = [] - max_j = 100 - total = len(bulk_ids) - for i in range(0, total, max_j): - uset = i + max_j - sys.stderr.write('Processing ' + str(uset) + ' set\n') - if uset > total: - uset = total - subset = api.files.bulk_get(files=bulk_ids[i:uset]) - file_objs.extend(subset) - - high_gain = config_data['cnv_high_gain'] - - x = 1 - m = 50 - with concurrent.futures.ThreadPoolExecutor(config_data['threads']) as executor: - if manifest is not None: - results = {executor.submit(mt_adjust_cn, obj): obj for obj in file_objs} - else: - results = {executor.submit(mt_adjust_cn, bs_id): bs_id for bs_id in bs_cbio_dict} - for result in concurrent.futures.as_completed(results): - if result.result()[0] == 1: - 'Had trouble processing object ' + result.result([1] + '\n') - sys.exit(1) - if x % m == 0: - sys.stderr.write('Processed ' + str(x) + ' samples\n') - sys.stderr.flush() - x += 1 - sys.stderr.write('Conversion completed. Writing results to file\n') - new_fname = cbio_dx = args.merged_cnv_dir + '/' + parts.group(1) + '.discrete_cnvs.txt' - data.to_csv(new_fname, sep='\t', mode='w', index=False) diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/5d_merge_seg.py b/scripts/2021-Sep-7_ETL_DEPRECATED/5d_merge_seg.py deleted file mode 100644 index 5a64ef9..0000000 --- a/scripts/2021-Sep-7_ETL_DEPRECATED/5d_merge_seg.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -import argparse -import json -import subprocess -from get_file_metadata_helper import get_file_metadata -import concurrent.futures -import pdb - - -def process_seg(cur_seg, new_seg, cbio_tum_id, limit): - cur_seg = open(cur_seg) - next(cur_seg) - for seg in cur_seg: - data = seg.rstrip('\n').split('\t') - data[0] = cbio_tum_id - # cbio does not like leading chr in chromosome name - data[1] = data[1].replace("chr", "") - new_seg.write("\t".join(data) + "\n") - - -def process_tbl(cbio_dx, file_meta_dict, limit): - try: - x = 0 - # project/disease name should be name of directory hosting datasheet - sys.stderr.write('Processing ' + cbio_dx + ' project' + '\n') - new_seg = open(out_dir + cbio_dx + ".merged_seg.txt", 'w') - new_seg.write(seg_file_header) - for cbio_tum_id in file_meta_dict[cbio_dx]: - cbio_norm_id = file_meta_dict[cbio_dx][cbio_tum_id]['cbio_norm_id'] - fname = file_meta_dict[cbio_dx][cbio_tum_id]['fname'] - sys.stderr.write("Found relevant seg to process for {} {} {} {} {}\n".format(cbio_tum_id,cbio_norm_id,file_meta_dict[cbio_dx][cbio_tum_id]['kf_tum_id'],file_meta_dict[cbio_dx][cbio_tum_id]['kf_norm_id'],fname)) - sys.stderr.flush() - process_seg(seg_dir + fname, new_seg, cbio_tum_id, limit) - x += 1 - sys.stderr.write('Completed processing ' + str(x) + ' entries in ' + cbio_dx + '\n') - new_seg.close() - except Exception as e: - sys.stderr.write(e) - sys.exit() - - -parser = argparse.ArgumentParser(description='Merge seg files') -parser.add_argument('-t', '--table', action='store', dest='table', - help='Table with cbio project, kf bs ids, cbio IDs, and file names') -parser.add_argument('-m', '--seg-dir', action='store', dest='seg_dir', help='seg file directory') -parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types and ' - 'data locations') - -args = parser.parse_args() -if __name__ == "__main__": - with open(args.config_file) as f: - config_data = json.load(f) - seg_file_header = "ID\tchrom\tloc.start\tloc.end\tnum.mark\tseg.mean\n" - # get maf file ext - seg_dir = args.seg_dir - if seg_dir[-1] != '/': - seg_dir += '/' - file_meta_dict = get_file_metadata(args.table, 'seg') - # not sure about this, seg files might not allow for droppig regions... - limit = config_data['cnv_min_len'] - - out_dir = 'merged_cnvs/' - try: - os.mkdir(out_dir) - except: - sys.stderr.write('output dir already exists\n') - - with concurrent.futures.ProcessPoolExecutor(config_data['cpus']) as executor: - results = {executor.submit(process_tbl, cbio_dx, file_meta_dict, limit): cbio_dx for cbio_dx in file_meta_dict} - - sys.stderr.write('Done, check logs\n') diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/6_merge_rename_rsem.py b/scripts/2021-Sep-7_ETL_DEPRECATED/6_merge_rename_rsem.py deleted file mode 100755 index ab5496e..0000000 --- a/scripts/2021-Sep-7_ETL_DEPRECATED/6_merge_rename_rsem.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -import os -import concurrent.futures -import gzip -import pandas as pd -import numpy as np -from scipy import stats -import pdb - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Merge rsem files using cavatica file info.') - parser.add_argument('-t', '--table', action='store', dest='table', - help='Table with cbio project, kf bs ids, cbio IDs, and file names') - parser.add_argument('-r', '--rsem-dir', action='store', dest='rsem_dir', help='rsem file directory') - args = parser.parse_args() - - - def mt_collate_df(rsem_file): - try: - sample = rna_subset.loc[rna_subset['File_Name'] == rsem_file, 'Cbio_Tumor_Name'].iloc[0] - if sample not in seen_dict: - current = pd.read_csv(rsem_dir + rsem_file, sep="\t", index_col=0) - cur_subset = current[['FPKM']] - cur_subset.rename(columns={"FPKM": sample}, inplace=True) - df_list.append(cur_subset) - seen_dict[sample] = 1 - else: - sys.stderr.write(sample + " was already seen, likely due to mulitple dx, skipping!\n") - return 1 - except Exception as e: - sys.stderr.write(str(e) + "\nFailed to process " + rsem_file) - - - rsem_dir = args.rsem_dir - - if rsem_dir[-1] != '/': - rsem_dir += '/' - - # file_meta_dict = get_file_metadata(args.table, 'rsem') - out_dir = 'merged_rsem/' - try: - os.mkdir(out_dir) - except: - sys.stderr.write('output dir already exists\n') - sys.stderr.flush() - - # Use cbio table to create master table to choose represntative gene symbol, calc z scores, output both - all_file_meta = pd.read_csv(args.table, sep="\t") - rna_subset = all_file_meta.loc[all_file_meta['File_Type'] == 'rsem'] - rsem_list = rna_subset['File_Name'].to_list() - # init_tbl = pd.read_csv(rsem_dir + rsem_list[0], sep="\t", index_col=0) - # Get cbio name to rename columns in master table - # master_tbl = init_tbl[['FPKM']] - # master_tbl.rename(columns={"FPKM": sample}, inplace=True) - sys.stderr.write('Creating merged rsem table\n') - x = 1 - m = 50 - - df_list = [] - # some samples have more than one dx, need only process once - seen_dict = {} - with concurrent.futures.ThreadPoolExecutor(16) as executor: - results = {executor.submit(mt_collate_df, rsem_list[i]): rsem_list[i] for i in range(len(rsem_list))} - for result in concurrent.futures.as_completed(results): - if x % m == 0: - sys.stderr.write('Merged ' + str(x) + ' files\n') - sys.stderr.flush() - x += 1 - master_tbl = pd.concat(df_list, axis=1) - del df_list - - sys.stderr.write('Merge complete, picking representative gene transcripts\n') - sys.stderr.flush() - master_tbl.reset_index(inplace=True) - # drop ens_id_sym and replace with just sym - gene_id_sym_list = master_tbl['gene_id'].to_list() - gene_sym_list = [] - for entry in gene_id_sym_list: - parts = entry.split('_') - gene_sym = '_'.join(parts[1:]) - gene_sym_list.append(gene_sym) - master_tbl['Hugo_Symbol'] = gene_sym_list - master_tbl.drop(columns =["gene_id"], inplace = True) - sample_list=list(master_tbl.columns.values) - # remove value with Hugo_Symbol - h_idx = sample_list.index('Hugo_Symbol') - sample_list.pop(h_idx) - # look for repeated instances of gene symbol, use highest average exp as rep value - dup_hugo_tbl = master_tbl[master_tbl.duplicated(['Hugo_Symbol'])] - rpt_sym = dup_hugo_tbl.Hugo_Symbol.unique() - for sym in rpt_sym: - gene_eval = master_tbl.loc[master_tbl['Hugo_Symbol'] == sym] - mean_expr = gene_eval[sample_list].mean(axis=1).sort_values(ascending=False) - to_dump = list(mean_expr.index)[1:] - master_tbl.drop(to_dump, inplace=True) - master_tbl.set_index('Hugo_Symbol', inplace=True) - gene_sym_list = master_tbl.index - project_list = rna_subset.Cbio_project.unique() - - sys.stderr.write('Outputting FPKM expression results\n') - sys.stderr.flush() - for project in project_list: - sub_sample_list = list(rna_subset.loc[rna_subset['Cbio_project'] == project, 'Cbio_Tumor_Name']) - expr_fname = out_dir + project + '.rsem_merged.txt' - master_tbl[sub_sample_list].to_csv(expr_fname, sep='\t', mode='w', index=True) - - sys.stderr.write('Calculating z scores\n') - sys.stderr.flush() - - z_scored = stats.zscore(np.log2(np.array(master_tbl + 1)), axis = 1) - del master_tbl - master_zscore_log = pd.DataFrame(z_scored, index=gene_sym_list, columns=sample_list) - # this may be memory-intensive for some insane reason... - sys.stderr.write("Replacing NaN with 0\n") - sys.stderr.flush() - master_zscore_log.fillna(0, inplace=True) - sys.stderr.write('Outputting z scored results\n') - sys.stderr.flush() - - for project in project_list: - sub_sample_list = list(rna_subset.loc[rna_subset['Cbio_project'] == project, 'Cbio_Tumor_Name']) - zscore_fname = out_dir + project + '.rsem_merged_zscore.txt' - master_zscore_log[sub_sample_list].to_csv(zscore_fname, sep='\t', mode='w', index=True, float_format='%.4f') \ No newline at end of file diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/7_convert_fusion.py b/scripts/2021-Sep-7_ETL_DEPRECATED/7_convert_fusion.py deleted file mode 100755 index a56bb12..0000000 --- a/scripts/2021-Sep-7_ETL_DEPRECATED/7_convert_fusion.py +++ /dev/null @@ -1,99 +0,0 @@ -import sys -import argparse -import json -import os -import pandas as pd -import numpy as np - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Convert openPBTA fusion table OR list of annofuse files to cbio format.') - parser.add_argument('-t', '--table', action='store', dest='table', - help='Table with cbio project, kf bs ids, cbio IDs, and file names') - parser.add_argument('-f', '--fusion-results', action='store', dest='fusion_results', help='openPBTA fusion file OR annoFuse results dir') - parser.add_argument('-m', '--mode', action='store', dest='mode', help='describe source, pbta or annofuse', required=True) - parser.add_argument('-s', '--center-file', action='store', dest='sq_file', help='File with BS IDs and sequencing centers. Should have headered columns: BS_ID\tSQ_Value') - parser.add_argument('-o', '--out-dir', action='store', dest='out_dir', default='merged_fusion/', help='Result output dir. Default is merged_fusion') - args = parser.parse_args() - if args.mode != "pbta" and args.mode != "annofuse": - sys.stderr.write("-m mode argument must be one of pbta or annofuse. It is case sensitive. You put " + args.mode + "\n") - exit(1) - def init_cbio_master(fusion_results, mode, rna_metadata): - if mode == 'pbta': - fusion_data = pd.read_csv(fusion_results, sep="\t") - return fusion_data[['Gene1A', 'Sample', 'FusionName', 'CalledBy', 'Fusion_Type']] - else: - flist = rna_metadata.File_Name - frame_list = [] - for i in range(0, len(flist), 1): - ann_file = pd.read_csv(fusion_results + "/" + flist[i], sep="\t") - frame_list.append(ann_file) - concat_frame = pd.concat(frame_list) - del frame_list - fusion_data = concat_frame[['Gene1A', 'Sample', 'FusionName', 'Caller', 'Fusion_Type']] - del concat_frame - fusion_data = fusion_data.groupby(['Gene1A', 'Sample', 'FusionName', 'Fusion_Type'])['Caller'].apply(', '.join).reset_index() - fusion_data['Caller'] = fusion_data['Caller'].str.upper() - return fusion_data - - - out_dir = args.out_dir - try: - os.mkdir(out_dir) - except: - sys.stderr.write('output dir already exists\n') - sys.stderr.flush() - - # deal only with RNA metadata - r_ext = 'rsem' # openPBTA data would cross-reference with expression results otherwise... - sq_info = pd.read_csv(args.sq_file, sep="\t") - all_file_meta = pd.read_csv(args.table, sep="\t") - if args.mode == 'annofuse': - r_ext = 'fusion' - rna_subset = all_file_meta.loc[all_file_meta['File_Type'] == r_ext] - rna_subset.rename(columns={"T_CL_BS_ID": "Sample"}, inplace=True) - rna_subset.set_index('Sample', inplace=True) - project_list = rna_subset.Cbio_project.unique() - cbio_master = init_cbio_master(args.fusion_results, args.mode, rna_subset) - - # Get relevant columns - cbio_master.set_index('Sample', inplace=True) - sq_info.rename(columns={"BS_ID": "Sample"}, inplace=True) - sq_info.set_index('Sample', inplace=True) - # get cbio IDs and seq center - cbio_master = pd.merge(cbio_master, rna_subset[['Cbio_Tumor_Name']], on='Sample', how='inner') - cbio_master = pd.merge(cbio_master, sq_info[['SQ_Value']], on='Sample', how='inner') - # drop bs ids - cbio_master.reset_index(inplace=True) - cbio_master.drop('Sample', axis=1, inplace=True) - if args.mode == 'pbta': - cbio_master.rename(columns={"Gene1A": "Hugo_Symbol", "FusionName": "Fusion", "CalledBy": "Method", - "Fusion_Type": "Frame", "Cbio_Tumor_Name": "Tumor_Sample_Barcode", "SQ_Value": "Center"}, inplace=True) - else: - cbio_master.rename(columns={"Gene1A": "Hugo_Symbol", "FusionName": "Fusion", "Caller": "Method", - "Fusion_Type": "Frame", "Cbio_Tumor_Name": "Tumor_Sample_Barcode", "SQ_Value": "Center"}, inplace=True) - - cbio_master['DNA_support'] = "no" - cbio_master['RNA_support'] = "yes" - # create blank entrez ID column so that 3' gene names can be searched - cbio_master['Entrez_Gene_Id'] = "" - # Reorder table - order_list = ['Hugo_Symbol', 'Entrez_Gene_Id', 'Center', 'Tumor_Sample_Barcode', 'Fusion','DNA_support', 'RNA_support', 'Method', 'Frame'] - cbio_master = cbio_master[order_list] - cbio_master.set_index('Hugo_Symbol', inplace=True) - for project in project_list: - sub_sample_list = list(rna_subset.loc[rna_subset['Cbio_project'] == project, 'Cbio_Tumor_Name']) - fus_fname = out_dir + project + '.fusions.txt' - fus_file = open(fus_fname, 'w') - fus_tbl = cbio_master[cbio_master.Tumor_Sample_Barcode.isin(sub_sample_list)] - fus_tbl.reset_index(inplace=True) - fus_head = "\t".join(fus_tbl.columns) + "\n" - fus_file.write(fus_head) - fus_data = list(fus_tbl.values) - # "hack" to allow 3' end of fusion to be searched - for data in fus_data: - fus_file.write("\t".join(data) + "\n") - (a, b) = data[4].split('--') - if a != b: - data[0] = b - fus_file.write("\t".join(data) + "\n") - fus_file.close() diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/8_organize_upload_packages.py b/scripts/2021-Sep-7_ETL_DEPRECATED/8_organize_upload_packages.py deleted file mode 100755 index 21846a6..0000000 --- a/scripts/2021-Sep-7_ETL_DEPRECATED/8_organize_upload_packages.py +++ /dev/null @@ -1,218 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -import os -import concurrent.futures -import re -import json -import subprocess -import pdb - - -parser = argparse.ArgumentParser(description='Create cases lists, meta files, and organize data for cbio upload.' - ' It is assumed you are at the dir level of all input data files') -parser.add_argument('-o', '--output_dir', action='store', dest='out_dir', help='output directory name') -parser.add_argument('-d', '--dx-file', action='store', dest='dx_file', help='Dx index file with cbio short and long names') -parser.add_argument('-c', '--config', action='store', dest='config_file', help='json config file with meta information; see REFS/case_meta_config.json example') -args = parser.parse_args() - -def process_meta_study(meta_data, output_dir, study): - meta_study = open(output_dir + "meta_study.txt", "w") - meta_study.write("type_of_cancer: " + study + "\n") - # default is dx value, otherwise make it "cancer_study_identifier" value from config, in not empty - canc_study_id = study - if meta_data["cancer_study_identifier"] != "": - canc_study_id = meta_data["cancer_study_identifier"] - canc_study_id += meta_data['dir_suffix'] - meta_study.write("cancer_study_identifier: " + canc_study_id + "\n") - if "reference_genome" in meta_data: - meta_study.write("reference_genome: " + meta_data["reference_genome"] + "\n") - name = dx_dict[study] - if meta_data["name_append"] != "": - name += " " + meta_data["name_append"] - meta_study.write("name: " + name + "\n") - desc = meta_data["description"][0] - if len(meta_data["description"]) > 1: - desc += " " + dx_dict[study] + " " + meta_data["description"][1] - meta_study.write("description: " + desc + "\nshort_name: " + canc_study_id + "\ngroups: " + meta_data["groups"] + "\n") - meta_study.close() - return canc_study_id - - -def process_meta_data(meta_data, output_dir, canc_study_id, study): - for dtype in meta_data["dtypes"]: - try: - # pointer for easier readability and dict key traciing - cur_data = meta_data["dtypes"][dtype] - cbio_name = cur_data["cbio_name"] - parts = cbio_name.split("_") - attr_dict = cur_data["meta_file_attr"] - meta_name = "meta_" + "_".join(parts[1:]) - if attr_dict["datatype"] == "FUSION": - meta_name = "meta_" + attr_dict["datatype"] + ".txt" - meta_data_file = open(output_dir + meta_name, "w") - meta_data_file.write("cancer_study_identifier: " + canc_study_id + "\n") - for mkey in attr_dict: - meta_data_file.write(mkey + ": " + attr_dict[mkey] + "\n") - meta_data_file.write("data_filename: " + cbio_name + "\n") - meta_data_file.close() - # create data_ links to data - cmd = "ln -s " + cwd + meta_data["dir"] + "/" + study + "." + cur_data["ext"] + " " + output_dir + cbio_name - subprocess.call(cmd, shell=True) - except Exception as e: - sys.stderr.write(str(e) + " failed processing meta data file\n") - pdb.set_trace() - hold = 1 - - -def process_clinical_data(meta_data, output_dir, canc_study_id, study): - for dtype in meta_data["dtypes"]: - # pointer for easier readability and dict key tracing - try: - cur_data = meta_data["dtypes"][dtype] - cbio_name = cur_data["cbio_name"] - parts = cbio_name.split("_") - meta_name = "meta_" + "_".join(parts[1:]) - meta_data_file = open(output_dir + meta_name, "w") - meta_data_file.write("cancer_study_identifier: " + canc_study_id + "\n") - attr_dict = cur_data["meta_file_attr"] - for mkey in attr_dict: - meta_data_file.write(mkey + ": " + attr_dict[mkey] + "\n") - meta_data_file.write("data_filename: " + cbio_name + "\n") - meta_data_file.close() - # create data_ links to data - cmd = "ln -s " + cwd + meta_data["dir"] + "/" + study + "/" + cbio_name + " " + output_dir + cbio_name - subprocess.call(cmd, shell=True) - except Exception as e: - sys.stderr.write(str(e) + " failed processing meta data file\n") - pdb.set_trace() - hold = 1 - - -def write_case_list(case_key, attr_dict, sample_list, case_dir): - case_file = open(case_dir + case_key + ".txt", "w") - case_file.write("cancer_study_identifier: " + canc_study_id + "\n") - key_list = list(attr_dict) - case_file.write(key_list[0] + ": " + canc_study_id + "_" + attr_dict[key_list[0]] + "\n") - for i in range(1, len(key_list), 1): - to_write = key_list[i] + ": " + attr_dict[key_list[i]] - if key_list[i] == "case_list_description": - to_write += " (" + str(len(sample_list)) + ")" - case_file.write(to_write + "\n") - case_file.write("case_list_ids: " + "\t".join(sample_list) + "\n") - case_file.close() - -def create_case_lists(data_dict, output_dir, canc_study_id, study): - case_dir = output_dir + "case_lists/" - try: - os.mkdir(case_dir) - except: - sys.stderr.write(case_dir + ' already exists.\n') - - muts_list = [] - cna_list = [] - rna_list = [] - fusion_list = [] - - muts_fname = output_dir + config_data["merged_mafs"]["dtypes"]["mutation"]["cbio_name"] - muts_file = open(muts_fname) - head = next(muts_file) - head = next(muts_file) - header = head.rstrip('\n').split('\t') - s_idx = header.index('Tumor_Sample_Barcode') - for line in muts_file: - data = line.rstrip('\n').split('\t') - muts_list.append(data[s_idx]) - muts_file.close() - muts_list = [*{*muts_list}] - if data_dict["merged_cnvs"] == 1: - cna_fname = output_dir + config_data["merged_cnvs"]["dtypes"]["linear"]["cbio_name"] - cna_file = open(cna_fname) - head = next(cna_file) - # assumes header is Hugo_symbols\tsample_name1\tsamplename2 etc, if entrez ID, will need to change! - cna_list = head.rstrip('\n').split('\t')[1:] - cna_file.close() - if data_dict["merged_rsem"] == 1: - # assumes header is Hugo_symbols\tsample_name1\tsamplename2 etc, if entrez ID, will need to change! - rna_fname = output_dir + config_data["merged_rsem"]["dtypes"]["counts"]["cbio_name"] - rna_file = open(rna_fname) - head = next(rna_file) - rna_list = head.rstrip('\n').split('\t')[1:] - fusion_list = rna_list - # muts_plus_fusion = muts_list + fusion_list - # muts_plus_fusion = [*{*muts_plus_fusion}] - all_cases = muts_list - write_case_list('cases_sequenced', config_data['cases_sequenced'], muts_list, case_dir) - # write_case_list('cases_sequenced', config_data['cases_sequenced'], muts_list, case_dir) - if len(cna_list) > 0: - write_case_list('cases_cna', config_data['cases_cna'], cna_list, case_dir) - all_cases += cna_list - muts_plus_cna = list(set(muts_list) & set(cna_list)) - write_case_list('cases_cnaseq', config_data['cases_cnaseq'], cna_list, case_dir) - if len(rna_list) > 0: - write_case_list('cases_RNA_Seq_v2_mRNA', config_data['cases_RNA_Seq_v2_mRNA'], rna_list, case_dir) - all_cases += rna_list - if "merged_fusion" in data_keys and data_keys["merged_fusion"] == 1: - write_case_list('cases_sv', config_data['cases_sv'], rna_list, case_dir) - - # loading mutations is a minimum, so if cna exists...3 way file can be made - if len(cna_list) > 0: - three_way = list(set(muts_list) & set(cna_list) & set(rna_list)) - write_case_list('cases_3way_complete', config_data['cases_3way_complete'], three_way, case_dir) - all_cases = [*{*all_cases}] - write_case_list('cases_all', config_data['cases_all'], all_cases, case_dir) - - -cwd = os.getcwd() + "/" -with open(args.config_file) as f: - config_data = json.load(f) - -dx_file = open(args.dx_file) -dx_dict = {} -head = next(dx_file) -for line in dx_file: - (ds_dx, cbio_short, cbio_long) = line.rstrip('\n').split('\t') - dx_dict[cbio_short] = cbio_long -dx_file.close() - -out_dir = args.out_dir -if out_dir[-1] != "/": - out_dir += "/" -try: - os.mkdir(out_dir) -except: - sys.stderr.write(out_dir + ' already exists.\n') - -for dx in dx_dict: - try: - if os.path.isdir(config_data["data_sheets"]["dir"] + "/" + dx): - cur_dir = "" - if config_data["study"]["cancer_study_identifier"] == "": - cur_dir = out_dir + dx - if config_data['study']['dir_suffix'] != "": - cur_dir += config_data['study']['dir_suffix'] - else: - cur_dir = out_dir + config_data["study"]["cancer_study_identifier"] - cur_dir += "/" - try: - os.mkdir(cur_dir) - except: - sys.stderr.write(cur_dir + ' already exists.\n') - sys.stderr.write("Creating meta study file for " + dx + "\n") - canc_study_id = process_meta_study(config_data['study'], cur_dir, dx) - data_keys = {"merged_mafs": 0, "merged_cnvs": 0, "merged_rsem": 0, "merged_fusion": 0} - for key in data_keys: - if key in config_data and config_data[key]["dir"] != "": - data_keys[key] = 1 - process_meta_data(config_data[key], cur_dir, canc_study_id, dx) - sys.stderr.write("Creating meta data files and links for " + key + "\n") - else: - sys.stderr.write("Skipping meta files for " + key + "\n") - sys.stderr.write("Creating clinical meta sheets and links\n") - process_clinical_data(config_data["data_sheets"], cur_dir, canc_study_id, dx) - create_case_lists(data_keys, cur_dir, canc_study_id, dx) - else: - sys.stderr.write("No datasheets for " + dx + ", skipping!\n") - except Exception as e: - sys.stderr.write(str(e) + "\nerror processing files for " + dx + "!\n") diff --git a/scripts/2021-Sep-7_ETL_DEPRECATED/sample_id_builder_helper.py b/scripts/2021-Sep-7_ETL_DEPRECATED/sample_id_builder_helper.py deleted file mode 100755 index eb71ecc..0000000 --- a/scripts/2021-Sep-7_ETL_DEPRECATED/sample_id_builder_helper.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python3 -import pdb - - -import sys - - -def build_samp_id(style, t_header, info): - if style == 'cbttc_dna_std': - # sample ID will consist of kf external_sample_id with .SUFFIX removed - sid_idx = t_header.index('external_sample_id') - samp_id = info[sid_idx].split('.')[0] - return samp_id - - elif style == 'cbttc_norm_std': - # sample ID will consist of kf external_sample_id with .SUFFIX removed - sid_idx = t_header.index('external_sample_id') - try: - samp_id = info[sid_idx].split('.')[:-1][0] - except Exception as e: - sys.stderr.write(str(e) + "; old formula not working, try direct\n") - samp_id = info[sid_idx] - return samp_id - - elif style == 'cbttc_rna_std': - # sample ID will consist of kf external_sample_id with .SUFFIX removed - sid_idx = t_header.index('external_sample_id') - samp_id = info[sid_idx].split('.')[0] - return samp_id - - elif style == 'pnoc_dna_wgs': - # sample ID will consist of kf external_sample_id with .SUFFIX removed and external_aliquot_id added - sid_idx = t_header.index('external_sample_id') - ali_idx = t_header.index('external_aliquot_id') - samp_id = info[sid_idx].split('.')[0] + '-' + info[ali_idx] - return samp_id - - elif style == 'pnoc_norm': - # sample ID will consist of kf external_sample_id with .SUFFIX removed and external_aliquot_id added - sid_idx = t_header.index('external_sample_id') - ali_idx = t_header.index('external_aliquot_id') - # Different styles for PNOC003 vs 008 - # 008 style has naked 7316 ID, tack on aliquot to differentiate from tumor - samp_id = info[sid_idx] + '-' + info[ali_idx] - - if info[sid_idx][-1] == "S": - # 003 style ending in .WGS or WXS - samp_id = info[sid_idx].split('.')[:-1][0] - return samp_id - - elif style == 'pnoc_rna_wes': - # sample ID will consist of kf external_sample_id with .SUFFIX removed - sid_idx = t_header.index('external_sample_id') - ali_idx = t_header.index('external_aliquot_id') - samp_id = info[sid_idx] + '_' + info[ali_idx] - return samp_id - - elif style == 'maris_dna_wgs': - # sample ID will consist of kf external_id, splitting on - and using only the last part - sid_idx = t_header.index('external_id') - samp_id = info[sid_idx].split("-")[2] - return samp_id - elif style == 'maris_norm_std': - # sample ID will consist of kf external_id, splitting on - and using only the last part - sid_idx = t_header.index('external_id') - samp_id = info[sid_idx].split("-")[2] - return samp_id - diff --git a/scripts/dev/get_files_from_manifest.py b/scripts/dev/get_files_from_manifest.py deleted file mode 100644 index 1647f03..0000000 --- a/scripts/dev/get_files_from_manifest.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import pdb -import sys -import boto3 -import concurrent.futures -import urllib3 -import os -import pandas as pd - - -def mt_type_download(file_type): - """ - Download files from each desired file type at the same time - """ - sys.stderr.write("Downloading " + file_type + " files\n") - sys.stderr.flush() - sub_file_list = list(selected.loc[selected['file_type'] == file_type, 's3_path']) - try: - os.mkdir(file_type) - except Exception as e: - sys.stderr.write(str(e) + " error while making directory for " + file_type + "\n") - for loc in sub_file_list: - out = file_type + "/" + loc.split("/")[-1] - parse_url = urllib3.util.parse_url(loc) - try: - dl_client.download_file(Bucket=parse_url.host, Key=parse_url.path.lstrip("/"), Filename=out) - except Exception as e: - sys.stderr.write(str(e) + " could not download from " + loc + "\n") - sys.stderr.flush() - sys.stderr.write("Completed downloading files for " + file_type + "\n") - sys.stderr.flush() - - -parser = argparse.ArgumentParser(description='Get all files for a project.') -parser.add_argument('-m', '--manifest-list', action='store', dest='manifest', help='csv list of of genomic file location manifests') -parser.add_argument('-f', '--file-types', action='store', dest='fts', help='csv list of workflow types to download') -parser.add_argument('-p', '--profile', action='store', dest='profile', help='aws profile name') - -args = parser.parse_args() -# concat multiple possible manifests -sys.stderr.write("Concatenating manifests\n") -sys.stderr.flush() -manifest_list = args.manifest.split(",") -manifest_concat = pd.DataFrame() -for manifest in manifest_list: - sys.stderr.write("Processing " + manifest + "\n") - current = pd.read_csv(manifest, sep=None) - if manifest_concat.empty: - manifest_concat = current.copy() - else: - manifest_concat = manifest_concat.append(current, ignore_index=True) - -file_types = args.fts.split(",") -# subset concatenated manifests -sys.stderr.write("Subsetting concatenated manifest\n") -sys.stderr.flush() -# change col names to lower case for input compatibility -manifest_concat.columns= manifest_concat.columns.str.lower() -selected = manifest_concat[manifest_concat['file_type'].isin(file_types)] -# remove vcfs as we only want mafs -pattern_del = ".vcf.gz" -filter = selected['file_name'].str.contains(pattern_del) -selected = selected[~filter] - -out_file = "manifest_subset.tsv" -selected.to_csv(out_file, sep='\t', mode='w', index=False) - -# download files by type -session = boto3.Session(profile_name=args.profile) -dl_client = session.client('s3') -# pdb.set_trace() - -with concurrent.futures.ThreadPoolExecutor(16) as executor: - results = {executor.submit(mt_type_download, ftype): ftype for ftype in file_types} - diff --git a/scripts/v1_etl_deprecated/0_ETL_wrapper.py b/scripts/v1_etl_deprecated/0_ETL_wrapper.py deleted file mode 100755 index 08c1d64..0000000 --- a/scripts/v1_etl_deprecated/0_ETL_wrapper.py +++ /dev/null @@ -1,318 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import subprocess -import json -import argparse -import os -from argparse import RawTextHelpFormatter - - -def check_vars(var_list, step_sub_list): - flag = 0 - for var in var_list: - if var not in config_data: - flag = 1 - sys.stderr.write('ERROR: ' + var + ' not defined in ' + config_file + ' for steps ' + step_sub_list + '\n') - if flag: - sys.exit(1) - - -def config_precheck(): - if 'all' in steps: - var_list = ['threads', 'cpus'] - check_vars(var_list, 'all') - if '1' in steps or '4' in steps or '5a' in steps or '5b' in steps or '6' in steps: - var_list = ['cpus'] - check_vars(var_list, '1, 4, 5a, 5b, 6') - if '2' in steps: - var_list = ['threads'] - check_vars(var_list, '2') - if 'all' in steps or '1' in steps or '3' in steps or '7' in steps: - var_list = ['rna_flag'] - check_vars(var_list, 'all, 1, 7') - if 'all' in steps or '1' in steps: - var_list = ['cav_dna_projects', 'tn_order'] - check_vars(var_list, 'all, 1') - if config_data['rna_flag']: - check_vars(['cav_rna_projects'], 'all, 1') - if 'all' in steps or '1' in steps or '7' in steps: - var_list = ['cna_flag'] - check_vars(var_list, 'all, 1, 7') - if 'all' in steps or '3' in steps: - var_list = ['dna_samp_id_style', 'dx_tbl_fn', 'ds_pt_desc', 'ds_samp_desc'] - if config_data['rna_flag']: - var_list.append('rna_samp_id_style') - check_vars(var_list, 'all, 3') - input_config_files = [config_data['dx_tbl_fn']] - check_files(input_config_files, 'CONFIG', '3') - if 'all' in steps or '5a' in steps: - var_list = ['bedtools', 'cp_only_script', 'bed_genes', 'hugo_tsv'] - check_vars(var_list, 'all, 5a') - input_config_files = [config_data['cp_only_script'], config_data['bed_genes'], - config_data['hugo_tsv']] - check_files(input_config_files, 'CONFIG', '5a') - if 'all' in steps or '6' in steps: - var_list = ['ens_gene_list', 'mapFileLoc'] - check_vars(var_list, 'all, 6') - input_config_files = [config_data['ens_gene_list'], config_data['mapFileLoc']] - check_files(input_config_files, 'CONFIG', '6') - if 'all' in steps or '7' in steps: - var_list = ['dx_tbl_fn', 'data_dir', 'cancStudyID', 'group', 'study_short_desc', 'study_desc_p1', - 'study_desc_p2'] - if config_data['rna_flag']: - var_list.extend(['fpkmFileLoc', 'mapFileLoc', 'genesFileLoc']) - check_vars(var_list, 'all, 7') - if config_data['rna_flag']: - input_config_files = [config_data['mapFileLoc'], config_data['genesFileLoc']] - check_files(input_config_files, 'CONFIG', '7') - input_config_dirs = [config_data['data_dir']] - check_dirs(input_config_dirs, 'CONFIG', '7') - - -def check_files(file_list, ftype, step): - flag = 0 - for fn in file_list: - if fn is None: - if step[0] != '7' and ftype == 'INPUT': - sys.stderr.write('ERROR: Missing an input file completely. Try running ' + script_dir + step + ' -h\n') - else: - sys.stderr.write('ERROR: No file defined of type ' + ftype + ' for step ' + step + '\n') - flag = 1 - elif not os.path.isfile(fn): - sys.stderr.write('ERROR: ' + ftype + ' file ' + fn + ' not found. Check file names and config file\n') - flag = 1 - if flag == 1: - sys.stderr.write('Missing files for step(s) ' + step + '. Exiting.') - sys.exit(1) - - -def check_dirs(dir_list, ftype, step): - flag = 0 - for dn in dir_list: - if dn is None: - if step[0] != '7' and ftype == 'INPUT': - sys.stderr.write('ERROR: Missing an input dir completely. Try running ' + script_dir + step + ' -h\n') - else: - sys.stderr.write('ERROR: No dir defined of type ' + ftype + ' for step ' + step + '\n') - flag = 1 - elif not os.path.isdir(dn): - sys.stderr.write('ERROR: ' + ftype + ' directory ' + dn - + ' not found. Check directory names and config file\n') - flag = 1 - if flag == 1: - sys.stderr.write('Missing directories for step(s) ' + step + '. Exiting.\n') - sys.exit(1) - - -parser = argparse.ArgumentParser(description='Wrapper script to run all or some of ETL component scripts.', - formatter_class=RawTextHelpFormatter) -parser.add_argument('-x', '--steps', action='store', dest='steps', help='csv list of steps to execute, valid choices ' - 'are: "all", "1", "2", "3", "4", "5a", "5b", "6", "7". Refers to scripts:\n' - '1_query_cavatica.py\n' - '2_query_ds_by_bs_id.py\n' - '3_create_data_sheets.py\n' - '4_merge_maf.py\n' - '5a_cnv_genome2gene.py\n' - '5b_merge_cnv.py\n' - '6_merge_rsem.py\n' - '7_createLoadingFiles.R') -parser.add_argument('-c', '--cavatica', action='store', dest='cav', help='cavatica output file name, relevant steps: ' - '1, 2, 3, 4, 5b, 6') -parser.add_argument('-p', '--profile', action='store', dest='profile', help='cavatica profile name. requires ' - '.sevenbridges/credentials file be present. Relevant steps: 1, 6') -parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types and ' - 'data locations') -parser.add_argument('-u', '--kf-url', action='store', dest='url', - help='Kids First data service url, i.e. https://kf-api-dataservice.kidsfirstdrc.org/. ' - 'Relevant step: 2') -parser.add_argument('-t', '--tumor', action='store', dest='tumor', - help='file with tumor metadata (see step 2). Relevant step: 3, not needed if step 2 run') -parser.add_argument('-n', '--normal', action='store', dest='normal', help='file with normal metadata (see step 2)' - ' Relevant step: 3, not needed if step 2 run') -parser.add_argument('-s', '--cell-line-supplement', action='store', dest='cl_supp', help='supplemental file with cell ' - 'line meta data - bs idtype. Relevant step: 2, OPTIONAL') -parser.add_argument('-i', '--header', action='store', dest='header', help='File with maf header only. Relevant step: 4') -parser.add_argument('-m', '--maf-dir', action='store', dest='maf_dir', help='maf file directory. Relevant step: 4') -parser.add_argument('-d', '--cnv-dir', action='store', dest='cnv_dir', help='cnv as genome coords file directory.' - ' Relevant step: 5a') -parser.add_argument('-e', '--cnv-dir-gene', action='store', dest='cnv_dir_gene', help='cnv as gene file directory.' - 'Relevant step: 5b, not needed if step5a run') -parser.add_argument('-r', '--rsem-dir', action='store', dest='rsem_dir', help='rsem file directory. Relevant step: 6') - -if len(sys.argv) == 1: - parser.print_help() - sys.exit(1) - -args = parser.parse_args() - -steps = args.steps.split(',') -valid_steps = ['all', '1', '2', '3', '4', '5a', '5b', '6', '7'] -s_flag = 0 -for step in steps: - if step not in valid_steps: - sys.stderr.write('Invalid step detected: ' + step + '\n') - s_flag = 1 -if s_flag: - sys.stderr.write('Invalid step or steps used. Please specify all or list as csv, i.e 1,2,3 (no spaces!)\n') - sys.exit(1) - -config_file = args.config_file -init_in = [config_file] - -if config_file is not None: - sys.stderr.write('Doing a quick check to see if config file ' + config_file + ' exists\n') -else: - sys.stderr.write('No config file set! Need file for param -j or --config\n') - sys.exit(1) -check_files(init_in, 'WRAP', '0') -with open(config_file) as f: - config_data = json.load(f) -script_dir = config_data['script_dir'] -if script_dir[-1] != '/': - script_dir += '/' -sys.stderr.write('Doing a quick check to see if script dir ' + script_dir + ' exists\n') -check_dirs([script_dir], 'WRAP', '0') -sys.stderr.write('Doing a quick check to see if relevant vars in config file are set and files and dirs actually ' - 'exist for step(s) ' + args.steps + '\n') -config_precheck() - -if 'all' in steps or '1' in steps: - step1 = '1_query_cavatica.py' - cmd = script_dir + step1 + ' --output ' + args.cav + ' --profile ' + args.profile + ' --config ' \ - + config_file + ' 2> step1.errs' - sys.stderr.write('Running step 1: ' + cmd + '\n') - sys.stderr.flush() - try: - subprocess.check_call(cmd, shell=True) - except Exception as e: - print(e) - sys.stderr.write('Failed at step 1. Maybe check cavatica profile and if sevenbridges api in installed?\n') - sys.exit(1) - outputs = [args.cav] - check_files(outputs, 'OUTPUT', '1') - -if 'all' in steps or '2' in steps: - step2 = '2_query_ds_by_bs_id.py' - inputs = [args.cav] - check_files(inputs, 'INPUT', step2) - cmd = script_dir + step2 + ' --kf-url ' + args.url + ' --cavatica ' + args.cav + ' --config ' \ - + config_file + ' 2> step2.errs' - sys.stderr.write('Running step 2: ' + cmd + '\n') - sys.stderr.flush() - try: - subprocess.check_call(cmd, shell=True) - except Exception as e: - print(e) - sys.stderr.write('Failed at step 2. Maybe the url is wrong or inaccessible?\n') - sys.exit(1) - outputs = ['tum_bs_ds_info.txt', 'norm_bs_ds_info.txt'] - check_files(outputs, 'OUTPUT', '2') - -if 'all' in steps or '3' in steps: - tumor = args.tumor - normal = args.normal - step3 = '3_create_data_sheets.py' - if '2' in steps or 'all' in steps: - tumor = 'tum_bs_ds_info.txt' - normal = 'norm_bs_ds_info.txt' - inputs = [tumor, normal, args.cav] - check_files(inputs, 'INPUT', step3) - cmd = script_dir + step3 + ' --cavatica ' + args.cav + ' --config ' + config_file + ' --tumor ' \ - + tumor + ' --normal ' + normal - if args.cl_supp is not None: - cmd += ' --cell-line-supplement ' + args.cl_supp - cmd += ' 2> step3.errs' - sys.stderr.write('Running step 3: ' + cmd + '\n') - sys.stderr.flush() - subprocess.call(cmd, shell=True) - outputs = ['datasheets'] - check_dirs(outputs, 'OUTPUT', '3') - -if 'all' in steps or '4' in steps: - step4 = '4_merge_maf.py' - input_files = [args.header, args.cav] - check_files(input_files, 'INPUT', step4) - input_dirs = [args.maf_dir] - check_dirs(input_dirs, 'INPUT', step4) - cmd = script_dir + step4 + ' --header ' + args.header + ' --cavatica ' + args.cav + ' --config ' \ - + config_file + ' --maf-dir ' + args.maf_dir + ' 2> step4.errs' - sys.stderr.write('Running step 4: ' + cmd + '\n') - sys.stderr.flush() - subprocess.call(cmd, shell=True) - output_dirs = ['merged_mafs'] - check_dirs(output_dirs, 'OUTPUT', '4') - -if 'all' in steps or '5a' in steps: - step5a = '5a_cnv_genome2gene.py' - if config_data['cna_flag']: - input_dirs = [args.cnv_dir] - check_dirs(input_dirs, 'INPUT', step5a) - cmd = script_dir + step5a + ' --config ' + config_file + ' --cnv-dir ' + args.cnv_dir \ - + ' 2> step5a.errs' - sys.stderr.write('Running step 5a: ' + cmd + '\n') - sys.stderr.flush() - try: - subprocess.check_call(cmd, shell=True) - except Exception as e: - print(e) - sys.stderr.write('Failed at step 5a. Maybe bedtools is improperly defined?\n') - sys.exit(1) - output_dirs = ['converted_cnvs'] - check_dirs(output_dirs, 'OUTPUT', '5a') - else: - sys.stderr.write('cna_flag set to 0. Check that you have CNA data to process and set to 1 in the config file\n') - sys.stderr.flush() - -if 'all' in steps or '5b' in steps: - step5b = '5b_merge_cnv.py' - if config_data['cna_flag']: - input_files = [args.cav] - check_files(input_files, 'INPUT', step5b) - cnv_gene_dir = args.cnv_dir_gene - if 'all' in steps or '5a' in steps: - cnv_gene_dir = 'converted_cnvs' - input_dirs = [cnv_gene_dir] - check_dirs(input_dirs, 'INPUT', step5b) - cmd = script_dir + step5b + ' --cavatica ' + args.cav + ' --config ' + config_file + ' --cnv-dir-gene ' \ - + cnv_gene_dir + ' 2> step5b.errs' - sys.stderr.write('Running step 5b: ' + cmd + '\n') - sys.stderr.flush() - subprocess.call(cmd, shell=True) - output_dirs = ['merged_cnvs'] - check_dirs(output_dirs, 'OUTPUT', '5b') - else: - sys.stderr.write('cna_flag set to 0. Check that you have CNA data to process and set to 1 in the config file\n') - sys.stderr.flush() - -if 'all' in steps or '6' in steps: - step6 = '6_merge_rsem.py' - if config_data['rna_flag']: - input_files = [args.cav] - check_files(input_files, 'INPUT', step6) - input_dirs = [args.rsem_dir] - check_dirs(input_dirs, 'INPUT', step6) - cmd = script_dir + step6 + ' --cavatica ' + args.cav + ' --config ' + config_file + ' --rsem-dir ' \ - + args.rsem_dir + ' --profile ' + args.profile + ' 2> step6.errs' - sys.stderr.write('Running step 6: ' + cmd + '\n') - sys.stderr.flush() - subprocess.call(cmd, shell=True) - output_dirs = ['merged_rsem'] - check_dirs(output_dirs, 'OUTPUT', '6') - else: - sys.stderr.write('rna_flag set to 0. Check that you have RNA data to process and set to 1 in the config file\n') - sys.stderr.flush() - -if 'all' in steps or '7' in steps: - input_files = [] - if config_data['rna_flag']: - input_files.append(config_data['fpkmFileLoc']) - check_files(input_files, 'INPUT', '7') - cmd = 'Rscript ' + script_dir + '7_createLoadingFiles.R ' + config_file + ' 2> step7.errs > step7.out' - sys.stderr.write('Running step 7: ' + cmd + '\n') - sys.stderr.flush() - subprocess.call(cmd, shell=True) - output_dirs = ['processed'] - check_dirs(output_dirs, 'OUTPUT', '7') - -sys.stderr.write('Pipe complete, check logs\n') diff --git a/scripts/v1_etl_deprecated/1_query_cavatica.py b/scripts/v1_etl_deprecated/1_query_cavatica.py deleted file mode 100755 index 45b6de8..0000000 --- a/scripts/v1_etl_deprecated/1_query_cavatica.py +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python3 - -import sevenbridges as sbg -import sys -import argparse -import concurrent.futures -from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper -import re -import json - - -def process_project_tasks(project, task, atype, cna_flag=None): - task_name = task.name.rstrip('\n') - task_id = task.id - if atype == 'DNA' and 'strelka2_vep_maf' in task.outputs and task.status == 'COMPLETED' and \ - re.search(config_data['dna_task_keyword'], task.name): - try: - (bs_id1, bs_id2) = re.findall('BS_[A-Z0-9]*',task.name) - tum_bs_id = bs_id1 - norm_bs_id = bs_id2 - if config_data['tn_order'] == 'NT': - tum_bs_id = bs_id2 - norm_bs_id = bs_id1 - outs = task.outputs['strelka2_vep_maf'].name - if cna_flag == 1: - outs += ',' + task.outputs['cnv_pval'].name - return '\t'.join((tum_bs_id, norm_bs_id, task_name, task_id, atype, outs, project)) + '\n' - except Exception as e: - sys.stderr.write(str(e) + '\n') - sys.stderr.write('Failed to process ' + task.id + ' ' + task.name + ', skipping\n') - elif atype == 'RNA': - try: - if task.outputs is not None and 'RSEM_gene' in task.outputs: - bs_id = task.outputs['RSEM_gene'].metadata['Kids First Biospecimen ID'] - outs = task.outputs['RSEM_gene'].name - return '\t'.join((bs_id, 'NA', task_name, task_id, atype, outs, project)) + '\n' - - except Exception as e: - sys.stderr.write(str(e) + '\n') - sys.stderr.write('Task ' + task_name + ' with ID ' + task_id + ' failed to gather all info\n') - sys.stderr.flush() - return None - - -def get_file_info(api, project, atype, out_fh, cna_flag=None): - proj_tasks = api.tasks.query(project=project).all() - sys.stderr.write('Processing project ' + project + '\n') - x = 1 - n = 20 - with concurrent.futures.ThreadPoolExecutor(config_data['cpus']) as executor: - results = {executor.submit(process_project_tasks, project, task, atype, cna_flag): task for task in proj_tasks} - for result in concurrent.futures.as_completed(results): - if x % n == 0: - sys.stderr.write('Processed ' + str(x) + ' tasks\n') - sys.stderr.flush() - outstr = result.result() - if outstr is not None: - out_fh.write(outstr) - x += 1 - - -parser = argparse.ArgumentParser(description='Get all relevant analyzed file outputs from projects in cavatica.') -parser.add_argument('-o', '--output', action='store', dest='out', help='output file name') -parser.add_argument('-p', '--profile', action='store', dest='profile', help='cavatica profile name. requires ' - '.sevenbridges/credentials file be present') -parser.add_argument('-c', '--config', action='store', dest='config_file', help='json config file with data types and ' - 'data locations') - -args = parser.parse_args() -config = sbg.Config(profile=args.profile) -api = sbg.Api(config=config, error_handlers=[ - sbg.http.error_handlers.rate_limit_sleeper, - sbg.http.error_handlers.maintenance_sleeper]) -with open(args.config_file) as f: - config_data = json.load(f) -dna_types = config_data['cav_dna_projects'] -rna_types = config_data['cav_rna_projects'] -out_fh = open(args.out, 'w') -out_fh.write('T/CL BS ID\tNorm BS ID\tTask Name\tTask ID\tAnalyte Type\tRelevant Outputs\tSource Project\n') -for project in dna_types: - get_file_info(api, project, 'DNA', out_fh, config_data['cna_flag']) -for project in rna_types: - get_file_info(api, project, 'RNA', out_fh) -out_fh.close() -sys.stderr.write('Complete\n') - diff --git a/scripts/v1_etl_deprecated/6b_z_score_transform.py b/scripts/v1_etl_deprecated/6b_z_score_transform.py deleted file mode 100644 index d482227..0000000 --- a/scripts/v1_etl_deprecated/6b_z_score_transform.py +++ /dev/null @@ -1,60 +0,0 @@ -import sys -import argparse -import os -import concurrent.futures -import json -import subprocess -import pandas as pd -import numpy as np -from scipy import stats - - -parser = argparse.ArgumentParser(description='Merge rsem files using cavatica file info.') -parser.add_argument('-d', '--merged_rsem_dir', action='store', dest='merged_rsem_dir', help='merged rsem dir') -parser.add_argument('-t', '--table', action='store', dest='table', - help='Table with cbio project, kf bs ids, cbio IDs, and file names, ' + - 'give only if you want to normalize first across all tables, then split') -parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types ' - 'and data locations') -args = parser.parse_args() - -with open(args.config_file) as f: - config_data = json.load(f) - -flist = subprocess.check_output('find ' + args.merged_rsem_dir + ' -name *.rsem_merged.txt', shell=True) -fname_list = flist.decode().split('\n') -if fname_list[-1] == '': - fname_list.pop() - -if args.table == None: - for fname in fname_list: - data = pd.read_csv(fname, sep="\t", index_col=0) - header = list(data.columns) - gene_list = list(data.index.values) - if len(header) > 2: - new_fname = fname.replace('.rsem_merged.txt', '.rsem_merged_zscore.txt') - z_scored = stats.zscore(np.array(data), axis=1) - new_data = pd.DataFrame(z_scored, index=gene_list, columns=header) - new_data.fillna('NA', inplace=True) - sys.stderr.write('Outputting results to ' + new_fname + '\n') - new_data.to_csv(new_fname, sep='\t', mode='w', index=True) - else: - sys.stderr.write('Only 1 sample found in ' + fname + ', skipping!\n') -else: - mega_tbl = pd.read_csv(fname_list[0], sep="\t", index_col=0) - mega_header = list(mega_tbl.columns) - gene_list = list(mega_tbl.index.values) - sample_info = pd.read_csv(args.table, sep="\t") - for i in range(1, len(fname_list), 1): - data = pd.read_csv(fname_list[i], sep="\t", index_col=0) - mega_header.extend(list(data.columns)) - mega_tbl = pd.concat([mega_tbl, data]) - log_tf = np.log(np.array(mega_tbl)) - z_scored = stats.zscore(log_tf, axis = 1) - mega_df = pd.DataFrame(z_scored, index=gene_list, columns=header) - cbio_proj = sample_info.Cbio_project.unique() - for proj in cbio_proj: - new_fname = proj + ".rsem_merged_zscore.txt" - cur = mega_df[[proj]] - cur.fillna('NA', inplace=True) - cur.to_csv(new_fname, sep='\t', mode='w', index=True) \ No newline at end of file diff --git a/scripts/v1_etl_deprecated/7_createLoadingFiles.R b/scripts/v1_etl_deprecated/7_createLoadingFiles.R deleted file mode 100644 index 724c489..0000000 --- a/scripts/v1_etl_deprecated/7_createLoadingFiles.R +++ /dev/null @@ -1,365 +0,0 @@ -#!/usr/bin/env Rscript -############################################## -#Purpose: Code to create Pedcbioportal loading files -#Date: 10/5/2018 -#Author: Pichai Raman, Modified by Miguel Brown -############################################## - -############################################## -#Rough outline -#1. We will start with the clinical file and update it with meta file -#2. We will find corresponding MAF files and create MAF meta file -#3. We will then find corresponding RNA-Seq files and update -############################################## - -#Set Mutation, CNA, and Datasheets Directory -args = commandArgs(trailingOnly=TRUE) -if (length(args)==0) { - stop("Please supply a json config file\n", call.=FALSE) -} -library("rjson") -config_data <- fromJSON(file=args[1]) -data_dir = config_data$data_dir -mutDir <- paste(data_dir, "/merged_mafs/", sep = "") -cnaDir <- paste(data_dir, "/merged_cnvs/", sep = "") -hasRNA = config_data$rna_flag -if (hasRNA){ - fusion_dir <- paste(data_dir, config_data$fusion_dir, sep = "") -} -hasCNA = config_data$cna_flag -dataSheetsDir <- paste(data_dir, "/datasheets/", sep = "") -diseaseMappingFile <- paste(data_dir, "/", config_data$dx_tbl_fn, sep = ""); - - -#Source RNA-Seq Functions -script_dir = config_data$script_dir -# source("RNALoadingHelper.R") -source(paste(script_dir,"/CNALoadingHelper.R", sep = "")) - -#Create Meta File for MAF files -createMutationsMeta <- function(myLoc=NULL, cancStudyID=NULL, myDesc=NULL) -{ - #Create File - myFile <- paste(myLoc, "/meta_mutations_extended.txt", sep=""); - file.create(myFile); - - #Add Cancer Study - cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="") - write(cancStudyIDLine, myFile, append=TRUE) - - #Add stable id - stableIDTypeLine <- "stable_id: mutations" - write(stableIDTypeLine, myFile, append=TRUE) - - #Add Profile Name - profileNameLine <- "profile_name: Mutations" - write(profileNameLine, myFile, append=TRUE) - - #Add Profile Description - profileDescLine <- paste("profile_description: ", myDesc, sep="") - write(profileDescLine, myFile, append=TRUE) - - #Add Genetic Alteration Type - genAltTypeLine <- "genetic_alteration_type: MUTATION_EXTENDED"; - write(genAltTypeLine, myFile, append=TRUE) - - #Add Data Type - dataTypeLine <- "datatype: MAF" - write(dataTypeLine, myFile, append=TRUE) - - #Add Show Profile in analysis Tab - showProfileInTabLine <- "show_profile_in_analysis_tab: true" - write(showProfileInTabLine, myFile, append=TRUE) - - #Add Data_filename - dataFileLine <- "data_filename: data_mutations_extended.txt" - write(dataFileLine, myFile, append=TRUE) - - print(paste("Done Creation Mutations Meta for", cancStudyID, sep = " ")); -} - -#Create Meta File for MAF files -createMetaStudy <- function(myLoc=NULL, myTypeOfCancer=NULL, cancStudyID=NULL, myName=NULL, myDesc=NULL, myShortName=NULL, myGroups=NULL) -{ - #Create File - myFile <- paste(myLoc, "/meta_study.txt", sep=""); - file.create(myFile); - - #Add Type of Cancer - typeOfCancerLine <- paste("type_of_cancer: ", myTypeOfCancer, sep="") - write(typeOfCancerLine, myFile, append=TRUE) - - #Add Cancer Study - cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="") - write(cancStudyIDLine, myFile, append=TRUE) - - #Add Profile Name - nameLine <- paste("name: ", myName, sep="") - write(nameLine, myFile, append=TRUE) - - #Add Profile Description - descLine <- paste("description: ", myDesc, sep="") - write(descLine, myFile, append=TRUE) - - #Add Data_filename - shortNameLine <- paste("short_name: ", myShortName, sep="") - write(shortNameLine, myFile, append=TRUE) - - #Add group - groupLine <- paste("groups: ", myGroups, sep="") - write(groupLine, myFile, append=TRUE) - - print(paste("Done Creation Meta Study for", cancStudyID, sep = " ")); -} - -#Create Meta Clinical Sample -createMetaSample <- function(myLoc=NULL, cancStudyID=NULL) -{ - #Create File - myFile <- paste(myLoc, "/meta_clinical_sample.txt", sep=""); - file.create(myFile); - - #Add Cancer Study - cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="") - write(cancStudyIDLine, myFile, append=TRUE) - - #Add Genetic Alteration type - genAltLine <- "genetic_alteration_type: CLINICAL" - write(genAltLine, myFile, append=TRUE) - - #Add Profile Description - datTypeLine <- "datatype: SAMPLE_ATTRIBUTES" - write(datTypeLine, myFile, append=TRUE) - - #Add Data_filename - datFileNameLine <- "data_filename: data_clinical_sample.txt" - write(datFileNameLine, myFile, append=TRUE) - print(paste("Done Creation Meta Sample for", cancStudyID, sep=" ")); -} - -#Create Meta Clinical Patient -createMetaClinical <- function(myLoc=NULL, cancStudyID=NULL) -{ - #Create File - myFile <- paste(myLoc, "/meta_clinical_patient.txt", sep=""); - file.create(myFile); - - #Add Cancer Study - cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="") - write(cancStudyIDLine, myFile, append=TRUE) - - #Add Genetic Alteration type - genAltLine <- "genetic_alteration_type: CLINICAL" - write(genAltLine, myFile, append=TRUE) - - #Add Profile Description - datTypeLine <- "datatype: PATIENT_ATTRIBUTES" - write(datTypeLine, myFile, append=TRUE) - - #Add Data_filename - datFileNameLine <- "data_filename: data_clinical_patient.txt" - write(datFileNameLine, myFile, append=TRUE) - print(paste("Done Creation Meta Patient for", cancStudyID, sep=" ")); -} - - -createCaseLists <- function(myLoc=NULL, cancStudyID=NULL, cancStableID=NULL, caseListName=NULL, caseListDesc=NULL, caseListCat=NULL, myCaseListIDs=NULL) -{ - #Create File - myFile <- paste(myLoc, sep=""); - file.create(myFile); - - #Add Cancer Study - cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="") - write(cancStudyIDLine, myFile, append=TRUE) - - #Add Cancer Stable ID - cancStableIDLine <- paste("stable_id: ", cancStableID, sep="") - write(cancStableIDLine, myFile, append=TRUE) - - #Add Case List Name - caseNameLine <- paste("case_list_name: ", caseListName, sep="") - write(caseNameLine, myFile, append=TRUE) - - #Add Case List Description - caseDescLine <- paste("case_list_description: ", caseListDesc, sep="") - write(caseDescLine, myFile, append=TRUE) - - #Add Case List Category - caseCatLine <- paste("case_list_category: ", caseListCat, sep="") - write(caseCatLine, myFile, append=TRUE) - - #Add list ids - myCaseListIDs <- paste(myCaseListIDs, collapse="\t") - caseListIDsLine <- paste("case_list_ids: ", myCaseListIDs, sep="") - write(caseListIDsLine, myFile, append=TRUE) - - print(paste("Done Creation of case list for", cancStudyID, sep=" ")); - -} - - - -createStudyAll <- function(myDisease=NULL, - myName=NULL, - myDescMS=NULL, - myDescMM=config_data$mut_desc, - myDescMRF="profile_description: Expression levels from RNA-Seq (FPKM)", - myDescMRZ="profile_description: Expression levels from RNA-Seq (z-scores)", - myDescMC="profile_description: Predicted copy number values from WGS (Continuous)", - myDescMCD="profile_description: Predicted copy number values from WGS (Discrete)") -{ - myDiseaseStudyFolder <- paste("./processed/", config_data$cancStudyID, sep="") - myCancStudyID <- config_data$cancStudyID; - if (config_data$prepend_cancStudyID_flag){ - myDiseaseStudyFolder <- paste("./processed/", myDisease, "_", config_data$cancStudyID, sep="") - myCancStudyID <- paste(myDisease, "_", config_data$cancStudyID, sep=""); - } - caseListFolder <- paste(myDiseaseStudyFolder, "/case_lists", sep="") - - #Create Directory - dir.create(myDiseaseStudyFolder, recursive = TRUE) - dir.create(caseListFolder, recursive = TRUE); - #Move clinical and patient files - system(paste("cp ", dataSheetsDir, myDisease, "/data* ", myDiseaseStudyFolder, sep="")); - - #Move MAF File - system(paste("cp ", mutDir, myDisease, ".strelka.vep.filtered.maf ", myDiseaseStudyFolder, "/data_mutations_extended.txt", sep="")); - - if(hasCNA){ - #Move CNA File - system(paste("cp ", cnaDir, myDisease, ".predicted_cnv.txt ", myDiseaseStudyFolder, "/data_linear_CNA.txt", sep="")); - system(paste("cp ", cnaDir, myDisease, ".discrete_cnvs.txt ", myDiseaseStudyFolder, "/data_CNA.txt", sep="")); - - #Create CNV meta files - createCNVMeta(myDiseaseStudyFolder, cancStudyID=myCancStudyID, myDescMC); - createCNVDiscreteMeta(myDiseaseStudyFolder, cancStudyID=myCancStudyID, myDescMCD); - #Case List CNA - tmpData <- read.delim(paste(myDiseaseStudyFolder, "/data_linear_CNA.txt", sep="")); - tmpSamps <- as.character(colnames(tmpData)); - tmpSamps <- tmpSamps[2:length(tmpSamps)] - tmpSamps <- gsub("^X", "", tmpSamps) - tmpSamps <- gsub("\\.", "-", tmpSamps) - createCaseLists(paste(myDiseaseStudyFolder, "/case_lists/cases_cna.txt", sep=""), - cancStudyID=myCancStudyID, - cancStableID=paste(myCancStudyID, "_cna", sep=""), - caseListName="Tumor Samples with CNA data", - caseListDesc=paste("All tumors with CNA data (", length(tmpSamps), " samples)", sep=""), - caseListCat="all_cases_with_cna_data", - myCaseListIDs=tmpSamps) - } - - #Create Study met file - createMetaStudy(myDiseaseStudyFolder, - myTypeOfCancer=myDisease, - cancStudyID=myCancStudyID, - myName=myName, - myDesc=myDescMS, - myShortName=myCancStudyID, - myGroups=config_data$group - ); - - #Create mutation meta file - createMutationsMeta(myDiseaseStudyFolder, - cancStudyID=myCancStudyID, - myDesc=myDescMM); - - if(hasRNA) - { - #Create Expression meta files - z_check = createRNASeqExpMatrix(loc=myDiseaseStudyFolder, myDisease=myDisease, dataSheetsDir=dataSheetsDir, res=res); - createExpressionMeta(myDiseaseStudyFolder, cancStudyID=myCancStudyID, myDescMRF); - if (z_check){ - createExpressionZMeta(myDiseaseStudyFolder, cancStudyID=myCancStudyID, myDescMRZ); - } - #Copy fusions files - # system(paste("cp ", fusion_dir, myDisease, "/data_fusions.txt ", myDiseaseStudyFolder, "/data_fusions.txt", sep="")); - # system(paste("cp ", fusion_dir, myDisease, "/meta_FUSION.txt ", myDiseaseStudyFolder, "/meta_FUSION.txt", sep="")); - } - - - #Create meta sample & Clinical - createMetaSample(myDiseaseStudyFolder, cancStudyID=myCancStudyID); - createMetaClinical(myDiseaseStudyFolder, cancStudyID=myCancStudyID); - - #Create case lists - #Case List All - tmpData <- read.delim(paste(myDiseaseStudyFolder, "/data_clinical_sample.txt", sep=""), skip=4); - tmpSamps <- as.character(unique(tmpData[,"SAMPLE_ID"])); - createCaseLists(paste(myDiseaseStudyFolder, "/case_lists/cases_all.txt", sep=""), - cancStudyID=myCancStudyID, - cancStableID=paste(myCancStudyID, "_all", sep=""), - caseListName="All Tumors", - caseListDesc=paste("All tumor samples (", length(tmpSamps), " samples)", sep=""), - caseListCat="all_cases_in_study", - myCaseListIDs=tmpSamps) - print("Done Creation cases all"); - - #Case List Mutations - print("Creating mutations case list"); - tmpData <- read.delim(paste(mutDir, myDisease, ".strelka.vep.filtered.maf", sep=""), skip=1, row.names=NULL); - tmpSamps <- as.character(unique(tmpData[,"Tumor_Sample_Barcode"])); - print("Read in mutations"); - #For now, fusions are lumped into mutation data, need to concat together - if (hasRNA){ - print("RNA flag given, reading fusions as mutations"); - tmpData1 <- read.delim(paste(fusion_dir, myDisease, "/data_fusions.txt", sep=""), row.names=NULL); - tmpSamps1 <- as.character(unique(tmpData[,"Tumor_Sample_Barcode"])); - tmpSamps <- unique(rbind(tmpSamps, tmpSamps1)) - print("Done reading fusions"); - } - createCaseLists(paste(myDiseaseStudyFolder, "/case_lists/cases_sequenced.txt", sep=""), - cancStudyID=myCancStudyID, - cancStableID=paste(myCancStudyID, "_sequenced", sep=""), - caseListName="All Sequenced Tumors", - caseListDesc=paste("All sequenced tumor samples (", length(tmpSamps), " samples)", sep=""), - caseListCat="all_cases_with_mutation_data", - myCaseListIDs=tmpSamps) - print("Created sequenced case list"); - - if(hasRNA) - { - #Case List RNA - print("Creating RNA case list"); - tmpData <- read.delim(paste(myDiseaseStudyFolder, "/data_rna_seq_v2_mrna.txt", sep="")); - tmpSamps <- as.character(colnames(tmpData)); - tmpSamps <- tmpSamps[2:length(tmpSamps)] - tmpSamps <- gsub("^X", "", tmpSamps) - tmpSamps <- gsub("\\.", "-", tmpSamps) - createCaseLists(paste(myDiseaseStudyFolder, "/case_lists/cases_RNA_Seq_v2_mRNA.txt", sep=""), - cancStudyID=myCancStudyID, - cancStableID=paste(myCancStudyID, "_rna_seq_v2_mrna", sep=""), - caseListName="Tumor Samples with mRNA data (RNA Seq V2)", - caseListDesc=paste("All samples with mRNA expression data (", length(tmpSamps), " samples)", sep=""), - caseListCat="all_cases_with_mrna_rnaseq_data", - myCaseListIDs=tmpSamps) - } - print("Created RNA case list"); -} - - -#Create studies -diseaseMapping <- read.delim(diseaseMappingFile, header=T, stringsAsFactors=F) -colnames(diseaseMapping) <- c("Disease", "Abbr", "Disease2"); -diseaseMapping <- diseaseMapping[!duplicated(diseaseMapping[,2]),] -rownames(diseaseMapping) <- diseaseMapping[,"Abbr"]; -myFiles <- list.files(dataSheetsDir) - -if(hasRNA) -{ - #Create Expression meta files - source(paste(script_dir,"/RNALoadingHelper.R", sep = "")) - res = init_res(config_data$fpkmFileLoc, config_data$mapFileLoc, config_data$genesFileLoc) -} -for(i in 1:length(myFiles)) -{ - print(i); - myAbbrev <- myFiles[i]; - myDisease <- as.character(diseaseMapping[myAbbrev,3]); - createStudyAll(myDisease=myAbbrev, - myName=paste(myDisease, config_data$study_short_desc, sep=" "), - myDescMS=paste(config_data$study_desc_p1, myDisease, config_data$study_desc_p2, sep=" ") - ) - -} - diff --git a/scripts/v1_etl_deprecated/CNALoadingHelper.R b/scripts/v1_etl_deprecated/CNALoadingHelper.R deleted file mode 100644 index df1fbec..0000000 --- a/scripts/v1_etl_deprecated/CNALoadingHelper.R +++ /dev/null @@ -1,89 +0,0 @@ -############################################## -#Purpose: Code to create RNA-Seq raw data per disease -#Date: 9/5/2018 -#Author: Pichai Raman -############################################## - -#Create Meta File for MAF files -createCNVMeta <- function(myLoc=NULL, cancStudyID=NULL, myDesc=NULL) -{ - #Create File - myFile <- paste(myLoc, "/meta_linear_CNA.txt", sep=""); - file.create(myFile); - - #Add Cancer Study - cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="") - write(cancStudyIDLine, myFile, append=TRUE) - - #Add stable id - stableIDTypeLine <- "stable_id: linear_CNA" - write(stableIDTypeLine, myFile, append=TRUE) - - #Add Profile Name - profileNameLine <- "profile_name: copy-number values" - write(profileNameLine, myFile, append=TRUE) - - #Add Profile Description - profileDescLine <- myDesc - write(profileDescLine, myFile, append=TRUE) - - #Add Genetic Alteration Type - genAltTypeLine <- "genetic_alteration_type: COPY_NUMBER_ALTERATION"; - write(genAltTypeLine, myFile, append=TRUE) - - #Add Data Type - dataTypeLine <- "datatype: CONTINUOUS" - write(dataTypeLine, myFile, append=TRUE) - - #Add Show Profile in analysis Tab - showProfileInTabLine <- "show_profile_in_analysis_tab: false" - write(showProfileInTabLine, myFile, append=TRUE) - - #Add Data_filename - dataFileLine <- "data_filename: data_linear_CNA.txt" - write(dataFileLine, myFile, append=TRUE) - - print("Done Creation CNV Meta"); -} - -#Create Meta File for MAF files -createCNVDiscreteMeta <- function(myLoc=NULL, cancStudyID=NULL, myDesc=NULL) -{ - #Create File - myFile <- paste(myLoc, "/meta_CNA.txt", sep=""); - file.create(myFile); - - #Add Cancer Study - cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="") - write(cancStudyIDLine, myFile, append=TRUE) - - #Add stable id - stableIDTypeLine <- "stable_id: cna" - write(stableIDTypeLine, myFile, append=TRUE) - - #Add Profile Name - profileNameLine <- "profile_name: Binned copy-number values" - write(profileNameLine, myFile, append=TRUE) - - #Add Profile Description - profileDescLine <- myDesc - write(profileDescLine, myFile, append=TRUE) - - #Add Genetic Alteration Type - genAltTypeLine <- "genetic_alteration_type: COPY_NUMBER_ALTERATION"; - write(genAltTypeLine, myFile, append=TRUE) - - #Add Data Type - dataTypeLine <- "datatype: DISCRETE" - write(dataTypeLine, myFile, append=TRUE) - - #Add Show Profile in analysis Tab - showProfileInTabLine <- "show_profile_in_analysis_tab: true" - write(showProfileInTabLine, myFile, append=TRUE) - - #Add Data_filename - dataFileLine <- "data_filename: data_CNA.txt" - write(dataFileLine, myFile, append=TRUE) - - print("Done Creation CNV Meta Discrete"); -} diff --git a/scripts/v1_etl_deprecated/RNALoadingHelper.R b/scripts/v1_etl_deprecated/RNALoadingHelper.R deleted file mode 100644 index 5d8ef18..0000000 --- a/scripts/v1_etl_deprecated/RNALoadingHelper.R +++ /dev/null @@ -1,171 +0,0 @@ -############################################## -#Purpose: Code to create RNA-Seq raw data per disease -#Date: 10/5/2018 -#Author: Pichai Raman -############################################## - - -#Read in mapping file -#ExpressionFile - -init_res <- function(fpkmFileLoc=NULL, mapFileLoc=NULL, genesFileLoc=NULL) -{ - print(paste("Reading inputs for expression data")) - res = read.table(file=fpkmFileLoc, header=TRUE, sep="\t", check.names = FALSE) - cBioGenes <- levels(read.delim(genesFileLoc)[,2]); - - #Format RNA-Seq Rows - res[,"median"] <- apply(res[3:ncol(res)], FUN=median, MARGIN=1) - res <- res[order(-res[,"median"]),] - res <- res[!duplicated(as.character(res[,2])),] - rownames(res) <- res[,2]; - res <- res[-1:-2]; - res <- res[intersect(rownames(res), cBioGenes),] - res <- res[-ncol(res)] - - #load mapping file and rename columns - mappingFile <- read.delim(mapFileLoc, header=F, stringsAsFactors=F) - rownames(mappingFile) <- mappingFile[,2]; - res <- res[, intersect(colnames(res), rownames(mappingFile))] - colnames(res) <- mappingFile[colnames(res),1] - return(res) - -} - -myZ <- function(x) -{ - x <- (x-mean(x))/sd(x); - return(x); -} - -createRNASeqExpMatrix <- function(loc=NULL, myDisease=NULL, dataSheetsDir=NULL, res=NULL) -{ - print(paste("Reading datasheet info for", myDisease, sep=" ")) - # z score out check - writeOut <- 0 - #Filter and write out - tmpData <- read.delim(paste(dataSheetsDir, myDisease, "/data_clinical_sample.txt", sep=""), skip=4); - tmpSamps <- as.character(tmpData[,"SAMPLE_ID"]); - tmpMat <- data.frame(res[,intersect(tmpSamps, colnames(res))]); - if(ncol(tmpMat)==1) - { - colnames(tmpMat)<- intersect(tmpSamps, colnames(res)); - rownames(tmpMat) <- rownames(res); - } - if(length(intersect(tmpSamps, colnames(res)))>0) - { - dataFPKM <- data.frame(rownames(tmpMat), tmpMat); - - #Write out expression matrix - colnames(dataFPKM)[1] <- c("Hugo_Symbol") - colnames(dataFPKM) <- gsub("^X", "", colnames(dataFPKM)) - colnames(dataFPKM) <- gsub("\\.", "-", colnames(dataFPKM)) - write.table(dataFPKM, paste(loc, "/data_rna_seq_v2_mrna.txt", sep=""), sep="\t", row.names=F, quote=F) - - #Write out Z-score - - if (length(intersect(tmpSamps, colnames(res))) > 1){ - tmpMat <- data.frame(log2(tmpMat+1)); - tmpMat <- data.frame(t(apply(tmpMat, FUN=myZ, MARGIN=1))) - dataZ <- data.frame(rownames(tmpMat), tmpMat); - colnames(dataZ) <- colnames(dataFPKM) - write.table(dataZ, paste(loc, "/data_rna_seq_v2_mrna_median_Zscores.txt", sep=""), sep="\t", row.names=F, quote=F) - writeOut <- 1 - print(paste("Finished writing out Expression Files for", myDisease, sep=" ")); - } - else{ - print(paste("One or fewer samples for", myDisease, "detected, skipping Z score output!", sep=" ")) - } - } - return(writeOut) -} - -#Create Meta File for RNA exp files -createExpressionMeta <- function(myLoc=NULL, cancStudyID=NULL, myDesc=NULL) -{ - #Create File - myFile <- paste(myLoc, "/meta_rna_seq_v2_mrna.txt", sep=""); - file.create(myFile); - - #Add Cancer Study - cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="") - write(cancStudyIDLine, myFile, append=TRUE) - - #Add stable id - stableIDTypeLine <- "stable_id: rna_seq_mrna" - write(stableIDTypeLine, myFile, append=TRUE) - - #Add Profile Name - profileNameLine <- "profile_name: mRNA expression" - write(profileNameLine, myFile, append=TRUE) - - #Add Profile Description - profileDescLine <- myDesc - write(profileDescLine, myFile, append=TRUE) - - #Add Genetic Alteration Type - genAltTypeLine <- "genetic_alteration_type: MRNA_EXPRESSION"; - write(genAltTypeLine, myFile, append=TRUE) - - #Add Data Type - dataTypeLine <- "datatype: CONTINUOUS" - write(dataTypeLine, myFile, append=TRUE) - - #Add Show Profile in analysis Tab - showProfileInTabLine <- "show_profile_in_analysis_tab: false" - write(showProfileInTabLine, myFile, append=TRUE) - - #Add Data_filename - dataFileLine <- "data_filename: data_rna_seq_v2_mrna.txt" - write(dataFileLine, myFile, append=TRUE) - - print(paste("Done Creation RNA Expression Meta for", cancStudyID, sep=" ")); -} - -#Create Meta File for RNA exp files -createExpressionZMeta <- function(myLoc=NULL, cancStudyID=NULL, myDesc=NULL) -{ - #Create File - myFile <- paste(myLoc, "/meta_rna_seq_v2_mrna_median_Zscores.txt", sep=""); - file.create(myFile); - - #Add Cancer Study - cancStudyIDLine <- paste("cancer_study_identifier: ", cancStudyID, sep="") - write(cancStudyIDLine, myFile, append=TRUE) - - #Add stable id - stableIDTypeLine <- "stable_id: rna_seq_mrna_median_Zscores" - write(stableIDTypeLine, myFile, append=TRUE) - - #Add Profile Name - profileNameLine <- "profile_name: mRNA expression z-scores" - write(profileNameLine, myFile, append=TRUE) - - #Add Profile Description - profileDescLine <- myDesc; - write(profileDescLine, myFile, append=TRUE) - - #Add Genetic Alteration Type - genAltTypeLine <- "genetic_alteration_type: MRNA_EXPRESSION"; - write(genAltTypeLine, myFile, append=TRUE) - - #Add Data Type - dataTypeLine <- "datatype: Z-SCORE" - write(dataTypeLine, myFile, append=TRUE) - - #Add Show Profile in analysis Tab - showProfileInTabLine <- "show_profile_in_analysis_tab: true" - write(showProfileInTabLine, myFile, append=TRUE) - - #Add Data_filename - dataFileLine <- "data_filename: data_rna_seq_v2_mrna_median_Zscores.txt" - write(dataFileLine, myFile, append=TRUE) - - print(paste("Done Creation Expression Z Meta", cancStudyID, sep=" ")); -} - - - - - - diff --git a/scripts/v1_etl_deprecated/add_case_lists.py b/scripts/v1_etl_deprecated/add_case_lists.py deleted file mode 100755 index d974066..0000000 --- a/scripts/v1_etl_deprecated/add_case_lists.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -import os -import subprocess -import pdb - -def process_col_samps(h_cnt, s_col, fname, samp_dict): - cur = open(fname) - temp = {} - for x in range(h_cnt): - skip = next(cur) - for row in cur: - data = row.rstrip('\n').split('\t') - samp = data[s_col] - if samp not in temp: - temp[samp] = 1 - if samp not in samp_dict: - samp_dict[samp] = 1 - else: - samp_dict[samp] += 1 - cur.close() - return samp_dict - -def process_row_samps(s_col, fname, samp_dict): - cur = open(fname) - head = next(cur) - header = head.rstrip('\n').split('\t') - for x in range(s_col, len(header), 1): - if header[x] not in samp_dict: - samp_dict[header[x]] = 1 - else: - samp_dict[header[x]] += 1 - cur.close() - return samp_dict - - -parser = argparse.ArgumentParser(description='Add case lists to existing projects') -parser.add_argument('-t', '--type', action='store', dest='ctype', help='case_list type to gen, i.e. cnaseq, 3way_complete, SCRIPT SHOULD BE RUN RIGHT AT STUDY DIR LEVEL') - -args = parser.parse_args() -if len(sys.argv) == 1: - parser.print_help() - sys.exit(1) - -# files to search for case building -clist_fdict = {'cnaseq': {'fnames': ['data_mutations_extended.txt', 'data_CNA.txt'], - 'name': 'Tumor samples with mutatation and CNA data','desc': 'All tumor samples with mutation and CNA data','cat': 'all_cases_with_mutation_and_cna_data'} , - '3way_complete': { 'fnames': ['data_mutations_extended.txt', 'data_CNA.txt','data_rna_seq_v2_mrna.txt'], - 'name': 'Tumor samples with mutatation, CNA and mRNA data','desc': 'All tumor samples with mutation, CNA, and mRNA data','cat': 'all_cases_with_mutation_and_cna_and_mrna_data'}} - -f_cmd = 'find . -name ' + clist_fdict[args.ctype]['fnames'][0] -result = subprocess.check_output(f_cmd, shell=True) -muts_list = result.decode().split('\n') -for mutFile in muts_list: - if mutFile != '': - s_dict = {} - project = os.path.dirname(mutFile) - project = project.replace('./','') - sys.stderr.write("Generating case lists for project " + project + "\n") - sys.stderr.write("Processing " + mutFile + "\n") - s_dict = process_col_samps(2, 13, mutFile, s_dict) - for i in range(1, len(clist_fdict[args.ctype]['fnames'])): - fname = project + "/" + clist_fdict[args.ctype]['fnames'][i] - sys.stderr.write("Processing " + fname + "\n") - s_dict = process_row_samps(1, fname, s_dict) - - new_case_list = open(project + "/case_lists/cases_" + args.ctype + ".txt", 'w') - new_case_list.write('cancer_study_identifier: ' + project + '\nstable_id: ' + project + '_' + args.ctype + '\ncase_list_name: ' + clist_fdict[args.ctype]['name'] + '\n') - count = len(clist_fdict[args.ctype]['fnames']) - s_list = [] - for samp in s_dict: - if s_dict[samp] == count: - s_list.append(samp) - total = len(s_list) - new_case_list.write('case_list_description: ' + clist_fdict[args.ctype]['desc'] + ' (' + str(total) + ' samples)\ncase_list_category: ' + clist_fdict[args.ctype]['cat'] + '\ncase_list_ids: ' + '\t'.join(s_list)) - new_case_list.close() - sys.stderr.write("Completed creating case list for " + project + "\n") - else: - sys.stderr.write(clist_fdict[args.ctype]['fnames'][0] + " path was empty str. skpping!\n") diff --git a/scripts/v1_etl_deprecated/entrez_id_fusion_format.R b/scripts/v1_etl_deprecated/entrez_id_fusion_format.R deleted file mode 100644 index 5a3825b..0000000 --- a/scripts/v1_etl_deprecated/entrez_id_fusion_format.R +++ /dev/null @@ -1,147 +0,0 @@ -suppressPackageStartupMessages(library("readr")) -suppressPackageStartupMessages(library("dplyr")) -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library(org.Hs.eg.db)) -hs <- org.Hs.eg.db - - -option_list <- list( - make_option(c("-f", "--fusionfile"),type="character", - help="Merged fusion calls from [STARfusion | Arriba] QC filtered for pedcbio"), - make_option(c("-a", "--approvedHugoSymbols"), type="character", - help="Hugo symbol approved gene name list"), - make_option(c("-m","--manifest"),type="character", - help="manifest with Tumor_Sample_Barcode\tCenter"), - make_option(c("-o","--outputfile"),type="character", - help="Standardized fusion calls from [STARfusion | Arriba] (.TSV)") -) - -# Get command line options, if help option encountered print help and exit, -opt <- parse_args(OptionParser(option_list=option_list)) -inputfile <- opt$fusionfile -approved <- opt$approvedHugoSymbols -manifest<-opt$manifest -outputfile <- opt$outputfile - - - -total<-read_tsv(inputfile) %>% as.data.frame() -genes<-c(total$Gene1A,total$Gene1B,total$Gene2A,total$Gene2B) -manifest<-read_tsv(manifest) %>% as.data.frame() %>% unique() - - -my.symbols<-genes -entrez<-select(hs, keys=my.symbols, columns= c("ENTREZID", "SYMBOL"),keytype = "SYMBOL") -nomatch<-entrez[is.na(entrez$ENTREZID),] -#genes without entrez id -#nomatch - -match<-read_tsv(approved) %>% as.data.frame() - - -my.symbols <- total$Gene1A -if (!all(is.na(my.symbols))){ - Gene1A_entrez<-select(hs, - keys = my.symbols, - columns = c("ENTREZID", "SYMBOL"), - keytype = "SYMBOL") - Gene1A_entrez[is.na(Gene1A_entrez$ENTREZID),"ENTREZID"]="" -} else {Gene1A_entrez=data.frame("ENTREZID"="","SYMBOL"="",stringsAsFactors = FALSE)} - -my.symbols <- total$Gene2A -if (!all(is.na(my.symbols))){ - Gene2A_entrez<-select(hs, - keys = my.symbols, - columns = c("ENTREZID", "SYMBOL"), - keytype = "SYMBOL") - Gene2A_entrez[is.na(Gene2A_entrez$ENTREZID),"ENTREZID"]="" -} else {Gene2A_entrez=data.frame("ENTREZID"="","SYMBOL"="",stringsAsFactors = FALSE)} - -my.symbols <- total$Gene1B -if (!all(is.na(my.symbols))){ - Gene1B_entrez<-select(hs, - keys = my.symbols, - columns = c("ENTREZID", "SYMBOL"), - keytype = "SYMBOL") - Gene1B_entrez[is.na(Gene1B_entrez$ENTREZID),"ENTREZID"]="" -} else {Gene1B_entrez=data.frame("ENTREZID"="","SYMBOL"="",stringsAsFactors = FALSE)} - -my.symbols <- total$Gene2B -if (!all(is.na(my.symbols))){ - Gene2B_entrez<-select(hs, - keys = my.symbols, - columns = c("ENTREZID", "SYMBOL"), - keytype = "SYMBOL") - - Gene2B_entrez[is.na(Gene2B_entrez$ENTREZID),"ENTREZID"]="" -}else {Gene2B_entrez=data.frame("ENTREZID"="","SYMBOL"="",stringsAsFactors = FALSE)} - - -# total$Gene1A_entrez<-Gene1A_entrez$ENTREZID -# total$Gene1B_entrez<-Gene1B_entrez$ENTREZID -# total$Gene2A_entrez<-Gene2A_entrez$ENTREZID -# total$Gene2B_entrez<-Gene2B_entrez$ENTREZID - -total$Gene1A[is.na(total$Gene1A)]<-"" -total$Gene2A[is.na(total$Gene2A)]<-"" -total$Gene1B[is.na(total$Gene1B)]<-"" -total$Gene2B[is.na(total$Gene2B)]<-"" - -total<-total %>% - left_join(unique(Gene1A_entrez), by=c("Gene1A"="SYMBOL")) %>% rename("ENTREZID"="Gene1A_entrez") %>% - left_join(unique(Gene2A_entrez), by=c("Gene2A"="SYMBOL")) %>% rename("ENTREZID"="Gene2A_entrez") %>% - left_join(unique(Gene1B_entrez), by=c("Gene1B"="SYMBOL")) %>% rename("ENTREZID"="Gene1B_entrez") %>% - left_join(unique(Gene2B_entrez), by=c("Gene2B"="SYMBOL")) %>% rename("ENTREZID"="Gene2B_entrez") - - - - -#aggregate -fusion_format<-data.frame("Hugo_Symbol"=total$Gene1A,"Entrez_Gene_Id"=total$Gene1A_entrez,"Tumor_Sample_Barcode"=total$Sample,"Fusion"=total$FusionName,"DNA_support"=rep("no",nrow(total)),"RNA_support"=rep("yes",nrow(total)),"Method"=total$Caller,"Frame"=total$Fusion_Type) -fusion_format<-rbind(fusion_format,data.frame("Hugo_Symbol"=total$Gene2A,"Entrez_Gene_Id"=total$Gene2A_entrez,"Tumor_Sample_Barcode"=total$Sample,"Fusion"=total$FusionName,"DNA_support"=rep("no",nrow(total)),"RNA_support"=rep("yes",nrow(total)),"Method"=total$Caller,"Frame"=total$Fusion_Type)) -fusion_format<-rbind(fusion_format,data.frame("Hugo_Symbol"=total$Gene1B,"Entrez_Gene_Id"=total$Gene1B_entrez,"Tumor_Sample_Barcode"=total$Sample,"Fusion"=total$FusionName,"DNA_support"=rep("no",nrow(total)),"RNA_support"=rep("yes",nrow(total)),"Method"=total$Caller,"Frame"=total$Fusion_Type)) -fusion_format<-rbind(fusion_format,data.frame("Hugo_Symbol"=total$Gene2B,"Entrez_Gene_Id"=total$Gene2B_entrez,"Tumor_Sample_Barcode"=total$Sample,"Fusion"=total$FusionName,"DNA_support"=rep("no",nrow(total)),"RNA_support"=rep("yes",nrow(total)),"Method"=total$Caller,"Frame"=total$Fusion_Type)) - - -fusion_format<-fusion_format %>% left_join(manifest,by=c("Tumor_Sample_Barcode"="Tumor_Sample_Barcode")) - -fusion_format<-fusion_format[!is.na(fusion_format$Hugo_Symbol),] -fusion_format<-unique(fusion_format) - -#filter other -if (nrow(fusion_format[grep("other",fusion_format$Frame),])>0){ -fusion_format<-fusion_format[-grep("other",fusion_format$Frame),] -} -#filter intergenic -if (nrow(fusion_format[grep("/",fusion_format$Fusion),])>0){ -fusion_format<-fusion_format[-grep("/", fusion_format$Fusion),] -} - - -#format for entrez== NA -fusion_format[is.na(fusion_format$Entrez_Gene_Id),"Entrez_Gene_Id"]<-"" -fusion_format<-unique(fusion_format) - -#remove blank Hugo Symbol rows caused by Gene2A/2B being empty for STARfusion and geneic arriba calls -fusion_format<-fusion_format[-which(fusion_format$Hugo_Symbol==""),] - -#ensure order of cols correct before print -col_order = c("Hugo_Symbol", "Entrez_Gene_Id", "Center", "Tumor_Sample_Barcode", "Fusion", "DNA_support", "RNA_support", "Method", "Frame") -fusion_format = fusion_format[, col_order] -write.table(fusion_format,outputfile,quote = FALSE,row.names = FALSE,sep="\t") - -# #get Center -# #TGEN -# fusion_format[grep("C0",fusion_format$Tumor_Sample_Barcode),"Center"]<-"TGEN" -# #BGI -# #from Yuankun "all the fq source input are from BGI" -# manifest<-read.delim("~/Downloads/1562961569658-manifest.csv",sep=",",stringsAsFactors=F) -# manifest$Tumor_Sample_Barcode<-sub("_1.fq.gz","",manifest$name) -# manifest$Tumor_Sample_Barcode<-sub("_2.fq.gz","",manifest$Tumor_Sample_Barcode) -# manifest$Tumor_Sample_Barcode<-sub("_RNA","",manifest$Tumor_Sample_Barcode) -# manifest$Tumor_Sample_Barcode<-sub("7316_","7316-",manifest$Tumor_Sample_Barcode) -# fusion_format[which(fusion_format$Tumor_Sample_Barcode %in% manifest$Tumor_Sample_Barcode),"Center"]<-"BGI" -# #NANT -# fusion_format[-which(fusion_format$Center %in% c("TGEN","BGI")),"Center"]<-"NANT" - - diff --git a/scripts/v1_etl_deprecated/merge_star_fusion.py b/scripts/v1_etl_deprecated/merge_star_fusion.py deleted file mode 100755 index 9786c29..0000000 --- a/scripts/v1_etl_deprecated/merge_star_fusion.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -import argparse -import json - -parser = argparse.ArgumentParser(description='Merge and add sample id to STAR Fusion output.') -parser.add_argument('-d', '--dir', action='store', dest='dir', help='star fusion file directory') -parser.add_argument('-j', '--config', action='store', dest='config_file', help='json config file with data types and ' - 'data locations') -parser.add_argument('-i', '--header', action='store', dest='header', help='File with fusion header only') -parser.add_argument('-m', '--manifest', action='store', dest='manifest', help='star fusion cavatica manifest file') - -args = parser.parse_args() -with open(args.config_file) as f: - config_data = json.load(f) -h_file = open(args.header) -header = next(h_file) -header = "tumor_id\t" + header -merged = open('merged_fusions.tsv', 'w') -merged.write(header) -map_dict = {} -for line in open(config_data['mapFileLoc']): - (samp_id, bs_id) = line.rstrip('\n').split('\t') - map_dict[bs_id] = samp_id - - - -manifest = open(args.manifest) -head = next(manifest) -header = head.rstrip('\n').split(',') -b_idx = header.index('Kids First Biospecimen ID') - -for line in manifest: - info = line.rstrip('\n').split(',') - bs_id = info[b_idx] - if bs_id in map_dict: - samp_id = map_dict[bs_id] - floc = args.dir + "/" + info[1] - fus = open(floc) - skip = next(fus) - for fdata in fus: - merged.write(samp_id + "\t" + fdata) -merged.close() \ No newline at end of file diff --git a/utilities/add_fusion_to_sample_sheets.py b/utilities/add_fusion_to_sample_sheets.py deleted file mode 100644 index 45db571..0000000 --- a/utilities/add_fusion_to_sample_sheets.py +++ /dev/null @@ -1,39 +0,0 @@ -import sys -import argparse -import pandas as pd -import os - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Add fusion flag to data clinical sample') - parser.add_argument('-f', '--fusion-dir', action='store', dest='fusion_dir', - help='Dir with fusions') - parser.add_argument('-d', '--datasheet-list', action='store', dest='datasheet_list', help='cbio data sample sheet list') - parser.add_argument('-o', '--out-dir', action='store', dest='out_dir', help='output dir - don\'t make it the same as current datasheet location!!!') - - args = parser.parse_args() - added_header = ['WITH_FUSION_DATA', - 'Indicator that sample has fusions associated with it', - 'STRING', - '1', - 'WITH_FUSION_DATA'] - os.mkdir(args.out_dir) - datasheet_list = open(args.datasheet_list) - for fname in datasheet_list: - datasheet = open(fname.rstrip('\n')) - parts = fname.rstrip('\n').split("/") - fusion_data = pd.read_csv(args.fusion_dir + "/" + parts[-2] + ".fusions.txt", sep="\t") - sample_list = fusion_data.Tumor_Sample_Barcode.unique() - cur_dir = args.out_dir + "/" + parts[-2] - os.mkdir(cur_dir) - out = open(cur_dir + "/" + parts[-1], "w") - for i in range(0, len(added_header)): - head = next(datasheet) - out.write(head.rstrip('\n') + '\t' + added_header[i] + '\n') - for line in datasheet: - out.write(line.rstrip('\n')) - data = line.split('\t') - if data[1] in sample_list: - out.write('\tYES\n') - else: - out.write('\tNO\n') - out.close() \ No newline at end of file diff --git a/utilities/adjust_patient_inputs.py b/utilities/adjust_patient_inputs.py deleted file mode 100755 index 4c03347..0000000 --- a/utilities/adjust_patient_inputs.py +++ /dev/null @@ -1,194 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -import os -import re -from shutil import copyfile -import concurrent.futures -# import pdb - - -def process_case_files(old_cdir, new_cdir, case_files, rm_sample_list): - f = 0 - for fn in case_files: - old_cf = open(old_cdir + '/' + fn) - new_cf = open(new_cdir + '/' + fn, 'w') - new_s_list = [] - # print standard lines - standard = next(old_cf) - new_cf.write(standard) - standard = next(old_cf) - new_cf.write(standard) - standard = next(old_cf) - new_cf.write(standard) - # hold lines to change description - desc = next(old_cf) - buffer = next(old_cf) - entry = next(old_cf) - info = entry.rstrip('\n').split() - for i in range(1, len(info)): - if info[i] not in rm_sample_list: - new_s_list.append(info[i]) - f = len(new_s_list) - desc = re.sub('\d+', str(f), desc) - new_cf.write(desc + buffer + info[0] + ' ' + '\t'.join(new_s_list) + '\n') - old_cf.close() - new_cf.close() - - return f - - -def process_data_clinical(old_ds, new_ds, rm_patient_list): - for entry in old_ds: - info = entry.split('\t') - if info[0] not in rm_patient_list: - new_ds.write(entry) - - -def process_samp_in_head(old_file, new_file, rm_sample_list): - try: - old_head = next(old_file) - good_i = [] - head_split = old_head.rstrip('\n').split('\t') - new_head = [] - new_head.append(head_split[0]) - for i in range(1, len(head_split)): - if head_split[i] not in rm_sample_list: - good_i.append(i) - new_head.append(head_split[i]) - new_file.write('\t'.join(new_head) + '\n') - for data in old_file: - datum = data.rstrip('\n').split('\t') - new_out = datum[0] - for i in good_i: - new_out += '\t' + datum[i] - new_file.write(new_out + '\n') - except Exception as e: - print(e) - exit(1) - - -def process_samp_in_tbl(old_file, new_file, rm_sample_list): - # pdb.set_trace() - head = next(old_file) - new_file.write(head) - for data in old_file: - datum = data.split('\t') - if datum[13] not in rm_sample_list: - new_file.write(data) - - -def cleanup_all(dname, rm_sample_list, rm_patient_list, new_out): - try: - # redo case list files - new_cdir = new_out + '/case_lists' - os.mkdir(new_cdir) - old_cdir = dname + '/case_lists' - case_files = os.listdir(old_cdir) - n_samp = process_case_files(old_cdir, new_cdir, case_files, rm_sample_list) - sys.stderr.write('Reprocessed case lists for ' + dname + '\n') - sys.stderr.flush() - if n_samp == 0: - sys.stderr.write('All samples in ' + dname + ' were REMOVED!\n') - else: - # redo data_clinical sheets - new_dss = open(new_out + '/data_clinical_sample.txt', 'w') - old_dss = open(dname + '/data_clinical_sample.txt') - process_data_clinical(old_dss, new_dss, rm_patient_list) - new_dss.close() - - new_dps = open(new_out + '/data_clinical_patient.txt', 'w') - old_dps = open(dname + '/data_clinical_patient.txt') - process_data_clinical(old_dps, new_dps, rm_patient_list) - new_dps.close() - sys.stderr.write('Reprocessed data clinical sheets for ' + dname + '\n') - sys.stderr.flush() - - samp_in_head_fn = ['data_CNA.txt', 'data_linear_CNA.txt', 'data_rna_seq_v2_mrna_median_Zscores.txt', - 'data_rna_seq_v2_mrna.txt'] - for fn in samp_in_head_fn: - old_fh = open(dname + '/' + fn) - new_fh = open(new_out + '/' + fn, 'w') - process_samp_in_head(old_fh, new_fh, rm_sample_list) - new_fh.close() - sys.stderr.write('Reprocessed ' + fn + ' for ' + dname + '\n') - sys.stderr.flush() - - samp_in_tbl = ['data_mutations_extended.txt'] - for fn in samp_in_tbl: - old_fh = open(dname + '/' + fn) - new_fh = open(new_out + '/' + fn, 'w') - process_samp_in_tbl(old_fh, new_fh, rm_sample_list) - new_fh.close() - sys.stderr.write('Reprocessed ' + fn + ' for ' + dname + '\n') - sys.stderr.flush() - # copy over all meta tables - meta_list = os.listdir(dname) - for fn in meta_list: - if fn[0:5] == 'meta_': - copyfile((dname + '/' + fn), (new_out + '/' + fn)) - sys.stderr.write('Copied meta file ' + fn + '\n') - sys.stderr.flush() - except Exception as e: - print(e) - - -def check_data_sheet(dname, exc_list, outdir): - try: - cur = open(dname + '/data_clinical_sample.txt') - f = 0 - rm_slist = [] - rm_plist = [] - for line in cur: - info = line.rstrip('\n').split('\t') - if info[0] in exc_list: - f = 1 - rm_slist.append(info[1]) - if info[0] not in rm_plist: - rm_plist.append(info[0]) - cur.close() - if f == 0: - sys.stderr.write('No samples to remove from ' + dname + '\n') - sys.stderr.flush() - else: - sys.stderr.write(dname + ' has samples ' + str(len(rm_slist)) + ' to be excluded:' + '\t'.join(rm_slist) - + ' from patients ' + '\t'.join(rm_plist) + '\n') - sys.stderr.flush() - parts = dname.split('/') - new_out = outdir + '/' + parts[-1] - os.mkdir(new_out) - cleanup_all(dname, rm_slist, rm_plist, new_out) - return dname - except Exception as e: - print(e) - exit(1) - - -parser = argparse.ArgumentParser(description='Use list of sample IDs to remove to rebuild a project.') -parser.add_argument('-l', '--list', action='store', dest='plist', help='new-line separated patient list') -parser.add_argument('-d', '--dir', action='store', dest='dir', help='data dir to search') -parser.add_argument('-p', '--pattern', action='store', dest='pattern', help='dir search pattern - a suffix or prefix') -parser.add_argument('-o', '--outdir', action='store', dest='outdir', help='output directory for projects that change') - -args = parser.parse_args() - -# read in patient list -exc_list = [] - -for line in open(args.plist): - exc_list.append(line.rstrip('\n')) - -# get dir list -all_files = os.listdir(args.dir) -dir_list = [] -pattern = '.*' + args.pattern + '.*' -for fn in all_files: - if re.match(pattern, fn): - dir_list.append(args.dir + '/' + fn) -# for pdir in dir_list: -# check_data_sheet(pdir, exc_list, args.outdir) -with concurrent.futures.ProcessPoolExecutor(8) as executor: - results = {executor.submit(check_data_sheet, pdir, exc_list, args.outdir): pdir for pdir in dir_list} - for result in concurrent.futures.as_completed(results): - sys.stderr.write('Completed processing ' + result.result() + '\n') diff --git a/utilities/append_add_on_cols.py b/utilities/append_add_on_cols.py deleted file mode 100644 index b1bab39..0000000 --- a/utilities/append_add_on_cols.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -import os - -parser = argparse.ArgumentParser(description='Quickly add cols to data_clinical sheet') -parser.add_argument('-d', '--datasheet', action='store', dest='data', help='datasheet to add to. Output will be to stdout') -parser.add_argument('-i', '--col_id', action='store', dest='cid', help='Name of column to use for finding ID') -parser.add_argument('-c', '--header', action='store', dest='header', help='tsv with cbio headers to add') -parser.add_argument('-a', '--add', action='store', dest='add', help='csv with BS_ID or PT_ID key and values to add, in same order as header') - -args = parser.parse_args() - -# tack on new header cols -datasheet = open(args.data) -header = open(args.header) -labels = "" -for head in header: - entry = next(datasheet) - sys.stdout.write(entry.rstrip('\n') + "\t" + head) - labels = entry -header.close() -labels = labels.rstrip('\n').split('\t') -cid = labels.index(args.cid) -# store IDs and vals -value_add = open(args.add) -skip = next(value_add) -add_dict = {} -blanks = "" -for entry in value_add: - data = entry.rstrip('\n').split(',') - add_dict[data[0]] = '\t'.join(data[1:]) - # init blanks for when no new value to be added - if blanks == "": - for i in range(1, len(data), 1): - blanks += "\t" -value_add.close() - -for line in datasheet: - info = line.rstrip('\n').split('\t') - sys.stdout.write(line.rstrip('\n')) - f = 0 - c = 0 - # IDs might be separeted by ; - check = info[cid].split(";") - for i in range(len(check)): - if check[i] in add_dict: - f = 1 - c = i - if f: - sys.stdout.write("\t" + add_dict[check[c]] + "\n") - else: - sys.stdout.write(blanks + "\n") -datasheet.close() diff --git a/utilities/get_survival_from_ds.py b/utilities/get_survival_from_ds.py deleted file mode 100755 index 335b607..0000000 --- a/utilities/get_survival_from_ds.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -import sys -from requests import request -import argparse -import concurrent.futures -import json -from time import sleep -import pdb - -def query_ds(pt_id): - try: - pt_id = pt_id.rstrip('\n') - outcome_url = url + "outcomes?participant_id=" + pt_id - out_info = request('GET', outcome_url) - statuses = [] - for result in out_info.json()['results']: - if result['vital_status'] is not None: - statuses.append([pt_id, result['vital_status'], str(result['age_at_event_days'])]) - return statuses - except Exception as e: - sys.stderr.write(str(e)) - sys.stderr.write('Could not get outcome info for ' + pt_id + "\n") - sys.exit(1) - - -parser = argparse.ArgumentParser(description='Get vital status by PT ID') -parser.add_argument('-u', '--kf-url', action='store', dest='url', - help='Kids First data service url, i.e. https://kf-api-dataservice.kidsfirstdrc.org/') -parser.add_argument('-p', '--pt-list', action='store', dest='pt_list', - help='pt_id list') - -args = parser.parse_args() - -url = args.url -pt_file = open(args.pt_list) -out = open('outcomes.txt', 'w') - -x = 1 -m = 100 - -with concurrent.futures.ThreadPoolExecutor(16) as executor: - results = {executor.submit(query_ds, entry): entry for entry in pt_file} - for result in concurrent.futures.as_completed(results): - if x % m == 0: - sys.stderr.write('Processed ' + str(x) + ' lines\n') - sys.stderr.flush() - outcomes = result.result() - try: - if len(outcomes) > 0: - for outcome in outcomes: - out.write("\t".join(outcome) + "\n") - else: - sys.stderr.write('No outcome for a patient\n') - except Exception as e: - sys.stderr.write(str(e)) - pdb.set_trace() - hold = 1 - x += 1 diff --git a/utilities/match_pick_samples_from_files.py b/utilities/match_pick_samples_from_files.py deleted file mode 100644 index 93f6c06..0000000 --- a/utilities/match_pick_samples_from_files.py +++ /dev/null @@ -1,30 +0,0 @@ -import pandas as pd -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Script to pick out samples from CNVS file when sample type in sample map file is DNA") - - parser.add_argument('-ID','--sample_map',help="File to read in sample ID") - parser.add_argument('-cnv','--cnvs',help="File to pick cnv samples") - parser.add_argument('-o','--output_file',help="Name of output file") - - args=parser.parse_args() - - sample_map = pd.read_csv(args.ID, sep='\t') - - #predicted = pd.read_csv("openpbta.predicted_cnv.txt", sep='\t') - predicted = pd.read_csv(args.cnvs, sep='\t') - cbio_ID_DNA=sample_map.loc[sample_map['Sample Type']=='DNA']['Cbio ID'] - - common_IDs=[] - for col in cbio_ID_DNA: - check=0 - for row in predicted.columns: - if row == col: - check=1 - if check!=1: - common_IDs.append(col) - predicted[col]=0 - - with open(args.output_file,'w') as write_tsv: - write_tsv.write(predicted.to_csv(sep='\t', index=False)) diff --git a/utilities/mount_link_sbfs.py b/utilities/mount_link_sbfs.py deleted file mode 100644 index 3add267..0000000 --- a/utilities/mount_link_sbfs.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -import sys -import os -import argparse -import json -import subprocess - -parser = argparse.ArgumentParser(description='Mount sbfs, create soft links bases on file type') -parser.add_argument('-m', '--manifest', action='store', dest='manifest', help='cavatica csv manifest file') -parser.add_argument('-c', '--config', action='store', dest='config_file', help='json config file with data types and ' - 'data locations') -parser.add_argument('-p', '--profile', action='store', dest='profile', help='cavatica profile name. requires ' - '.sevenbridges/credentials file be present') -args = parser.parse_args() -with open(args.config_file) as f: - config_data = json.load(f) -os.makedirs('mafs', exist_ok=True) -if config_data['cna_flag'] == 1: - os.makedirs('cnvs', exist_ok=True) -if config_data['rna_flag'] == 1: - os.makedirs('rsem', exist_ok=True) - -# create mounts and make links -m_dict = {} -manifest = open(args.manifest) -# just last two cols required -head = next(manifest) -cwd = os.getcwd() + "/" -for line in manifest: - info = line.rstrip('\n').split('\t') - fnames = info[-2].split(',') - projects = info[-1] - project = projects.split(',') - atype = info[4] - for i in range(len(project)): - (u,p) = project[i].split('/') - if p not in m_dict: - os.mkdir(p) - cmd = 'sbfs mount ' + p + ' --project ' + project[i] + ' --profile turbo --read-only' - subprocess.call(cmd, shell=True) - mdir = cwd + p + "/projects/" + project[i] + "/" - m_dict[p] = mdir - cmd = 'ln -s ' + m_dict[p] + fnames[i] - if atype == 'DNA': - if fnames[i][-3:] == 'maf': - cmd += " mafs" - else: - cmd += " cnvs" - else: - cmd += " rsem" - subprocess.call(cmd, shell=True) -out = open('sbfs_mount.txt', 'w') -for p in m_dict: - out.write(m_dict[p] + '\n') -out.close() \ No newline at end of file diff --git a/utilities/null_cols_from_maf.py b/utilities/null_cols_from_maf.py deleted file mode 100755 index a75d9c9..0000000 --- a/utilities/null_cols_from_maf.py +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse - -parser = argparse.ArgumentParser(description='Quickly add cols to data_clinical sheet') -parser.add_argument('-c', '--col-names-file', action='store', dest='col_file', help='File the new line seprated header names to null out') -parser.add_argument('-m', '--maf_in', action='store', dest='maf_in', help='MAF file with cols to null. Output will be to stdout') - -args = parser.parse_args() - -col_file = args.col_file -maf = args.maf_in - -col_names = [] -for line in open(col_file): - col_names.append(line.rstrip('\n')) - -col_idx = [] -maf_rd = open(maf) -head = next(maf_rd) -sys.stdout.write(head) -head = next(maf_rd) -header = head.rstrip('\n').split('\t') -for col in col_names: - col_idx.append(header.index(col)) -sys.stdout.write(head) -for data in maf_rd: - entry = data.rstrip('\n').split('\t') - for i in col_idx: - entry[i] = "" - sys.stdout.write("\t".join(entry) + "\n") diff --git a/utilities/patch_pt_id_update.py b/utilities/patch_pt_id_update.py deleted file mode 100644 index a313404..0000000 --- a/utilities/patch_pt_id_update.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -import os -import json - -parser = argparse.ArgumentParser(description='Quickly patch KF PT ID update for PNOC WES') -parser.add_argument('-d', '--data', action='store', dest='data', help='Sample file with patients IDS and external IDs') -parser.add_argument('-s', '--sample', action='store', dest='sample', help='Sample file with patients IDS and external IDs') -parser.add_argument('-j', '--json', action='store', dest='json', help='Input json file') - -args = parser.parse_args() -with open(args.json) as f: - pt_data = json.load(f) -pt_id_dict = {} -pt_sheet = open('data_clinical_patient.txt', 'w') -for line in open(args.data): - info = line.rstrip('\n').split('\t') - if info[1] in pt_data['participant']: - sys.stderr.write('Replacing old id ' + info[0] + ' with ' + pt_data['participant'][info[1]] + '\n') - pt_id_dict[info[0]] = pt_data['participant'][info[1]] - info[0] = pt_data['participant'][info[1]] - pt_sheet.write('\t'.join(info) + '\n') -pt_sheet.close() -sys.stderr.write('Finished with patient file.\n') -samp_sheet = open('data_clinical_sample.txt', 'w') - -for line in open(args.sample): - info = line.rstrip('\n').split('\t') - if info[0] in pt_id_dict: - sys.stderr.write('Replacing old id ' + info[0] + ' with ' + pt_id_dict[info[0]] + '\n') - info[0] = pt_id_dict[info[0]] - samp_sheet.write('\t'.join(info) + '\n') -samp_sheet.close() -sys.stderr.write('Finished with sample file.\n') diff --git a/utilities/patch_sample_id.py b/utilities/patch_sample_id.py deleted file mode 100644 index d751614..0000000 --- a/utilities/patch_sample_id.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -import os -import re - -parser = argparse.ArgumentParser(description='Quickly patch sample id from step2 with data warehouse entries') -parser.add_argument('-i', '--info', action='store', dest='info', help='A sample info sheet from step 2') -parser.add_argument('-d', '--data', action='store', dest='data', help='Data Warehouse tsv with biospecimen ID, master aliquot id, and sample id') - -new_samp = {} -args = parser.parse_args() - -data_in = open(args.data) -head = next(data_in) -header = head.rstrip('\n').split('\t') -b_idx = header.index('kf_biospecimen_id') -s_idx = header.index('sample_id') -a_idx = header.index('master_parent_aliquot_id') - -for line in data_in: - meta = line.rstrip('\n').split('\t') - new_samp[meta[b_idx]] = meta[s_idx] + "_" + meta[a_idx] -data_in.close() - -info_in = open(args.info) -head = next(info_in) -header = head.rstrip('\n').split('\t') -b_idx = header.index('BS_ID') -s_idx = header.index('external_sample_id') -a_idx = header.index('analyte_type') - -sys.stdout.write(head) -for line in info_in: - meta = line.rstrip('\n').split('\t') - if meta[b_idx] in new_samp: - meta[s_idx] = new_samp[meta[b_idx]] - else: - # modify original to fit new - sys.stderr.write("WARN: Entry for " + meta[b_idx] + " " + meta[s_idx] + " not found, reformatting\n") - meta[s_idx] = re.sub(r"-[T|N]-", "_", meta[s_idx]) - meta[s_idx] = re.sub(r"\..*$", "", meta[s_idx]) - meta[s_idx] = re.sub(r"-[T|N]$", "", meta[s_idx]) - if meta[a_idx] == "Not Applicable": - meta[a_idx] = "DNA" - print("\t".join(meta)) -info_in.close() - diff --git a/utilities/patch_survival_to_data_patient.py b/utilities/patch_survival_to_data_patient.py deleted file mode 100755 index 6a0a3b2..0000000 --- a/utilities/patch_survival_to_data_patient.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -import os -import math -import pdb - -parser = argparse.ArgumentParser(description='Added survival data to data_clinical_patient.txt sheet. Run from dir level of disease subdirs') -parser.add_argument('-d', '--data', action='store', dest='data', - help='File with patients IDS and survival data, can be gotten from running get_survival_from_ds.py') -parser.add_argument('-l', '--list', action='store', dest='flist', help='Data file list') -parser.add_argument('-m', '--tum-bs-id', action='store', dest='tum_file', help='Tum bs id file from step 2 to properly calculate survival') -parser.add_argument('-o', '--outdir', action='store', dest='outdir', help='output dir - MAKE THIS DIFFERENT FROM INPUT DIR') -parser.add_argument('-t', '--id-type', action='store', dest='itype', help='kf or ext') -parser.add_argument('-f', '--dfs-flag', action='store', dest='dflag', help='1 if DFS data available, 0 if not') - - -def calc_os_months(surv_data_val): - (status, status_days) = surv_data_val.split('\t') - # calc months basically as average length of a month - as_months = status_days - if as_months != "NA": - as_months = math.floor(int(status_days)/(365.25/12)) - return status + '\t' + str(as_months) - - -args = parser.parse_args() - -added_header = ['OS_STATUS\tOS_MONTHS\tDFS_STATUS\tDFS_MONTHS', - 'Overall patient survival status\tOverall survival in months since initial diagnosis\tDisease free status since initial treatment\tDisease free (months) since initial treatment', - 'STRING\tNUMBER\tSTRING\tNUMBER', - '1\t1\t1\t1', - 'OS_STATUS\tOS_MONTHS\tDFS_STATUS\tDFS_MONTHS'] -if int(args.dflag) == 0: - added_header = ['OS_STATUS\tOS_MONTHS', - 'Overall patient survival status\tOverall survival in months since initial diagnosis', - 'STRING\tNUMBER', - '1\t1', - 'OS_STATUS\tOS_MONTHS'] - -if not os.path.isdir(args.outdir): - os.mkdir(args.outdir) -surv_dict = {} -surv_data = open(args.data) -skip_head = next(surv_data) -# In format pt IDOS STATUSage in days of that status -for line in surv_data: - data = line.rstrip('\n').split('\t') - surv_dict[data[0]] = '\t'.join([data[2], data[1]]) -surv_data.close() -id_idx = {'kf': 0, 'ext': 1} -for fname in open(args.flist): - fname = fname.rstrip('\n') - subdir = os.path.dirname(fname) - newdir = args.outdir + '/' + subdir - if not os.path.isdir(newdir): - os.makedirs(newdir, exist_ok=True) - cur = open(fname) - newdata = open(newdir + '/data_clinical_patient.txt', 'w') - # pdb.set_trace() - for i in range(0, len(added_header)): - head = next(cur) - newdata.write(head.rstrip('\n') + '\t' + added_header[i] + '\n') - for line in cur: - newdata.write(line.rstrip('\n')) - data = line.split('\t') - pt_id = data[id_idx[args.itype]] - - if pt_id in surv_dict: - surv_data_value = calc_os_months(surv_dict[pt_id]) - newdata.write('\t' + surv_data_value + '\n') - else: - newdata.write('\t\t\t\t\n') - sys.stderr.write('No survival entry for ' + pt_id + ' in ' + fname + '\n') - cur.close() - newdata.close() diff --git a/utilities/rsem_replace_data_rna.py b/utilities/rsem_replace_data_rna.py deleted file mode 100644 index b7b4e62..0000000 --- a/utilities/rsem_replace_data_rna.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -import os - -parser = argparse.ArgumentParser(description='Updates outdated data in data_rna_seq sheet from new rsem_merged file') -parser.add_argument('-m', '--mapFileLoc', action='store', dest='map', help='RNA sample name map file') -parser.add_argument('-r', '--rsem', action='store', dest='rsem', help='rsem file with new data') -parser.add_argument('-d', '--data', action='store', dest='data', help='data_rna_seq_v2_mrna.txt sheet to update') -parser.add_argument('-o', '--outdir', action='store', dest='outdir', help='outputdir') - -args = parser.parse_args() -map_dict = {} -for line in open(args.map): - (new_name, old_name) = line.rstrip('\n').split('\t') - map_dict[old_name] = new_name - -rsem_data = {} - -rsem_file = open(args.rsem) -head = next(rsem_file) -slist = [] -header = head.rstrip('\n').split('\t') -for line in rsem_file: - info = line.rstrip('\n').split('\t') - rsem_data[info[1]] = {} - for i in range(2, len(info)): - samp = map_dict[header[i]] - slist.append(samp) - rsem_data[info[1]][samp] = info[i] - -rsem_file.close() - -outdir = args.outdir + '/' + os.path.dirname(args.data) -if not os.path.isdir(outdir): - os.mkdir(outdir) -update = open(outdir + '/data_rna_seq_v2_mrna.txt', 'w') - -old_data = open(args.data) -head = next(old_data) -update.write(head) -header = head.rstrip('\n').split('\t') - -h_i = [] -for i in range(1, len(header)): - if header[i] in slist: - h_i.append(i) - sys.stderr.write('Found sample ' + header[i] + ' to update at index ' + str(i) + '\n') -for line in old_data: - info = line.rstrip('\n').split('\t') - for i in h_i: - info[i] = rsem_data[info[0]][header[i]] - update.write('\t'.join(info) + '\n') -update.close() diff --git a/utilities/subset_table_by_id.py b/utilities/subset_table_by_id.py deleted file mode 100755 index 3bbe6a2..0000000 --- a/utilities/subset_table_by_id.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -import os -import json -import pdb - -parser = argparse.ArgumentParser(description='Quickly patch KF PT ID update for PNOC WES') -parser.add_argument('-i', '--input-file', action='store', dest='in_file', help='File to subset') -parser.add_argument('-l', '--id-list', action='store', dest='id_list', help='list of ids seprated by new line') -parser.add_argument('-c', '--columns', action='store', dest='cols', help='csv cols to search') -parser.add_argument('-n', '--num-header-lines', action='store', dest='n', help='number of header lines') -parser.add_argument('-s', '--separator', action='store', dest='s', help='seprator type - \'comma\' or \'tab\'') -parser.add_argument('-o', '--out-dir', action='store', dest='o', help='output dir name - should be different if file being processed is in same dir') -parser.add_argument('-x', '--extra_split', action='store', dest='x', help='split id name on ;, special case') - -args = parser.parse_args() -id_file = open(args.id_list) -id_dict = {} -sep = '\t' -if args.s == "comma": - sep = ',' -n = int(args.n) -for line in id_file: - id_dict[line.rstrip('\n')] = 0 - -cols = list(map(int, args.cols.split(','))) -#pdb.set_trace() -out_dir = args.o -new_fn = out_dir + os.path.basename(args.in_file) -new_out = open(new_fn, 'w') -in_file = open(args.in_file) -for i in range(n): - header = next(in_file) - new_out.write(header) -for line in in_file: - info = line.rstrip('\n').split(sep) - if args.x is None: - for i in cols: - if info[i] in id_dict: - id_dict[info[i]] = 1 - new_out.write(line) - else: - for i in cols: - subset = info[i].split(';') - for j in subset: - if j in id_dict: - id_dict[info[i]] = 1 - new_out.write(line) - break diff --git a/utilities/swap_samples.py b/utilities/swap_samples.py deleted file mode 100644 index ab9e5f7..0000000 --- a/utilities/swap_samples.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -import os - -parser = argparse.ArgumentParser(description='Swaps data between two projects. Run from dir level of disease subdirs') -parser.add_argument('-a', '--datainput1', action='store', dest='d1', help='First sheet with sample to swap') -parser.add_argument('-b', '--datainput2', action='store', dest='d2', help='Second sheet with sample to swap') -parser.add_argument('-x', '--sample1', action='store', dest='x', help='sample 1 name to swap with...') -parser.add_argument('-y', '--sample2', action='store', dest='y', help='sample2 name to swap') -parser.add_argument('-o', '--outdir', action='store', dest='outdir', help='outputdir') -parser.add_argument('-f', '--flag', action='store', dest='f', help='0 to keep original sampe names, 1 to swap ' - 'names too') - -args = parser.parse_args() -d1_out_dir = args.outdir + '/' + os.path.dirname(args.d1) -d2_out_dir = args.outdir + '/' + os.path.dirname(args.d2) -if not os.path.isdir(d1_out_dir): - os.mkdir(d1_out_dir) -if not os.path.isdir(d2_out_dir): - os.mkdir(d2_out_dir) - -d1_out_fh = open(d1_out_dir + '/' + os.path.basename(args.d1), 'w') -d2_out_fh = open(d2_out_dir + '/' + os.path.basename(args.d2), 'w') - -d1_in = open(args.d1) -d2_in = open(args.d2) -# find sample one to swap -h1 = next(d1_in) -h1_info = h1.rstrip('\n').split('\t') -h1_i = 0 -for i in range(0, len(h1_info)): - if h1_info[i] == args.x: - h1_i = i - sys.stderr.write('Found sample ' + args.x + ' to swap in ' + args.d1 + ' at position ' + str(i) + '\n') - if args.f == '1': - h1_info[i] = args.y - sys.stderr.write('Swapped header name to ' + args.y + '\n') - break -d1_out_fh.write('\t'.join(h1_info) + '\n') - -#find sample 2 to swap -h2 = next(d2_in) -h2_info = h2.rstrip('\n').split('\t') -h2_i = 0 -for i in range(0, len(h2_info)): - if h2_info[i] == args.y: - h2_i = i - sys.stderr.write('Found sample ' + args.y + ' to swap in ' + args.d2 + ' at position ' + str(i) + '\n') - if args.f == '1': - h2_info[i] = args.x - sys.stderr.write('Swapped header name to ' + args.x + '\n') - break -d2_out_fh.write('\t'.join(h2_info) + '\n') - -for line1 in d1_in: - line2 = next(d2_in) - - info1 = line1.rstrip('\n').split('\t') - info2 = line2.rstrip('\n').split('\t') - t1 = info1[h1_i] - info1[h1_i] = info2[h2_i] - info2[h2_i] = t1 - d1_out_fh.write('\t'.join(info1) + '\n') - d2_out_fh.write('\t'.join(info2) + '\n') -d1_out_fh.close() -d2_out_fh.close() \ No newline at end of file