Skip to content

Commit

Permalink
Merge pull request #3 from calico/revision-upd
Browse files Browse the repository at this point in the history
Revision updates to analyses
  • Loading branch information
johli authored Sep 14, 2024
2 parents 807cf6f + 3ffcb47 commit 7f76005
Show file tree
Hide file tree
Showing 8 changed files with 2,120 additions and 40 deletions.
215 changes: 215 additions & 0 deletions analysis/eqtl/get_eqtl_train_test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "bd382268-2054-4b77-bc6e-d8b4e8c2aa55",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import pyranges as pr\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3e02be85",
"metadata": {},
"outputs": [],
"source": [
"#Parse vcf and extract eQTL rows\n",
"\n",
"eqtl_file = '/home/drk/seqnn/data/gtex_fine/susie_pip90/pos_merge.vcf'\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "60078dd1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"len(eqtl_df) = 17925\n",
"len(eqtl_set) = 17925\n"
]
}
],
"source": [
"#Read eQTLs\n",
"\n",
"eqtl_df = pd.read_csv(eqtl_file, sep='\\t', skiprows=1, names=['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'feat1', 'feat2', 'featNaN'])[['#CHROM', 'POS', 'ID', 'REF', 'ALT']]\n",
"eqtl_set = set(sorted(eqtl_df['ID'].values.tolist()))\n",
"\n",
"print(\"len(eqtl_df) = \" + str(len(eqtl_df)))\n",
"print(\"len(eqtl_set) = \" + str(len(eqtl_set)))\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "cefb9c25",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- replicate = f3c0 (fi = 3, ci = 0) ---\n",
"len(vcf_df) = 17925\n",
"len(seq_df) = 55497\n",
"len(vcf_train_set) = 13750\n",
"len(snp_list_train) = 13750\n",
"len(snp_list_test) = 4175\n",
"--- replicate = f3c1 (fi = 3, ci = 1) ---\n",
"len(vcf_df) = 17925\n",
"len(seq_df) = 55497\n",
"len(vcf_train_set) = 13750\n",
"len(snp_list_train) = 13750\n",
"len(snp_list_test) = 4175\n",
"--- replicate = f3c2 (fi = 3, ci = 2) ---\n",
"len(vcf_df) = 17925\n",
"len(seq_df) = 55497\n",
"len(vcf_train_set) = 13750\n",
"len(snp_list_train) = 13750\n",
"len(snp_list_test) = 4175\n",
"--- replicate = f3c3 (fi = 3, ci = 3) ---\n",
"len(vcf_df) = 17925\n",
"len(seq_df) = 55497\n",
"len(vcf_train_set) = 13750\n",
"len(snp_list_train) = 13750\n",
"len(snp_list_test) = 4175\n"
]
}
],
"source": [
"#Load sequence bed; output trained-on and held-out snp lists respectively\n",
"\n",
"valid_shift = False\n",
"\n",
"eqtl_out_file = 'gtex_susie_pip90'\n",
"\n",
"seq_bed_file = '/scratch3/drk/seqnn/data/v9/hg38/sequences.bed'\n",
"\n",
"num_folds = 8\n",
"\n",
"repl_index = [\n",
" 'f3c0',\n",
" 'f3c1',\n",
" 'f3c2',\n",
" 'f3c3',\n",
"]\n",
"\n",
"#Loop over replicates\n",
"for repl_str in repl_index :\n",
" \n",
" fi, ci = int(repl_str.split(\"c\")[0][1:]), int(repl_str.split(\"c\")[1])\n",
" \n",
" print(\"--- replicate = \" + repl_str + \" (fi = \" + str(fi) + \", ci = \" + str(ci) + \") ---\")\n",
"\n",
" vcf_df = eqtl_df.copy().reset_index(drop=True)\n",
"\n",
" print(\"len(vcf_df) = \" + str(len(vcf_df)))\n",
"\n",
" #Load sequence bed\n",
" seq_df = pd.read_csv(seq_bed_file, sep='\\t', names=['chrom', 'start', 'end', 'label'])\n",
"\n",
" seq_df['start'] -= 163840\n",
" seq_df['end'] += 163840\n",
"\n",
" test_fold = fi\n",
" valid_fold = -1\n",
" if valid_shift :\n",
" valid_fold = (fi+1+ci) % num_folds\n",
" else :\n",
" valid_fold = (fi+1) % num_folds\n",
"\n",
" def _label_train(x) :\n",
" if x == 'fold' + str(test_fold) :\n",
" return 'test'\n",
" elif x == 'fold' + str(valid_fold) :\n",
" return 'valid'\n",
" else :\n",
" return 'train'\n",
"\n",
" seq_df['label'] = seq_df['label'].apply(_label_train)\n",
"\n",
" print(\"len(seq_df) = \" + str(len(seq_df)))\n",
"\n",
" #Intersect vcf against sequence bed\n",
" seq_pr = pr.PyRanges(seq_df.rename(columns={'chrom' : 'Chromosome', 'start' : 'Start', 'end' : 'End'}))\n",
"\n",
" vcf_df['End'] = vcf_df['POS'] + 1\n",
" vcf_pr = pr.PyRanges(vcf_df[['#CHROM', 'POS', 'End', 'ID', 'REF', 'ALT']].rename(columns={'#CHROM' : 'Chromosome', 'POS' : 'Start', 'REF' : 'ref', 'ALT' : 'alt'}))\n",
"\n",
" vcf_seq_df = vcf_pr.join(seq_pr, strandedness=False).df.copy().reset_index(drop=True)\n",
" vcf_train_set = sorted(list(set(vcf_seq_df.query(\"label == 'train'\")['ID'].values)))\n",
"\n",
" print(\"len(vcf_train_set) = \" + str(len(vcf_train_set)))\n",
"\n",
" #Mark loci in the vcf that had been seen during training\n",
" is_train_locus = []\n",
" for _, row in vcf_df.iterrows () :\n",
" if row['ID'] in vcf_train_set :\n",
" is_train_locus.append(True)\n",
" else :\n",
" is_train_locus.append(False)\n",
"\n",
" vcf_df['is_train_locus'] = is_train_locus\n",
"\n",
" #Store final list of trained-on and non-trained-on SNP positions for the given fold\n",
" vcf_df_train = vcf_df.query(\"is_train_locus == True\").copy().reset_index(drop=True)\n",
" snp_list_train = sorted(list(set(vcf_df_train['ID'].values.tolist())))\n",
"\n",
" vcf_df_test = vcf_df.query(\"is_train_locus == False\").copy().reset_index(drop=True)\n",
" snp_list_test = sorted(list(set(vcf_df_test['ID'].values.tolist())))\n",
"\n",
" print(\"len(snp_list_train) = \" + str(len(snp_list_train)))\n",
" print(\"len(snp_list_test) = \" + str(len(snp_list_test)))\n",
"\n",
" with open(eqtl_out_file + \"_\" + repl_str + (\"s\" if valid_shift else \"\") + \"_train.txt\", 'wt') as out_f :\n",
" for snp_id in snp_list_train :\n",
" out_f.write(snp_id + '\\n')\n",
"\n",
" with open(eqtl_out_file + \"_\" + repl_str + (\"s\" if valid_shift else \"\") + \"_test.txt\", 'wt') as out_f :\n",
" for snp_id in snp_list_test :\n",
" out_f.write(snp_id + '\\n')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5cdb135d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
4 changes: 2 additions & 2 deletions analysis/eqtl/plot_eqtl_coef.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"source": [
"#Load westminster scripts (gtex coefficient helper functions)\n",
"\n",
"sys.path.append('/home/drk/code/westminster/scripts/')\n",
"sys.path.append('/home/drk/code/westminster/src/westminster/scripts/')\n",
"from westminster_gtex_coef import read_eqtl\n",
"from westminster_gtex_coef import read_scores as read_borzoi\n"
]
Expand Down Expand Up @@ -236,7 +236,7 @@
}
],
"source": [
"#Constrct dataframe with metrics\n",
"#Construct dataframe with metrics\n",
"\n",
"metrics_df = pd.DataFrame({\n",
" 'tissue': metrics_tissue,\n",
Expand Down
Loading

0 comments on commit 7f76005

Please sign in to comment.