diff --git a/analysis/gnomad_v3/gwas_analysis.ipynb b/analysis/gnomad_v3/gwas_catalog_analysis.ipynb similarity index 100% rename from analysis/gnomad_v3/gwas_analysis.ipynb rename to analysis/gnomad_v3/gwas_catalog_analysis.ipynb diff --git a/analysis/gnomad_v3/gwas_causaldb_comparison.ipynb b/analysis/gnomad_v3/gwas_causaldb_comparison.ipynb new file mode 100644 index 0000000..3d9ba12 --- /dev/null +++ b/analysis/gnomad_v3/gwas_causaldb_comparison.ipynb @@ -0,0 +1,1151 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + } + ], + "source": [ + "from __future__ import print_function\n", + "import keras\n", + "from keras.models import Sequential, Model, load_model\n", + "from keras import backend as K\n", + "\n", + "import tensorflow as tf\n", + "\n", + "import os\n", + "import pandas as pd\n", + "\n", + "import numpy as np\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.cm as cm\n", + "\n", + "#import aparent.visualization as vis\n", + "\n", + "#from aparent_predictor import *\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "#Store variant prediction dataframe\n", + "\n", + "variant_df = pd.read_csv('aparent_resnet_variant_predictions_polyadb_no_sequences_no_cutoff.csv', sep='\\t')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(polyadb_df) = 175451\n" + ] + } + ], + "source": [ + "#Load APADB Data and filter on targeted genes\n", + "\n", + "#genes = ['RUNX1', 'CEBPA', 'GATA2', 'ANKRD26', 'DDX41', 'ETV6', 'PTEN', 'BRCA1', 'BRCA2', 'TP53', 'APC', 'ATM', 'PALB2', 'MSH2', 'MLH1', 'MSH6', 'PMS2', 'MUTYH']\n", + "\n", + "polyadb_df = pd.read_csv('polyadb_processed.csv', sep=',')\n", + "\n", + "#polyadb_df = polyadb_df.loc[polyadb_df['gene'].isin(genes)].reset_index(drop=True).copy()\n", + "polyadb_df = polyadb_df.loc[((~polyadb_df['gene'].isnull()) & (polyadb_df['gene'] != 'na')) & (polyadb_df['pas'] != -1)].reset_index(drop=True).copy()\n", + "\n", + "print('len(polyadb_df) = ' + str(len(polyadb_df)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#Process PolyaDB data\n", + "\n", + "polyadb_df_minus = polyadb_df.query(\"strand == '-'\").copy().reset_index(drop=True)\n", + "polyadb_df_plus = polyadb_df.query(\"strand == '+'\").copy().reset_index(drop=True)\n", + "\n", + "polyadb_df_minus = polyadb_df_minus.sort_values(by='pas_pos', ascending=False).copy().reset_index(drop=True)\n", + "polyadb_df_plus = polyadb_df_plus.sort_values(by='pas_pos', ascending=True).copy().reset_index(drop=True)\n", + "\n", + "new_gene_id_list_plus = []\n", + "sitenum_list_plus = []\n", + "gene_id_dict = {}\n", + "for _, row in polyadb_df_plus.iterrows() :\n", + "\n", + " gene = row['gene']\n", + "\n", + " if gene not in gene_id_dict :\n", + " gene_id_dict[gene] = 0\n", + "\n", + " gene_id_dict[gene] += 1\n", + "\n", + " new_gene_id_list_plus.append(gene + \".\" + str(gene_id_dict[gene]))\n", + " sitenum_list_plus.append(gene_id_dict[gene])\n", + "\n", + "polyadb_df_plus['gene_id'] = new_gene_id_list_plus\n", + "polyadb_df_plus['sitenum'] = sitenum_list_plus\n", + "\n", + "new_gene_id_list_minus = []\n", + "sitenum_list_minus = []\n", + "gene_id_dict = {}\n", + "for _, row in polyadb_df_minus.iterrows() :\n", + "\n", + " gene = row['gene']\n", + "\n", + " if gene not in gene_id_dict :\n", + " gene_id_dict[gene] = 0\n", + "\n", + " gene_id_dict[gene] += 1\n", + "\n", + " new_gene_id_list_minus.append(gene + \".\" + str(gene_id_dict[gene]))\n", + " sitenum_list_minus.append(gene_id_dict[gene])\n", + "\n", + "polyadb_df_minus['gene_id'] = new_gene_id_list_minus\n", + "polyadb_df_minus['sitenum'] = sitenum_list_minus\n", + "\n", + "polyadb_df = pd.concat([polyadb_df_plus, polyadb_df_minus])\n", + "\n", + "polyadb_df = polyadb_df.sort_values(by=['gene', 'sitenum'], ascending=True).reset_index(drop=True).copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#Get variant positions in hg38 coordinates\n", + "\n", + "polyadb_bed_hg19 = pd.read_csv(\"polyadb_coordinates_hg19.bed\", sep='\\t', header=None, names=['chrom', 'pas_pos_hg19', 'end', 'gene', 'gene_id', 'strand'])\n", + "polyadb_bed_hg38 = pd.read_csv(\"polyadb_coordinates_hg38.bed\", sep='\\t', header=None, names=['chrom', 'pas_pos_hg38', 'end', 'gene', 'gene_id', 'strand'])\n", + "\n", + "polyadb_bed_hg38 = polyadb_bed_hg38.join(polyadb_bed_hg19[['gene_id', 'pas_pos_hg19']].set_index('gene_id'), on='gene_id', how='inner').copy().reset_index(drop=True)\n", + "\n", + "polyadb_bed_hg38['padb_join_id'] = polyadb_bed_hg38['chrom'] + \"_\" + polyadb_bed_hg38['pas_pos_hg19'].astype(str) + \"_\" + polyadb_bed_hg38['gene']\n", + "polyadb_df['padb_join_id'] = polyadb_df['chrom'] + \"_\" + polyadb_df['pas_pos'].astype(str) + \"_\" + polyadb_df['gene']\n", + "\n", + "polyadb_df = polyadb_df.join(polyadb_bed_hg38[['padb_join_id', 'pas_pos_hg38']].set_index(\"padb_join_id\"), on='padb_join_id', how='inner').copy().reset_index(drop=True)\n", + "\n", + "#polyadb_df = polyadb_df.query(\"site_type == '3_most_exon'\")\n", + "\n", + "polyadb_df = polyadb_df.drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)\n", + "\n", + "variant_df = variant_df.join(polyadb_df[['gene_id', 'pas_pos', 'pas_pos_hg38']].set_index(\"gene_id\"), on='gene_id', how='inner').copy().reset_index(drop=True)\n", + "\n", + "variant_df['var_position_hg38'] = variant_df['var_position'] - variant_df['pas_pos'] + variant_df['pas_pos_hg38']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\ncausaldb_dfs = []\\nfor _, row in causaldb_meta_df.iterrows() :\\n causaldb_dfs.append(\\n pd.read_csv(\"causaldb_credible_set/\" + row[\"ID\"] + \"_total_credible_set.txt\", sep=\\'\\t\\')\\n )\\n\\ncausaldb_df = pd.concat(causaldb_dfs).copy().reset_index(drop=True)\\n\\nprint(\"len(causaldb_df) = \" + str(len(causaldb_df)))\\n'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Get CausalDB credible sets\n", + "\n", + "causaldb_meta_df = pd.read_csv(\"causaldb_credible_set/causaldb_meta_info_v1.txt\", sep='\\t')\n", + "\n", + "causaldb_dfs = []\n", + "for _, row in causaldb_meta_df.iterrows() :\n", + " causaldb_dfs.append(\n", + " pd.read_csv(\"causaldb_credible_set/\" + row[\"ID\"] + \"_total_credible_set.txt\", sep='\\t')\n", + " )\n", + "\n", + "causaldb_df = pd.concat(causaldb_dfs).copy().reset_index(drop=True)\n", + "\n", + "print(\"len(causaldb_df) = \" + str(len(causaldb_df)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(non_causaldb_df) = 705151\n" + ] + } + ], + "source": [ + "#Get CausalDB significant non-causal variants\n", + "\n", + "non_causaldb_df = pd.read_csv(\"causaldb_credible_set/causaldb_sig_noncausal_v1.txt\", sep='\\t')\n", + "\n", + "print(\"len(non_causaldb_df) = \" + str(len(non_causaldb_df)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "#Map variants to predictions\n", + "\n", + "causaldb_df['causaldb_join_id'] = \"chr\" + causaldb_df['CHR'].astype(str) + \"_\" + causaldb_df['BP'].astype(str)\n", + "variant_df['causaldb_join_id'] = variant_df['chrom'] + \"_\" + variant_df['var_position'].astype(str)\n", + "\n", + "#causaldb_df = causaldb_df.drop_duplicates(subset=['causaldb_join_id'], keep='first').copy().reset_index(drop=True)\n", + "\n", + "variant_df_gwas = variant_df.join(causaldb_df[[\"causaldb_join_id\", 'rsID', 'MAF', 'EA', 'NEA', 'BETA', 'SE', 'P', 'Zscore', 'PAINTOR', 'CAVIARBF', 'FINEMAP', 'meta_id']].set_index(\"causaldb_join_id\"), on='causaldb_join_id', how='inner').copy().reset_index(drop=True)\n", + "variant_df_gwas = variant_df_gwas.query(\"EA == var_nucleotide\").copy().reset_index(drop=True)\n", + "\n", + "#variant_df_gwas.to_csv(\"aparent_resnet_variant_predictions_polyadb_no_sequences_causaldb_no_cutoff.csv\", sep='\\t', index=False)\n", + "\n", + "#Map non-causal variants to predictions\n", + "\n", + "non_causaldb_df['causaldb_join_id'] = \"chr\" + non_causaldb_df['CHR'].astype(str) + \"_\" + non_causaldb_df['BP'].astype(str)\n", + "variant_df['causaldb_join_id'] = variant_df['chrom'] + \"_\" + variant_df['var_position'].astype(str)\n", + "\n", + "#non_causaldb_df = non_causaldb_df.drop_duplicates(subset=['causaldb_join_id'], keep='first').copy().reset_index(drop=True)\n", + "\n", + "variant_df_gwas_noncausal = variant_df.join(non_causaldb_df[[\"causaldb_join_id\", 'rsID', 'MAF', 'EA', 'NEA', 'BETA', 'SE', 'P', 'Zscore', 'PAINTOR', 'CAVIARBF', 'FINEMAP', 'meta_id']].set_index(\"causaldb_join_id\"), on='causaldb_join_id', how='inner').copy().reset_index(drop=True)\n", + "variant_df_gwas_noncausal = variant_df_gwas_noncausal.query(\"EA == var_nucleotide\").copy().reset_index(drop=True)\n", + "\n", + "#variant_df_gwas_noncausal.to_csv(\"aparent_resnet_variant_predictions_polyadb_no_sequences_noncausaldb_no_cutoff.csv\", sep='\\t', index=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "variant_df_gwas = pd.read_csv(\"aparent_resnet_variant_predictions_polyadb_no_sequences_causaldb_no_cutoff.csv\", sep='\\t')\n", + "variant_df_gwas_noncausal = pd.read_csv(\"aparent_resnet_variant_predictions_polyadb_no_sequences_noncausaldb_no_cutoff.csv\", sep='\\t')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "#Drop duplicates for partially overlapping annotated PASs\n", + "\n", + "variant_df_gwas['rsID'] = \"rs\" + variant_df_gwas['rsID'].astype(int).astype(str)\n", + "\n", + "variant_df_gwas['delta_logodds_77_127_abs'] = np.abs(variant_df_gwas['delta_logodds_77_127'])\n", + "\n", + "variant_df_gwas['rel_var_position'] = -1\n", + "variant_df_gwas.loc[variant_df_gwas['strand'] == '+', 'rel_var_position'] = variant_df_gwas['var_position'] - (variant_df_gwas['pas_pos'] - 70 + 1)\n", + "variant_df_gwas.loc[variant_df_gwas['strand'] == '-', 'rel_var_position'] = ((variant_df_gwas['pas_pos'] - (205 - 70)) + 205) - variant_df_gwas['var_position']\n", + "\n", + "variant_df_gwas['target_rel_var_position'] = np.abs(90 - variant_df_gwas['rel_var_position'])\n", + "variant_df_gwas['delta_logodds_77_127_abs'] = -variant_df_gwas['delta_logodds_77_127_abs']\n", + "variant_df_gwas = variant_df_gwas.sort_values(by=['target_rel_var_position', 'delta_logodds_77_127_abs'], ascending=True).drop_duplicates(subset=['causaldb_join_id', 'meta_id'], keep='first').copy().reset_index(drop=True)\n", + "\n", + "#Drop duplicates for partially overlapping annotated PASs (non-causal)\n", + "\n", + "variant_df_gwas_noncausal['delta_logodds_77_127_abs'] = np.abs(variant_df_gwas_noncausal['delta_logodds_77_127'])\n", + "\n", + "variant_df_gwas_noncausal['rel_var_position'] = -1\n", + "variant_df_gwas_noncausal.loc[variant_df_gwas_noncausal['strand'] == '+', 'rel_var_position'] = variant_df_gwas_noncausal['var_position'] - (variant_df_gwas_noncausal['pas_pos'] - 70 + 1)\n", + "variant_df_gwas_noncausal.loc[variant_df_gwas_noncausal['strand'] == '-', 'rel_var_position'] = ((variant_df_gwas_noncausal['pas_pos'] - (205 - 70)) + 205) - variant_df_gwas_noncausal['var_position']\n", + "\n", + "variant_df_gwas_noncausal['target_rel_var_position'] = np.abs(90 - variant_df_gwas_noncausal['rel_var_position'])\n", + "variant_df_gwas_noncausal['delta_logodds_77_127_abs'] = -variant_df_gwas_noncausal['delta_logodds_77_127_abs']\n", + "variant_df_gwas_noncausal = variant_df_gwas_noncausal.sort_values(by=['target_rel_var_position', 'delta_logodds_77_127_abs'], ascending=True).drop_duplicates(subset=['causaldb_join_id', 'meta_id'], keep='first').copy().reset_index(drop=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(variant_df_gwas_lead) = 8072\n" + ] + } + ], + "source": [ + "#Aggregate summary statistic for unique SNPs (by rsID)\n", + "\n", + "variant_df_gwas_lead = variant_df_gwas.copy().sort_values(by='FINEMAP', ascending=False).drop_duplicates(\"causaldb_join_id\").copy().reset_index(drop=True)\n", + "\n", + "print(\"len(variant_df_gwas_lead) = \" + str(len(variant_df_gwas_lead)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n = 8072\n", + "n (>= 1.4-fold) = 999\n", + "\n", + "n = 152 (PP >= 0.9)\n", + "n (>= 1.4-fold) = 26 (PP >= 0.9)\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "variant_df_gwas_lead['delta_logodds_77_127_abs'] = np.abs(variant_df_gwas_lead['delta_logodds_77_127'])\n", + "\n", + "variant_df_gwas_lead_non_signi = variant_df_gwas_lead.query(\"FINEMAP < 0.9\")\n", + "variant_df_gwas_lead_signi = variant_df_gwas_lead.query(\"FINEMAP >= 0.9\")\n", + "\n", + "print(\"n = \" + str(len(variant_df_gwas_lead)))\n", + "print(\"n (>= 1.4-fold) = \" + str(len(variant_df_gwas_lead.query(\"delta_logodds_77_127_abs >= 0.336\"))))\n", + "\n", + "print(\"\")\n", + "print(\"n = \" + str(len(variant_df_gwas_lead_signi)) + \" (PP >= 0.9)\")\n", + "print(\"n (>= 1.4-fold) = \" + str(len(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs >= 0.336\"))) + \" (PP >= 0.9)\")\n", + "\n", + "plt.scatter(variant_df_gwas_lead_non_signi['delta_logodds_77_127_abs'] / np.log(2.), -np.log(np.clip(variant_df_gwas_lead_non_signi['P'], 1e-200, 1.)) / np.log(10), color='darkgreen', s=15, alpha=0.95)\n", + "\n", + "plt.scatter(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs < 0.336\")['delta_logodds_77_127_abs'] / np.log(2.), -np.log(np.clip(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs < 0.336\")['P'], 1e-200, 1.)) / np.log(10), color='red', s=15, alpha=0.95)\n", + "plt.scatter(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs >= 0.336\")['delta_logodds_77_127_abs'] / np.log(2.), -np.log(np.clip(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs >= 0.336\")['P'], 1e-200, 1.)) / np.log(10), color='red', s=35, marker=\"^\", alpha=0.95)\n", + "\n", + "plt.axvline(x=np.log2(1.4), linestyle='--', linewidth=2, color='black')\n", + "\n", + "plt.xlim(-0.1, 6.0)\n", + "plt.ylim(3, 80)\n", + "\n", + "plt.xticks([0, 1, 2, 3, 4, 5, 6], ['Ref', '2-fold', '4-fold', '8-fold', '16-fold', '32-fold', '64-fold'], fontsize=12, rotation=45)\n", + "plt.yticks([10, 20, 40, 60, 80], [\"1e-10\", \"1e-20\", \"1e-40\", \"1e-60\", \"1e-80\", \"1e-100\"], fontsize=12, rotation=45)\n", + "\n", + "plt.savefig(\"causaldb_aparent_resnet_scatter_pp_09_v2.png\", transparent=True, dpi=300)\n", + "plt.savefig(\"causaldb_aparent_resnet_scatter_pp_09_v2.eps\")\n", + "\n", + "plt.tight_layout()\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n = 8072\n", + "n (>= 1.4-fold) = 999\n", + "\n", + "n = 121 (PP >= 0.5)\n", + "n (>= 1.4-fold) = 13 (PP >= 0.5)\n", + "\n", + "n = 152 (PP >= 0.9)\n", + "n (>= 1.4-fold) = 26 (PP >= 0.9)\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "variant_df_gwas_lead['delta_logodds_77_127_abs'] = np.abs(variant_df_gwas_lead['delta_logodds_77_127'])\n", + "\n", + "variant_df_gwas_lead_non_signi = variant_df_gwas_lead.query(\"FINEMAP < 0.5\")\n", + "variant_df_gwas_lead_semi_signi = variant_df_gwas_lead.query(\"FINEMAP >= 0.5 and FINEMAP < 0.9\")\n", + "variant_df_gwas_lead_signi = variant_df_gwas_lead.query(\"FINEMAP >= 0.9\")\n", + "\n", + "print(\"n = \" + str(len(variant_df_gwas_lead)))\n", + "print(\"n (>= 1.4-fold) = \" + str(len(variant_df_gwas_lead.query(\"delta_logodds_77_127_abs >= 0.336\"))))\n", + "\n", + "print(\"\")\n", + "print(\"n = \" + str(len(variant_df_gwas_lead_semi_signi)) + \" (PP >= 0.5)\")\n", + "print(\"n (>= 1.4-fold) = \" + str(len(variant_df_gwas_lead_semi_signi.query(\"delta_logodds_77_127_abs >= 0.336\"))) + \" (PP >= 0.5)\")\n", + "\n", + "print(\"\")\n", + "print(\"n = \" + str(len(variant_df_gwas_lead_signi)) + \" (PP >= 0.9)\")\n", + "print(\"n (>= 1.4-fold) = \" + str(len(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs >= 0.336\"))) + \" (PP >= 0.9)\")\n", + "\n", + "plt.scatter(variant_df_gwas_lead_non_signi['delta_logodds_77_127_abs'] / np.log(2.), -np.log(np.clip(variant_df_gwas_lead_non_signi['P'], 1e-200, 1.)) / np.log(10), color='darkgreen', s=15, alpha=0.95)\n", + "\n", + "plt.scatter(variant_df_gwas_lead_semi_signi.query(\"delta_logodds_77_127_abs < 0.336\")['delta_logodds_77_127_abs'] / np.log(2.), -np.log(np.clip(variant_df_gwas_lead_semi_signi.query(\"delta_logodds_77_127_abs < 0.336\")['P'], 1e-200, 1.)) / np.log(10), color='blue', s=15, alpha=0.95)\n", + "plt.scatter(variant_df_gwas_lead_semi_signi.query(\"delta_logodds_77_127_abs >= 0.336\")['delta_logodds_77_127_abs'] / np.log(2.), -np.log(np.clip(variant_df_gwas_lead_semi_signi.query(\"delta_logodds_77_127_abs >= 0.336\")['P'], 1e-200, 1.)) / np.log(10), color='blue', s=35, marker=\"^\", alpha=0.95)\n", + "\n", + "plt.scatter(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs < 0.336\")['delta_logodds_77_127_abs'] / np.log(2.), -np.log(np.clip(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs < 0.336\")['P'], 1e-200, 1.)) / np.log(10), color='red', s=15, alpha=0.95)\n", + "plt.scatter(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs >= 0.336\")['delta_logodds_77_127_abs'] / np.log(2.), -np.log(np.clip(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs >= 0.336\")['P'], 1e-200, 1.)) / np.log(10), color='red', s=35, marker=\"^\", alpha=0.95)\n", + "\n", + "plt.axvline(x=np.log2(1.4), linestyle='--', linewidth=2, color='black')\n", + "\n", + "plt.xlim(-0.1, 6.0)\n", + "plt.ylim(3, 80)\n", + "\n", + "plt.xticks([0, 1, 2, 3, 4, 5, 6], ['Ref', '2-fold', '4-fold', '8-fold', '16-fold', '32-fold', '64-fold'], fontsize=12, rotation=45)\n", + "plt.yticks([10, 20, 40, 60, 80], [\"1e-10\", \"1e-20\", \"1e-40\", \"1e-60\", \"1e-80\", \"1e-100\"], fontsize=12, rotation=45)\n", + "\n", + "plt.savefig(\"causaldb_aparent_resnet_scatter_pp_05_09_v2.png\", transparent=True, dpi=300)\n", + "plt.savefig(\"causaldb_aparent_resnet_scatter_pp_05_09_v2.eps\")\n", + "\n", + "plt.tight_layout()\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "#Filter on 3' UTR SNPs only\n", + "\n", + "variant_df_gwas = variant_df_gwas.query(\"site_type == '3_most_exon'\").copy().reset_index(drop=True)\n", + "variant_df_gwas_lead = variant_df_gwas_lead.query(\"site_type == '3_most_exon'\").copy().reset_index(drop=True)\n", + "variant_df_gwas_noncausal = variant_df_gwas_noncausal.query(\"site_type == '3_most_exon'\").copy().reset_index(drop=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n = 4200\n", + "n (>= 1.4-fold) = 554\n", + "\n", + "n = 70 (PP >= 0.5)\n", + "n (>= 1.4-fold) = 8 (PP >= 0.5)\n", + "\n", + "n = 96 (PP >= 0.9)\n", + "n (>= 1.4-fold) = 17 (PP >= 0.9)\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "variant_df_gwas_lead['delta_logodds_77_127_abs'] = np.abs(variant_df_gwas_lead['delta_logodds_77_127'])\n", + "\n", + "variant_df_gwas_lead_non_signi = variant_df_gwas_lead.query(\"FINEMAP < 0.5\")\n", + "variant_df_gwas_lead_semi_signi = variant_df_gwas_lead.query(\"FINEMAP >= 0.5 and FINEMAP < 0.9\")\n", + "variant_df_gwas_lead_signi = variant_df_gwas_lead.query(\"FINEMAP >= 0.9\")\n", + "\n", + "print(\"n = \" + str(len(variant_df_gwas_lead)))\n", + "print(\"n (>= 1.4-fold) = \" + str(len(variant_df_gwas_lead.query(\"delta_logodds_77_127_abs >= 0.336\"))))\n", + "\n", + "print(\"\")\n", + "print(\"n = \" + str(len(variant_df_gwas_lead_semi_signi)) + \" (PP >= 0.5)\")\n", + "print(\"n (>= 1.4-fold) = \" + str(len(variant_df_gwas_lead_semi_signi.query(\"delta_logodds_77_127_abs >= 0.336\"))) + \" (PP >= 0.5)\")\n", + "\n", + "print(\"\")\n", + "print(\"n = \" + str(len(variant_df_gwas_lead_signi)) + \" (PP >= 0.9)\")\n", + "print(\"n (>= 1.4-fold) = \" + str(len(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs >= 0.336\"))) + \" (PP >= 0.9)\")\n", + "\n", + "plt.scatter(variant_df_gwas_lead_non_signi['delta_logodds_77_127_abs'] / np.log(2.), -np.log(np.clip(variant_df_gwas_lead_non_signi['P'], 1e-200, 1.)) / np.log(10), color='darkgreen', s=15, alpha=0.95)\n", + "\n", + "plt.scatter(variant_df_gwas_lead_semi_signi['delta_logodds_77_127_abs'] / np.log(2.), -np.log(np.clip(variant_df_gwas_lead_semi_signi['P'], 1e-200, 1.)) / np.log(10), color='blue', s=15, alpha=0.95)\n", + "\n", + "plt.scatter(variant_df_gwas_lead_signi['delta_logodds_77_127_abs'] / np.log(2.), -np.log(np.clip(variant_df_gwas_lead_signi['P'], 1e-200, 1.)) / np.log(10), color='red', s=15, alpha=0.95)\n", + "\n", + "#Re-plot rs6796 with other association to make it visible\n", + "rs6796_df = variant_df_gwas.query(\"rsID == 'rs6796' and meta_id == 'AT690'\").copy()\n", + "rs6796_df['delta_logodds_77_127_abs'] = np.abs(rs6796_df['delta_logodds_77_127'])\n", + "plt.scatter(rs6796_df['delta_logodds_77_127_abs'] / np.log(2.), -np.log(np.clip(rs6796_df['P'], 1e-200, 1.)) / np.log(10), color='red', s=15, alpha=0.95)\n", + "\n", + "plt.axvline(x=np.log2(1.4), linestyle='--', linewidth=2, color='black')\n", + "\n", + "plt.xlim(-0.1, 5.0)\n", + "plt.ylim(3, 80)\n", + "\n", + "plt.xticks([0, 1, 2, 3, 4, 5], ['Ref', '2-fold', '4-fold', '8-fold', '16-fold', '32-fold'], fontsize=12, rotation=45)\n", + "plt.yticks([10, 20, 40, 60, 80], [\"1e-10\", \"1e-20\", \"1e-40\", \"1e-60\", \"1e-80\", \"1e-100\"], fontsize=12, rotation=45)\n", + "\n", + "plt.savefig(\"causaldb_aparent_resnet_scatter_utr3_only_pp_05_09_v2.png\", transparent=True, dpi=300)\n", + "plt.savefig(\"causaldb_aparent_resnet_scatter_utr3_only_pp_05_09_v2.eps\")\n", + "\n", + "plt.tight_layout()\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n = 4200\n", + "n (>= 1.4-fold) = 554\n", + "\n", + "n = 70 (PP >= 0.5)\n", + "n (>= 1.4-fold) = 8 (PP >= 0.5)\n", + "\n", + "n = 96 (PP >= 0.9)\n", + "n (>= 1.4-fold) = 17 (PP >= 0.9)\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "variant_df_gwas_lead['delta_logodds_77_127_abs'] = np.abs(variant_df_gwas_lead['delta_logodds_77_127'])\n", + "\n", + "variant_df_gwas_lead_non_signi = variant_df_gwas_lead.query(\"FINEMAP < 0.5\")\n", + "variant_df_gwas_lead_semi_signi = variant_df_gwas_lead.query(\"FINEMAP >= 0.5 and FINEMAP < 0.9\")\n", + "variant_df_gwas_lead_signi = variant_df_gwas_lead.query(\"FINEMAP >= 0.9\")\n", + "\n", + "print(\"n = \" + str(len(variant_df_gwas_lead)))\n", + "print(\"n (>= 1.4-fold) = \" + str(len(variant_df_gwas_lead.query(\"delta_logodds_77_127_abs >= 0.336\"))))\n", + "\n", + "print(\"\")\n", + "print(\"n = \" + str(len(variant_df_gwas_lead_semi_signi)) + \" (PP >= 0.5)\")\n", + "print(\"n (>= 1.4-fold) = \" + str(len(variant_df_gwas_lead_semi_signi.query(\"delta_logodds_77_127_abs >= 0.336\"))) + \" (PP >= 0.5)\")\n", + "\n", + "print(\"\")\n", + "print(\"n = \" + str(len(variant_df_gwas_lead_signi)) + \" (PP >= 0.9)\")\n", + "print(\"n (>= 1.4-fold) = \" + str(len(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs >= 0.336\"))) + \" (PP >= 0.9)\")\n", + "\n", + "plt.scatter(variant_df_gwas_lead_non_signi['delta_logodds_77_127_abs'] / np.log(2.), -np.log(np.clip(variant_df_gwas_lead_non_signi['P'], 1e-200, 1.)) / np.log(10), color='darkgreen', s=15, alpha=0.95)\n", + "\n", + "plt.scatter(variant_df_gwas_lead_semi_signi['delta_logodds_77_127_abs'] / np.log(2.), -np.log(np.clip(variant_df_gwas_lead_semi_signi['P'], 1e-200, 1.)) / np.log(10), color='blue', s=35, marker=\"^\", alpha=0.95)\n", + "\n", + "plt.scatter(variant_df_gwas_lead_signi['delta_logodds_77_127_abs'] / np.log(2.), -np.log(np.clip(variant_df_gwas_lead_signi['P'], 1e-200, 1.)) / np.log(10), color='red', s=35, marker=\"^\", alpha=0.95)\n", + "\n", + "#Re-plot rs6796 with other association to make it visible\n", + "rs6796_df = variant_df_gwas.query(\"rsID == 'rs6796' and meta_id == 'AT690'\").copy()\n", + "rs6796_df['delta_logodds_77_127_abs'] = np.abs(rs6796_df['delta_logodds_77_127'])\n", + "plt.scatter(rs6796_df['delta_logodds_77_127_abs'] / np.log(2.), -np.log(np.clip(rs6796_df['P'], 1e-200, 1.)) / np.log(10), color='red', s=35, marker=\"^\", alpha=0.95)\n", + "\n", + "plt.axvline(x=np.log2(1.4), linestyle='--', linewidth=2, color='black')\n", + "\n", + "plt.xlim(-0.1, 5.0)\n", + "plt.ylim(3, 80)\n", + "\n", + "plt.xticks([0, 1, 2, 3, 4, 5], ['Ref', '2-fold', '4-fold', '8-fold', '16-fold', '32-fold'], fontsize=12, rotation=45)\n", + "plt.yticks([10, 20, 40, 60, 80], [\"1e-10\", \"1e-20\", \"1e-40\", \"1e-60\", \"1e-80\", \"1e-100\"], fontsize=12, rotation=45)\n", + "\n", + "plt.savefig(\"causaldb_aparent_resnet_scatter_utr3_only_pp_05_09_v2_alt.png\", transparent=True, dpi=300)\n", + "plt.savefig(\"causaldb_aparent_resnet_scatter_utr3_only_pp_05_09_v2_alt.eps\")\n", + "\n", + "plt.tight_layout()\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "#Get SNPs with PP >= 0.5 and PP >= 0.9\n", + "\n", + "variant_df_gwas_semi_signi = variant_df_gwas.query(\"FINEMAP >= 0.5 and FINEMAP < 0.9\").copy()\n", + "variant_df_gwas_lead_semi_signi = variant_df_gwas_lead_semi_signi.query(\"FINEMAP >= 0.5 and FINEMAP < 0.9\").copy()\n", + "\n", + "variant_df_gwas_semi_signi = variant_df_gwas_semi_signi.join(causaldb_meta_df[['ID', 'Trait', 'MeSH_term']].set_index(\"ID\"), on='meta_id', how='inner').copy().reset_index(drop=True)\n", + "variant_df_gwas_lead_semi_signi = variant_df_gwas_lead_semi_signi.join(causaldb_meta_df[['ID', 'Trait', 'MeSH_term']].set_index(\"ID\"), on='meta_id', how='inner').copy().reset_index(drop=True)\n", + "\n", + "variant_df_gwas_signi = variant_df_gwas.query(\"FINEMAP >= 0.9\").copy()\n", + "variant_df_gwas_lead_signi = variant_df_gwas_lead_signi.query(\"FINEMAP >= 0.9\").copy()\n", + "\n", + "variant_df_gwas_signi = variant_df_gwas_signi.join(causaldb_meta_df[['ID', 'Trait', 'MeSH_term']].set_index(\"ID\"), on='meta_id', how='inner').copy().reset_index(drop=True)\n", + "variant_df_gwas_lead_signi = variant_df_gwas_lead_signi.join(causaldb_meta_df[['ID', 'Trait', 'MeSH_term']].set_index(\"ID\"), on='meta_id', how='inner').copy().reset_index(drop=True)\n", + "\n", + "variant_df_gwas_lead_semi_signi['delta_logodds_77_127_abs_log_2'] = 2**(variant_df_gwas_lead_semi_signi['delta_logodds_77_127_abs'] / np.log(2.))\n", + "variant_df_gwas_lead_signi['delta_logodds_77_127_abs_log_2'] = 2**(variant_df_gwas_lead_signi['delta_logodds_77_127_abs'] / np.log(2.))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "#Get variant ids\n", + "\n", + "rs_ids = set(variant_df_gwas['rsID'].unique().tolist())\n", + "\n", + "rs_ids_semi_signi = set(variant_df_gwas_semi_signi['rsID'].unique().tolist())\n", + "rs_ids_lead_semi_signi = set(variant_df_gwas_lead_semi_signi['rsID'].unique().tolist())\n", + "\n", + "rs_ids_signi = set(variant_df_gwas_signi['rsID'].unique().tolist())\n", + "rs_ids_lead_signi = set(variant_df_gwas_lead_signi['rsID'].unique().tolist())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- SNPs from credible set w. PP >= 0.9 ---\n", + "rs78378222 : True\n", + " - delta_logodds_77_127 = -2.3624\n", + " - delta_logodds_77_127_abs_log_2 = 10.6162\n", + " -- Mean corpuscular hemoglobin, p-value = 6.44e-09, FINEMAP = 0.997827\n", + " -- Red blood cell (erythrocyte) count, p-value = 7.6177e-25, FINEMAP = 0.921756\n", + " -- Mean corpuscular haemoglobin, p-value = 1.1518e-23, FINEMAP = 1.0\n", + " -- Mean corpuscular volume, p-value = 1.8429000000000003e-23, FINEMAP = 1.0\n", + " -- Impedance measures - Basal metabolic rate, p-value = 1.538e-34, FINEMAP = 1.0\n", + " -- Impedance measures - Whole body water mass, p-value = 1.2809999999999999e-40, FINEMAP = 1.0\n", + " -- Weight, p-value = 1.7840000000000003e-18, FINEMAP = 0.999278\n", + " -- Impedance measures - Weight, p-value = 4.428e-18, FINEMAP = 0.999086\n", + " -- Impedance measures - Leg fat-free mass (right), p-value = 7.078e-33, FINEMAP = 1.0\n", + " -- Impedance measures - Leg predicted mass (right), p-value = 6.511e-33, FINEMAP = 1.0\n", + " -- Impedance measures - Leg fat-free mass (left), p-value = 1.2279999999999999e-29, FINEMAP = 1.0\n", + " -- Impedance measures - Leg predicted mass (left), p-value = 1.62e-29, FINEMAP = 1.0\n", + " -- Impedance measures - Whole body fat-free mass, p-value = 6.189e-40, FINEMAP = 1.0\n", + " -- Impedance measures - Arm fat-free mass (right), p-value = 6.931000000000001e-35, FINEMAP = 1.0\n", + " -- Impedance measures - Arm predicted mass (right), p-value = 1.617e-34, FINEMAP = 1.0\n", + " -- Impedance measures - Arm fat-free mass (left), p-value = 1.3809999999999998e-40, FINEMAP = 1.0\n", + " -- Impedance measures - Arm predicted mass (left), p-value = 1.9379999999999997e-35, FINEMAP = 1.0\n", + " -- Impedance measures - Trunk fat-free mass, p-value = 1.18e-42, FINEMAP = 1.0\n", + " -- Impedance measures - Trunk predicted mass, p-value = 6.612000000000001e-43, FINEMAP = 1.0\n", + " -- Mean reticulocyte volume, p-value = 3.1284e-13, FINEMAP = 0.975953\n", + " -- Mean sphered cell volume, p-value = 3.8348e-24, FINEMAP = 1.0\n", + " -- Basal metabolic rate, p-value = 2.99675e-34, FINEMAP = 1.0\n", + " -- Whole body water mass, p-value = 4.67851e-39, FINEMAP = 1.0\n", + " -- Whole body fat-free mass, p-value = 2.4184e-38, FINEMAP = 1.0\n", + " -- Basal metabolic rate, p-value = 2.5649e-43, FINEMAP = 1.0\n", + " -- Whole body water mass, p-value = 2.733599999999998e-51, FINEMAP = 1.0\n", + " -- Weight, p-value = 9.692900000000001e-23, FINEMAP = 0.999992\n", + " -- Weight, p-value = 5.4847e-23, FINEMAP = 0.999998\n", + " -- C43-C44 Melanoma and other malignant neoplasms of skin, p-value = 8.7235e-17, FINEMAP = 1.0\n", + " -- Impedance of arm (right), p-value = 3.3375999999999994e-26, FINEMAP = 0.999524\n", + " -- Impedance of whole body, p-value = 6.6648e-36, FINEMAP = 1.0\n", + " -- Impedance of arm (left), p-value = 6.980800000000001e-27, FINEMAP = 1.0\n", + " -- Impedance of leg (left), p-value = 4.849599999999999e-30, FINEMAP = 1.0\n", + " -- Impedance of leg (right), p-value = 3.5038999999999998e-31, FINEMAP = 1.0\n", + " -- L80-L99 Other disorders of the skin and subcutaneous tissue, p-value = 1.5893e-08, FINEMAP = 0.998027\n", + " -- C44 Other malignant neoplasms of skin, p-value = 1.9368e-15, FINEMAP = 1.0\n", + " -- Trunk predicted mass, p-value = 1.8052000000000002e-52, FINEMAP = 1.0\n", + " -- Trunk fat-free mass, p-value = 1.1203e-52, FINEMAP = 1.0\n", + " -- Arm predicted mass (left), p-value = 1.3085999999999993e-43, FINEMAP = 1.0\n", + " -- Arm fat-free mass (left), p-value = 2.0442999999999999e-44, FINEMAP = 1.0\n", + " -- Arm predicted mass (right), p-value = 8.2865e-44, FINEMAP = 1.0\n", + " -- Arm fat-free mass (right), p-value = 4.6606e-44, FINEMAP = 1.0\n", + " -- Leg predicted mass (left), p-value = 2.7929999999999995e-37, FINEMAP = 1.0\n", + " -- Leg fat-free mass (left), p-value = 1.8524999999999998e-37, FINEMAP = 1.0\n", + " -- Leg predicted mass (right), p-value = 1.4567999999999998e-41, FINEMAP = 1.0\n", + " -- Leg fat-free mass (right), p-value = 1.8934e-41, FINEMAP = 1.0\n", + " -- Whole body fat-free mass, p-value = 8.64389999999999e-51, FINEMAP = 1.0\n", + " -- Number of operations, self-reported, p-value = 2.8876e-17, FINEMAP = 1.0\n", + " -- Cancer register - Histology of cancer tumour: Basal cell carcinoma, NOS, p-value = 1.049e-09, FINEMAP = 0.998733\n", + " -- Ever had hysterectomy (womb removed) (female), p-value = 6.419e-09, FINEMAP = 0.99889\n", + " -- Impedance measures - Impedance of whole body, p-value = 8.096e-26, FINEMAP = 1.0\n", + " -- Impedance measures - Impedance of leg (right), p-value = 2.112e-21, FINEMAP = 1.0\n", + " -- Impedance measures - Impedance of leg (left), p-value = 4.0769999999999997e-20, FINEMAP = 1.0\n", + " -- Impedance measures - Impedance of arm (right), p-value = 3.779e-18, FINEMAP = 0.999041\n", + " -- Impedance measures - Impedance of arm (left), p-value = 3.066e-19, FINEMAP = 0.999904\n", + " -- C44 Other and unspecified malignant neoplasm of skin, p-value = 1.1990000000000002e-22, FINEMAP = 1.0\n", + " -- Years since last cervical smear test (female), p-value = 4.092e-10, FINEMAP = 0.978371\n", + " -- Weight, p-value = 5.9555e-18, FINEMAP = 0.999365\n", + " -- Weight, p-value = 4.066819999999999e-18, FINEMAP = 0.999456\n", + " -- Forced expiratory volume in 1-second (FEV1), Best measure, p-value = 6.6753200000000005e-09, FINEMAP = 0.955059\n", + " -- Forced vital capacity (FVC), Best measure, p-value = 5.29275e-12, FINEMAP = 0.996491\n", + " -- D25 Leiomyoma of uterus, p-value = 8.048380000000004e-14, FINEMAP = 0.999996\n", + " -- Impedance of arm (right), p-value = 1.52703e-19, FINEMAP = 0.99984\n", + " -- Impedance of whole body, p-value = 2.82892e-25, FINEMAP = 1.0\n", + " -- Impedance of arm (left), p-value = 1.8908999999999998e-20, FINEMAP = 0.999983\n", + " -- Impedance of leg (left), p-value = 1.2351e-19, FINEMAP = 1.0\n", + " -- Impedance of leg (right), p-value = 2.81817e-21, FINEMAP = 1.0\n", + " -- C44 Other malignant neoplasms of skin, p-value = 3.7645499999999996e-24, FINEMAP = 1.0\n", + " -- Trunk predicted mass, p-value = 1.3318199999999998e-39, FINEMAP = 1.0\n", + " -- Trunk fat-free mass, p-value = 8.77218e-40, FINEMAP = 1.0\n", + " -- Arm predicted mass (left), p-value = 1.32153e-32, FINEMAP = 1.0\n", + " -- Arm fat-free mass (left), p-value = 5.3152e-34, FINEMAP = 1.0\n", + " -- Arm predicted mass (right), p-value = 1.60397e-33, FINEMAP = 1.0\n", + " -- Arm fat-free mass (right), p-value = 9.020689999999998e-34, FINEMAP = 1.0\n", + " -- Leg predicted mass (left), p-value = 6.1356399999999995e-31, FINEMAP = 1.0\n", + " -- Leg fat-free mass (left), p-value = 4.4373299999999995e-31, FINEMAP = 1.0\n", + " -- Leg fat-free mass (right), p-value = 5.57084e-34, FINEMAP = 1.0\n", + " -- Leg predicted mass (right), p-value = 4.09828e-34, FINEMAP = 1.0\n", + "\n", + "rs6796 : True\n", + " - delta_logodds_77_127 = -2.2975\n", + " - delta_logodds_77_127_abs_log_2 = 9.9491\n", + " -- Mean platelet (thrombocyte) volume, p-value = 8.899099999999991e-69, FINEMAP = 0.933805\n", + " -- Monocyte count, p-value = 8.468099999999995e-67, FINEMAP = 0.999974\n", + " -- Monocyte percentage, p-value = 1.2099999999999982e-87, FINEMAP = 0.99999\n", + " -- Neutrophill percentage, p-value = 1.7466999999999994e-60, FINEMAP = 0.996227\n", + "\n", + "rs76020419 : True\n", + " - delta_logodds_77_127 = 1.4474\n", + " - delta_logodds_77_127_abs_log_2 = 4.252\n", + " -- Diastolic blood pressure (automated reading), p-value = 7.082e-11, FINEMAP = 0.972447\n", + " -- Intra-ocular pressure, corneal-compensated (right), p-value = 5.49045e-12, FINEMAP = 0.998956\n", + " -- Intra-ocular pressure, Goldmann-correlated (right), p-value = 3.5142800000000013e-10, FINEMAP = 0.98917\n", + "\n", + "rs544433296 : True\n", + " - delta_logodds_77_127 = -1.3535\n", + " - delta_logodds_77_127_abs_log_2 = 3.8708\n", + " -- Education - Qualifications, p-value = 1.1289999999999999e-07, FINEMAP = 0.974245\n", + "\n", + "rs3208787 : True\n", + " - delta_logodds_77_127 = -1.254\n", + " - delta_logodds_77_127_abs_log_2 = 3.5043\n", + " -- Red blood cell (erythrocyte) count, p-value = 9.0991e-19, FINEMAP = 0.999941\n", + " -- Mean corpuscular haemoglobin, p-value = 5.9438e-11, FINEMAP = 0.997529\n", + " -- Mean corpuscular volume, p-value = 7.326899999999999e-11, FINEMAP = 0.994247\n", + " -- Red blood cell (erythrocyte) distribution width, p-value = 1.0262e-09, FINEMAP = 0.998158\n", + " -- Mean reticulocyte volume, p-value = 4.0174e-09, FINEMAP = 0.984262\n", + " -- Red blood cell count, p-value = 1.49e-09, FINEMAP = 0.997299\n", + " -- Red cell distribution width, p-value = 9.69e-12, FINEMAP = 0.999368\n", + "\n", + "rs35979828 : True\n", + " - delta_logodds_77_127 = -1.0283\n", + " - delta_logodds_77_127_abs_log_2 = 2.7962\n", + " -- Immature reticulocyte fraction, p-value = 2.8606e-09, FINEMAP = 0.988875\n", + " -- Mean sphered cell volume, p-value = 2.6656999999999996e-52, FINEMAP = 1.0\n", + " -- Eosinophill percentage, p-value = 5.7338e-23, FINEMAP = 1.0\n", + " -- Monocyte count, p-value = 7.5808e-21, FINEMAP = 0.99999\n", + " -- Eosinophill count, p-value = 5.1232e-21, FINEMAP = 1.0\n", + "\n", + "rs884205 : True\n", + " - delta_logodds_77_127 = -0.798\n", + " - delta_logodds_77_127_abs_log_2 = 2.2211\n", + " -- Sitting height, p-value = 1.2315999999999998e-19, FINEMAP = 0.999446\n", + " -- Alkaline phosphatase, p-value = 5.457e-11, FINEMAP = 0.999985\n", + "\n", + "rs1419008 : True\n", + " - delta_logodds_77_127 = -0.7825\n", + " - delta_logodds_77_127_abs_log_2 = 2.187\n", + " -- Age at first live birth (female), p-value = 1.391e-09, FINEMAP = 0.968478\n", + "\n", + "rs10492321 : True\n", + " - delta_logodds_77_127 = -0.7597\n", + " - delta_logodds_77_127_abs_log_2 = 2.1376\n", + " -- Height tails, p-value = 9.4e-10, FINEMAP = 0.948431\n", + "\n", + "rs1055253 : True\n", + " - delta_logodds_77_127 = 0.4956\n", + " - delta_logodds_77_127_abs_log_2 = 1.6414\n", + " -- Red blood cell (erythrocyte) count, p-value = 1.3947000000000004e-15, FINEMAP = 0.98616\n", + "\n", + "rs185853558 : True\n", + " - delta_logodds_77_127 = 0.4924\n", + " - delta_logodds_77_127_abs_log_2 = 1.6363\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -- Frequency of travelling from home to job workplace, p-value = 4.319e-10, FINEMAP = 0.999907\n", + "\n", + "rs148354990 : True\n", + " - delta_logodds_77_127 = 0.4284\n", + " - delta_logodds_77_127_abs_log_2 = 1.5349\n", + " -- Ease of skin tanning, p-value = 6.536e-10, FINEMAP = 0.974185\n", + "\n", + "rs7078 : True\n", + " - delta_logodds_77_127 = -0.4102\n", + " - delta_logodds_77_127_abs_log_2 = 1.5071\n", + " -- Heel bone mineral density (BMD) T-score, automated, p-value = 2.08993e-11, FINEMAP = 0.998068\n", + "\n", + "rs3741434 : True\n", + " - delta_logodds_77_127 = -0.4016\n", + " - delta_logodds_77_127_abs_log_2 = 1.4942\n", + " -- Cognitive performance, p-value = 4.68e-08, FINEMAP = 0.96687\n", + "\n", + "rs8045438 : True\n", + " - delta_logodds_77_127 = 0.3838\n", + " - delta_logodds_77_127_abs_log_2 = 1.4679\n", + " -- Dihomo-gamma-linolenic acid, p-value = 7.89e-14, FINEMAP = 1.0\n", + "\n", + "rs190092330 : True\n", + " - delta_logodds_77_127 = 0.3455\n", + " - delta_logodds_77_127_abs_log_2 = 1.4127\n", + " -- Nucleated red blood cell count, p-value = 1.0699e-14, FINEMAP = 0.947007\n", + "\n", + "rs1799963 : True\n", + " - delta_logodds_77_127 = 0.3436\n", + " - delta_logodds_77_127_abs_log_2 = 1.4101\n", + " -- I80 Phlebitis and thrombophlebitis, p-value = 1.29e-16, FINEMAP = 0.905818\n", + " -- deep venous thrombosis (dvt), p-value = 7.650299999999999e-32, FINEMAP = 0.982793\n", + " -- venous thromboembolic disease, p-value = 1.3731999999999999e-37, FINEMAP = 0.98843\n", + "\n" + ] + } + ], + "source": [ + "#Top candidates from CasualDB with PP >= 0.9 (cutoff immediately after known pathogenic variant rs1799963)\n", + "\n", + "cand_rs_ids = variant_df_gwas_lead_signi.sort_values(by='delta_logodds_77_127_abs_log_2', ascending=False).query(\"delta_logodds_77_127_abs_log_2 >= 1.4\")['rsID'].values.tolist()\n", + "\n", + "print(\"--- SNPs from credible set w. PP >= 0.9 ---\")\n", + "\n", + "for cand_rs_id in cand_rs_ids :\n", + " print((cand_rs_id + \" \" * 15)[:15] + \": \" + str((cand_rs_id in rs_ids_signi)))\n", + " \n", + " if cand_rs_id in rs_ids_signi :\n", + " print(\" - delta_logodds_77_127 = \" + str(round(variant_df_gwas_lead_signi.query(\"rsID == '\" + cand_rs_id + \"'\").iloc[0]['delta_logodds_77_127'], 4)))\n", + " print(\" - delta_logodds_77_127_abs_log_2 = \" + str(round(2**(np.abs(variant_df_gwas_lead_signi.query(\"rsID == '\" + cand_rs_id + \"'\").iloc[0]['delta_logodds_77_127']) / np.log(2.)), 4)))\n", + " \n", + " subset_df = variant_df_gwas_signi.query(\"rsID == '\" + cand_rs_id + \"'\")\n", + " for _, row in subset_df.iterrows() :\n", + " print(\" -- \" + row['Trait'] + \", p-value = \" + str(row['P']) + \", FINEMAP = \" + str(row['FINEMAP']))\n", + "\n", + " print(\"\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- SNPs from credible set w. PP >= 0.5 ---\n", + "rs11556473 : True\n", + " - delta_logodds_77_127 = -1.1012\n", + " - delta_logodds_77_127_abs_log_2 = 3.0076\n", + " -- Salt added to food, p-value = 3.142e-08, FINEMAP = 0.523251\n", + "\n", + "rs2732480 : True\n", + " - delta_logodds_77_127 = 0.9093\n", + " - delta_logodds_77_127_abs_log_2 = 2.4826\n", + " -- Red blood cell count, p-value = 1.44e-17, FINEMAP = 0.529911\n", + " -- Hematocrit, p-value = 3.9599999999999996e-36, FINEMAP = 0.585425\n", + " -- Hemoglobin concentration, p-value = 7.65e-29, FINEMAP = 0.602596\n", + "\n", + "rs12142199 : True\n", + " - delta_logodds_77_127 = 0.7497\n", + " - delta_logodds_77_127_abs_log_2 = 2.1164\n", + " -- Ulcerative colitis, p-value = 6.72223986453e-10, FINEMAP = 0.5131319999999999\n", + "\n", + "rs1049868 : True\n", + " - delta_logodds_77_127 = -0.6088\n", + " - delta_logodds_77_127_abs_log_2 = 1.8382\n", + " -- Mean platelet volume, p-value = 4.7499999999999995e-09, FINEMAP = 0.531417\n", + " -- Mean platelet (thrombocyte) volume, p-value = 4.977600000000001e-21, FINEMAP = 0.74954\n", + "\n", + "rs703862 : True\n", + " - delta_logodds_77_127 = 0.5425\n", + " - delta_logodds_77_127_abs_log_2 = 1.7203\n", + " -- Comparative height size at age 10, p-value = 5.0648e-09, FINEMAP = 0.676253\n", + "\n", + "rs71516780 : True\n", + " - delta_logodds_77_127 = 0.4584\n", + " - delta_logodds_77_127_abs_log_2 = 1.5816\n", + " -- Impedance of arm (right), p-value = 1.2835e-10, FINEMAP = 0.7825300000000001\n", + " -- Impedance of whole body, p-value = 1.1593e-08, FINEMAP = 0.702916\n", + " -- Impedance of arm (left), p-value = 7.13e-12, FINEMAP = 0.843332\n", + " -- Immature reticulocyte fraction, p-value = 1.9891000000000003e-08, FINEMAP = 0.586407\n", + " -- Impedance of arm (right), p-value = 2.03412e-08, FINEMAP = 0.549821\n", + " -- Impedance of arm (left), p-value = 7.879350000000001e-09, FINEMAP = 0.599094\n", + "\n", + "rs8077889 : True\n", + " - delta_logodds_77_127 = -0.4509\n", + " - delta_logodds_77_127_abs_log_2 = 1.5697\n", + " -- Triglycerides, p-value = 9.88e-09, FINEMAP = 0.862151\n", + "\n", + "rs13675 : True\n", + " - delta_logodds_77_127 = 0.3826\n", + " - delta_logodds_77_127_abs_log_2 = 1.466\n", + " -- Comparative height size at age 10, p-value = 1.2506999999999999e-14, FINEMAP = 0.559778\n", + " -- Comparative height size at age 10, p-value = 1.3000000000000001e-12, FINEMAP = 0.601916\n", + "\n" + ] + } + ], + "source": [ + "#Top candidates from CasualDB with PP >= 0.5 (cutoff immediately after known pathogenic variant rs1799963)\n", + "\n", + "cand_rs_ids = variant_df_gwas_lead_semi_signi.sort_values(by='delta_logodds_77_127_abs_log_2', ascending=False).query(\"delta_logodds_77_127_abs_log_2 >= 1.4\")['rsID'].values.tolist()\n", + "\n", + "print(\"--- SNPs from credible set w. PP >= 0.5 ---\")\n", + "\n", + "for cand_rs_id in cand_rs_ids :\n", + " print((cand_rs_id + \" \" * 15)[:15] + \": \" + str((cand_rs_id in rs_ids_semi_signi)))\n", + " \n", + " if cand_rs_id in rs_ids_semi_signi :\n", + " print(\" - delta_logodds_77_127 = \" + str(round(variant_df_gwas_lead_semi_signi.query(\"rsID == '\" + cand_rs_id + \"'\").iloc[0]['delta_logodds_77_127'], 4)))\n", + " print(\" - delta_logodds_77_127_abs_log_2 = \" + str(round(2**(np.abs(variant_df_gwas_lead_semi_signi.query(\"rsID == '\" + cand_rs_id + \"'\").iloc[0]['delta_logodds_77_127']) / np.log(2.)), 4)))\n", + " \n", + " subset_df = variant_df_gwas_semi_signi.query(\"rsID == '\" + cand_rs_id + \"'\")\n", + " for _, row in subset_df.iterrows() :\n", + " print(\" -- \" + row['Trait'] + \", p-value = \" + str(row['P']) + \", FINEMAP = \" + str(row['FINEMAP']))\n", + "\n", + " print(\"\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- SNPs from credible set w. PP > 0.9 ---\n", + "rs6796 : True\n", + " - delta_logodds_77_127 = -2.2975\n", + " - delta_logodds_77_127_abs_log_2 = 9.9491\n", + " -- Mean platelet (thrombocyte) volume, p-value = 8.899099999999991e-69, FINEMAP = 0.933805\n", + " -- Monocyte count, p-value = 8.468099999999995e-67, FINEMAP = 0.999974\n", + " -- Monocyte percentage, p-value = 1.2099999999999982e-87, FINEMAP = 0.99999\n", + " -- Neutrophill percentage, p-value = 1.7466999999999994e-60, FINEMAP = 0.996227\n", + "\n", + "rs35630683 : False\n", + "rs78378222 : True\n", + " - delta_logodds_77_127 = -2.3624\n", + " - delta_logodds_77_127_abs_log_2 = 10.6162\n", + " -- Mean corpuscular hemoglobin, p-value = 6.44e-09, FINEMAP = 0.997827\n", + " -- Red blood cell (erythrocyte) count, p-value = 7.6177e-25, FINEMAP = 0.921756\n", + " -- Mean corpuscular haemoglobin, p-value = 1.1518e-23, FINEMAP = 1.0\n", + " -- Mean corpuscular volume, p-value = 1.8429000000000003e-23, FINEMAP = 1.0\n", + " -- Impedance measures - Basal metabolic rate, p-value = 1.538e-34, FINEMAP = 1.0\n", + " -- Impedance measures - Whole body water mass, p-value = 1.2809999999999999e-40, FINEMAP = 1.0\n", + " -- Weight, p-value = 1.7840000000000003e-18, FINEMAP = 0.999278\n", + " -- Impedance measures - Weight, p-value = 4.428e-18, FINEMAP = 0.999086\n", + " -- Impedance measures - Leg fat-free mass (right), p-value = 7.078e-33, FINEMAP = 1.0\n", + " -- Impedance measures - Leg predicted mass (right), p-value = 6.511e-33, FINEMAP = 1.0\n", + " -- Impedance measures - Leg fat-free mass (left), p-value = 1.2279999999999999e-29, FINEMAP = 1.0\n", + " -- Impedance measures - Leg predicted mass (left), p-value = 1.62e-29, FINEMAP = 1.0\n", + " -- Impedance measures - Whole body fat-free mass, p-value = 6.189e-40, FINEMAP = 1.0\n", + " -- Impedance measures - Arm fat-free mass (right), p-value = 6.931000000000001e-35, FINEMAP = 1.0\n", + " -- Impedance measures - Arm predicted mass (right), p-value = 1.617e-34, FINEMAP = 1.0\n", + " -- Impedance measures - Arm fat-free mass (left), p-value = 1.3809999999999998e-40, FINEMAP = 1.0\n", + " -- Impedance measures - Arm predicted mass (left), p-value = 1.9379999999999997e-35, FINEMAP = 1.0\n", + " -- Impedance measures - Trunk fat-free mass, p-value = 1.18e-42, FINEMAP = 1.0\n", + " -- Impedance measures - Trunk predicted mass, p-value = 6.612000000000001e-43, FINEMAP = 1.0\n", + " -- Mean reticulocyte volume, p-value = 3.1284e-13, FINEMAP = 0.975953\n", + " -- Mean sphered cell volume, p-value = 3.8348e-24, FINEMAP = 1.0\n", + " -- Basal metabolic rate, p-value = 2.99675e-34, FINEMAP = 1.0\n", + " -- Whole body water mass, p-value = 4.67851e-39, FINEMAP = 1.0\n", + " -- Whole body fat-free mass, p-value = 2.4184e-38, FINEMAP = 1.0\n", + " -- Basal metabolic rate, p-value = 2.5649e-43, FINEMAP = 1.0\n", + " -- Whole body water mass, p-value = 2.733599999999998e-51, FINEMAP = 1.0\n", + " -- Weight, p-value = 9.692900000000001e-23, FINEMAP = 0.999992\n", + " -- Weight, p-value = 5.4847e-23, FINEMAP = 0.999998\n", + " -- C43-C44 Melanoma and other malignant neoplasms of skin, p-value = 8.7235e-17, FINEMAP = 1.0\n", + " -- Impedance of arm (right), p-value = 3.3375999999999994e-26, FINEMAP = 0.999524\n", + " -- Impedance of whole body, p-value = 6.6648e-36, FINEMAP = 1.0\n", + " -- Impedance of arm (left), p-value = 6.980800000000001e-27, FINEMAP = 1.0\n", + " -- Impedance of leg (left), p-value = 4.849599999999999e-30, FINEMAP = 1.0\n", + " -- Impedance of leg (right), p-value = 3.5038999999999998e-31, FINEMAP = 1.0\n", + " -- L80-L99 Other disorders of the skin and subcutaneous tissue, p-value = 1.5893e-08, FINEMAP = 0.998027\n", + " -- C44 Other malignant neoplasms of skin, p-value = 1.9368e-15, FINEMAP = 1.0\n", + " -- Trunk predicted mass, p-value = 1.8052000000000002e-52, FINEMAP = 1.0\n", + " -- Trunk fat-free mass, p-value = 1.1203e-52, FINEMAP = 1.0\n", + " -- Arm predicted mass (left), p-value = 1.3085999999999993e-43, FINEMAP = 1.0\n", + " -- Arm fat-free mass (left), p-value = 2.0442999999999999e-44, FINEMAP = 1.0\n", + " -- Arm predicted mass (right), p-value = 8.2865e-44, FINEMAP = 1.0\n", + " -- Arm fat-free mass (right), p-value = 4.6606e-44, FINEMAP = 1.0\n", + " -- Leg predicted mass (left), p-value = 2.7929999999999995e-37, FINEMAP = 1.0\n", + " -- Leg fat-free mass (left), p-value = 1.8524999999999998e-37, FINEMAP = 1.0\n", + " -- Leg predicted mass (right), p-value = 1.4567999999999998e-41, FINEMAP = 1.0\n", + " -- Leg fat-free mass (right), p-value = 1.8934e-41, FINEMAP = 1.0\n", + " -- Whole body fat-free mass, p-value = 8.64389999999999e-51, FINEMAP = 1.0\n", + " -- Number of operations, self-reported, p-value = 2.8876e-17, FINEMAP = 1.0\n", + " -- Cancer register - Histology of cancer tumour: Basal cell carcinoma, NOS, p-value = 1.049e-09, FINEMAP = 0.998733\n", + " -- Ever had hysterectomy (womb removed) (female), p-value = 6.419e-09, FINEMAP = 0.99889\n", + " -- Impedance measures - Impedance of whole body, p-value = 8.096e-26, FINEMAP = 1.0\n", + " -- Impedance measures - Impedance of leg (right), p-value = 2.112e-21, FINEMAP = 1.0\n", + " -- Impedance measures - Impedance of leg (left), p-value = 4.0769999999999997e-20, FINEMAP = 1.0\n", + " -- Impedance measures - Impedance of arm (right), p-value = 3.779e-18, FINEMAP = 0.999041\n", + " -- Impedance measures - Impedance of arm (left), p-value = 3.066e-19, FINEMAP = 0.999904\n", + " -- C44 Other and unspecified malignant neoplasm of skin, p-value = 1.1990000000000002e-22, FINEMAP = 1.0\n", + " -- Years since last cervical smear test (female), p-value = 4.092e-10, FINEMAP = 0.978371\n", + " -- Weight, p-value = 5.9555e-18, FINEMAP = 0.999365\n", + " -- Weight, p-value = 4.066819999999999e-18, FINEMAP = 0.999456\n", + " -- Forced expiratory volume in 1-second (FEV1), Best measure, p-value = 6.6753200000000005e-09, FINEMAP = 0.955059\n", + " -- Forced vital capacity (FVC), Best measure, p-value = 5.29275e-12, FINEMAP = 0.996491\n", + " -- D25 Leiomyoma of uterus, p-value = 8.048380000000004e-14, FINEMAP = 0.999996\n", + " -- Impedance of arm (right), p-value = 1.52703e-19, FINEMAP = 0.99984\n", + " -- Impedance of whole body, p-value = 2.82892e-25, FINEMAP = 1.0\n", + " -- Impedance of arm (left), p-value = 1.8908999999999998e-20, FINEMAP = 0.999983\n", + " -- Impedance of leg (left), p-value = 1.2351e-19, FINEMAP = 1.0\n", + " -- Impedance of leg (right), p-value = 2.81817e-21, FINEMAP = 1.0\n", + " -- C44 Other malignant neoplasms of skin, p-value = 3.7645499999999996e-24, FINEMAP = 1.0\n", + " -- Trunk predicted mass, p-value = 1.3318199999999998e-39, FINEMAP = 1.0\n", + " -- Trunk fat-free mass, p-value = 8.77218e-40, FINEMAP = 1.0\n", + " -- Arm predicted mass (left), p-value = 1.32153e-32, FINEMAP = 1.0\n", + " -- Arm fat-free mass (left), p-value = 5.3152e-34, FINEMAP = 1.0\n", + " -- Arm predicted mass (right), p-value = 1.60397e-33, FINEMAP = 1.0\n", + " -- Arm fat-free mass (right), p-value = 9.020689999999998e-34, FINEMAP = 1.0\n", + " -- Leg predicted mass (left), p-value = 6.1356399999999995e-31, FINEMAP = 1.0\n", + " -- Leg fat-free mass (left), p-value = 4.4373299999999995e-31, FINEMAP = 1.0\n", + " -- Leg fat-free mass (right), p-value = 5.57084e-34, FINEMAP = 1.0\n", + " -- Leg predicted mass (right), p-value = 4.09828e-34, FINEMAP = 1.0\n", + "\n", + "rs2066865 : False\n", + "rs16833132 : False\n", + "rs8753 : False\n", + "rs35979828 : True\n", + " - delta_logodds_77_127 = -1.0283\n", + " - delta_logodds_77_127_abs_log_2 = 2.7962\n", + " -- Immature reticulocyte fraction, p-value = 2.8606e-09, FINEMAP = 0.988875\n", + " -- Mean sphered cell volume, p-value = 2.6656999999999996e-52, FINEMAP = 1.0\n", + " -- Eosinophill percentage, p-value = 5.7338e-23, FINEMAP = 1.0\n", + " -- Monocyte count, p-value = 7.5808e-21, FINEMAP = 0.99999\n", + " -- Eosinophill count, p-value = 5.1232e-21, FINEMAP = 1.0\n", + "\n", + "rs2732480 : False\n", + "rs12459634 : False\n", + "rs555328608 : False\n", + "rs16833132 : False\n", + "\n", + "--- SNPs from credible set ---\n", + "rs6796 : True\n", + "rs35630683 : True\n", + "rs78378222 : True\n", + "rs2066865 : True\n", + "rs16833132 : False\n", + "rs8753 : True\n", + "rs35979828 : True\n", + "rs2732480 : True\n", + "rs12459634 : False\n", + "rs555328608 : False\n", + "rs16833132 : False\n" + ] + } + ], + "source": [ + "#Candidates from GWAS catalog\n", + "\n", + "cand_rs_ids = ['rs6796', 'rs35630683', 'rs78378222', 'rs2066865', 'rs16833132', 'rs8753', 'rs35979828', 'rs2732480', 'rs12459634', 'rs555328608', 'rs16833132']\n", + "\n", + "print(\"--- SNPs from credible set w. PP > 0.9 ---\")\n", + "\n", + "for cand_rs_id in cand_rs_ids :\n", + " print((cand_rs_id + \" \" * 15)[:15] + \": \" + str((cand_rs_id in rs_ids_signi)))\n", + " \n", + " if cand_rs_id in rs_ids_signi :\n", + " print(\" - delta_logodds_77_127 = \" + str(round(variant_df_gwas_lead_signi.query(\"rsID == '\" + cand_rs_id + \"'\").iloc[0]['delta_logodds_77_127'], 4)))\n", + " print(\" - delta_logodds_77_127_abs_log_2 = \" + str(round(2**(np.abs(variant_df_gwas_lead_signi.query(\"rsID == '\" + cand_rs_id + \"'\").iloc[0]['delta_logodds_77_127']) / np.log(2.)), 4)))\n", + " \n", + " subset_df = variant_df_gwas_signi.query(\"rsID == '\" + cand_rs_id + \"'\")\n", + " for _, row in subset_df.iterrows() :\n", + " print(\" -- \" + row['Trait'] + \", p-value = \" + str(row['P']) + \", FINEMAP = \" + str(row['FINEMAP']))\n", + "\n", + " print(\"\")\n", + "\n", + "print(\"\")\n", + "\n", + "print(\"--- SNPs from credible set ---\")\n", + "\n", + "for cand_rs_id in cand_rs_ids :\n", + " print((cand_rs_id + \" \" * 15)[:15] + \": \" + str((cand_rs_id in rs_ids)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Environment (conda_tensorflow_p36)", + "language": "python", + "name": "conda_tensorflow_p36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/analysis/gnomad_v3/gwas_causaldb_enrichment_analysis.ipynb b/analysis/gnomad_v3/gwas_causaldb_enrichment_analysis.ipynb new file mode 100644 index 0000000..a402d92 --- /dev/null +++ b/analysis/gnomad_v3/gwas_causaldb_enrichment_analysis.ipynb @@ -0,0 +1,466 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import print_function\n", + "\n", + "import os\n", + "import pandas as pd\n", + "\n", + "import numpy as np\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.cm as cm\n", + "\n", + "#import aparent.visualization as vis\n", + "\n", + "#from aparent_predictor import *\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(polyadb_df) = 175451\n" + ] + } + ], + "source": [ + "#Load APADB Data and filter on targeted genes\n", + "\n", + "#genes = ['RUNX1', 'CEBPA', 'GATA2', 'ANKRD26', 'DDX41', 'ETV6', 'PTEN', 'BRCA1', 'BRCA2', 'TP53', 'APC', 'ATM', 'PALB2', 'MSH2', 'MLH1', 'MSH6', 'PMS2', 'MUTYH']\n", + "\n", + "polyadb_df = pd.read_csv('polyadb_processed.csv', sep=',')\n", + "\n", + "#polyadb_df = polyadb_df.loc[polyadb_df['gene'].isin(genes)].reset_index(drop=True).copy()\n", + "polyadb_df = polyadb_df.loc[((~polyadb_df['gene'].isnull()) & (polyadb_df['gene'] != 'na')) & (polyadb_df['pas'] != -1)].reset_index(drop=True).copy()\n", + "\n", + "print('len(polyadb_df) = ' + str(len(polyadb_df)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#Process PolyaDB data\n", + "\n", + "polyadb_df_minus = polyadb_df.query(\"strand == '-'\").copy().reset_index(drop=True)\n", + "polyadb_df_plus = polyadb_df.query(\"strand == '+'\").copy().reset_index(drop=True)\n", + "\n", + "polyadb_df_minus = polyadb_df_minus.sort_values(by='pas_pos', ascending=False).copy().reset_index(drop=True)\n", + "polyadb_df_plus = polyadb_df_plus.sort_values(by='pas_pos', ascending=True).copy().reset_index(drop=True)\n", + "\n", + "new_gene_id_list_plus = []\n", + "sitenum_list_plus = []\n", + "gene_id_dict = {}\n", + "for _, row in polyadb_df_plus.iterrows() :\n", + "\n", + " gene = row['gene']\n", + "\n", + " if gene not in gene_id_dict :\n", + " gene_id_dict[gene] = 0\n", + "\n", + " gene_id_dict[gene] += 1\n", + "\n", + " new_gene_id_list_plus.append(gene + \".\" + str(gene_id_dict[gene]))\n", + " sitenum_list_plus.append(gene_id_dict[gene])\n", + "\n", + "polyadb_df_plus['gene_id'] = new_gene_id_list_plus\n", + "polyadb_df_plus['sitenum'] = sitenum_list_plus\n", + "\n", + "new_gene_id_list_minus = []\n", + "sitenum_list_minus = []\n", + "gene_id_dict = {}\n", + "for _, row in polyadb_df_minus.iterrows() :\n", + "\n", + " gene = row['gene']\n", + "\n", + " if gene not in gene_id_dict :\n", + " gene_id_dict[gene] = 0\n", + "\n", + " gene_id_dict[gene] += 1\n", + "\n", + " new_gene_id_list_minus.append(gene + \".\" + str(gene_id_dict[gene]))\n", + " sitenum_list_minus.append(gene_id_dict[gene])\n", + "\n", + "polyadb_df_minus['gene_id'] = new_gene_id_list_minus\n", + "polyadb_df_minus['sitenum'] = sitenum_list_minus\n", + "\n", + "polyadb_df = pd.concat([polyadb_df_plus, polyadb_df_minus])\n", + "\n", + "polyadb_df = polyadb_df.sort_values(by=['gene', 'sitenum'], ascending=True).reset_index(drop=True).copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#Get CausalDB metadata\n", + "\n", + "causaldb_meta_df = pd.read_csv(\"causaldb_credible_set/causaldb_meta_info_v1.txt\", sep='\\t')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#Load cached causaldb dataframes (already intersected against predictions)\n", + "\n", + "variant_df_gwas = pd.read_csv(\"aparent_resnet_variant_predictions_polyadb_no_sequences_causaldb_no_cutoff.csv\", sep='\\t')\n", + "variant_df_gwas_noncausal = pd.read_csv(\"aparent_resnet_variant_predictions_polyadb_no_sequences_noncausaldb_no_cutoff.csv\", sep='\\t')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "#Drop duplicates for partially overlapping annotated PASs and add additional fields\n", + "\n", + "variant_df_gwas['rsID'] = \"rs\" + variant_df_gwas['rsID'].astype(int).astype(str)\n", + "\n", + "variant_df_gwas['delta_logodds_77_127_abs'] = np.abs(variant_df_gwas['delta_logodds_77_127'])\n", + "\n", + "variant_df_gwas['rel_var_position'] = -1\n", + "variant_df_gwas.loc[variant_df_gwas['strand'] == '+', 'rel_var_position'] = variant_df_gwas['var_position'] - (variant_df_gwas['pas_pos'] - 70 + 1)\n", + "variant_df_gwas.loc[variant_df_gwas['strand'] == '-', 'rel_var_position'] = ((variant_df_gwas['pas_pos'] - (205 - 70)) + 205) - variant_df_gwas['var_position']\n", + "\n", + "variant_df_gwas['alt_usage'] = 1. / (1. + 1. / ((np.clip(variant_df_gwas['native_usage'], 1e-7, 1. - 1e-7) / (1. - np.clip(variant_df_gwas['native_usage'], 1e-7, 1. - 1e-7))) * np.exp(variant_df_gwas['delta_logodds_0_205'])))\n", + "variant_df_gwas['delta_usage'] = variant_df_gwas['alt_usage'] - variant_df_gwas['native_usage']\n", + "\n", + "variant_df_gwas['alt_usage_77_127'] = 1. / (1. + 1. / ((np.clip(variant_df_gwas['native_usage'], 1e-7, 1. - 1e-7) / (1. - np.clip(variant_df_gwas['native_usage'], 1e-7, 1. - 1e-7))) * np.exp(variant_df_gwas['delta_logodds_77_127'])))\n", + "variant_df_gwas['delta_usage_77_127'] = variant_df_gwas['alt_usage_77_127'] - variant_df_gwas['native_usage']\n", + "\n", + "polyadb_df_unique = polyadb_df.drop_duplicates(subset=[\"gene_id\"], keep='first').copy().reset_index(drop=True)\n", + "polyadb_df_unique['padb_join_id'] = polyadb_df_unique['chrom'] + \"_\" + polyadb_df_unique['pas_pos'].astype(str) + \"_\" + polyadb_df_unique['strand']\n", + "variant_df_gwas['padb_join_id'] = variant_df_gwas['chrom'] + \"_\" + variant_df_gwas['pas_pos'].astype(str) + \"_\" + variant_df_gwas['strand']\n", + "\n", + "variant_df_gwas = variant_df_gwas.join(polyadb_df_unique[['padb_join_id', 'pas']].set_index(\"padb_join_id\"), on='padb_join_id', how='inner').copy().reset_index(drop=True)\n", + "\n", + "variant_df_gwas['target_rel_var_position'] = np.abs(90 - variant_df_gwas['rel_var_position'])\n", + "variant_df_gwas['target_delta_logodds_77_127_abs'] = -variant_df_gwas['delta_logodds_77_127_abs']\n", + "variant_df_gwas = variant_df_gwas.sort_values(by=['target_rel_var_position', 'target_delta_logodds_77_127_abs'], ascending=True).drop_duplicates(subset=['causaldb_join_id', 'meta_id'], keep='first').copy().reset_index(drop=True)\n", + "#variant_df_gwas['pas'] = -1. * variant_df_gwas['pas']\n", + "#variant_df_gwas = variant_df_gwas.sort_values(by=[\"pas\", 'delta_logodds_77_127_abs'], ascending=False).drop_duplicates(subset=['causaldb_join_id', 'meta_id'], keep='first').copy().reset_index(drop=True)\n", + "\n", + "#Drop duplicates for partially overlapping annotated PASs and add additional fields (non-causal)\n", + "\n", + "variant_df_gwas_noncausal['delta_logodds_77_127_abs'] = np.abs(variant_df_gwas_noncausal['delta_logodds_77_127'])\n", + "\n", + "variant_df_gwas_noncausal['rel_var_position'] = -1\n", + "variant_df_gwas_noncausal.loc[variant_df_gwas_noncausal['strand'] == '+', 'rel_var_position'] = variant_df_gwas_noncausal['var_position'] - (variant_df_gwas_noncausal['pas_pos'] - 70 + 1)\n", + "variant_df_gwas_noncausal.loc[variant_df_gwas_noncausal['strand'] == '-', 'rel_var_position'] = ((variant_df_gwas_noncausal['pas_pos'] - (205 - 70)) + 205) - variant_df_gwas_noncausal['var_position']\n", + "\n", + "variant_df_gwas_noncausal['alt_usage'] = 1. / (1. + 1. / ((np.clip(variant_df_gwas_noncausal['native_usage'], 1e-7, 1. - 1e-7) / (1. - np.clip(variant_df_gwas_noncausal['native_usage'], 1e-7, 1. - 1e-7))) * np.exp(variant_df_gwas_noncausal['delta_logodds_0_205'])))\n", + "variant_df_gwas_noncausal['delta_usage'] = variant_df_gwas_noncausal['alt_usage'] - variant_df_gwas_noncausal['native_usage']\n", + "\n", + "variant_df_gwas_noncausal['alt_usage_77_127'] = 1. / (1. + 1. / ((np.clip(variant_df_gwas_noncausal['native_usage'], 1e-7, 1. - 1e-7) / (1. - np.clip(variant_df_gwas_noncausal['native_usage'], 1e-7, 1. - 1e-7))) * np.exp(variant_df_gwas_noncausal['delta_logodds_77_127'])))\n", + "variant_df_gwas_noncausal['delta_usage_77_127'] = variant_df_gwas_noncausal['alt_usage_77_127'] - variant_df_gwas_noncausal['native_usage']\n", + "\n", + "polyadb_df_unique = polyadb_df.drop_duplicates(subset=[\"gene_id\"], keep='first').copy().reset_index(drop=True)\n", + "polyadb_df_unique['padb_join_id'] = polyadb_df_unique['chrom'] + \"_\" + polyadb_df_unique['pas_pos'].astype(str) + \"_\" + polyadb_df_unique['strand']\n", + "variant_df_gwas_noncausal['padb_join_id'] = variant_df_gwas_noncausal['chrom'] + \"_\" + variant_df_gwas_noncausal['pas_pos'].astype(str) + \"_\" + variant_df_gwas_noncausal['strand']\n", + "\n", + "variant_df_gwas_noncausal = variant_df_gwas_noncausal.join(polyadb_df_unique[['padb_join_id', 'pas']].set_index(\"padb_join_id\"), on='padb_join_id', how='inner').copy().reset_index(drop=True)\n", + "\n", + "variant_df_gwas_noncausal['target_rel_var_position'] = np.abs(90 - variant_df_gwas_noncausal['rel_var_position'])\n", + "variant_df_gwas_noncausal['target_delta_logodds_77_127_abs'] = -variant_df_gwas_noncausal['delta_logodds_77_127_abs']\n", + "variant_df_gwas_noncausal = variant_df_gwas_noncausal.sort_values(by=['target_rel_var_position', 'target_delta_logodds_77_127_abs'], ascending=True).drop_duplicates(subset=['causaldb_join_id', 'meta_id'], keep='first').copy().reset_index(drop=True)\n", + "#variant_df_gwas_noncausal['pas'] = -1. * variant_df_gwas_noncausal['pas']\n", + "#variant_df_gwas_noncausal = variant_df_gwas_noncausal.sort_values(by=[\"pas\", 'delta_logodds_77_127_abs'], ascending=False).drop_duplicates(subset=['causaldb_join_id', 'meta_id'], keep='first').copy().reset_index(drop=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(variant_df_gwas_lead) = 8072\n", + "len(variant_df_gwas_all_lead) = 9205\n" + ] + } + ], + "source": [ + "#Aggregate summary statistic for unique SNPs (by rsID)\n", + "\n", + "variant_df_gwas_all = pd.concat([variant_df_gwas, variant_df_gwas_noncausal]).drop_duplicates(subset=[\"causaldb_join_id\"], keep='first').copy().reset_index(drop=True)\n", + "\n", + "variant_df_gwas_lead = variant_df_gwas.copy().sort_values(by='FINEMAP', ascending=False).drop_duplicates(subset=[\"causaldb_join_id\"], keep='first').copy().reset_index(drop=True)\n", + "variant_df_gwas_all_lead = variant_df_gwas_all.copy().sort_values(by='FINEMAP', ascending=False).drop_duplicates(subset=[\"causaldb_join_id\"], keep='first').copy().reset_index(drop=True)\n", + "\n", + "print(\"len(variant_df_gwas_lead) = \" + str(len(variant_df_gwas_lead)))\n", + "print(\"len(variant_df_gwas_all_lead) = \" + str(len(variant_df_gwas_all_lead)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "x_thresh_low = -0.3365\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- Fisher's exact test ---\n", + "\n", + "x (low) = 0.001\n", + "x (high) = 0.2\n", + "\n", + "n (low) = 1026\n", + "n (high) = 202\n", + "\n", + "[[942. 84.]\n", + " [179. 23.]]\n", + "\n", + "(1.4409417398244213, 0.09330476285498879)\n", + "\n", + "RanksumsResult(statistic=1.7953519202835833, pvalue=0.03629882382397424)\n", + "\n", + "x_thresh_low = -0.4055\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- Fisher's exact test ---\n", + "\n", + "x (low) = 0.001\n", + "x (high) = 0.2\n", + "\n", + "n (low) = 1026\n", + "n (high) = 202\n", + "\n", + "[[965. 61.]\n", + " [183. 19.]]\n", + "\n", + "(1.6424796201737883, 0.05238394717543503)\n", + "\n", + "RanksumsResult(statistic=1.7953519202835833, pvalue=0.03629882382397424)\n", + "\n", + "x_thresh_low = -0.6931\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- Fisher's exact test ---\n", + "\n", + "x (low) = 0.001\n", + "x (high) = 0.2\n", + "\n", + "n (low) = 1026\n", + "n (high) = 202\n", + "\n", + "[[996. 30.]\n", + " [190. 12.]]\n", + "\n", + "(2.096842105263158, 0.03188941718410415)\n", + "\n", + "RanksumsResult(statistic=1.7953519202835833, pvalue=0.03629882382397424)\n", + "\n" + ] + } + ], + "source": [ + "#Calculate statistics in intervals of FINEMAP PP (log odds)\n", + "\n", + "from scipy.stats import ranksums, fisher_exact\n", + "\n", + "x = -np.abs(variant_df_gwas_all_lead.query(\"site_type == '3_most_exon' and native_usage > 0. and native_usage < 1.\")['delta_logodds_0_205'].values)\n", + "y = np.array(variant_df_gwas_all_lead.query(\"site_type == '3_most_exon' and native_usage > 0. and native_usage < 1.\")['FINEMAP'].values)\n", + "\n", + "x_thresh_lows = [-np.log(1.4), -np.log(1.5), -np.log(2.0)]\n", + "\n", + "fisher_test_low_y = 0.001\n", + "fisher_test_high_y = 0.2\n", + "\n", + "for x_thresh_low in x_thresh_lows :\n", + "\n", + " print(\"x_thresh_low = \" + str(round(x_thresh_low, 4)))\n", + " \n", + " x_0 = x[(y >= 0.)]\n", + " x_0_01 = x[(y >= 0.01)]\n", + " x_0_05 = x[(y >= 0.05)]\n", + " x_0_2 = x[(y >= 0.2)]\n", + " x_0_5 = x[(y >= 0.5)]\n", + " x_0_9 = x[(y >= 0.9)]\n", + " x_0_95 = x[(y >= 0.95)]\n", + "\n", + " f_0 = np.sum(x_0 < x_thresh_low) / x_0.shape[0]\n", + " f_0_01 = np.sum(x_0_01 < x_thresh_low) / x_0_01.shape[0]\n", + " f_0_05 = np.sum(x_0_05 < x_thresh_low) / x_0_05.shape[0]\n", + " f_0_2 = np.sum(x_0_2 < x_thresh_low) / x_0_2.shape[0]\n", + " f_0_5 = np.sum(x_0_5 < x_thresh_low) / x_0_5.shape[0]\n", + " f_0_9 = np.sum(x_0_9 < x_thresh_low) / x_0_9.shape[0]\n", + " f_0_95 = np.sum(x_0_95 < x_thresh_low) / x_0_95.shape[0]\n", + "\n", + " r_0_01 = f_0_01 / f_0\n", + " r_0_05 = f_0_05 / f_0\n", + " r_0_2 = f_0_2 / f_0\n", + " r_0_5 = f_0_5 / f_0\n", + " r_0_9 = f_0_9 / f_0\n", + " r_0_95 = f_0_95 / f_0\n", + "\n", + " fs = np.array([1., r_0_01, r_0_05, r_0_2, r_0_5, r_0_9, r_0_95])\n", + "\n", + " f = plt.figure(figsize=(6, 4))\n", + "\n", + " plt.bar([0], [fs[0]], color='darkgray', linewidth=2, edgecolor='black')\n", + " plt.bar((np.arange(fs.shape[0]-1)+1).tolist(), fs[1:].tolist(), color='deepskyblue', linewidth=2, edgecolor='black')\n", + "\n", + " plt.axhline(y=1.0, linewidth=2, linestyle='--', color='black')\n", + "\n", + " plt.xticks(\n", + " np.arange(fs.shape[0]), [\n", + " \">= 0%\",\n", + " \">= 1%\",\n", + " \">= 5%\",\n", + " \">= 20%\",\n", + " \">= 50%\",\n", + " \">= 90%\",\n", + " \">= 95%\",\n", + " ], fontsize=12, rotation=45\n", + " )\n", + "\n", + " plt.yticks(fontsize=12)\n", + "\n", + " plt.xlabel(\"FINEMAP Posterior Probability\", fontsize=12)\n", + " plt.ylabel(\"Frequency of Disruptive Variants\\n(Relative to FINEMAP >= 0)\", fontsize=12)\n", + "\n", + " plt.ylim(0.3)\n", + "\n", + " plt.tight_layout()\n", + "\n", + " plt.savefig(\"apa_causaldb_disruptive_frequency_logodds_\" + str(round(x_thresh_low, 4)).replace(\".\", \"_\").replace(\"-\", \"\") + \"_v2.eps\")\n", + " plt.savefig(\"apa_causaldb_disruptive_frequency_logodds_\" + str(round(x_thresh_low, 4)).replace(\".\", \"_\").replace(\"-\", \"\") + \"_v2.png\", transparent=True, dpi=300)\n", + "\n", + " plt.show()\n", + " \n", + " x_low = x[(y < fisher_test_low_y)]\n", + " x_high = x[(y >= fisher_test_high_y)]\n", + "\n", + " print(\"--- Fisher's exact test ---\")\n", + " print(\"\")\n", + " print(\"x (low) = \" + str(fisher_test_low_y))\n", + " print(\"x (high) = \" + str(fisher_test_high_y))\n", + " print(\"\")\n", + " print(\"n (low) = \" + str(x_low.shape[0]))\n", + " print(\"n (high) = \" + str(x_high.shape[0]))\n", + " \n", + " t1 = np.zeros((2, 2))\n", + " \n", + " t1[0, 0] = np.sum(x_low >= x_thresh_low)\n", + " t1[0, 1] = np.sum(x_low < x_thresh_low)\n", + " t1[1, 0] = np.sum(x_high >= x_thresh_low)\n", + " t1[1, 1] = np.sum(x_high < x_thresh_low)\n", + "\n", + " print(\"\")\n", + " print(t1)\n", + " print(\"\")\n", + "\n", + " print(fisher_exact(t1, alternative='greater'))\n", + " \n", + " print(\"\")\n", + " \n", + " print(ranksums(x_low, x_high, alternative='greater'))\n", + " \n", + " print(\"\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Environment (conda_modisco_p37)", + "language": "python", + "name": "conda_modisco_p37" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/analysis/gnomad_v3/gwas_ukbb_comparison.ipynb b/analysis/gnomad_v3/gwas_ukbb_comparison.ipynb new file mode 100644 index 0000000..6f4522a --- /dev/null +++ b/analysis/gnomad_v3/gwas_ukbb_comparison.ipynb @@ -0,0 +1,769 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + } + ], + "source": [ + "from __future__ import print_function\n", + "import keras\n", + "from keras.models import Sequential, Model, load_model\n", + "from keras import backend as K\n", + "\n", + "import tensorflow as tf\n", + "\n", + "import os\n", + "import pandas as pd\n", + "\n", + "import numpy as np\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.cm as cm\n", + "\n", + "#import aparent.visualization as vis\n", + "\n", + "#from aparent_predictor import *\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "#Store variant prediction dataframe\n", + "\n", + "variant_df = pd.read_csv('aparent_resnet_variant_predictions_polyadb_no_sequences_no_cutoff.csv', sep='\\t')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(polyadb_df) = 175451\n" + ] + } + ], + "source": [ + "#Load APADB Data and filter on targeted genes\n", + "\n", + "#genes = ['RUNX1', 'CEBPA', 'GATA2', 'ANKRD26', 'DDX41', 'ETV6', 'PTEN', 'BRCA1', 'BRCA2', 'TP53', 'APC', 'ATM', 'PALB2', 'MSH2', 'MLH1', 'MSH6', 'PMS2', 'MUTYH']\n", + "\n", + "polyadb_df = pd.read_csv('polyadb_processed.csv', sep=',')\n", + "\n", + "#polyadb_df = polyadb_df.loc[polyadb_df['gene'].isin(genes)].reset_index(drop=True).copy()\n", + "polyadb_df = polyadb_df.loc[((~polyadb_df['gene'].isnull()) & (polyadb_df['gene'] != 'na')) & (polyadb_df['pas'] != -1)].reset_index(drop=True).copy()\n", + "\n", + "print('len(polyadb_df) = ' + str(len(polyadb_df)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#Process PolyaDB data\n", + "\n", + "polyadb_df_minus = polyadb_df.query(\"strand == '-'\").copy().reset_index(drop=True)\n", + "polyadb_df_plus = polyadb_df.query(\"strand == '+'\").copy().reset_index(drop=True)\n", + "\n", + "polyadb_df_minus = polyadb_df_minus.sort_values(by='pas_pos', ascending=False).copy().reset_index(drop=True)\n", + "polyadb_df_plus = polyadb_df_plus.sort_values(by='pas_pos', ascending=True).copy().reset_index(drop=True)\n", + "\n", + "new_gene_id_list_plus = []\n", + "sitenum_list_plus = []\n", + "gene_id_dict = {}\n", + "for _, row in polyadb_df_plus.iterrows() :\n", + "\n", + " gene = row['gene']\n", + "\n", + " if gene not in gene_id_dict :\n", + " gene_id_dict[gene] = 0\n", + "\n", + " gene_id_dict[gene] += 1\n", + "\n", + " new_gene_id_list_plus.append(gene + \".\" + str(gene_id_dict[gene]))\n", + " sitenum_list_plus.append(gene_id_dict[gene])\n", + "\n", + "polyadb_df_plus['gene_id'] = new_gene_id_list_plus\n", + "polyadb_df_plus['sitenum'] = sitenum_list_plus\n", + "\n", + "new_gene_id_list_minus = []\n", + "sitenum_list_minus = []\n", + "gene_id_dict = {}\n", + "for _, row in polyadb_df_minus.iterrows() :\n", + "\n", + " gene = row['gene']\n", + "\n", + " if gene not in gene_id_dict :\n", + " gene_id_dict[gene] = 0\n", + "\n", + " gene_id_dict[gene] += 1\n", + "\n", + " new_gene_id_list_minus.append(gene + \".\" + str(gene_id_dict[gene]))\n", + " sitenum_list_minus.append(gene_id_dict[gene])\n", + "\n", + "polyadb_df_minus['gene_id'] = new_gene_id_list_minus\n", + "polyadb_df_minus['sitenum'] = sitenum_list_minus\n", + "\n", + "polyadb_df = pd.concat([polyadb_df_plus, polyadb_df_minus])\n", + "\n", + "polyadb_df = polyadb_df.sort_values(by=['gene', 'sitenum'], ascending=True).reset_index(drop=True).copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#Get variant positions in hg38 coordinates\n", + "\n", + "polyadb_bed_hg19 = pd.read_csv(\"polyadb_coordinates_hg19.bed\", sep='\\t', header=None, names=['chrom', 'pas_pos_hg19', 'end', 'gene', 'gene_id', 'strand'])\n", + "polyadb_bed_hg38 = pd.read_csv(\"polyadb_coordinates_hg38.bed\", sep='\\t', header=None, names=['chrom', 'pas_pos_hg38', 'end', 'gene', 'gene_id', 'strand'])\n", + "\n", + "polyadb_bed_hg38 = polyadb_bed_hg38.join(polyadb_bed_hg19[['gene_id', 'pas_pos_hg19']].set_index('gene_id'), on='gene_id', how='inner').copy().reset_index(drop=True)\n", + "\n", + "polyadb_bed_hg38['padb_join_id'] = polyadb_bed_hg38['chrom'] + \"_\" + polyadb_bed_hg38['pas_pos_hg19'].astype(str) + \"_\" + polyadb_bed_hg38['gene']\n", + "polyadb_df['padb_join_id'] = polyadb_df['chrom'] + \"_\" + polyadb_df['pas_pos'].astype(str) + \"_\" + polyadb_df['gene']\n", + "\n", + "polyadb_df = polyadb_df.join(polyadb_bed_hg38[['padb_join_id', 'pas_pos_hg38']].set_index(\"padb_join_id\"), on='padb_join_id', how='inner').copy().reset_index(drop=True)\n", + "\n", + "#polyadb_df = polyadb_df.query(\"site_type == '3_most_exon'\")\n", + "\n", + "polyadb_df = polyadb_df.drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)\n", + "\n", + "variant_df = variant_df.join(polyadb_df[['gene_id', 'pas_pos', 'pas_pos_hg38']].set_index(\"gene_id\"), on='gene_id', how='inner').copy().reset_index(drop=True)\n", + "\n", + "variant_df['var_position_hg38'] = variant_df['var_position'] - variant_df['pas_pos'] + variant_df['pas_pos_hg38']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#Store hg19 interval of PAS sites to intersect UKBB data against\n", + "\n", + "polyadb_df['start'] = polyadb_df['pas_pos'] - 150\n", + "polyadb_df['end'] = polyadb_df['pas_pos'] + 150\n", + "\n", + "polyadb_bed = polyadb_df[['chrom', 'start', 'end', 'gene', 'gene_id', 'strand']].query(\"chrom != 'chrX' and chrom != 'chrY'\").copy().reset_index(drop=True)\n", + "\n", + "polyadb_bed['sort_chrom'] = polyadb_bed['chrom'].apply(lambda x: x[3:]).astype(int)\n", + "polyadb_bed['sort_start'] = polyadb_bed['start'].astype(int)\n", + "\n", + "polyadb_bed = polyadb_bed.sort_values(by=['sort_chrom', 'sort_start'], ascending=True).copy().reset_index(drop=True)\n", + "\n", + "polyadb_bed = polyadb_bed[['chrom', 'start', 'end', 'gene', 'gene_id', 'strand']].copy().reset_index(drop=True)\n", + "\n", + "polyadb_bed.to_csv(\"polyadb_interval_coordinates_hg19.sorted.bed\", sep='\\t', header=False, index=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#Intersect UKBB data against PolyADB\n", + "\n", + "!bedtools intersect -sorted -a polyadb_interval_coordinates_hg19.sorted.bed -b UKBB_94_traits/UKBB_94traits_release1_regions.bed.gz -wb > UKBB_94_traits/UKBB_94traits_release1_regions_intersect.bed\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(ukbb_df) = 5377879\n" + ] + } + ], + "source": [ + "#Get UKBB fine-mapping data\n", + "\n", + "column_names = [\n", + " 'chromosome',\n", + " 'start',\n", + " 'end',\n", + " 'variant',\n", + " 'rsid',\n", + " 'allele1',\n", + " 'allele2',\n", + " 'minorallele',\n", + " 'cohort',\n", + " 'model_marginal',\n", + " 'method',\n", + " 'trait',\n", + " 'region',\n", + " 'maf',\n", + " 'beta_marginal',\n", + " 'se_marginal',\n", + " 'chisq_marginal',\n", + " 'pip',\n", + " 'cs_id',\n", + " 'beta_posterior',\n", + " 'sd_posterior',\n", + " 'LD_HWE',\n", + " 'LD_SV',\n", + "]\n", + "\n", + "ukbb_df = pd.read_csv(\"UKBB_94_traits/UKBB_94traits_release1.bed.gz\", sep='\\t', compression='gzip', names=column_names)\n", + "\n", + "print(\"len(ukbb_df) = \" + str(len(ukbb_df)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(all_ukbb_df) = 3450965\n" + ] + } + ], + "source": [ + "#Get UKKB non-causal variants\n", + "\n", + "column_names = [\n", + " 'chromosome',\n", + " 'start',\n", + " 'end',\n", + " 'cohort',\n", + " 'trait',\n", + " 'region',\n", + " 'variant',\n", + " 'success_finemap',\n", + " 'success_susie',\n", + "]\n", + "\n", + "all_ukbb_df = pd.read_csv(\"UKBB_94_traits/UKBB_94traits_release1_regions_intersect.bed\", sep='\\t', names=column_names)\n", + "\n", + "#Apply filters\n", + "all_ukbb_df = all_ukbb_df.query(\"success_finemap == True and success_susie == True\").drop_duplicates(subset=['variant', 'trait', 'cohort'], keep='first').copy().reset_index(drop=True)\n", + "\n", + "print(\"len(all_ukbb_df) = \" + str(len(all_ukbb_df)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "#Extract non-causal variants\n", + "\n", + "all_ukbb_df = all_ukbb_df[['chromosome', 'start', 'end', 'cohort', 'trait', 'region', 'variant']]\n", + "\n", + "all_ukbb_df = all_ukbb_df.join(ukbb_df[['variant', 'pip']].set_index(\"variant\"), on='variant', how='left')\n", + "noncausal_ukbb_df = all_ukbb_df.loc[all_ukbb_df['pip'].isnull()].copy().reset_index(drop=True)\n", + "noncausal_ukbb_df['pip'] = 0.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nvariant_df_gwas_noncausal = variant_df.join(noncausal_ukbb_df[[\"variant\", \\'trait\\', \\'pip\\']].set_index(\"variant\"), on=\\'ukbb_join_id\\', how=\\'inner\\').copy().reset_index(drop=True)\\n\\nvariant_df_gwas_noncausal.to_csv(\"aparent_resnet_variant_predictions_polyadb_no_sequences_ukbb_noncausal_no_cutoff.csv\", sep=\\'\\t\\', index=False)\\n'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Map variants to predictions\n", + "\n", + "variant_df['ukbb_join_id'] = variant_df['chrom'] + \":\" + variant_df['var_position'].astype(str) + \":\" + variant_df['ref_nucleotide'] + \":\" + variant_df['var_nucleotide']\n", + "\n", + "variant_df_gwas = variant_df.join(ukbb_df[[\"variant\", 'rsid', 'trait', 'maf', 'pip', 'cs_id', 'beta_marginal', 'se_marginal', 'chisq_marginal', 'beta_posterior', 'sd_posterior']].set_index(\"variant\"), on='ukbb_join_id', how='inner').copy().reset_index(drop=True)\n", + "\n", + "variant_df_gwas.to_csv(\"aparent_resnet_variant_predictions_polyadb_no_sequences_ukbb_no_cutoff_w_chisq.csv\", sep='\\t', index=False)\n", + "\n", + "#Map non-causal variants to predictions\n", + "\n", + "variant_df_gwas_noncausal = variant_df.join(noncausal_ukbb_df[[\"variant\", 'trait', 'pip']].set_index(\"variant\"), on='ukbb_join_id', how='inner').copy().reset_index(drop=True)\n", + "\n", + "variant_df_gwas_noncausal.to_csv(\"aparent_resnet_variant_predictions_polyadb_no_sequences_ukbb_noncausal_no_cutoff.csv\", sep='\\t', index=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "variant_df_gwas = pd.read_csv(\"aparent_resnet_variant_predictions_polyadb_no_sequences_ukbb_no_cutoff_w_chisq.csv\", sep='\\t')\n", + "variant_df_gwas_noncausal = pd.read_csv(\"aparent_resnet_variant_predictions_polyadb_no_sequences_ukbb_noncausal_no_cutoff.csv\", sep='\\t')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "#Drop duplicates for partially overlapping annotated PASs\n", + "\n", + "variant_df_gwas['rsID'] = variant_df_gwas['rsid'].astype(str)\n", + "\n", + "variant_df_gwas['delta_logodds_77_127_abs'] = np.abs(variant_df_gwas['delta_logodds_77_127'])\n", + "\n", + "variant_df_gwas['rel_var_position'] = -1\n", + "variant_df_gwas.loc[variant_df_gwas['strand'] == '+', 'rel_var_position'] = variant_df_gwas['var_position'] - (variant_df_gwas['pas_pos'] - 70 + 1)\n", + "variant_df_gwas.loc[variant_df_gwas['strand'] == '-', 'rel_var_position'] = ((variant_df_gwas['pas_pos'] - (205 - 70)) + 205) - variant_df_gwas['var_position']\n", + "\n", + "variant_df_gwas['target_rel_var_position'] = np.abs(90 - variant_df_gwas['rel_var_position'])\n", + "variant_df_gwas['delta_logodds_77_127_abs'] = -variant_df_gwas['delta_logodds_77_127_abs']\n", + "variant_df_gwas = variant_df_gwas.sort_values(by=['target_rel_var_position', 'delta_logodds_77_127_abs'], ascending=True).drop_duplicates(subset=['ukbb_join_id', 'trait'], keep='first').copy().reset_index(drop=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(variant_df_gwas_lead) = 17426\n" + ] + } + ], + "source": [ + "#Aggregate summary statistic for unique SNPs (by rsID)\n", + "\n", + "variant_df_gwas_lead = variant_df_gwas.copy().sort_values(by='pip', ascending=False).drop_duplicates(\"ukbb_join_id\").copy().reset_index(drop=True)\n", + "\n", + "print(\"len(variant_df_gwas_lead) = \" + str(len(variant_df_gwas_lead)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "#Filter on 3' UTR SNPs only\n", + "\n", + "variant_df_gwas = variant_df_gwas.query(\"site_type == '3_most_exon'\").copy().reset_index(drop=True)\n", + "variant_df_gwas_lead = variant_df_gwas_lead.query(\"site_type == '3_most_exon'\").copy().reset_index(drop=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n = 9190\n", + "n (>= 1.4-fold) = 1153\n", + "\n", + "n = 41 (PP >= 0.9)\n", + "n (>= 1.4-fold) = 13 (PP >= 0.9)\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "variant_df_gwas_lead['delta_logodds_77_127_abs'] = np.abs(variant_df_gwas_lead['delta_logodds_77_127'])\n", + "\n", + "variant_df_gwas_lead_non_signi = variant_df_gwas_lead.query(\"pip < 0.9\")\n", + "variant_df_gwas_lead_signi = variant_df_gwas_lead.query(\"pip >= 0.9\")\n", + "\n", + "print(\"n = \" + str(len(variant_df_gwas_lead)))\n", + "print(\"n (>= 1.4-fold) = \" + str(len(variant_df_gwas_lead.query(\"delta_logodds_77_127_abs >= 0.336\"))))\n", + "\n", + "print(\"\")\n", + "print(\"n = \" + str(len(variant_df_gwas_lead_signi)) + \" (PP >= 0.9)\")\n", + "print(\"n (>= 1.4-fold) = \" + str(len(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs >= 0.336\"))) + \" (PP >= 0.9)\")\n", + "\n", + "plt.scatter(variant_df_gwas_lead_non_signi['delta_logodds_77_127_abs'] / np.log(2.), variant_df_gwas_lead_non_signi['chisq_marginal'], color='darkgreen', s=15, alpha=0.95)\n", + "\n", + "plt.scatter(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs < 0.336\")['delta_logodds_77_127_abs'] / np.log(2.), variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs < 0.336\")['chisq_marginal'], color='red', s=15, alpha=0.95)\n", + "plt.scatter(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs >= 0.336\")['delta_logodds_77_127_abs'] / np.log(2.), variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs >= 0.336\")['chisq_marginal'], color='red', s=35, marker=\"^\", alpha=0.95)\n", + "\n", + "plt.axvline(x=np.log2(1.4), linestyle='--', linewidth=2, color='black')\n", + "\n", + "plt.xlim(-0.1, 6.0)\n", + "plt.ylim(-25, 1000)\n", + "\n", + "plt.xticks([0, 1, 2, 3, 4, 5, 6], ['Ref', '2-fold', '4-fold', '8-fold', '16-fold', '32-fold', '64-fold'], fontsize=12, rotation=45)\n", + "plt.yticks(fontsize=12, rotation=45)\n", + "\n", + "plt.savefig(\"ukbb_aparent_resnet_scatter_utr3_only_pp_09_v2.png\", transparent=True, dpi=300)\n", + "plt.savefig(\"ukbb_aparent_resnet_scatter_utr3_only_pp_09_v2.eps\")\n", + "\n", + "plt.tight_layout()\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n = 9190\n", + "n (>= 1.4-fold) = 1153\n", + "\n", + "n = 26 (PP >= 0.5)\n", + "n (>= 1.4-fold) = 2 (PP >= 0.5)\n", + "\n", + "n = 41 (PP >= 0.9)\n", + "n (>= 1.4-fold) = 13 (PP >= 0.9)\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "variant_df_gwas_lead['delta_logodds_77_127_abs'] = np.abs(variant_df_gwas_lead['delta_logodds_77_127'])\n", + "\n", + "variant_df_gwas_lead_non_signi = variant_df_gwas_lead.query(\"pip < 0.5\")\n", + "variant_df_gwas_lead_semi_signi = variant_df_gwas_lead.query(\"pip >= 0.5 and pip < 0.9\")\n", + "variant_df_gwas_lead_signi = variant_df_gwas_lead.query(\"pip >= 0.9\")\n", + "\n", + "print(\"n = \" + str(len(variant_df_gwas_lead)))\n", + "print(\"n (>= 1.4-fold) = \" + str(len(variant_df_gwas_lead.query(\"delta_logodds_77_127_abs >= 0.336\"))))\n", + "\n", + "print(\"\")\n", + "print(\"n = \" + str(len(variant_df_gwas_lead_semi_signi)) + \" (PP >= 0.5)\")\n", + "print(\"n (>= 1.4-fold) = \" + str(len(variant_df_gwas_lead_semi_signi.query(\"delta_logodds_77_127_abs >= 0.336\"))) + \" (PP >= 0.5)\")\n", + "\n", + "print(\"\")\n", + "print(\"n = \" + str(len(variant_df_gwas_lead_signi)) + \" (PP >= 0.9)\")\n", + "print(\"n (>= 1.4-fold) = \" + str(len(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs >= 0.336\"))) + \" (PP >= 0.9)\")\n", + "\n", + "plt.scatter(variant_df_gwas_lead_non_signi['delta_logodds_77_127_abs'] / np.log(2.), variant_df_gwas_lead_non_signi['chisq_marginal'], color='darkgreen', s=15, alpha=0.95)\n", + "\n", + "plt.scatter(variant_df_gwas_lead_semi_signi.query(\"delta_logodds_77_127_abs < 0.336\")['delta_logodds_77_127_abs'] / np.log(2.), variant_df_gwas_lead_semi_signi.query(\"delta_logodds_77_127_abs < 0.336\")['chisq_marginal'], color='blue', s=15, alpha=0.95)\n", + "plt.scatter(variant_df_gwas_lead_semi_signi.query(\"delta_logodds_77_127_abs >= 0.336\")['delta_logodds_77_127_abs'] / np.log(2.), variant_df_gwas_lead_semi_signi.query(\"delta_logodds_77_127_abs >= 0.336\")['chisq_marginal'], color='blue', s=35, marker=\"^\", alpha=0.95)\n", + "\n", + "plt.scatter(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs < 0.336\")['delta_logodds_77_127_abs'] / np.log(2.), variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs < 0.336\")['chisq_marginal'], color='red', s=15, alpha=0.95)\n", + "plt.scatter(variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs >= 0.336\")['delta_logodds_77_127_abs'] / np.log(2.), variant_df_gwas_lead_signi.query(\"delta_logodds_77_127_abs >= 0.336\")['chisq_marginal'], color='red', s=35, marker=\"^\", alpha=0.95)\n", + "\n", + "plt.axvline(x=np.log2(1.4), linestyle='--', linewidth=2, color='black')\n", + "\n", + "plt.xlim(-0.1, 6.0)\n", + "plt.ylim(-25, 1000)\n", + "\n", + "plt.xticks([0, 1, 2, 3, 4, 5, 6], ['Ref', '2-fold', '4-fold', '8-fold', '16-fold', '32-fold', '64-fold'], fontsize=12, rotation=45)\n", + "plt.yticks(fontsize=12, rotation=45)\n", + "\n", + "plt.savefig(\"ukbb_aparent_resnet_scatter_utr3_only_pp_05_09_v2.png\", transparent=True, dpi=300)\n", + "plt.savefig(\"ukbb_aparent_resnet_scatter_utr3_only_pp_05_09_v2.eps\")\n", + "\n", + "plt.tight_layout()\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "#Load trait metadata\n", + "\n", + "ukbb_meta_df = pd.read_csv(\"UKBB_94_traits/UKBB_94traits_release1.traits\", sep='\\t')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "#Get SNPs with PP >= 0.5 and PP >= 0.9\n", + "\n", + "variant_df_gwas_semi_signi = variant_df_gwas.query(\"pip >= 0.5 and pip < 0.9\").copy()\n", + "variant_df_gwas_lead_semi_signi = variant_df_gwas_lead_semi_signi.query(\"pip >= 0.5 and pip < 0.9\").copy()\n", + "\n", + "variant_df_gwas_semi_signi = variant_df_gwas_semi_signi.join(ukbb_meta_df[['trait', 'description']].set_index(\"trait\"), on='trait', how='inner').copy().reset_index(drop=True)\n", + "variant_df_gwas_lead_semi_signi = variant_df_gwas_lead_semi_signi.join(ukbb_meta_df[['trait', 'description']].set_index(\"trait\"), on='trait', how='inner').copy().reset_index(drop=True)\n", + "\n", + "variant_df_gwas_signi = variant_df_gwas.query(\"pip >= 0.9\").copy()\n", + "variant_df_gwas_lead_signi = variant_df_gwas_lead_signi.query(\"pip >= 0.9\").copy()\n", + "\n", + "variant_df_gwas_signi = variant_df_gwas_signi.join(ukbb_meta_df[['trait', 'description']].set_index(\"trait\"), on='trait', how='inner').copy().reset_index(drop=True)\n", + "variant_df_gwas_lead_signi = variant_df_gwas_lead_signi.join(ukbb_meta_df[['trait', 'description']].set_index(\"trait\"), on='trait', how='inner').copy().reset_index(drop=True)\n", + "\n", + "variant_df_gwas_lead_semi_signi['delta_logodds_77_127_abs_log_2'] = 2**(variant_df_gwas_lead_semi_signi['delta_logodds_77_127_abs'] / np.log(2.))\n", + "variant_df_gwas_lead_signi['delta_logodds_77_127_abs_log_2'] = 2**(variant_df_gwas_lead_signi['delta_logodds_77_127_abs'] / np.log(2.))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "#Get variant ids\n", + "\n", + "rs_ids = set(variant_df_gwas['rsID'].unique().tolist())\n", + "\n", + "rs_ids_semi_signi = set(variant_df_gwas_semi_signi['rsID'].unique().tolist())\n", + "rs_ids_lead_semi_signi = set(variant_df_gwas_lead_semi_signi['rsID'].unique().tolist())\n", + "\n", + "rs_ids_signi = set(variant_df_gwas_signi['rsID'].unique().tolist())\n", + "rs_ids_lead_signi = set(variant_df_gwas_lead_signi['rsID'].unique().tolist())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- SNPs from credible set w. PP >= 0.9 ---\n", + "rs3211995 : True\n", + " - delta_logodds_77_127 = 3.0457\n", + " - delta_logodds_77_127_abs_log_2 = 21.0241\n", + " -- Mean arterial pressure, statistic = 37.9865, PP = 0.996894\n", + " -- Systolic blood pressure, statistic = 43.1158, PP = 0.995952\n", + "\n", + "rs78378222 : True\n", + " - delta_logodds_77_127 = -2.3624\n", + " - delta_logodds_77_127_abs_log_2 = 10.6162\n", + " -- Adult height, statistic = 118.362, PP = 1.0\n", + " -- Mean corpuscular hemoglobin, statistic = 100.5, PP = 1.0\n", + " -- Mean corpuscular volume, statistic = 88.3365, PP = 1.0\n", + " -- Red blood cell count, statistic = 97.3022, PP = 1.0\n", + " -- Apolipoprotein A, statistic = 40.8586, PP = 0.926956\n", + " -- Diastolic blood pressure, statistic = 43.0585, PP = 1.0\n", + " -- Loss of Y, statistic = 296.598, PP = 1.0\n", + " -- Body weight, statistic = 77.2636, PP = 1.0\n", + " -- Hematocrit, statistic = 23.3951, PP = 0.902121\n", + " -- Pluse pressure, statistic = 90.6262, PP = 1.0\n", + "\n", + "rs6796 : True\n", + " - delta_logodds_77_127 = -2.2975\n", + " - delta_logodds_77_127_abs_log_2 = 9.9491\n", + " -- Monocyte count, statistic = 273.915, PP = 0.999998\n", + "\n", + "rs76020419 : True\n", + " - delta_logodds_77_127 = 1.4474\n", + " - delta_logodds_77_127_abs_log_2 = 4.252\n", + " -- Diastolic blood pressure, statistic = 29.4173, PP = 0.98914\n", + "\n", + "rs3208787 : True\n", + " - delta_logodds_77_127 = -1.254\n", + " - delta_logodds_77_127_abs_log_2 = 3.5043\n", + " -- Mean corpuscular hemoglobin, statistic = 33.9289, PP = 0.94508\n", + " -- Red blood cell count, statistic = 65.042, PP = 1.0\n", + "\n", + "rs35979828 : True\n", + " - delta_logodds_77_127 = -1.0283\n", + " - delta_logodds_77_127_abs_log_2 = 2.7962\n", + " -- Hemoglobin A1c, statistic = 40.9369, PP = 0.999966\n", + " -- Mean corpuscular hemoglobin, statistic = 76.1084, PP = 1.0\n", + " -- Mean corpuscular volume, statistic = 151.055, PP = 1.0\n", + " -- Red blood cell count, statistic = 65.6807, PP = 1.0\n", + " -- Eosinophil count, statistic = 67.447, PP = 1.0\n", + " -- Loss of Y, statistic = 87.508, PP = 1.0\n", + " -- Monocyte count, statistic = 67.3078, PP = 1.0\n", + " -- Platelet count, statistic = 181.556, PP = 0.945909\n", + "\n", + "rs884205 : True\n", + " - delta_logodds_77_127 = -0.798\n", + " - delta_logodds_77_127_abs_log_2 = 2.2211\n", + " -- Alkaline phosphatase, statistic = 254.701, PP = 1.0\n", + " -- Estimated heel bone mineral density, statistic = 93.8172, PP = 1.0\n", + "\n", + "rs1815009 : True\n", + " - delta_logodds_77_127 = 0.5175\n", + " - delta_logodds_77_127_abs_log_2 = 1.6778\n", + " -- Insulin-like growth factor 1, statistic = 63.3597, PP = 0.995106\n", + "\n", + "rs11666245 : True\n", + " - delta_logodds_77_127 = 0.5158\n", + " - delta_logodds_77_127_abs_log_2 = 1.6749\n", + " -- C-reactive protein, statistic = 51.5876, PP = 0.977589\n", + "\n", + "rs1055253 : True\n", + " - delta_logodds_77_127 = 0.4956\n", + " - delta_logodds_77_127_abs_log_2 = 1.6414\n", + " -- Red blood cell count, statistic = 46.9531, PP = 0.925224\n", + "\n", + "rs141870697 : True\n", + " - delta_logodds_77_127 = 0.471\n", + " - delta_logodds_77_127_abs_log_2 = 1.6016\n", + " -- FEV1/FVC ratio, statistic = 47.2386, PP = 0.918507\n", + "\n", + "rs7078 : True\n", + " - delta_logodds_77_127 = -0.4102\n", + " - delta_logodds_77_127_abs_log_2 = 1.5071\n", + " -- Estimated heel bone mineral density, statistic = 71.9146, PP = 1.0\n", + "\n", + "rs17352041 : True\n", + " - delta_logodds_77_127 = -0.4011\n", + " - delta_logodds_77_127_abs_log_2 = 1.4934\n", + " -- Adult height, statistic = 35.0065, PP = 0.999975\n", + "\n" + ] + } + ], + "source": [ + "#Top candidates from CasualDB with PP >= 0.9 (cutoff immediately after known pathogenic variant rs1799963)\n", + "\n", + "cand_rs_ids = variant_df_gwas_lead_signi.sort_values(by='delta_logodds_77_127_abs_log_2', ascending=False).query(\"delta_logodds_77_127_abs_log_2 >= 1.4\")['rsID'].values.tolist()\n", + "\n", + "print(\"--- SNPs from credible set w. PP >= 0.9 ---\")\n", + "\n", + "for cand_rs_id in cand_rs_ids :\n", + " print((cand_rs_id + \" \" * 15)[:15] + \": \" + str((cand_rs_id in rs_ids_signi)))\n", + " \n", + " if cand_rs_id in rs_ids_signi :\n", + " print(\" - delta_logodds_77_127 = \" + str(round(variant_df_gwas_lead_signi.query(\"rsID == '\" + cand_rs_id + \"'\").iloc[0]['delta_logodds_77_127'], 4)))\n", + " print(\" - delta_logodds_77_127_abs_log_2 = \" + str(round(2**(np.abs(variant_df_gwas_lead_signi.query(\"rsID == '\" + cand_rs_id + \"'\").iloc[0]['delta_logodds_77_127']) / np.log(2.)), 4)))\n", + " \n", + " subset_df = variant_df_gwas_signi.query(\"rsID == '\" + cand_rs_id + \"'\")\n", + " for _, row in subset_df.iterrows() :\n", + " print(\" -- \" + row['description'] + \", statistic = \" + str(row['chisq_marginal']) + \", PP = \" + str(row['pip']))\n", + "\n", + " print(\"\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- SNPs from credible set w. PP >= 0.5 ---\n", + "rs1064939 : True\n", + " - delta_logodds_77_127 = 0.9162\n", + " - delta_logodds_77_127_abs_log_2 = 2.4999\n", + " -- Body mass index, statistic = 41.7335, PP = 0.894186\n", + "\n", + "rs45463895 : True\n", + " - delta_logodds_77_127 = -0.3893\n", + " - delta_logodds_77_127_abs_log_2 = 1.4759\n", + " -- Monocyte count, statistic = 11.1444, PP = 0.679254\n", + "\n" + ] + } + ], + "source": [ + "#Top candidates from CasualDB with PP >= 0.5 (cutoff immediately after known pathogenic variant rs1799963)\n", + "\n", + "cand_rs_ids = variant_df_gwas_lead_semi_signi.sort_values(by='delta_logodds_77_127_abs_log_2', ascending=False).query(\"delta_logodds_77_127_abs_log_2 >= 1.4\")['rsID'].values.tolist()\n", + "\n", + "print(\"--- SNPs from credible set w. PP >= 0.5 ---\")\n", + "\n", + "for cand_rs_id in cand_rs_ids :\n", + " print((cand_rs_id + \" \" * 15)[:15] + \": \" + str((cand_rs_id in rs_ids_semi_signi)))\n", + " \n", + " if cand_rs_id in rs_ids_semi_signi :\n", + " print(\" - delta_logodds_77_127 = \" + str(round(variant_df_gwas_lead_semi_signi.query(\"rsID == '\" + cand_rs_id + \"'\").iloc[0]['delta_logodds_77_127'], 4)))\n", + " print(\" - delta_logodds_77_127_abs_log_2 = \" + str(round(2**(np.abs(variant_df_gwas_lead_semi_signi.query(\"rsID == '\" + cand_rs_id + \"'\").iloc[0]['delta_logodds_77_127']) / np.log(2.)), 4)))\n", + " \n", + " subset_df = variant_df_gwas_semi_signi.query(\"rsID == '\" + cand_rs_id + \"'\")\n", + " for _, row in subset_df.iterrows() :\n", + " print(\" -- \" + row['description'] + \", statistic = \" + str(row['chisq_marginal']) + \", PP = \" + str(row['pip']))\n", + "\n", + " print(\"\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Environment (conda_tensorflow_p36)", + "language": "python", + "name": "conda_tensorflow_p36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/analysis/gnomad_v3/gwas_ukbb_enrichment_analysis.ipynb b/analysis/gnomad_v3/gwas_ukbb_enrichment_analysis.ipynb new file mode 100644 index 0000000..e5d10dd --- /dev/null +++ b/analysis/gnomad_v3/gwas_ukbb_enrichment_analysis.ipynb @@ -0,0 +1,455 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import print_function\n", + "\n", + "import os\n", + "import pandas as pd\n", + "\n", + "import numpy as np\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.cm as cm\n", + "\n", + "#import aparent.visualization as vis\n", + "\n", + "#from aparent_predictor import *\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(polyadb_df) = 175451\n" + ] + } + ], + "source": [ + "#Load APADB Data and filter on targeted genes\n", + "\n", + "#genes = ['RUNX1', 'CEBPA', 'GATA2', 'ANKRD26', 'DDX41', 'ETV6', 'PTEN', 'BRCA1', 'BRCA2', 'TP53', 'APC', 'ATM', 'PALB2', 'MSH2', 'MLH1', 'MSH6', 'PMS2', 'MUTYH']\n", + "\n", + "polyadb_df = pd.read_csv('polyadb_processed.csv', sep=',')\n", + "\n", + "#polyadb_df = polyadb_df.loc[polyadb_df['gene'].isin(genes)].reset_index(drop=True).copy()\n", + "polyadb_df = polyadb_df.loc[((~polyadb_df['gene'].isnull()) & (polyadb_df['gene'] != 'na')) & (polyadb_df['pas'] != -1)].reset_index(drop=True).copy()\n", + "\n", + "print('len(polyadb_df) = ' + str(len(polyadb_df)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#Process PolyaDB data\n", + "\n", + "polyadb_df_minus = polyadb_df.query(\"strand == '-'\").copy().reset_index(drop=True)\n", + "polyadb_df_plus = polyadb_df.query(\"strand == '+'\").copy().reset_index(drop=True)\n", + "\n", + "polyadb_df_minus = polyadb_df_minus.sort_values(by='pas_pos', ascending=False).copy().reset_index(drop=True)\n", + "polyadb_df_plus = polyadb_df_plus.sort_values(by='pas_pos', ascending=True).copy().reset_index(drop=True)\n", + "\n", + "new_gene_id_list_plus = []\n", + "sitenum_list_plus = []\n", + "gene_id_dict = {}\n", + "for _, row in polyadb_df_plus.iterrows() :\n", + "\n", + " gene = row['gene']\n", + "\n", + " if gene not in gene_id_dict :\n", + " gene_id_dict[gene] = 0\n", + "\n", + " gene_id_dict[gene] += 1\n", + "\n", + " new_gene_id_list_plus.append(gene + \".\" + str(gene_id_dict[gene]))\n", + " sitenum_list_plus.append(gene_id_dict[gene])\n", + "\n", + "polyadb_df_plus['gene_id'] = new_gene_id_list_plus\n", + "polyadb_df_plus['sitenum'] = sitenum_list_plus\n", + "\n", + "new_gene_id_list_minus = []\n", + "sitenum_list_minus = []\n", + "gene_id_dict = {}\n", + "for _, row in polyadb_df_minus.iterrows() :\n", + "\n", + " gene = row['gene']\n", + "\n", + " if gene not in gene_id_dict :\n", + " gene_id_dict[gene] = 0\n", + "\n", + " gene_id_dict[gene] += 1\n", + "\n", + " new_gene_id_list_minus.append(gene + \".\" + str(gene_id_dict[gene]))\n", + " sitenum_list_minus.append(gene_id_dict[gene])\n", + "\n", + "polyadb_df_minus['gene_id'] = new_gene_id_list_minus\n", + "polyadb_df_minus['sitenum'] = sitenum_list_minus\n", + "\n", + "polyadb_df = pd.concat([polyadb_df_plus, polyadb_df_minus])\n", + "\n", + "polyadb_df = polyadb_df.sort_values(by=['gene', 'sitenum'], ascending=True).reset_index(drop=True).copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#Load cached UKBB dataframes (already intersected against predictions)\n", + "\n", + "variant_df_gwas = pd.read_csv(\"aparent_resnet_variant_predictions_polyadb_no_sequences_ukbb_no_cutoff.csv\", sep='\\t')\n", + "variant_df_gwas = variant_df_gwas.query(\"maf >= 0.001\").copy().reset_index(drop=True)\n", + "\n", + "variant_df_gwas_noncausal = pd.read_csv(\"aparent_resnet_variant_predictions_polyadb_no_sequences_ukbb_noncausal_no_cutoff.csv\", sep='\\t')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "#Drop duplicates for partially overlapping annotated PASs and add additional fields\n", + "\n", + "variant_df_gwas['delta_logodds_77_127_abs'] = np.abs(variant_df_gwas['delta_logodds_77_127'])\n", + "\n", + "variant_df_gwas['rel_var_position'] = -1\n", + "variant_df_gwas.loc[variant_df_gwas['strand'] == '+', 'rel_var_position'] = variant_df_gwas['var_position'] - (variant_df_gwas['pas_pos'] - 70 + 1)\n", + "variant_df_gwas.loc[variant_df_gwas['strand'] == '-', 'rel_var_position'] = ((variant_df_gwas['pas_pos'] - (205 - 70)) + 205) - variant_df_gwas['var_position']\n", + "\n", + "variant_df_gwas['alt_usage'] = 1. / (1. + 1. / ((np.clip(variant_df_gwas['native_usage'], 1e-7, 1. - 1e-7) / (1. - np.clip(variant_df_gwas['native_usage'], 1e-7, 1. - 1e-7))) * np.exp(variant_df_gwas['delta_logodds_0_205'])))\n", + "variant_df_gwas['delta_usage'] = variant_df_gwas['alt_usage'] - variant_df_gwas['native_usage']\n", + "\n", + "variant_df_gwas['alt_usage_77_127'] = 1. / (1. + 1. / ((np.clip(variant_df_gwas['native_usage'], 1e-7, 1. - 1e-7) / (1. - np.clip(variant_df_gwas['native_usage'], 1e-7, 1. - 1e-7))) * np.exp(variant_df_gwas['delta_logodds_77_127'])))\n", + "variant_df_gwas['delta_usage_77_127'] = variant_df_gwas['alt_usage_77_127'] - variant_df_gwas['native_usage']\n", + "\n", + "polyadb_df_unique = polyadb_df.drop_duplicates(subset=[\"gene_id\"], keep='first').copy().reset_index(drop=True)\n", + "polyadb_df_unique['padb_join_id'] = polyadb_df_unique['chrom'] + \"_\" + polyadb_df_unique['pas_pos'].astype(str) + \"_\" + polyadb_df_unique['strand']\n", + "variant_df_gwas['padb_join_id'] = variant_df_gwas['chrom'] + \"_\" + variant_df_gwas['pas_pos'].astype(str) + \"_\" + variant_df_gwas['strand']\n", + "\n", + "variant_df_gwas = variant_df_gwas.join(polyadb_df_unique[['padb_join_id', 'pas']].set_index(\"padb_join_id\"), on='padb_join_id', how='inner').copy().reset_index(drop=True)\n", + "\n", + "variant_df_gwas['target_rel_var_position'] = np.abs(90 - variant_df_gwas['rel_var_position'])\n", + "variant_df_gwas['target_delta_logodds_77_127_abs'] = -variant_df_gwas['delta_logodds_77_127_abs']\n", + "variant_df_gwas = variant_df_gwas.sort_values(by=['target_rel_var_position', 'target_delta_logodds_77_127_abs'], ascending=True).drop_duplicates(subset=['ukbb_join_id', 'trait'], keep='first').copy().reset_index(drop=True)\n", + "#variant_df_gwas['pas'] = -1. * variant_df_gwas['pas']\n", + "#variant_df_gwas = variant_df_gwas.sort_values(by=[\"pas\", 'delta_logodds_77_127_abs'], ascending=False).drop_duplicates(subset=['causaldb_join_id', 'meta_id'], keep='first').copy().reset_index(drop=True)\n", + "\n", + "#Drop duplicates for partially overlapping annotated PASs and add additional fields (non-causal)\n", + "\n", + "variant_df_gwas_noncausal['delta_logodds_77_127_abs'] = np.abs(variant_df_gwas_noncausal['delta_logodds_77_127'])\n", + "\n", + "variant_df_gwas_noncausal['rel_var_position'] = -1\n", + "variant_df_gwas_noncausal.loc[variant_df_gwas_noncausal['strand'] == '+', 'rel_var_position'] = variant_df_gwas_noncausal['var_position'] - (variant_df_gwas_noncausal['pas_pos'] - 70 + 1)\n", + "variant_df_gwas_noncausal.loc[variant_df_gwas_noncausal['strand'] == '-', 'rel_var_position'] = ((variant_df_gwas_noncausal['pas_pos'] - (205 - 70)) + 205) - variant_df_gwas_noncausal['var_position']\n", + "\n", + "variant_df_gwas_noncausal['alt_usage'] = 1. / (1. + 1. / ((np.clip(variant_df_gwas_noncausal['native_usage'], 1e-7, 1. - 1e-7) / (1. - np.clip(variant_df_gwas_noncausal['native_usage'], 1e-7, 1. - 1e-7))) * np.exp(variant_df_gwas_noncausal['delta_logodds_0_205'])))\n", + "variant_df_gwas_noncausal['delta_usage'] = variant_df_gwas_noncausal['alt_usage'] - variant_df_gwas_noncausal['native_usage']\n", + "\n", + "variant_df_gwas_noncausal['alt_usage_77_127'] = 1. / (1. + 1. / ((np.clip(variant_df_gwas_noncausal['native_usage'], 1e-7, 1. - 1e-7) / (1. - np.clip(variant_df_gwas_noncausal['native_usage'], 1e-7, 1. - 1e-7))) * np.exp(variant_df_gwas_noncausal['delta_logodds_77_127'])))\n", + "variant_df_gwas_noncausal['delta_usage_77_127'] = variant_df_gwas_noncausal['alt_usage_77_127'] - variant_df_gwas_noncausal['native_usage']\n", + "\n", + "polyadb_df_unique = polyadb_df.drop_duplicates(subset=[\"gene_id\"], keep='first').copy().reset_index(drop=True)\n", + "polyadb_df_unique['padb_join_id'] = polyadb_df_unique['chrom'] + \"_\" + polyadb_df_unique['pas_pos'].astype(str) + \"_\" + polyadb_df_unique['strand']\n", + "variant_df_gwas_noncausal['padb_join_id'] = variant_df_gwas_noncausal['chrom'] + \"_\" + variant_df_gwas_noncausal['pas_pos'].astype(str) + \"_\" + variant_df_gwas_noncausal['strand']\n", + "\n", + "variant_df_gwas_noncausal = variant_df_gwas_noncausal.join(polyadb_df_unique[['padb_join_id', 'pas']].set_index(\"padb_join_id\"), on='padb_join_id', how='inner').copy().reset_index(drop=True)\n", + "\n", + "variant_df_gwas_noncausal['target_rel_var_position'] = np.abs(90 - variant_df_gwas_noncausal['rel_var_position'])\n", + "variant_df_gwas_noncausal['target_delta_logodds_77_127_abs'] = -variant_df_gwas_noncausal['delta_logodds_77_127_abs']\n", + "variant_df_gwas_noncausal = variant_df_gwas_noncausal.sort_values(by=['target_rel_var_position', 'target_delta_logodds_77_127_abs'], ascending=True).drop_duplicates(subset=['ukbb_join_id', 'trait'], keep='first').copy().reset_index(drop=True)\n", + "#variant_df_gwas_noncausal['pas'] = -1. * variant_df_gwas_noncausal['pas']\n", + "#variant_df_gwas_noncausal = variant_df_gwas_noncausal.sort_values(by=[\"pas\", 'delta_logodds_77_127_abs'], ascending=False).drop_duplicates(subset=['causaldb_join_id', 'meta_id'], keep='first').copy().reset_index(drop=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(variant_df_gwas_lead) = 17324\n", + "len(variant_df_gwas_all_lead) = 90356\n" + ] + } + ], + "source": [ + "#Aggregate summary statistic for unique SNPs (by rsID)\n", + "\n", + "variant_df_gwas_all = pd.concat([variant_df_gwas, variant_df_gwas_noncausal]).drop_duplicates(subset=[\"ukbb_join_id\"], keep='first').copy().reset_index(drop=True)\n", + "\n", + "variant_df_gwas_lead = variant_df_gwas.copy().sort_values(by='pip', ascending=False).drop_duplicates(subset=[\"ukbb_join_id\"], keep='first').copy().reset_index(drop=True)\n", + "variant_df_gwas_all_lead = variant_df_gwas_all.copy().sort_values(by='pip', ascending=False).drop_duplicates(subset=[\"ukbb_join_id\"], keep='first').copy().reset_index(drop=True)\n", + "\n", + "print(\"len(variant_df_gwas_lead) = \" + str(len(variant_df_gwas_lead)))\n", + "print(\"len(variant_df_gwas_all_lead) = \" + str(len(variant_df_gwas_all_lead)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "x_thresh_low = -0.3365\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- Fisher's exact test ---\n", + "\n", + "x (low) = 0.001\n", + "x (high) = 0.05\n", + "\n", + "n (low) = 36567\n", + "n (high) = 323\n", + "\n", + "[[34005. 2562.]\n", + " [ 281. 42.]]\n", + "\n", + "(1.983839915990899, 9.913732451733462e-05)\n", + "\n", + "RanksumsResult(statistic=2.5272381034178766, pvalue=0.005748176600230651)\n", + "\n", + "x_thresh_low = -0.4055\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- Fisher's exact test ---\n", + "\n", + "x (low) = 0.001\n", + "x (high) = 0.05\n", + "\n", + "n (low) = 36567\n", + "n (high) = 323\n", + "\n", + "[[34657. 1910.]\n", + " [ 286. 37.]]\n", + "\n", + "(2.3474334565957604, 8.912377113043503e-06)\n", + "\n", + "RanksumsResult(statistic=2.5272381034178766, pvalue=0.005748176600230651)\n", + "\n", + "x_thresh_low = -0.6931\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- Fisher's exact test ---\n", + "\n", + "x (low) = 0.001\n", + "x (high) = 0.05\n", + "\n", + "n (low) = 36567\n", + "n (high) = 323\n", + "\n", + "[[3.5733e+04 8.3400e+02]\n", + " [2.9900e+02 2.4000e+01]]\n", + "\n", + "(3.4390895310507448, 7.308479447356434e-07)\n", + "\n", + "RanksumsResult(statistic=2.5272381034178766, pvalue=0.005748176600230651)\n", + "\n" + ] + } + ], + "source": [ + "#Calculate statistics in intervals of PP (log odds)\n", + "\n", + "from scipy.stats import ranksums, fisher_exact\n", + "\n", + "x = -np.abs(variant_df_gwas_all_lead.query(\"site_type == '3_most_exon' and native_usage > 0. and native_usage < 1.\")['delta_logodds_0_205'].values)\n", + "y = np.array(variant_df_gwas_all_lead.query(\"site_type == '3_most_exon' and native_usage > 0. and native_usage < 1.\")['pip'].values)\n", + "\n", + "x_thresh_lows = [-np.log(1.4), -np.log(1.5), -np.log(2.0)]\n", + "\n", + "fisher_test_low_y = 0.001\n", + "fisher_test_high_y = 0.05\n", + "\n", + "for x_thresh_low in x_thresh_lows :\n", + "\n", + " print(\"x_thresh_low = \" + str(round(x_thresh_low, 4)))\n", + " \n", + " x_0 = x[(y >= 0.)]\n", + " x_0_01 = x[(y >= 0.01)]\n", + " x_0_05 = x[(y >= 0.05)]\n", + " x_0_2 = x[(y >= 0.2)]\n", + " x_0_5 = x[(y >= 0.5)]\n", + " x_0_9 = x[(y >= 0.9)]\n", + " x_0_95 = x[(y >= 0.95)]\n", + "\n", + " f_0 = np.sum(x_0 < x_thresh_low) / x_0.shape[0]\n", + " f_0_01 = np.sum(x_0_01 < x_thresh_low) / x_0_01.shape[0]\n", + " f_0_05 = np.sum(x_0_05 < x_thresh_low) / x_0_05.shape[0]\n", + " f_0_2 = np.sum(x_0_2 < x_thresh_low) / x_0_2.shape[0]\n", + " f_0_5 = np.sum(x_0_5 < x_thresh_low) / x_0_5.shape[0]\n", + " f_0_9 = np.sum(x_0_9 < x_thresh_low) / x_0_9.shape[0]\n", + " f_0_95 = np.sum(x_0_95 < x_thresh_low) / x_0_95.shape[0]\n", + "\n", + " r_0_01 = f_0_01 / f_0\n", + " r_0_05 = f_0_05 / f_0\n", + " r_0_2 = f_0_2 / f_0\n", + " r_0_5 = f_0_5 / f_0\n", + " r_0_9 = f_0_9 / f_0\n", + " r_0_95 = f_0_95 / f_0\n", + "\n", + " fs = np.array([1., r_0_01, r_0_05, r_0_2, r_0_5, r_0_9, r_0_95])\n", + "\n", + " f = plt.figure(figsize=(6, 4))\n", + "\n", + " plt.bar([0], [fs[0]], color='darkgray', linewidth=2, edgecolor='black')\n", + " plt.bar((np.arange(fs.shape[0]-1)+1).tolist(), fs[1:].tolist(), color='deepskyblue', linewidth=2, edgecolor='black')\n", + "\n", + " plt.axhline(y=1.0, linewidth=2, linestyle='--', color='black')\n", + "\n", + " plt.xticks(\n", + " np.arange(fs.shape[0]), [\n", + " \">= 0%\",\n", + " \">= 1%\",\n", + " \">= 5%\",\n", + " \">= 20%\",\n", + " \">= 50%\",\n", + " \">= 90%\",\n", + " \">= 95%\",\n", + " ], fontsize=12, rotation=45\n", + " )\n", + "\n", + " plt.yticks(fontsize=12)\n", + "\n", + " plt.xlabel(\"Posterior Probability\", fontsize=12)\n", + " plt.ylabel(\"Frequency of Disruptive Variants\\n(Relative to PP >= 0)\", fontsize=12)\n", + "\n", + " plt.ylim(0.3)\n", + "\n", + " plt.tight_layout()\n", + "\n", + " plt.savefig(\"apa_ukbb_disruptive_frequency_logodds_\" + str(round(x_thresh_low, 4)).replace(\".\", \"_\").replace(\"-\", \"\") + \"_v2.eps\")\n", + " plt.savefig(\"apa_ukbb_disruptive_frequency_logodds_\" + str(round(x_thresh_low, 4)).replace(\".\", \"_\").replace(\"-\", \"\") + \"_v2.png\", transparent=True, dpi=300)\n", + "\n", + " plt.show()\n", + " \n", + " x_low = x[(y < fisher_test_low_y)]\n", + " x_high = x[(y >= fisher_test_high_y)]\n", + "\n", + " print(\"--- Fisher's exact test ---\")\n", + " print(\"\")\n", + " print(\"x (low) = \" + str(fisher_test_low_y))\n", + " print(\"x (high) = \" + str(fisher_test_high_y))\n", + " print(\"\")\n", + " print(\"n (low) = \" + str(x_low.shape[0]))\n", + " print(\"n (high) = \" + str(x_high.shape[0]))\n", + " \n", + " t1 = np.zeros((2, 2))\n", + " \n", + " t1[0, 0] = np.sum(x_low >= x_thresh_low)\n", + " t1[0, 1] = np.sum(x_low < x_thresh_low)\n", + " t1[1, 0] = np.sum(x_high >= x_thresh_low)\n", + " t1[1, 1] = np.sum(x_high < x_thresh_low)\n", + "\n", + " print(\"\")\n", + " print(t1)\n", + " print(\"\")\n", + "\n", + " print(fisher_exact(t1, alternative='greater'))\n", + " \n", + " print(\"\")\n", + " \n", + " print(ranksums(x_low, x_high, alternative='greater'))\n", + " \n", + " print(\"\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Environment (conda_modisco_p37)", + "language": "python", + "name": "conda_modisco_p37" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data/oligo_pool_2022/medium_library/analyze_library.ipynb b/data/oligo_pool_2022/medium_library/analyze_library.ipynb new file mode 100644 index 0000000..f6fd86f --- /dev/null +++ b/data/oligo_pool_2022/medium_library/analyze_library.ipynb @@ -0,0 +1,1727 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import scipy\n", + "import scipy.io as spio\n", + "import scipy.sparse as sp\n", + "\n", + "import regex as re\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "#Load cached measurements\n", + "\n", + "hek_df = pd.read_csv(\"apa_100_variants_rev2_20220621_hek293_v3_umi_mut_0.csv\", sep='\\t')\n", + "sknsh_df = pd.read_csv(\"apa_100_variants_rev2_20220621_sknsh_v3_umi_mut_0.csv\", sep='\\t')\n", + "hmc3_df = pd.read_csv(\"apa_100_variants_rev2_20220621_hmc3_v3_umi_mut_0.csv\", sep='\\t')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- HEK293 ---\n", + "Wilcoxon statistic = -2.0682\n", + "Wilcoxon P-value = 0.038625305508977345\n", + "T-test statistic = -2.3702\n", + "T-test P-value = 0.02072423706876088\n", + "n1 / n2 = 34 / 36\n", + "--- SK-N-SH ---\n", + "Wilcoxon statistic = -2.1511\n", + "Wilcoxon P-value = 0.03146851890679132\n", + "T-test statistic = -2.187\n", + "T-test P-value = 0.032091653261210565\n", + "n1 / n2 = 36 / 36\n", + "--- HMC3 ---\n", + "Wilcoxon statistic = -2.016\n", + "Wilcoxon P-value = 0.04379985744145928\n", + "T-test statistic = -2.5153\n", + "T-test P-value = 0.014286754418166209\n", + "n1 / n2 = 32 / 38\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Analyze measured ASD variants (controls / cases)\n", + "\n", + "from scipy.stats import ranksums, ttest_ind\n", + "\n", + "import seaborn as sns\n", + "\n", + "save_figs = True\n", + "fig_name = \"asd_measured_lor\"\n", + "\n", + "#HEK293 measurements\n", + "hek_min_c = 5.\n", + "\n", + "hek_min_ref_prox_c = 5.\n", + "hek_min_var_prox_c = 1.\n", + "\n", + "hek_filtered_df = hek_df.query(\"ref_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hek_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hek_min_var_prox_c))\n", + "\n", + "hek_x1_true = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "hek_x2_true = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "\n", + "hek_x1_pred = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_0_205'].values)\n", + "hek_x2_pred = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_0_205'].values)\n", + "\n", + "wilcoxon_s_hek, wilcoxon_p_hek = ranksums(hek_x1_true, hek_x2_true)\n", + "ttest_s_hek, ttest_p_hek = ttest_ind(hek_x1_true, hek_x2_true, equal_var=False)\n", + "\n", + "print(\"--- HEK293 ---\")\n", + "print(\"Wilcoxon statistic = \" + str(round(wilcoxon_s_hek, 4)))\n", + "print(\"Wilcoxon P-value = \" + str(wilcoxon_p_hek))\n", + "print(\"T-test statistic = \" + str(round(ttest_s_hek, 4)))\n", + "print(\"T-test P-value = \" + str(ttest_p_hek))\n", + "print(\"n1 / n2 = \" + str(hek_x1_true.shape[0]) + \" / \" + str(hek_x2_true.shape[0]))\n", + "\n", + "#SK-N-SH measurements\n", + "sknsh_min_c = 5.\n", + "\n", + "sknsh_min_ref_prox_c = 5.\n", + "sknsh_min_var_prox_c = 1.\n", + "\n", + "sknsh_filtered_df = sknsh_df.query(\"ref_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(sknsh_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(sknsh_min_var_prox_c))\n", + "\n", + "sknsh_x1_true = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "sknsh_x2_true = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "\n", + "sknsh_x1_pred = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_0_205'].values)\n", + "sknsh_x2_pred = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_0_205'].values)\n", + "\n", + "wilcoxon_s_sknsh, wilcoxon_p_sknsh = ranksums(sknsh_x1_true, sknsh_x2_true)\n", + "ttest_s_sknsh, ttest_p_sknsh = ttest_ind(sknsh_x1_true, sknsh_x2_true, equal_var=False)\n", + "\n", + "print(\"--- SK-N-SH ---\")\n", + "print(\"Wilcoxon statistic = \" + str(round(wilcoxon_s_sknsh, 4)))\n", + "print(\"Wilcoxon P-value = \" + str(wilcoxon_p_sknsh))\n", + "print(\"T-test statistic = \" + str(round(ttest_s_sknsh, 4)))\n", + "print(\"T-test P-value = \" + str(ttest_p_sknsh))\n", + "print(\"n1 / n2 = \" + str(sknsh_x1_true.shape[0]) + \" / \" + str(sknsh_x2_true.shape[0]))\n", + "\n", + "#HMC3 measurements\n", + "hmc3_min_c = 5.\n", + "\n", + "hmc3_min_ref_prox_c = 5.\n", + "hmc3_min_var_prox_c = 1.\n", + "\n", + "hmc3_filtered_df = hmc3_df.query(\"ref_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hmc3_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hmc3_min_var_prox_c))\n", + "\n", + "hmc3_x1_true = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "hmc3_x2_true = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "\n", + "hmc3_x1_pred = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_0_205'].values)\n", + "hmc3_x2_pred = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_0_205'].values)\n", + "\n", + "wilcoxon_s_hmc3, wilcoxon_p_hmc3 = ranksums(hmc3_x1_true, hmc3_x2_true)\n", + "ttest_s_hmc3, ttest_p_hmc3 = ttest_ind(hmc3_x1_true, hmc3_x2_true, equal_var=False)\n", + "\n", + "print(\"--- HMC3 ---\")\n", + "print(\"Wilcoxon statistic = \" + str(round(wilcoxon_s_hmc3, 4)))\n", + "print(\"Wilcoxon P-value = \" + str(wilcoxon_p_hmc3))\n", + "print(\"T-test statistic = \" + str(round(ttest_s_hmc3, 4)))\n", + "print(\"T-test P-value = \" + str(ttest_p_hmc3))\n", + "print(\"n1 / n2 = \" + str(hmc3_x1_true.shape[0]) + \" / \" + str(hmc3_x2_true.shape[0]))\n", + "\n", + "#Visualize measurements (controls / cases)\n", + "\n", + "f = plt.figure(figsize=(6, 4))\n", + "\n", + "sns.swarmplot(data=[\n", + " hek_x1_true,\n", + " hek_x2_true,\n", + " sknsh_x1_true,\n", + " sknsh_x2_true,\n", + " hmc3_x1_true,\n", + " hmc3_x2_true,\n", + "], palette=['green', 'red', 'green', 'red', 'green', 'red'], size=4)\n", + "\n", + "plt.xticks([0, 1, 2, 3, 4, 5], [\"Control\\n(HEK293)\", \"Case\\n(HEK293)\", \"Control\\n(SK-N-SH)\", \"Case\\n(SK-N-SH)\", \"Control\\n(HMC3)\", \"Case\\n(HMC3)\"], rotation=45, fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- HEK293 ---\n", + "Wilcoxon statistic = -2.0682\n", + "Wilcoxon P-value = 0.038625305508977345\n", + "T-test statistic = -2.3702\n", + "T-test P-value = 0.02072423706876088\n", + "n1 / n2 = 34 / 36\n", + "--- SK-N-SH ---\n", + "Wilcoxon statistic = -2.2092\n", + "Wilcoxon P-value = 0.027163012509820155\n", + "T-test statistic = -2.2266\n", + "T-test P-value = 0.02931270488852479\n", + "n1 / n2 = 34 / 36\n", + "--- HMC3 ---\n", + "Wilcoxon statistic = -2.1379\n", + "Wilcoxon P-value = 0.03252418126532747\n", + "T-test statistic = -2.6009\n", + "T-test P-value = 0.011530170287279982\n", + "n1 / n2 = 32 / 36\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Analyze measured ASD variants (controls / cases), more stringent filtering (except low-count HEK293 data)\n", + "\n", + "from scipy.stats import ranksums, ttest_ind\n", + "\n", + "import seaborn as sns\n", + "\n", + "save_figs = True\n", + "fig_name = \"asd_measured_lor_stringent_filtering\"\n", + "\n", + "#HEK293 measurements\n", + "hek_min_c = 5.\n", + "\n", + "hek_min_ref_prox_c = 5.\n", + "hek_min_var_prox_c = 1.\n", + "\n", + "hek_filtered_df = hek_df.query(\"ref_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hek_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hek_min_var_prox_c))\n", + "\n", + "hek_x1_true = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "hek_x2_true = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "\n", + "hek_x1_pred = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_0_205'].values)\n", + "hek_x2_pred = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_0_205'].values)\n", + "\n", + "wilcoxon_s_hek, wilcoxon_p_hek = ranksums(hek_x1_true, hek_x2_true)\n", + "ttest_s_hek, ttest_p_hek = ttest_ind(hek_x1_true, hek_x2_true, equal_var=False)\n", + "\n", + "print(\"--- HEK293 ---\")\n", + "print(\"Wilcoxon statistic = \" + str(round(wilcoxon_s_hek, 4)))\n", + "print(\"Wilcoxon P-value = \" + str(wilcoxon_p_hek))\n", + "print(\"T-test statistic = \" + str(round(ttest_s_hek, 4)))\n", + "print(\"T-test P-value = \" + str(ttest_p_hek))\n", + "print(\"n1 / n2 = \" + str(hek_x1_true.shape[0]) + \" / \" + str(hek_x2_true.shape[0]))\n", + "\n", + "#SK-N-SH measurements\n", + "sknsh_min_c = 10.\n", + "\n", + "sknsh_min_ref_prox_c = 5.\n", + "sknsh_min_var_prox_c = 5.\n", + "\n", + "sknsh_filtered_df = sknsh_df.query(\"ref_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(sknsh_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(sknsh_min_var_prox_c))\n", + "\n", + "sknsh_x1_true = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "sknsh_x2_true = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "\n", + "sknsh_x1_pred = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_0_205'].values)\n", + "sknsh_x2_pred = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_0_205'].values)\n", + "\n", + "wilcoxon_s_sknsh, wilcoxon_p_sknsh = ranksums(sknsh_x1_true, sknsh_x2_true)\n", + "ttest_s_sknsh, ttest_p_sknsh = ttest_ind(sknsh_x1_true, sknsh_x2_true, equal_var=False)\n", + "\n", + "print(\"--- SK-N-SH ---\")\n", + "print(\"Wilcoxon statistic = \" + str(round(wilcoxon_s_sknsh, 4)))\n", + "print(\"Wilcoxon P-value = \" + str(wilcoxon_p_sknsh))\n", + "print(\"T-test statistic = \" + str(round(ttest_s_sknsh, 4)))\n", + "print(\"T-test P-value = \" + str(ttest_p_sknsh))\n", + "print(\"n1 / n2 = \" + str(sknsh_x1_true.shape[0]) + \" / \" + str(sknsh_x2_true.shape[0]))\n", + "\n", + "#HMC3 measurements\n", + "hmc3_min_c = 10.\n", + "\n", + "hmc3_min_ref_prox_c = 5.\n", + "hmc3_min_var_prox_c = 5.\n", + "\n", + "hmc3_filtered_df = hmc3_df.query(\"ref_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hmc3_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hmc3_min_var_prox_c))\n", + "\n", + "hmc3_x1_true = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "hmc3_x2_true = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "\n", + "hmc3_x1_pred = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_0_205'].values)\n", + "hmc3_x2_pred = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_0_205'].values)\n", + "\n", + "wilcoxon_s_hmc3, wilcoxon_p_hmc3 = ranksums(hmc3_x1_true, hmc3_x2_true)\n", + "ttest_s_hmc3, ttest_p_hmc3 = ttest_ind(hmc3_x1_true, hmc3_x2_true, equal_var=False)\n", + "\n", + "print(\"--- HMC3 ---\")\n", + "print(\"Wilcoxon statistic = \" + str(round(wilcoxon_s_hmc3, 4)))\n", + "print(\"Wilcoxon P-value = \" + str(wilcoxon_p_hmc3))\n", + "print(\"T-test statistic = \" + str(round(ttest_s_hmc3, 4)))\n", + "print(\"T-test P-value = \" + str(ttest_p_hmc3))\n", + "print(\"n1 / n2 = \" + str(hmc3_x1_true.shape[0]) + \" / \" + str(hmc3_x2_true.shape[0]))\n", + "\n", + "#Visualize measurements (controls / cases)\n", + "\n", + "f = plt.figure(figsize=(6, 4))\n", + "\n", + "sns.swarmplot(data=[\n", + " hek_x1_true,\n", + " hek_x2_true,\n", + " sknsh_x1_true,\n", + " sknsh_x2_true,\n", + " hmc3_x1_true,\n", + " hmc3_x2_true,\n", + "], palette=['green', 'red', 'green', 'red', 'green', 'red'], size=4)\n", + "\n", + "plt.xticks([0, 1, 2, 3, 4, 5], [\"Control\\n(HEK293)\", \"Case\\n(HEK293)\", \"Control\\n(SK-N-SH)\", \"Case\\n(SK-N-SH)\", \"Control\\n(HMC3)\", \"Case\\n(HMC3)\"], rotation=45, fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- HEK293 ---\n", + "Wilcoxon statistic = -2.0682\n", + "Wilcoxon P-value = 0.038625305508977345\n", + "T-test statistic = -2.3702\n", + "T-test P-value = 0.02072423706876088\n", + "n1 / n2 = 34 / 36\n", + "--- SK-N-SH ---\n", + "Wilcoxon statistic = -2.1511\n", + "Wilcoxon P-value = 0.03146851890679132\n", + "T-test statistic = -2.187\n", + "T-test P-value = 0.032091653261210565\n", + "n1 / n2 = 36 / 36\n", + "--- HMC3 ---\n", + "Wilcoxon statistic = -2.016\n", + "Wilcoxon P-value = 0.04379985744145928\n", + "T-test statistic = -2.5153\n", + "T-test P-value = 0.014286754418166209\n", + "n1 / n2 = 32 / 38\n", + "--- Average across cell lines ---\n", + "Wilcoxon statistic = -2.6675\n", + "Wilcoxon P-value = 0.007642882836288566\n", + "T-test statistic = -2.7049\n", + "T-test P-value = 0.00862842500296053\n", + "n1 / n2 = 34 / 36\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Analyze measured ASD variants (controls / cases), include average measurements across cell lines\n", + "\n", + "from scipy.stats import ranksums, ttest_ind\n", + "\n", + "import seaborn as sns\n", + "\n", + "save_figs = True\n", + "fig_name = \"asd_measured_lor_w_avg_cell_line\"\n", + "\n", + "#HEK293 measurements\n", + "hek_min_c = 5.\n", + "\n", + "hek_min_ref_prox_c = 5.\n", + "hek_min_var_prox_c = 1.\n", + "\n", + "hek_filtered_df = hek_df.query(\"ref_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hek_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hek_min_var_prox_c))\n", + "\n", + "hek_x1_true = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "hek_x2_true = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "\n", + "hek_x1_pred = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_0_205'].values)\n", + "hek_x2_pred = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_0_205'].values)\n", + "\n", + "wilcoxon_s_hek, wilcoxon_p_hek = ranksums(hek_x1_true, hek_x2_true)\n", + "ttest_s_hek, ttest_p_hek = ttest_ind(hek_x1_true, hek_x2_true, equal_var=False)\n", + "\n", + "print(\"--- HEK293 ---\")\n", + "print(\"Wilcoxon statistic = \" + str(round(wilcoxon_s_hek, 4)))\n", + "print(\"Wilcoxon P-value = \" + str(wilcoxon_p_hek))\n", + "print(\"T-test statistic = \" + str(round(ttest_s_hek, 4)))\n", + "print(\"T-test P-value = \" + str(ttest_p_hek))\n", + "print(\"n1 / n2 = \" + str(hek_x1_true.shape[0]) + \" / \" + str(hek_x2_true.shape[0]))\n", + "\n", + "#SK-N-SH measurements\n", + "sknsh_min_c = 5.\n", + "\n", + "sknsh_min_ref_prox_c = 5.\n", + "sknsh_min_var_prox_c = 1.\n", + "\n", + "sknsh_filtered_df = sknsh_df.query(\"ref_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(sknsh_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(sknsh_min_var_prox_c))\n", + "\n", + "sknsh_x1_true = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "sknsh_x2_true = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "\n", + "sknsh_x1_pred = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_0_205'].values)\n", + "sknsh_x2_pred = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_0_205'].values)\n", + "\n", + "wilcoxon_s_sknsh, wilcoxon_p_sknsh = ranksums(sknsh_x1_true, sknsh_x2_true)\n", + "ttest_s_sknsh, ttest_p_sknsh = ttest_ind(sknsh_x1_true, sknsh_x2_true, equal_var=False)\n", + "\n", + "print(\"--- SK-N-SH ---\")\n", + "print(\"Wilcoxon statistic = \" + str(round(wilcoxon_s_sknsh, 4)))\n", + "print(\"Wilcoxon P-value = \" + str(wilcoxon_p_sknsh))\n", + "print(\"T-test statistic = \" + str(round(ttest_s_sknsh, 4)))\n", + "print(\"T-test P-value = \" + str(ttest_p_sknsh))\n", + "print(\"n1 / n2 = \" + str(sknsh_x1_true.shape[0]) + \" / \" + str(sknsh_x2_true.shape[0]))\n", + "\n", + "#HMC3 measurements\n", + "hmc3_min_c = 5.\n", + "\n", + "hmc3_min_ref_prox_c = 5.\n", + "hmc3_min_var_prox_c = 1.\n", + "\n", + "hmc3_filtered_df = hmc3_df.query(\"ref_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hmc3_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hmc3_min_var_prox_c))\n", + "\n", + "hmc3_x1_true = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "hmc3_x2_true = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "\n", + "hmc3_x1_pred = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_0_205'].values)\n", + "hmc3_x2_pred = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_0_205'].values)\n", + "\n", + "wilcoxon_s_hmc3, wilcoxon_p_hmc3 = ranksums(hmc3_x1_true, hmc3_x2_true)\n", + "ttest_s_hmc3, ttest_p_hmc3 = ttest_ind(hmc3_x1_true, hmc3_x2_true, equal_var=False)\n", + "\n", + "print(\"--- HMC3 ---\")\n", + "print(\"Wilcoxon statistic = \" + str(round(wilcoxon_s_hmc3, 4)))\n", + "print(\"Wilcoxon P-value = \" + str(wilcoxon_p_hmc3))\n", + "print(\"T-test statistic = \" + str(round(ttest_s_hmc3, 4)))\n", + "print(\"T-test P-value = \" + str(ttest_p_hmc3))\n", + "print(\"n1 / n2 = \" + str(hmc3_x1_true.shape[0]) + \" / \" + str(hmc3_x2_true.shape[0]))\n", + "\n", + "#Join individual filtered dataframes (outer)\n", + "filtered_df = hek_filtered_df.join(sknsh_filtered_df, how='outer', rsuffix='_sknsh')\n", + "filtered_df = filtered_df.join(hmc3_filtered_df, how='outer', rsuffix='_hmc3').copy()\n", + "\n", + "#Compute average measurements\n", + "def _calc_avg_delta_logodds_true_0_205(row) :\n", + " \n", + " sum_delta_logodds_true_0_205_repl_pooled = 0.\n", + " n_delta_logodds_true_0_205_repl_pooled = 0.\n", + "\n", + " if not np.isnan(row['delta_logodds_true_0_205_repl_pooled']) :\n", + " sum_delta_logodds_true_0_205_repl_pooled += row['delta_logodds_true_0_205_repl_pooled']\n", + " n_delta_logodds_true_0_205_repl_pooled += 1.\n", + " \n", + " if not np.isnan(row['delta_logodds_true_0_205_repl_pooled_sknsh']) :\n", + " sum_delta_logodds_true_0_205_repl_pooled += row['delta_logodds_true_0_205_repl_pooled_sknsh']\n", + " n_delta_logodds_true_0_205_repl_pooled += 1.\n", + " \n", + " if not np.isnan(row['delta_logodds_true_0_205_repl_pooled_hmc3']) :\n", + " sum_delta_logodds_true_0_205_repl_pooled += row['delta_logodds_true_0_205_repl_pooled_hmc3']\n", + " n_delta_logodds_true_0_205_repl_pooled += 1.\n", + "\n", + " return sum_delta_logodds_true_0_205_repl_pooled / n_delta_logodds_true_0_205_repl_pooled\n", + "\n", + "filtered_df['mean_delta_logodds_true_0_205_repl_pooled'] = filtered_df.apply(_calc_avg_delta_logodds_true_0_205, axis=1)\n", + "\n", + "#Get mean measurements\n", + "x1_true = np.array(filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['mean_delta_logodds_true_0_205_repl_pooled'].values)\n", + "x2_true = np.array(filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['mean_delta_logodds_true_0_205_repl_pooled'].values)\n", + "\n", + "wilcoxon_s, wilcoxon_p = ranksums(x1_true, x2_true)\n", + "ttest_s, ttest_p = ttest_ind(x1_true, x2_true, equal_var=False)\n", + "\n", + "print(\"--- Average across cell lines ---\")\n", + "print(\"Wilcoxon statistic = \" + str(round(wilcoxon_s, 4)))\n", + "print(\"Wilcoxon P-value = \" + str(wilcoxon_p))\n", + "print(\"T-test statistic = \" + str(round(ttest_s, 4)))\n", + "print(\"T-test P-value = \" + str(ttest_p))\n", + "print(\"n1 / n2 = \" + str(x1_true.shape[0]) + \" / \" + str(x2_true.shape[0]))\n", + "\n", + "#Visualize measurements (controls / cases)\n", + "\n", + "f = plt.figure(figsize=(7, 4))\n", + "\n", + "sns.swarmplot(data=[\n", + " hek_x1_true,\n", + " hek_x2_true,\n", + " sknsh_x1_true,\n", + " sknsh_x2_true,\n", + " hmc3_x1_true,\n", + " hmc3_x2_true,\n", + " x1_true,\n", + " x2_true,\n", + "], palette=['green', 'red', 'green', 'red', 'green', 'red', 'darkgreen', 'darkred'], size=4)\n", + "\n", + "plt.xticks([0, 1, 2, 3, 4, 5, 6, 7], [\"Control\\n(HEK293)\", \"Case\\n(HEK293)\", \"Control\\n(SK-N-SH)\", \"Case\\n(SK-N-SH)\", \"Control\\n(HMC3)\", \"Case\\n(HMC3)\", \"Control\\n(Mean)\", \"Case\\n(Mean)\"], rotation=45, fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Compare predictions vs. measurements, ASD variants (controls / cases), 0 to 205\n", + "\n", + "from scipy.stats import spearmanr\n", + "\n", + "save_figs = True\n", + "fig_name = \"asd_predicted_vs_measured_lor_0_205\"\n", + "\n", + "#HEK293 measurements\n", + "hek_min_c = 50.\n", + "\n", + "hek_min_ref_prox_c = 5.\n", + "hek_min_var_prox_c = 1.\n", + "\n", + "hek_filtered_df = hek_df.query(\"ref_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hek_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hek_min_var_prox_c))\n", + "\n", + "hek_x1_true = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "hek_x2_true = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "\n", + "hek_x1_pred = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_0_205'].values)\n", + "hek_x2_pred = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_0_205'].values)\n", + "\n", + "#SK-N-SH measurements\n", + "sknsh_min_c = 50.\n", + "\n", + "sknsh_min_ref_prox_c = 5.\n", + "sknsh_min_var_prox_c = 1.\n", + "\n", + "sknsh_filtered_df = sknsh_df.query(\"ref_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(sknsh_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(sknsh_min_var_prox_c))\n", + "\n", + "sknsh_x1_true = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "sknsh_x2_true = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "\n", + "sknsh_x1_pred = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_0_205'].values)\n", + "sknsh_x2_pred = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_0_205'].values)\n", + "\n", + "#HMC3 measurements\n", + "hmc3_min_c = 50.\n", + "\n", + "hmc3_min_ref_prox_c = 5.\n", + "hmc3_min_var_prox_c = 1.\n", + "\n", + "hmc3_filtered_df = hmc3_df.query(\"ref_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hmc3_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hmc3_min_var_prox_c))\n", + "\n", + "hmc3_x1_true = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "hmc3_x2_true = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_true_0_205_repl_pooled'].values)\n", + "\n", + "hmc3_x1_pred = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_0_205'].values)\n", + "hmc3_x2_pred = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_0_205'].values)\n", + "\n", + "r_val_hek, _ = spearmanr(np.concatenate([hek_x1_pred, hek_x2_pred], axis=0), np.concatenate([hek_x1_true, hek_x2_true], axis=0))\n", + "r_val_sknsh, _ = spearmanr(np.concatenate([sknsh_x1_pred, sknsh_x2_pred], axis=0), np.concatenate([sknsh_x1_true, sknsh_x2_true], axis=0))\n", + "r_val_hmc3, _ = spearmanr(np.concatenate([hmc3_x1_pred, hmc3_x2_pred], axis=0), np.concatenate([hmc3_x1_true, hmc3_x2_true], axis=0))\n", + "\n", + "f, ax = plt.subplots(1, 3, figsize=(12, 4))\n", + "\n", + "#Plot HEK293 measurements\n", + "plt.sca(ax[0])\n", + "\n", + "plt.scatter(hek_x1_pred, hek_x1_true, color='lightgreen', s=45, edgecolor='black', linewidth=1)\n", + "plt.scatter(hek_x2_pred, hek_x2_true, color='lightcoral', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"HEK293, r = \" + str(round(r_val_hek, 3)) + \", n = \" + str(np.concatenate([hek_x1_pred, hek_x2_pred], axis=0).shape[0]), fontsize=12)\n", + "\n", + "#Plot SK-N-SH measurements\n", + "plt.sca(ax[1])\n", + "\n", + "plt.scatter(sknsh_x1_pred, sknsh_x1_true, color='lightgreen', s=45, edgecolor='black', linewidth=1)\n", + "plt.scatter(sknsh_x2_pred, sknsh_x2_true, color='lightcoral', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"SK-N-SH, r = \" + str(round(r_val_sknsh, 3)) + \", n = \" + str(np.concatenate([sknsh_x1_pred, sknsh_x2_pred], axis=0).shape[0]), fontsize=12)\n", + "\n", + "#Plot HMC3 measurements\n", + "plt.sca(ax[2])\n", + "\n", + "plt.scatter(hmc3_x1_pred, hmc3_x1_true, color='lightgreen', s=45, edgecolor='black', linewidth=1)\n", + "plt.scatter(hmc3_x2_pred, hmc3_x2_true, color='lightcoral', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"HMC3, r = \" + str(round(r_val_hmc3, 3)) + \", n = \" + str(np.concatenate([hmc3_x1_pred, hmc3_x2_pred], axis=0).shape[0]), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Compare predictions vs. measurements, ASD variants (controls / cases), 77 to 127\n", + "\n", + "from scipy.stats import spearmanr\n", + "\n", + "save_figs = True\n", + "fig_name = \"asd_predicted_vs_measured_lor_77_127\"\n", + "\n", + "#HEK293 measurements\n", + "hek_min_c = 50.\n", + "\n", + "hek_min_ref_prox_c = 5.\n", + "hek_min_var_prox_c = 1.\n", + "\n", + "hek_filtered_df = hek_df.query(\"ref_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hek_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hek_min_var_prox_c))\n", + "\n", + "hek_x1_true = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_true_77_127_repl_pooled'].values)\n", + "hek_x2_true = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_true_77_127_repl_pooled'].values)\n", + "\n", + "hek_x1_pred = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_77_127'].values)\n", + "hek_x2_pred = np.array(hek_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_77_127'].values)\n", + "\n", + "#SK-N-SH measurements\n", + "sknsh_min_c = 50.\n", + "\n", + "sknsh_min_ref_prox_c = 5.\n", + "sknsh_min_var_prox_c = 1.\n", + "\n", + "sknsh_filtered_df = sknsh_df.query(\"ref_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(sknsh_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(sknsh_min_var_prox_c))\n", + "\n", + "sknsh_x1_true = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_true_77_127_repl_pooled'].values)\n", + "sknsh_x2_true = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_true_77_127_repl_pooled'].values)\n", + "\n", + "sknsh_x1_pred = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_77_127'].values)\n", + "sknsh_x2_pred = np.array(sknsh_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_77_127'].values)\n", + "\n", + "#HMC3 measurements\n", + "hmc3_min_c = 50.\n", + "\n", + "hmc3_min_ref_prox_c = 5.\n", + "hmc3_min_var_prox_c = 1.\n", + "\n", + "hmc3_filtered_df = hmc3_df.query(\"ref_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hmc3_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hmc3_min_var_prox_c))\n", + "\n", + "hmc3_x1_true = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_true_77_127_repl_pooled'].values)\n", + "hmc3_x2_true = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_true_77_127_repl_pooled'].values)\n", + "\n", + "hmc3_x1_pred = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_control'\")['delta_logodds_77_127'].values)\n", + "hmc3_x2_pred = np.array(hmc3_filtered_df.query(\"data_source == 'ASD_1' and experiment == 'variant_case'\")['delta_logodds_77_127'].values)\n", + "\n", + "r_val_hek, _ = spearmanr(np.concatenate([hek_x1_pred, hek_x2_pred], axis=0), np.concatenate([hek_x1_true, hek_x2_true], axis=0))\n", + "r_val_sknsh, _ = spearmanr(np.concatenate([sknsh_x1_pred, sknsh_x2_pred], axis=0), np.concatenate([sknsh_x1_true, sknsh_x2_true], axis=0))\n", + "r_val_hmc3, _ = spearmanr(np.concatenate([hmc3_x1_pred, hmc3_x2_pred], axis=0), np.concatenate([hmc3_x1_true, hmc3_x2_true], axis=0))\n", + "\n", + "f, ax = plt.subplots(1, 3, figsize=(12, 4))\n", + "\n", + "#Plot HEK293 measurements\n", + "plt.sca(ax[0])\n", + "\n", + "plt.scatter(hek_x1_pred, hek_x1_true, color='lightgreen', s=45, edgecolor='black', linewidth=1)\n", + "plt.scatter(hek_x2_pred, hek_x2_true, color='lightcoral', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"HEK293, r = \" + str(round(r_val_hek, 3)) + \", n = \" + str(np.concatenate([hek_x1_pred, hek_x2_pred], axis=0).shape[0]), fontsize=12)\n", + "\n", + "#Plot SK-N-SH measurements\n", + "plt.sca(ax[1])\n", + "\n", + "plt.scatter(sknsh_x1_pred, sknsh_x1_true, color='lightgreen', s=45, edgecolor='black', linewidth=1)\n", + "plt.scatter(sknsh_x2_pred, sknsh_x2_true, color='lightcoral', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"SK-N-SH, r = \" + str(round(r_val_sknsh, 3)) + \", n = \" + str(np.concatenate([sknsh_x1_pred, sknsh_x2_pred], axis=0).shape[0]), fontsize=12)\n", + "\n", + "#Plot HMC3 measurements\n", + "plt.sca(ax[2])\n", + "\n", + "plt.scatter(hmc3_x1_pred, hmc3_x1_true, color='lightgreen', s=45, edgecolor='black', linewidth=1)\n", + "plt.scatter(hmc3_x2_pred, hmc3_x2_true, color='lightcoral', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"HMC3, r = \" + str(round(r_val_hmc3, 3)) + \", n = \" + str(np.concatenate([hmc3_x1_pred, hmc3_x2_pred], axis=0).shape[0]), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Compare predictions vs. measurements, all variants, 0 to 205\n", + "\n", + "from scipy.stats import spearmanr\n", + "\n", + "save_figs = True\n", + "fig_name = \"all_vars_predicted_vs_measured_lor_0_205\"\n", + "\n", + "#HEK293 measurements\n", + "hek_min_c = 50.\n", + "\n", + "hek_min_ref_prox_c = 5.\n", + "hek_min_var_prox_c = 1.\n", + "\n", + "hek_filtered_df = hek_df.query(\"ref_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hek_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hek_min_var_prox_c))\n", + "\n", + "hek_true = np.array(hek_filtered_df['delta_logodds_true_0_205_repl_pooled'].values)\n", + "hek_pred = np.array(hek_filtered_df['delta_logodds_0_205'].values)\n", + "\n", + "#SK-N-SH measurements\n", + "sknsh_min_c = 50.\n", + "\n", + "sknsh_min_ref_prox_c = 5.\n", + "sknsh_min_var_prox_c = 1.\n", + "\n", + "sknsh_filtered_df = sknsh_df.query(\"ref_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(sknsh_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(sknsh_min_var_prox_c))\n", + "\n", + "sknsh_true = np.array(sknsh_filtered_df['delta_logodds_true_0_205_repl_pooled'].values)\n", + "sknsh_pred = np.array(sknsh_filtered_df['delta_logodds_0_205'].values)\n", + "\n", + "#HMC3 measurements\n", + "hmc3_min_c = 50.\n", + "\n", + "hmc3_min_ref_prox_c = 5.\n", + "hmc3_min_var_prox_c = 1.\n", + "\n", + "hmc3_filtered_df = hmc3_df.query(\"ref_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hmc3_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hmc3_min_var_prox_c))\n", + "\n", + "hmc3_true = np.array(hmc3_filtered_df['delta_logodds_true_0_205_repl_pooled'].values)\n", + "hmc3_pred = np.array(hmc3_filtered_df['delta_logodds_0_205'].values)\n", + "\n", + "r_val_hek, _ = spearmanr(hek_pred, hek_true)\n", + "r_val_sknsh, _ = spearmanr(sknsh_pred, sknsh_true)\n", + "r_val_hmc3, _ = spearmanr(hmc3_pred, hmc3_true)\n", + "\n", + "f, ax = plt.subplots(1, 3, figsize=(12, 4))\n", + "\n", + "#Plot HEK293 measurements\n", + "plt.sca(ax[0])\n", + "\n", + "plt.scatter(hek_pred, hek_true, color='gray', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"HEK293, r = \" + str(round(r_val_hek, 3)) + \", n = \" + str(hek_pred.shape[0]), fontsize=12)\n", + "\n", + "#Plot SK-N-SH measurements\n", + "plt.sca(ax[1])\n", + "\n", + "plt.scatter(sknsh_pred, sknsh_true, color='gray', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"SK-N-SH, r = \" + str(round(r_val_sknsh, 3)) + \", n = \" + str(sknsh_pred.shape[0]), fontsize=12)\n", + "\n", + "#Plot HMC3 measurements\n", + "plt.sca(ax[2])\n", + "\n", + "plt.scatter(hmc3_pred, hmc3_true, color='gray', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"HMC3, r = \" + str(round(r_val_hmc3, 3)) + \", n = \" + str(hmc3_pred.shape[0]), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Compare predictions vs. measurements, all variants, 77 to 127\n", + "\n", + "from scipy.stats import spearmanr\n", + "\n", + "save_figs = True\n", + "fig_name = \"all_vars_predicted_vs_measured_lor_77_127\"\n", + "\n", + "#HEK293 measurements\n", + "hek_min_c = 50.\n", + "\n", + "hek_min_ref_prox_c = 5.\n", + "hek_min_var_prox_c = 1.\n", + "\n", + "hek_filtered_df = hek_df.query(\"ref_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hek_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hek_min_var_prox_c))\n", + "\n", + "hek_true = np.array(hek_filtered_df['delta_logodds_true_77_127_repl_pooled'].values)\n", + "hek_pred = np.array(hek_filtered_df['delta_logodds_77_127'].values)\n", + "\n", + "#SK-N-SH measurements\n", + "sknsh_min_c = 50.\n", + "\n", + "sknsh_min_ref_prox_c = 5.\n", + "sknsh_min_var_prox_c = 1.\n", + "\n", + "sknsh_filtered_df = sknsh_df.query(\"ref_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(sknsh_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(sknsh_min_var_prox_c))\n", + "\n", + "sknsh_true = np.array(sknsh_filtered_df['delta_logodds_true_77_127_repl_pooled'].values)\n", + "sknsh_pred = np.array(sknsh_filtered_df['delta_logodds_77_127'].values)\n", + "\n", + "#HMC3 measurements\n", + "hmc3_min_c = 50.\n", + "\n", + "hmc3_min_ref_prox_c = 5.\n", + "hmc3_min_var_prox_c = 1.\n", + "\n", + "hmc3_filtered_df = hmc3_df.query(\"ref_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hmc3_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hmc3_min_var_prox_c))\n", + "\n", + "hmc3_true = np.array(hmc3_filtered_df['delta_logodds_true_77_127_repl_pooled'].values)\n", + "hmc3_pred = np.array(hmc3_filtered_df['delta_logodds_77_127'].values)\n", + "\n", + "r_val_hek, _ = spearmanr(hek_pred, hek_true)\n", + "r_val_sknsh, _ = spearmanr(sknsh_pred, sknsh_true)\n", + "r_val_hmc3, _ = spearmanr(hmc3_pred, hmc3_true)\n", + "\n", + "f, ax = plt.subplots(1, 3, figsize=(12, 4))\n", + "\n", + "#Plot HEK293 measurements\n", + "plt.sca(ax[0])\n", + "\n", + "plt.scatter(hek_pred, hek_true, color='gray', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"HEK293, r = \" + str(round(r_val_hek, 3)) + \", n = \" + str(hek_pred.shape[0]), fontsize=12)\n", + "\n", + "#Plot SK-N-SH measurements\n", + "plt.sca(ax[1])\n", + "\n", + "plt.scatter(sknsh_pred, sknsh_true, color='gray', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"SK-N-SH, r = \" + str(round(r_val_sknsh, 3)) + \", n = \" + str(sknsh_pred.shape[0]), fontsize=12)\n", + "\n", + "#Plot HMC3 measurements\n", + "plt.sca(ax[2])\n", + "\n", + "plt.scatter(hmc3_pred, hmc3_true, color='gray', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"HMC3, r = \" + str(round(r_val_hmc3, 3)) + \", n = \" + str(hmc3_pred.shape[0]), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Compare predictions vs. measurements, all variants, 77 to 127\n", + "\n", + "from scipy.stats import spearmanr\n", + "\n", + "save_figs = True\n", + "fig_name = \"all_vars_predicted_vs_measured_lor_77_127_superimposed\"\n", + "\n", + "#HEK293 measurements\n", + "hek_min_c = 50.\n", + "\n", + "hek_min_ref_prox_c = 5.\n", + "hek_min_var_prox_c = 1.\n", + "\n", + "hek_filtered_df = hek_df.query(\"ref_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hek_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hek_min_var_prox_c))\n", + "\n", + "hek_true = np.array(hek_filtered_df['delta_logodds_true_77_127_repl_pooled'].values)\n", + "hek_pred = np.array(hek_filtered_df['delta_logodds_77_127'].values)\n", + "\n", + "#SK-N-SH measurements\n", + "sknsh_min_c = 50.\n", + "\n", + "sknsh_min_ref_prox_c = 5.\n", + "sknsh_min_var_prox_c = 1.\n", + "\n", + "sknsh_filtered_df = sknsh_df.query(\"ref_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(sknsh_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(sknsh_min_var_prox_c))\n", + "\n", + "sknsh_true = np.array(sknsh_filtered_df['delta_logodds_true_77_127_repl_pooled'].values)\n", + "sknsh_pred = np.array(sknsh_filtered_df['delta_logodds_77_127'].values)\n", + "\n", + "#HMC3 measurements\n", + "hmc3_min_c = 50.\n", + "\n", + "hmc3_min_ref_prox_c = 5.\n", + "hmc3_min_var_prox_c = 1.\n", + "\n", + "hmc3_filtered_df = hmc3_df.query(\"ref_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hmc3_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hmc3_min_var_prox_c))\n", + "\n", + "hmc3_true = np.array(hmc3_filtered_df['delta_logodds_true_77_127_repl_pooled'].values)\n", + "hmc3_pred = np.array(hmc3_filtered_df['delta_logodds_77_127'].values)\n", + "\n", + "r_val_hek, _ = spearmanr(hek_pred, hek_true)\n", + "r_val_sknsh, _ = spearmanr(sknsh_pred, sknsh_true)\n", + "r_val_hmc3, _ = spearmanr(hmc3_pred, hmc3_true)\n", + "\n", + "f = plt.subplots(figsize=(4, 4))\n", + "\n", + "#Plot HEK293 measurements\n", + "plt.scatter(hek_pred, hek_true, color='red', alpha=0.33, s=45, edgecolor='black', linewidth=1.5, label='HEK293')\n", + "\n", + "#Plot SK-N-SH measurements\n", + "plt.scatter(sknsh_pred, sknsh_true, color='blue', alpha=0.33, s=45, edgecolor='black', linewidth=1.5, label='SK-N-SH')\n", + "\n", + "#Plot HMC3 measurements\n", + "plt.scatter(hmc3_pred, hmc3_true, color='green', alpha=0.33, s=45, edgecolor='black', linewidth=1.5, label='HMC3')\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.legend(fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n", + "\n", + "f = plt.figure(figsize=(2, 4))\n", + "\n", + "plt.bar([0], [r_val_hek], color='red', linewidth=1, edgecolor='black', alpha=0.5)\n", + "plt.bar([1], [r_val_sknsh], color='blue', linewidth=1, edgecolor='black', alpha=0.5)\n", + "plt.bar([2], [r_val_hmc3], color='green', linewidth=1, edgecolor='black', alpha=0.5)\n", + "\n", + "plt.ylim(0.5, 1.0)\n", + "\n", + "plt.xticks([0, 1, 2], ['HEK293', 'SK-N-SH', 'HMC3'], fontsize=12, rotation=45)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.ylabel(\"Spearman r\", fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_r_vals.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_r_vals.eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " snp_id gene_id delta_logodds_0_205 \\\n", + "16 rs78378222 TP53.9 -2.1716 \n", + "22 rs12459634 IGFLR1.1 -1.5258 \n", + "14 rs6796 KDELR2.14 -1.0956 \n", + "20 rs35979828 NFE2.1 -0.9556 \n", + "15 rs35630683 ZNF592.7 -0.3138 \n", + "18 rs16833132 KPNA1.26 0.3026 \n", + "19 rs8753 POLR2A.3 0.4854 \n", + "21 rs2732480 ZNF641.5 0.5996 \n", + "\n", + " delta_logodds_true_0_205_repl_pooled \n", + "16 -2.596713 \n", + "22 -3.556974 \n", + "14 -1.395183 \n", + "20 -1.442611 \n", + "15 -0.154653 \n", + "18 0.055570 \n", + "19 0.205128 \n", + "21 0.740291 \n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Compare predictions vs. measurements, GWAS SNPs, 0 to 205\n", + "\n", + "from scipy.stats import spearmanr\n", + "\n", + "save_figs = True\n", + "fig_name = \"GWAS_predicted_vs_measured_lor_0_205\"\n", + "\n", + "#HEK293 measurements\n", + "hek_min_c = 50.\n", + "\n", + "hek_min_ref_prox_c = 5.\n", + "hek_min_var_prox_c = 1.\n", + "\n", + "hek_filtered_df = hek_df.query(\"data_source == 'GWAS' and ref_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hek_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hek_min_var_prox_c))\n", + "\n", + "print(hek_filtered_df.sort_values(by='delta_logodds_0_205')[['snp_id', 'gene_id', 'delta_logodds_0_205', 'delta_logodds_true_0_205_repl_pooled']])\n", + "\n", + "hek_true = np.array(hek_filtered_df['delta_logodds_true_0_205_repl_pooled'].values)\n", + "hek_pred = np.array(hek_filtered_df['delta_logodds_0_205'].values)\n", + "\n", + "#SK-N-SH measurements\n", + "sknsh_min_c = 50.\n", + "\n", + "sknsh_min_ref_prox_c = 5.\n", + "sknsh_min_var_prox_c = 1.\n", + "\n", + "sknsh_filtered_df = sknsh_df.query(\"data_source == 'GWAS' and ref_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(sknsh_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(sknsh_min_var_prox_c))\n", + "\n", + "sknsh_true = np.array(sknsh_filtered_df['delta_logodds_true_0_205_repl_pooled'].values)\n", + "sknsh_pred = np.array(sknsh_filtered_df['delta_logodds_0_205'].values)\n", + "\n", + "#HMC3 measurements\n", + "hmc3_min_c = 50.\n", + "\n", + "hmc3_min_ref_prox_c = 5.\n", + "hmc3_min_var_prox_c = 1.\n", + "\n", + "hmc3_filtered_df = hmc3_df.query(\"data_source == 'GWAS' and ref_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hmc3_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hmc3_min_var_prox_c))\n", + "\n", + "hmc3_true = np.array(hmc3_filtered_df['delta_logodds_true_0_205_repl_pooled'].values)\n", + "hmc3_pred = np.array(hmc3_filtered_df['delta_logodds_0_205'].values)\n", + "\n", + "r_val_hek, _ = spearmanr(hek_pred, hek_true)\n", + "r_val_sknsh, _ = spearmanr(sknsh_pred, sknsh_true)\n", + "r_val_hmc3, _ = spearmanr(hmc3_pred, hmc3_true)\n", + "\n", + "f, ax = plt.subplots(1, 3, figsize=(12, 4))\n", + "\n", + "#Plot HEK293 measurements\n", + "plt.sca(ax[0])\n", + "\n", + "plt.scatter(hek_pred, hek_true, color='red', marker='^', s=125, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"HEK293, r = \" + str(round(r_val_hek, 3)) + \", n = \" + str(hek_pred.shape[0]), fontsize=12)\n", + "\n", + "#Plot SK-N-SH measurements\n", + "plt.sca(ax[1])\n", + "\n", + "plt.scatter(sknsh_pred, sknsh_true, color='red', marker='^', s=125, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"SK-N-SH, r = \" + str(round(r_val_sknsh, 3)) + \", n = \" + str(sknsh_pred.shape[0]), fontsize=12)\n", + "\n", + "#Plot HMC3 measurements\n", + "plt.sca(ax[2])\n", + "\n", + "plt.scatter(hmc3_pred, hmc3_true, color='red', marker='^', s=125, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"HMC3, r = \" + str(round(r_val_hmc3, 3)) + \", n = \" + str(hmc3_pred.shape[0]), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " snp_id gene_id delta_logodds_77_127 \\\n", + "16 rs78378222 TP53.9 -2.3632 \n", + "14 rs6796 KDELR2.14 -2.3504 \n", + "22 rs12459634 IGFLR1.1 -1.7076 \n", + "20 rs35979828 NFE2.1 -1.0283 \n", + "15 rs35630683 ZNF592.7 -0.4139 \n", + "18 rs16833132 KPNA1.26 0.7260 \n", + "19 rs8753 POLR2A.3 0.8946 \n", + "21 rs2732480 ZNF641.5 0.9084 \n", + "\n", + " delta_logodds_true_77_127_repl_pooled \n", + "16 -2.655698 \n", + "14 -2.888136 \n", + "22 -3.456609 \n", + "20 -1.478624 \n", + "15 -0.201321 \n", + "18 0.311756 \n", + "19 0.257417 \n", + "21 0.839856 \n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Compare predictions vs. measurements, GWAS SNPs, 77 to 127\n", + "\n", + "from scipy.stats import spearmanr\n", + "\n", + "save_figs = True\n", + "fig_name = \"GWAS_predicted_vs_measured_lor_77_127\"\n", + "\n", + "#HEK293 measurements\n", + "hek_min_c = 50.\n", + "\n", + "hek_min_ref_prox_c = 5.\n", + "hek_min_var_prox_c = 1.\n", + "\n", + "hek_filtered_df = hek_df.query(\"data_source == 'GWAS' and ref_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hek_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hek_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hek_min_var_prox_c))\n", + "\n", + "print(hek_filtered_df.sort_values(by='delta_logodds_77_127')[['snp_id', 'gene_id', 'delta_logodds_77_127', 'delta_logodds_true_77_127_repl_pooled']])\n", + "\n", + "hek_true = np.array(hek_filtered_df['delta_logodds_true_77_127_repl_pooled'].values)\n", + "hek_pred = np.array(hek_filtered_df['delta_logodds_77_127'].values)\n", + "\n", + "#SK-N-SH measurements\n", + "sknsh_min_c = 50.\n", + "\n", + "sknsh_min_ref_prox_c = 5.\n", + "sknsh_min_var_prox_c = 1.\n", + "\n", + "sknsh_filtered_df = sknsh_df.query(\"data_source == 'GWAS' and ref_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(sknsh_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(sknsh_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(sknsh_min_var_prox_c))\n", + "\n", + "sknsh_true = np.array(sknsh_filtered_df['delta_logodds_true_77_127_repl_pooled'].values)\n", + "sknsh_pred = np.array(sknsh_filtered_df['delta_logodds_77_127'].values)\n", + "\n", + "#HMC3 measurements\n", + "hmc3_min_c = 50.\n", + "\n", + "hmc3_min_ref_prox_c = 5.\n", + "hmc3_min_var_prox_c = 1.\n", + "\n", + "hmc3_filtered_df = hmc3_df.query(\"data_source == 'GWAS' and ref_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"var_count_total_repl_pooled >= \" + str(hmc3_min_c) + \" and \" + \"ref_count_0_205_repl_pooled >= \" + str(hmc3_min_ref_prox_c) + \" and \" + \"var_count_0_205_repl_pooled >= \" + str(hmc3_min_var_prox_c))\n", + "\n", + "hmc3_true = np.array(hmc3_filtered_df['delta_logodds_true_77_127_repl_pooled'].values)\n", + "hmc3_pred = np.array(hmc3_filtered_df['delta_logodds_77_127'].values)\n", + "\n", + "r_val_hek, _ = spearmanr(hek_pred, hek_true)\n", + "r_val_sknsh, _ = spearmanr(sknsh_pred, sknsh_true)\n", + "r_val_hmc3, _ = spearmanr(hmc3_pred, hmc3_true)\n", + "\n", + "f, ax = plt.subplots(1, 3, figsize=(12, 4))\n", + "\n", + "#Plot HEK293 measurements\n", + "plt.sca(ax[0])\n", + "\n", + "plt.scatter(hek_pred, hek_true, color='red', marker='^', s=125, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"HEK293, r = \" + str(round(r_val_hek, 3)) + \", n = \" + str(hek_pred.shape[0]), fontsize=12)\n", + "\n", + "#Plot SK-N-SH measurements\n", + "plt.sca(ax[1])\n", + "\n", + "plt.scatter(sknsh_pred, sknsh_true, color='red', marker='^', s=125, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"SK-N-SH, r = \" + str(round(r_val_sknsh, 3)) + \", n = \" + str(sknsh_pred.shape[0]), fontsize=12)\n", + "\n", + "#Plot HMC3 measurements\n", + "plt.sca(ax[2])\n", + "\n", + "plt.scatter(hmc3_pred, hmc3_true, color='red', marker='^', s=125, edgecolor='black', linewidth=1)\n", + "\n", + "#plt.xlim(x_min, x_max)\n", + "#plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Predicted LOR\", fontsize=12)\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(\"HMC3, r = \" + str(round(r_val_hmc3, 3)) + \", n = \" + str(hmc3_pred.shape[0]), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Plot individual example: F2 variants (measurements, 77 to 127)\n", + "\n", + "save_figs = True\n", + "fig_name = \"F2_example_measured_lor_77_127\"\n", + "\n", + "plot_title = \"F2\"\n", + "\n", + "test_ixs = [9, 11]\n", + "test_colors = ['orange', 'deepskyblue']\n", + "\n", + "min_y = -1.6\n", + "max_y = 1.6\n", + "\n", + "f = plt.figure(figsize=(2, 4))\n", + "\n", + "for test_ix, test_color in zip(test_ixs, test_colors) :\n", + " \n", + " hek_true = hek_df.iloc[test_ix]['delta_logodds_true_77_127_repl_pooled']\n", + " sknsh_true = sknsh_df.iloc[test_ix]['delta_logodds_true_77_127_repl_pooled']\n", + " hmc3_true = hmc3_df.iloc[test_ix]['delta_logodds_true_77_127_repl_pooled']\n", + " \n", + " plt.scatter([0, 1, 2], [hek_true, sknsh_true, hmc3_true], s=55, color=test_color, edgecolor='black', linewidth=1)\n", + "\n", + "plt.axhline(y=0., linewidth=2, linestyle='--', color='black')\n", + "\n", + "plt.xlim(-1, 3)\n", + "plt.ylim(min_y, max_y)\n", + "\n", + "plt.xticks([0, 1, 2], [\"HEK293\", \"SK-N-SH\", \"HMC3\"], fontsize=12, rotation=60)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(plot_title, fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Plot individual example: SCAF8 variants (measurements, 77 to 127)\n", + "\n", + "save_figs = True\n", + "fig_name = \"SCAF8_example_measured_lor_77_127\"\n", + "\n", + "plot_title = \"SCAF8\"\n", + "\n", + "test_ixs = [99, 13]\n", + "test_colors = ['orange', 'deepskyblue']\n", + "\n", + "min_y = -1.6\n", + "max_y = 1.6\n", + "\n", + "f = plt.figure(figsize=(2, 4))\n", + "\n", + "for test_ix, test_color in zip(test_ixs, test_colors) :\n", + " \n", + " hek_true = hek_df.iloc[test_ix]['delta_logodds_true_77_127_repl_pooled']\n", + " sknsh_true = sknsh_df.iloc[test_ix]['delta_logodds_true_77_127_repl_pooled']\n", + " hmc3_true = hmc3_df.iloc[test_ix]['delta_logodds_true_77_127_repl_pooled']\n", + " \n", + " plt.scatter([0, 1, 2], [hek_true, sknsh_true, hmc3_true], s=55, color=test_color, edgecolor='black', linewidth=1)\n", + "\n", + "plt.axhline(y=0., linewidth=2, linestyle='--', color='black')\n", + "\n", + "plt.xlim(-1, 3)\n", + "plt.ylim(min_y, max_y)\n", + "\n", + "plt.xticks([0, 1, 2], [\"HEK293\", \"SK-N-SH\", \"HMC3\"], fontsize=12, rotation=60)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(plot_title, fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Plot individual example: Brain-specific aQTL (measurements, 77 to 127)\n", + "\n", + "save_figs = True\n", + "fig_name = \"brain_aQTL_example_measured_lor_77_127\"\n", + "\n", + "plot_title = \"Brain\"\n", + "\n", + "test_ixs = [6, 99]\n", + "test_colors = ['orange', 'deepskyblue']\n", + "\n", + "min_y = -1.6\n", + "max_y = 1.6\n", + "\n", + "f = plt.figure(figsize=(2, 4))\n", + "\n", + "for test_ix, test_color in zip(test_ixs, test_colors) :\n", + " \n", + " hek_true = hek_df.iloc[test_ix]['delta_logodds_true_77_127_repl_pooled']\n", + " sknsh_true = sknsh_df.iloc[test_ix]['delta_logodds_true_77_127_repl_pooled']\n", + " hmc3_true = hmc3_df.iloc[test_ix]['delta_logodds_true_77_127_repl_pooled']\n", + " \n", + " plt.scatter([0, 1, 2], [hek_true, sknsh_true, hmc3_true], s=55, color=test_color, edgecolor='black', linewidth=1)\n", + "\n", + "plt.axhline(y=0., linewidth=2, linestyle='--', color='black')\n", + "\n", + "plt.xlim(-1, 3)\n", + "plt.ylim(min_y, max_y)\n", + "\n", + "plt.xticks([0, 1, 2], [\"HEK293\", \"SK-N-SH\", \"HMC3\"], fontsize=12, rotation=60)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(plot_title, fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Plot individual example: Brain-specific GWAS SNP (measurements, 77 to 127)\n", + "\n", + "save_figs = True\n", + "fig_name = \"gwas_example_measured_lor_77_127\"\n", + "\n", + "plot_title = \"GWAS\"\n", + "\n", + "test_ixs = [17, 99]\n", + "test_colors = ['orange', 'deepskyblue']\n", + "\n", + "min_y = -1.6\n", + "max_y = 1.6\n", + "\n", + "f = plt.figure(figsize=(2, 4))\n", + "\n", + "for test_ix, test_color in zip(test_ixs, test_colors) :\n", + " \n", + " hek_true = hek_df.iloc[test_ix]['delta_logodds_true_77_127_repl_pooled']\n", + " sknsh_true = sknsh_df.iloc[test_ix]['delta_logodds_true_77_127_repl_pooled']\n", + " hmc3_true = hmc3_df.iloc[test_ix]['delta_logodds_true_77_127_repl_pooled']\n", + " \n", + " plt.scatter([0, 1, 2], [hek_true, sknsh_true, hmc3_true], s=55, color=test_color, edgecolor='black', linewidth=1)\n", + "\n", + "plt.axhline(y=0., linewidth=2, linestyle='--', color='black')\n", + "\n", + "plt.xlim(-1, 3)\n", + "plt.ylim(min_y, max_y)\n", + "\n", + "plt.xticks([0, 1, 2], [\"HEK293\", \"SK-N-SH\", \"HMC3\"], fontsize=12, rotation=60)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.ylabel(\"Measured LOR\", fontsize=12)\n", + "\n", + "plt.title(plot_title, fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:tensorflow]", + "language": "python", + "name": "conda-env-tensorflow-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data/oligo_pool_2022/medium_library/process_reads_hek293.ipynb b/data/oligo_pool_2022/medium_library/process_reads_hek293.ipynb new file mode 100644 index 0000000..8dfe2ea --- /dev/null +++ b/data/oligo_pool_2022/medium_library/process_reads_hek293.ipynb @@ -0,0 +1,1851 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import scipy\n", + "import scipy.io as spio\n", + "import scipy.sparse as sp\n", + "\n", + "import regex as re\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(barcode_dict) = 201500\n" + ] + } + ], + "source": [ + "#Load reference library dataframe and build barcode dictionary\n", + "\n", + "library_df = pd.read_csv(\"apa_100_variants_rev2_20220621_pred.csv\", sep='\\t')\n", + "\n", + "#Build dictionary (double-mutation support)\n", + "bases = ['A', 'C', 'G', 'T']\n", + "\n", + "barcode_dict = {}\n", + "sequences = []\n", + "for i, [_, row] in enumerate(library_df.iterrows()) :\n", + " bc = row['ref_barcode']\n", + " \n", + " sequences.append(row['ref_seq'])\n", + " \n", + " barcode_dict[bc] = i\n", + " for pos1 in range(len(bc)) :\n", + " for pos2 in range(pos1, len(bc)) :\n", + " for b1 in bases :\n", + " for b2 in bases :\n", + " bc_mut = bc[:pos1] + b1 + bc[pos1+1:pos2] + b2 + bc[pos2+1:]\n", + " \n", + " if bc_mut in barcode_dict and barcode_dict[bc_mut] != i :\n", + " print(\"[ERROR] Barcode dictionary collision.\")\n", + " else :\n", + " barcode_dict[bc_mut] = i\n", + "\n", + "print(\"len(barcode_dict) = \" + str(len(barcode_dict)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'GATGCAGCTGGCTATCATGA': 0,\n", + " 'AAATGCAGCTGGCTATCATGA': 0,\n", + " 'ACATGCAGCTGGCTATCATGA': 0,\n", + " 'AGATGCAGCTGGCTATCATGA': 0,\n", + " 'ATATGCAGCTGGCTATCATGA': 0,\n", + " 'CAATGCAGCTGGCTATCATGA': 0,\n", + " 'CCATGCAGCTGGCTATCATGA': 0,\n", + " 'CGATGCAGCTGGCTATCATGA': 0,\n", + " 'CTATGCAGCTGGCTATCATGA': 0,\n", + " 'GAATGCAGCTGGCTATCATGA': 0,\n", + " 'GCATGCAGCTGGCTATCATGA': 0,\n", + " 'GGATGCAGCTGGCTATCATGA': 0,\n", + " 'GTATGCAGCTGGCTATCATGA': 0,\n", + " 'TAATGCAGCTGGCTATCATGA': 0,\n", + " 'TCATGCAGCTGGCTATCATGA': 0,\n", + " 'TGATGCAGCTGGCTATCATGA': 0,\n", + " 'TTATGCAGCTGGCTATCATGA': 0,\n", + " 'AATGCAGCTGGCTATCATGA': 0,\n", + " 'ACTGCAGCTGGCTATCATGA': 0,\n", + " 'AGTGCAGCTGGCTATCATGA': 0,\n", + " 'ATTGCAGCTGGCTATCATGA': 0,\n", + " 'CATGCAGCTGGCTATCATGA': 0,\n", + " 'CCTGCAGCTGGCTATCATGA': 0,\n", + " 'CGTGCAGCTGGCTATCATGA': 0,\n", + " 'CTTGCAGCTGGCTATCATGA': 0,\n", + " 'GCTGCAGCTGGCTATCATGA': 0,\n", + " 'GGTGCAGCTGGCTATCATGA': 0,\n", + " 'GTTGCAGCTGGCTATCATGA': 0,\n", + " 'TATGCAGCTGGCTATCATGA': 0,\n", + " 'TCTGCAGCTGGCTATCATGA': 0,\n", + " 'TGTGCAGCTGGCTATCATGA': 0,\n", + " 'TTTGCAGCTGGCTATCATGA': 0,\n", + " 'AAAGCAGCTGGCTATCATGA': 0,\n", + " 'AACGCAGCTGGCTATCATGA': 0,\n", + " 'AAGGCAGCTGGCTATCATGA': 0,\n", + " 'CAAGCAGCTGGCTATCATGA': 0,\n", + " 'CACGCAGCTGGCTATCATGA': 0,\n", + " 'CAGGCAGCTGGCTATCATGA': 0,\n", + " 'GAAGCAGCTGGCTATCATGA': 0,\n", + " 'GACGCAGCTGGCTATCATGA': 0,\n", + " 'GAGGCAGCTGGCTATCATGA': 0,\n", + " 'TAAGCAGCTGGCTATCATGA': 0,\n", + " 'TACGCAGCTGGCTATCATGA': 0,\n", + " 'TAGGCAGCTGGCTATCATGA': 0,\n", + " 'AATACAGCTGGCTATCATGA': 0,\n", + " 'AATCCAGCTGGCTATCATGA': 0,\n", + " 'AATTCAGCTGGCTATCATGA': 0,\n", + " 'CATACAGCTGGCTATCATGA': 0,\n", + " 'CATCCAGCTGGCTATCATGA': 0,\n", + " 'CATTCAGCTGGCTATCATGA': 0,\n", + " 'GATACAGCTGGCTATCATGA': 0,\n", + " 'GATCCAGCTGGCTATCATGA': 0,\n", + " 'GATTCAGCTGGCTATCATGA': 0,\n", + " 'TATACAGCTGGCTATCATGA': 0,\n", + " 'TATCCAGCTGGCTATCATGA': 0,\n", + " 'TATTCAGCTGGCTATCATGA': 0,\n", + " 'AATGAAGCTGGCTATCATGA': 0,\n", + " 'AATGGAGCTGGCTATCATGA': 0,\n", + " 'AATGTAGCTGGCTATCATGA': 0,\n", + " 'CATGAAGCTGGCTATCATGA': 0,\n", + " 'CATGGAGCTGGCTATCATGA': 0,\n", + " 'CATGTAGCTGGCTATCATGA': 0,\n", + " 'GATGAAGCTGGCTATCATGA': 0,\n", + " 'GATGGAGCTGGCTATCATGA': 0,\n", + " 'GATGTAGCTGGCTATCATGA': 0,\n", + " 'TATGAAGCTGGCTATCATGA': 0,\n", + " 'TATGGAGCTGGCTATCATGA': 0,\n", + " 'TATGTAGCTGGCTATCATGA': 0,\n", + " 'AATGCCGCTGGCTATCATGA': 0,\n", + " 'AATGCGGCTGGCTATCATGA': 0,\n", + " 'AATGCTGCTGGCTATCATGA': 0,\n", + " 'CATGCCGCTGGCTATCATGA': 0,\n", + " 'CATGCGGCTGGCTATCATGA': 0,\n", + " 'CATGCTGCTGGCTATCATGA': 0,\n", + " 'GATGCCGCTGGCTATCATGA': 0,\n", + " 'GATGCGGCTGGCTATCATGA': 0,\n", + " 'GATGCTGCTGGCTATCATGA': 0,\n", + " 'TATGCCGCTGGCTATCATGA': 0,\n", + " 'TATGCGGCTGGCTATCATGA': 0,\n", + " 'TATGCTGCTGGCTATCATGA': 0,\n", + " 'AATGCAACTGGCTATCATGA': 0,\n", + " 'AATGCACCTGGCTATCATGA': 0,\n", + " 'AATGCATCTGGCTATCATGA': 0,\n", + " 'CATGCAACTGGCTATCATGA': 0,\n", + " 'CATGCACCTGGCTATCATGA': 0,\n", + " 'CATGCATCTGGCTATCATGA': 0,\n", + " 'GATGCAACTGGCTATCATGA': 0,\n", + " 'GATGCACCTGGCTATCATGA': 0,\n", + " 'GATGCATCTGGCTATCATGA': 0,\n", + " 'TATGCAACTGGCTATCATGA': 0,\n", + " 'TATGCACCTGGCTATCATGA': 0,\n", + " 'TATGCATCTGGCTATCATGA': 0,\n", + " 'AATGCAGATGGCTATCATGA': 0,\n", + " 'AATGCAGGTGGCTATCATGA': 0,\n", + " 'AATGCAGTTGGCTATCATGA': 0,\n", + " 'CATGCAGATGGCTATCATGA': 0,\n", + " 'CATGCAGGTGGCTATCATGA': 0,\n", + " 'CATGCAGTTGGCTATCATGA': 0,\n", + " 'GATGCAGATGGCTATCATGA': 0,\n", + " 'GATGCAGGTGGCTATCATGA': 0,\n", + " 'GATGCAGTTGGCTATCATGA': 0,\n", + " 'TATGCAGATGGCTATCATGA': 0,\n", + " 'TATGCAGGTGGCTATCATGA': 0,\n", + " 'TATGCAGTTGGCTATCATGA': 0,\n", + " 'AATGCAGCAGGCTATCATGA': 0,\n", + " 'AATGCAGCCGGCTATCATGA': 0,\n", + " 'AATGCAGCGGGCTATCATGA': 0,\n", + " 'CATGCAGCAGGCTATCATGA': 0,\n", + " 'CATGCAGCCGGCTATCATGA': 0,\n", + " 'CATGCAGCGGGCTATCATGA': 0,\n", + " 'GATGCAGCAGGCTATCATGA': 0,\n", + " 'GATGCAGCCGGCTATCATGA': 0,\n", + " 'GATGCAGCGGGCTATCATGA': 0,\n", + " 'TATGCAGCAGGCTATCATGA': 0,\n", + " 'TATGCAGCCGGCTATCATGA': 0,\n", + " 'TATGCAGCGGGCTATCATGA': 0,\n", + " 'AATGCAGCTAGCTATCATGA': 0,\n", + " 'AATGCAGCTCGCTATCATGA': 0,\n", + " 'AATGCAGCTTGCTATCATGA': 0,\n", + " 'CATGCAGCTAGCTATCATGA': 0,\n", + " 'CATGCAGCTCGCTATCATGA': 0,\n", + " 'CATGCAGCTTGCTATCATGA': 0,\n", + " 'GATGCAGCTAGCTATCATGA': 0,\n", + " 'GATGCAGCTCGCTATCATGA': 0,\n", + " 'GATGCAGCTTGCTATCATGA': 0,\n", + " 'TATGCAGCTAGCTATCATGA': 0,\n", + " 'TATGCAGCTCGCTATCATGA': 0,\n", + " 'TATGCAGCTTGCTATCATGA': 0,\n", + " 'AATGCAGCTGACTATCATGA': 0,\n", + " 'AATGCAGCTGCCTATCATGA': 0,\n", + " 'AATGCAGCTGTCTATCATGA': 0,\n", + " 'CATGCAGCTGACTATCATGA': 0,\n", + " 'CATGCAGCTGCCTATCATGA': 0,\n", + " 'CATGCAGCTGTCTATCATGA': 0,\n", + " 'GATGCAGCTGACTATCATGA': 0,\n", + " 'GATGCAGCTGCCTATCATGA': 0,\n", + " 'GATGCAGCTGTCTATCATGA': 0,\n", + " 'TATGCAGCTGACTATCATGA': 0,\n", + " 'TATGCAGCTGCCTATCATGA': 0,\n", + " 'TATGCAGCTGTCTATCATGA': 0,\n", + " 'AATGCAGCTGGATATCATGA': 0,\n", + " 'AATGCAGCTGGGTATCATGA': 0,\n", + " 'AATGCAGCTGGTTATCATGA': 0,\n", + " 'CATGCAGCTGGATATCATGA': 0,\n", + " 'CATGCAGCTGGGTATCATGA': 0,\n", + " 'CATGCAGCTGGTTATCATGA': 0,\n", + " 'GATGCAGCTGGATATCATGA': 0,\n", + " 'GATGCAGCTGGGTATCATGA': 0,\n", + " 'GATGCAGCTGGTTATCATGA': 0,\n", + " 'TATGCAGCTGGATATCATGA': 0,\n", + " 'TATGCAGCTGGGTATCATGA': 0,\n", + " 'TATGCAGCTGGTTATCATGA': 0,\n", + " 'AATGCAGCTGGCAATCATGA': 0,\n", + " 'AATGCAGCTGGCCATCATGA': 0,\n", + " 'AATGCAGCTGGCGATCATGA': 0,\n", + " 'CATGCAGCTGGCAATCATGA': 0,\n", + " 'CATGCAGCTGGCCATCATGA': 0,\n", + " 'CATGCAGCTGGCGATCATGA': 0,\n", + " 'GATGCAGCTGGCAATCATGA': 0,\n", + " 'GATGCAGCTGGCCATCATGA': 0,\n", + " 'GATGCAGCTGGCGATCATGA': 0,\n", + " 'TATGCAGCTGGCAATCATGA': 0,\n", + " 'TATGCAGCTGGCCATCATGA': 0,\n", + " 'TATGCAGCTGGCGATCATGA': 0,\n", + " 'AATGCAGCTGGCTCTCATGA': 0,\n", + " 'AATGCAGCTGGCTGTCATGA': 0,\n", + " 'AATGCAGCTGGCTTTCATGA': 0,\n", + " 'CATGCAGCTGGCTCTCATGA': 0,\n", + " 'CATGCAGCTGGCTGTCATGA': 0,\n", + " 'CATGCAGCTGGCTTTCATGA': 0,\n", + " 'GATGCAGCTGGCTCTCATGA': 0,\n", + " 'GATGCAGCTGGCTGTCATGA': 0,\n", + " 'GATGCAGCTGGCTTTCATGA': 0,\n", + " 'TATGCAGCTGGCTCTCATGA': 0,\n", + " 'TATGCAGCTGGCTGTCATGA': 0,\n", + " 'TATGCAGCTGGCTTTCATGA': 0,\n", + " 'AATGCAGCTGGCTAACATGA': 0,\n", + " 'AATGCAGCTGGCTACCATGA': 0,\n", + " 'AATGCAGCTGGCTAGCATGA': 0,\n", + " 'CATGCAGCTGGCTAACATGA': 0,\n", + " 'CATGCAGCTGGCTACCATGA': 0,\n", + " 'CATGCAGCTGGCTAGCATGA': 0,\n", + " 'GATGCAGCTGGCTAACATGA': 0,\n", + " 'GATGCAGCTGGCTACCATGA': 0,\n", + " 'GATGCAGCTGGCTAGCATGA': 0,\n", + " 'TATGCAGCTGGCTAACATGA': 0,\n", + " 'TATGCAGCTGGCTACCATGA': 0,\n", + " 'TATGCAGCTGGCTAGCATGA': 0,\n", + " 'AATGCAGCTGGCTATAATGA': 0,\n", + " 'AATGCAGCTGGCTATGATGA': 0,\n", + " 'AATGCAGCTGGCTATTATGA': 0,\n", + " 'CATGCAGCTGGCTATAATGA': 0,\n", + " 'CATGCAGCTGGCTATGATGA': 0,\n", + " 'CATGCAGCTGGCTATTATGA': 0,\n", + " 'GATGCAGCTGGCTATAATGA': 0,\n", + " 'GATGCAGCTGGCTATGATGA': 0,\n", + " 'GATGCAGCTGGCTATTATGA': 0,\n", + " 'TATGCAGCTGGCTATAATGA': 0,\n", + " 'TATGCAGCTGGCTATGATGA': 0,\n", + " 'TATGCAGCTGGCTATTATGA': 0,\n", + " 'AATGCAGCTGGCTATCCTGA': 0,\n", + " 'AATGCAGCTGGCTATCGTGA': 0,\n", + " 'AATGCAGCTGGCTATCTTGA': 0,\n", + " 'CATGCAGCTGGCTATCCTGA': 0,\n", + " 'CATGCAGCTGGCTATCGTGA': 0,\n", + " 'CATGCAGCTGGCTATCTTGA': 0,\n", + " 'GATGCAGCTGGCTATCCTGA': 0,\n", + " 'GATGCAGCTGGCTATCGTGA': 0,\n", + " 'GATGCAGCTGGCTATCTTGA': 0,\n", + " 'TATGCAGCTGGCTATCCTGA': 0,\n", + " 'TATGCAGCTGGCTATCGTGA': 0,\n", + " 'TATGCAGCTGGCTATCTTGA': 0,\n", + " 'AATGCAGCTGGCTATCAAGA': 0,\n", + " 'AATGCAGCTGGCTATCACGA': 0,\n", + " 'AATGCAGCTGGCTATCAGGA': 0,\n", + " 'CATGCAGCTGGCTATCAAGA': 0,\n", + " 'CATGCAGCTGGCTATCACGA': 0,\n", + " 'CATGCAGCTGGCTATCAGGA': 0,\n", + " 'GATGCAGCTGGCTATCAAGA': 0,\n", + " 'GATGCAGCTGGCTATCACGA': 0,\n", + " 'GATGCAGCTGGCTATCAGGA': 0,\n", + " 'TATGCAGCTGGCTATCAAGA': 0,\n", + " 'TATGCAGCTGGCTATCACGA': 0,\n", + " 'TATGCAGCTGGCTATCAGGA': 0,\n", + " 'AATGCAGCTGGCTATCATAA': 0,\n", + " 'AATGCAGCTGGCTATCATCA': 0,\n", + " 'AATGCAGCTGGCTATCATTA': 0,\n", + " 'CATGCAGCTGGCTATCATAA': 0,\n", + " 'CATGCAGCTGGCTATCATCA': 0,\n", + " 'CATGCAGCTGGCTATCATTA': 0,\n", + " 'GATGCAGCTGGCTATCATAA': 0,\n", + " 'GATGCAGCTGGCTATCATCA': 0,\n", + " 'GATGCAGCTGGCTATCATTA': 0,\n", + " 'TATGCAGCTGGCTATCATAA': 0,\n", + " 'TATGCAGCTGGCTATCATCA': 0,\n", + " 'TATGCAGCTGGCTATCATTA': 0,\n", + " 'AATGCAGCTGGCTATCATGC': 0,\n", + " 'AATGCAGCTGGCTATCATGG': 0,\n", + " 'AATGCAGCTGGCTATCATGT': 0,\n", + " 'CATGCAGCTGGCTATCATGC': 0,\n", + " 'CATGCAGCTGGCTATCATGG': 0,\n", + " 'CATGCAGCTGGCTATCATGT': 0,\n", + " 'GATGCAGCTGGCTATCATGC': 0,\n", + " 'GATGCAGCTGGCTATCATGG': 0,\n", + " 'GATGCAGCTGGCTATCATGT': 0,\n", + " 'TATGCAGCTGGCTATCATGC': 0,\n", + " 'TATGCAGCTGGCTATCATGG': 0,\n", + " 'TATGCAGCTGGCTATCATGT': 0,\n", + " 'GACTGCAGCTGGCTATCATGA': 0,\n", + " 'GAGTGCAGCTGGCTATCATGA': 0,\n", + " 'GATTGCAGCTGGCTATCATGA': 0,\n", + " 'GCCTGCAGCTGGCTATCATGA': 0,\n", + " 'GCGTGCAGCTGGCTATCATGA': 0,\n", + " 'GCTTGCAGCTGGCTATCATGA': 0,\n", + " 'GGCTGCAGCTGGCTATCATGA': 0,\n", + " 'GGGTGCAGCTGGCTATCATGA': 0,\n", + " 'GGTTGCAGCTGGCTATCATGA': 0,\n", + " 'GTCTGCAGCTGGCTATCATGA': 0,\n", + " 'GTGTGCAGCTGGCTATCATGA': 0,\n", + " 'GTTTGCAGCTGGCTATCATGA': 0,\n", + " 'GCAGCAGCTGGCTATCATGA': 0,\n", + " 'GCCGCAGCTGGCTATCATGA': 0,\n", + " 'GCGGCAGCTGGCTATCATGA': 0,\n", + " 'GGAGCAGCTGGCTATCATGA': 0,\n", + " 'GGCGCAGCTGGCTATCATGA': 0,\n", + " 'GGGGCAGCTGGCTATCATGA': 0,\n", + " 'GTAGCAGCTGGCTATCATGA': 0,\n", + " 'GTCGCAGCTGGCTATCATGA': 0,\n", + " 'GTGGCAGCTGGCTATCATGA': 0,\n", + " 'GCTACAGCTGGCTATCATGA': 0,\n", + " 'GCTCCAGCTGGCTATCATGA': 0,\n", + " 'GCTTCAGCTGGCTATCATGA': 0,\n", + " 'GGTACAGCTGGCTATCATGA': 0,\n", + " 'GGTCCAGCTGGCTATCATGA': 0,\n", + " 'GGTTCAGCTGGCTATCATGA': 0,\n", + " 'GTTACAGCTGGCTATCATGA': 0,\n", + " 'GTTCCAGCTGGCTATCATGA': 0,\n", + " 'GTTTCAGCTGGCTATCATGA': 0,\n", + " 'GCTGAAGCTGGCTATCATGA': 0,\n", + " 'GCTGGAGCTGGCTATCATGA': 0,\n", + " 'GCTGTAGCTGGCTATCATGA': 0,\n", + " 'GGTGAAGCTGGCTATCATGA': 0,\n", + " 'GGTGGAGCTGGCTATCATGA': 0,\n", + " 'GGTGTAGCTGGCTATCATGA': 0,\n", + " 'GTTGAAGCTGGCTATCATGA': 0,\n", + " 'GTTGGAGCTGGCTATCATGA': 0,\n", + " 'GTTGTAGCTGGCTATCATGA': 0,\n", + " 'GCTGCCGCTGGCTATCATGA': 0,\n", + " 'GCTGCGGCTGGCTATCATGA': 0,\n", + " 'GCTGCTGCTGGCTATCATGA': 0,\n", + " 'GGTGCCGCTGGCTATCATGA': 0,\n", + " 'GGTGCGGCTGGCTATCATGA': 0,\n", + " 'GGTGCTGCTGGCTATCATGA': 0,\n", + " 'GTTGCCGCTGGCTATCATGA': 0,\n", + " 'GTTGCGGCTGGCTATCATGA': 0,\n", + " 'GTTGCTGCTGGCTATCATGA': 0,\n", + " 'GCTGCAACTGGCTATCATGA': 0,\n", + " 'GCTGCACCTGGCTATCATGA': 0,\n", + " 'GCTGCATCTGGCTATCATGA': 0,\n", + " 'GGTGCAACTGGCTATCATGA': 0,\n", + " 'GGTGCACCTGGCTATCATGA': 0,\n", + " 'GGTGCATCTGGCTATCATGA': 0,\n", + " 'GTTGCAACTGGCTATCATGA': 0,\n", + " 'GTTGCACCTGGCTATCATGA': 0,\n", + " 'GTTGCATCTGGCTATCATGA': 0,\n", + " 'GCTGCAGATGGCTATCATGA': 0,\n", + " 'GCTGCAGGTGGCTATCATGA': 0,\n", + " 'GCTGCAGTTGGCTATCATGA': 0,\n", + " 'GGTGCAGATGGCTATCATGA': 0,\n", + " 'GGTGCAGGTGGCTATCATGA': 0,\n", + " 'GGTGCAGTTGGCTATCATGA': 0,\n", + " 'GTTGCAGATGGCTATCATGA': 0,\n", + " 'GTTGCAGGTGGCTATCATGA': 0,\n", + " 'GTTGCAGTTGGCTATCATGA': 0,\n", + " 'GCTGCAGCAGGCTATCATGA': 0,\n", + " 'GCTGCAGCCGGCTATCATGA': 0,\n", + " 'GCTGCAGCGGGCTATCATGA': 0,\n", + " 'GGTGCAGCAGGCTATCATGA': 0,\n", + " 'GGTGCAGCCGGCTATCATGA': 0,\n", + " 'GGTGCAGCGGGCTATCATGA': 0,\n", + " 'GTTGCAGCAGGCTATCATGA': 0,\n", + " 'GTTGCAGCCGGCTATCATGA': 0,\n", + " 'GTTGCAGCGGGCTATCATGA': 0,\n", + " 'GCTGCAGCTAGCTATCATGA': 0,\n", + " 'GCTGCAGCTCGCTATCATGA': 0,\n", + " 'GCTGCAGCTTGCTATCATGA': 0,\n", + " 'GGTGCAGCTAGCTATCATGA': 0,\n", + " 'GGTGCAGCTCGCTATCATGA': 0,\n", + " 'GGTGCAGCTTGCTATCATGA': 0,\n", + " 'GTTGCAGCTAGCTATCATGA': 0,\n", + " 'GTTGCAGCTCGCTATCATGA': 0,\n", + " 'GTTGCAGCTTGCTATCATGA': 0,\n", + " 'GCTGCAGCTGACTATCATGA': 0,\n", + " 'GCTGCAGCTGCCTATCATGA': 0,\n", + " 'GCTGCAGCTGTCTATCATGA': 0,\n", + " 'GGTGCAGCTGACTATCATGA': 0,\n", + " 'GGTGCAGCTGCCTATCATGA': 0,\n", + " 'GGTGCAGCTGTCTATCATGA': 0,\n", + " 'GTTGCAGCTGACTATCATGA': 0,\n", + " 'GTTGCAGCTGCCTATCATGA': 0,\n", + " 'GTTGCAGCTGTCTATCATGA': 0,\n", + " 'GCTGCAGCTGGATATCATGA': 0,\n", + " 'GCTGCAGCTGGGTATCATGA': 0,\n", + " 'GCTGCAGCTGGTTATCATGA': 0,\n", + " 'GGTGCAGCTGGATATCATGA': 0,\n", + " 'GGTGCAGCTGGGTATCATGA': 0,\n", + " 'GGTGCAGCTGGTTATCATGA': 0,\n", + " 'GTTGCAGCTGGATATCATGA': 0,\n", + " 'GTTGCAGCTGGGTATCATGA': 0,\n", + " 'GTTGCAGCTGGTTATCATGA': 0,\n", + " 'GCTGCAGCTGGCAATCATGA': 0,\n", + " 'GCTGCAGCTGGCCATCATGA': 0,\n", + " 'GCTGCAGCTGGCGATCATGA': 0,\n", + " 'GGTGCAGCTGGCAATCATGA': 0,\n", + " 'GGTGCAGCTGGCCATCATGA': 0,\n", + " 'GGTGCAGCTGGCGATCATGA': 0,\n", + " 'GTTGCAGCTGGCAATCATGA': 0,\n", + " 'GTTGCAGCTGGCCATCATGA': 0,\n", + " 'GTTGCAGCTGGCGATCATGA': 0,\n", + " 'GCTGCAGCTGGCTCTCATGA': 0,\n", + " 'GCTGCAGCTGGCTGTCATGA': 0,\n", + " 'GCTGCAGCTGGCTTTCATGA': 0,\n", + " 'GGTGCAGCTGGCTCTCATGA': 0,\n", + " 'GGTGCAGCTGGCTGTCATGA': 0,\n", + " 'GGTGCAGCTGGCTTTCATGA': 0,\n", + " 'GTTGCAGCTGGCTCTCATGA': 0,\n", + " 'GTTGCAGCTGGCTGTCATGA': 0,\n", + " 'GTTGCAGCTGGCTTTCATGA': 0,\n", + " 'GCTGCAGCTGGCTAACATGA': 0,\n", + " 'GCTGCAGCTGGCTACCATGA': 0,\n", + " 'GCTGCAGCTGGCTAGCATGA': 0,\n", + " 'GGTGCAGCTGGCTAACATGA': 0,\n", + " 'GGTGCAGCTGGCTACCATGA': 0,\n", + " 'GGTGCAGCTGGCTAGCATGA': 0,\n", + " 'GTTGCAGCTGGCTAACATGA': 0,\n", + " 'GTTGCAGCTGGCTACCATGA': 0,\n", + " 'GTTGCAGCTGGCTAGCATGA': 0,\n", + " 'GCTGCAGCTGGCTATAATGA': 0,\n", + " 'GCTGCAGCTGGCTATGATGA': 0,\n", + " 'GCTGCAGCTGGCTATTATGA': 0,\n", + " 'GGTGCAGCTGGCTATAATGA': 0,\n", + " 'GGTGCAGCTGGCTATGATGA': 0,\n", + " 'GGTGCAGCTGGCTATTATGA': 0,\n", + " 'GTTGCAGCTGGCTATAATGA': 0,\n", + " 'GTTGCAGCTGGCTATGATGA': 0,\n", + " 'GTTGCAGCTGGCTATTATGA': 0,\n", + " 'GCTGCAGCTGGCTATCCTGA': 0,\n", + " 'GCTGCAGCTGGCTATCGTGA': 0,\n", + " 'GCTGCAGCTGGCTATCTTGA': 0,\n", + " 'GGTGCAGCTGGCTATCCTGA': 0,\n", + " 'GGTGCAGCTGGCTATCGTGA': 0,\n", + " 'GGTGCAGCTGGCTATCTTGA': 0,\n", + " 'GTTGCAGCTGGCTATCCTGA': 0,\n", + " 'GTTGCAGCTGGCTATCGTGA': 0,\n", + " 'GTTGCAGCTGGCTATCTTGA': 0,\n", + " 'GCTGCAGCTGGCTATCAAGA': 0,\n", + " 'GCTGCAGCTGGCTATCACGA': 0,\n", + " 'GCTGCAGCTGGCTATCAGGA': 0,\n", + " 'GGTGCAGCTGGCTATCAAGA': 0,\n", + " 'GGTGCAGCTGGCTATCACGA': 0,\n", + " 'GGTGCAGCTGGCTATCAGGA': 0,\n", + " 'GTTGCAGCTGGCTATCAAGA': 0,\n", + " 'GTTGCAGCTGGCTATCACGA': 0,\n", + " 'GTTGCAGCTGGCTATCAGGA': 0,\n", + " 'GCTGCAGCTGGCTATCATAA': 0,\n", + " 'GCTGCAGCTGGCTATCATCA': 0,\n", + " 'GCTGCAGCTGGCTATCATTA': 0,\n", + " 'GGTGCAGCTGGCTATCATAA': 0,\n", + " 'GGTGCAGCTGGCTATCATCA': 0,\n", + " 'GGTGCAGCTGGCTATCATTA': 0,\n", + " 'GTTGCAGCTGGCTATCATAA': 0,\n", + " 'GTTGCAGCTGGCTATCATCA': 0,\n", + " 'GTTGCAGCTGGCTATCATTA': 0,\n", + " 'GCTGCAGCTGGCTATCATGC': 0,\n", + " 'GCTGCAGCTGGCTATCATGG': 0,\n", + " 'GCTGCAGCTGGCTATCATGT': 0,\n", + " 'GGTGCAGCTGGCTATCATGC': 0,\n", + " 'GGTGCAGCTGGCTATCATGG': 0,\n", + " 'GGTGCAGCTGGCTATCATGT': 0,\n", + " 'GTTGCAGCTGGCTATCATGC': 0,\n", + " 'GTTGCAGCTGGCTATCATGG': 0,\n", + " 'GTTGCAGCTGGCTATCATGT': 0,\n", + " 'GAAAGCAGCTGGCTATCATGA': 0,\n", + " 'GAACGCAGCTGGCTATCATGA': 0,\n", + " 'GAAGGCAGCTGGCTATCATGA': 0,\n", + " 'GACAGCAGCTGGCTATCATGA': 0,\n", + " 'GACCGCAGCTGGCTATCATGA': 0,\n", + " 'GACGGCAGCTGGCTATCATGA': 0,\n", + " 'GAGAGCAGCTGGCTATCATGA': 0,\n", + " 'GAGCGCAGCTGGCTATCATGA': 0,\n", + " 'GAGGGCAGCTGGCTATCATGA': 0,\n", + " 'GATAGCAGCTGGCTATCATGA': 0,\n", + " 'GATCGCAGCTGGCTATCATGA': 0,\n", + " 'GATGGCAGCTGGCTATCATGA': 0,\n", + " 'GAAACAGCTGGCTATCATGA': 0,\n", + " 'GAACCAGCTGGCTATCATGA': 0,\n", + " 'GAATCAGCTGGCTATCATGA': 0,\n", + " 'GACACAGCTGGCTATCATGA': 0,\n", + " 'GACCCAGCTGGCTATCATGA': 0,\n", + " 'GACTCAGCTGGCTATCATGA': 0,\n", + " 'GAGACAGCTGGCTATCATGA': 0,\n", + " 'GAGCCAGCTGGCTATCATGA': 0,\n", + " 'GAGTCAGCTGGCTATCATGA': 0,\n", + " 'GAAGAAGCTGGCTATCATGA': 0,\n", + " 'GAAGGAGCTGGCTATCATGA': 0,\n", + " 'GAAGTAGCTGGCTATCATGA': 0,\n", + " 'GACGAAGCTGGCTATCATGA': 0,\n", + " 'GACGGAGCTGGCTATCATGA': 0,\n", + " 'GACGTAGCTGGCTATCATGA': 0,\n", + " 'GAGGAAGCTGGCTATCATGA': 0,\n", + " 'GAGGGAGCTGGCTATCATGA': 0,\n", + " 'GAGGTAGCTGGCTATCATGA': 0,\n", + " 'GAAGCCGCTGGCTATCATGA': 0,\n", + " 'GAAGCGGCTGGCTATCATGA': 0,\n", + " 'GAAGCTGCTGGCTATCATGA': 0,\n", + " 'GACGCCGCTGGCTATCATGA': 0,\n", + " 'GACGCGGCTGGCTATCATGA': 0,\n", + " 'GACGCTGCTGGCTATCATGA': 0,\n", + " 'GAGGCCGCTGGCTATCATGA': 0,\n", + " 'GAGGCGGCTGGCTATCATGA': 0,\n", + " 'GAGGCTGCTGGCTATCATGA': 0,\n", + " 'GAAGCAACTGGCTATCATGA': 0,\n", + " 'GAAGCACCTGGCTATCATGA': 0,\n", + " 'GAAGCATCTGGCTATCATGA': 0,\n", + " 'GACGCAACTGGCTATCATGA': 0,\n", + " 'GACGCACCTGGCTATCATGA': 0,\n", + " 'GACGCATCTGGCTATCATGA': 0,\n", + " 'GAGGCAACTGGCTATCATGA': 0,\n", + " 'GAGGCACCTGGCTATCATGA': 0,\n", + " 'GAGGCATCTGGCTATCATGA': 0,\n", + " 'GAAGCAGATGGCTATCATGA': 0,\n", + " 'GAAGCAGGTGGCTATCATGA': 0,\n", + " 'GAAGCAGTTGGCTATCATGA': 0,\n", + " 'GACGCAGATGGCTATCATGA': 0,\n", + " 'GACGCAGGTGGCTATCATGA': 0,\n", + " 'GACGCAGTTGGCTATCATGA': 0,\n", + " 'GAGGCAGATGGCTATCATGA': 0,\n", + " 'GAGGCAGGTGGCTATCATGA': 0,\n", + " 'GAGGCAGTTGGCTATCATGA': 0,\n", + " 'GAAGCAGCAGGCTATCATGA': 0,\n", + " 'GAAGCAGCCGGCTATCATGA': 0,\n", + " 'GAAGCAGCGGGCTATCATGA': 0,\n", + " 'GACGCAGCAGGCTATCATGA': 0,\n", + " 'GACGCAGCCGGCTATCATGA': 0,\n", + " 'GACGCAGCGGGCTATCATGA': 0,\n", + " 'GAGGCAGCAGGCTATCATGA': 0,\n", + " 'GAGGCAGCCGGCTATCATGA': 0,\n", + " 'GAGGCAGCGGGCTATCATGA': 0,\n", + " 'GAAGCAGCTAGCTATCATGA': 0,\n", + " 'GAAGCAGCTCGCTATCATGA': 0,\n", + " 'GAAGCAGCTTGCTATCATGA': 0,\n", + " 'GACGCAGCTAGCTATCATGA': 0,\n", + " 'GACGCAGCTCGCTATCATGA': 0,\n", + " 'GACGCAGCTTGCTATCATGA': 0,\n", + " 'GAGGCAGCTAGCTATCATGA': 0,\n", + " 'GAGGCAGCTCGCTATCATGA': 0,\n", + " 'GAGGCAGCTTGCTATCATGA': 0,\n", + " 'GAAGCAGCTGACTATCATGA': 0,\n", + " 'GAAGCAGCTGCCTATCATGA': 0,\n", + " 'GAAGCAGCTGTCTATCATGA': 0,\n", + " 'GACGCAGCTGACTATCATGA': 0,\n", + " 'GACGCAGCTGCCTATCATGA': 0,\n", + " 'GACGCAGCTGTCTATCATGA': 0,\n", + " 'GAGGCAGCTGACTATCATGA': 0,\n", + " 'GAGGCAGCTGCCTATCATGA': 0,\n", + " 'GAGGCAGCTGTCTATCATGA': 0,\n", + " 'GAAGCAGCTGGATATCATGA': 0,\n", + " 'GAAGCAGCTGGGTATCATGA': 0,\n", + " 'GAAGCAGCTGGTTATCATGA': 0,\n", + " 'GACGCAGCTGGATATCATGA': 0,\n", + " 'GACGCAGCTGGGTATCATGA': 0,\n", + " 'GACGCAGCTGGTTATCATGA': 0,\n", + " 'GAGGCAGCTGGATATCATGA': 0,\n", + " 'GAGGCAGCTGGGTATCATGA': 0,\n", + " 'GAGGCAGCTGGTTATCATGA': 0,\n", + " 'GAAGCAGCTGGCAATCATGA': 0,\n", + " 'GAAGCAGCTGGCCATCATGA': 0,\n", + " 'GAAGCAGCTGGCGATCATGA': 0,\n", + " 'GACGCAGCTGGCAATCATGA': 0,\n", + " 'GACGCAGCTGGCCATCATGA': 0,\n", + " 'GACGCAGCTGGCGATCATGA': 0,\n", + " 'GAGGCAGCTGGCAATCATGA': 0,\n", + " 'GAGGCAGCTGGCCATCATGA': 0,\n", + " 'GAGGCAGCTGGCGATCATGA': 0,\n", + " 'GAAGCAGCTGGCTCTCATGA': 0,\n", + " 'GAAGCAGCTGGCTGTCATGA': 0,\n", + " 'GAAGCAGCTGGCTTTCATGA': 0,\n", + " 'GACGCAGCTGGCTCTCATGA': 0,\n", + " 'GACGCAGCTGGCTGTCATGA': 0,\n", + " 'GACGCAGCTGGCTTTCATGA': 0,\n", + " 'GAGGCAGCTGGCTCTCATGA': 0,\n", + " 'GAGGCAGCTGGCTGTCATGA': 0,\n", + " 'GAGGCAGCTGGCTTTCATGA': 0,\n", + " 'GAAGCAGCTGGCTAACATGA': 0,\n", + " 'GAAGCAGCTGGCTACCATGA': 0,\n", + " 'GAAGCAGCTGGCTAGCATGA': 0,\n", + " 'GACGCAGCTGGCTAACATGA': 0,\n", + " 'GACGCAGCTGGCTACCATGA': 0,\n", + " 'GACGCAGCTGGCTAGCATGA': 0,\n", + " 'GAGGCAGCTGGCTAACATGA': 0,\n", + " 'GAGGCAGCTGGCTACCATGA': 0,\n", + " 'GAGGCAGCTGGCTAGCATGA': 0,\n", + " 'GAAGCAGCTGGCTATAATGA': 0,\n", + " 'GAAGCAGCTGGCTATGATGA': 0,\n", + " 'GAAGCAGCTGGCTATTATGA': 0,\n", + " 'GACGCAGCTGGCTATAATGA': 0,\n", + " 'GACGCAGCTGGCTATGATGA': 0,\n", + " 'GACGCAGCTGGCTATTATGA': 0,\n", + " 'GAGGCAGCTGGCTATAATGA': 0,\n", + " 'GAGGCAGCTGGCTATGATGA': 0,\n", + " 'GAGGCAGCTGGCTATTATGA': 0,\n", + " 'GAAGCAGCTGGCTATCCTGA': 0,\n", + " 'GAAGCAGCTGGCTATCGTGA': 0,\n", + " 'GAAGCAGCTGGCTATCTTGA': 0,\n", + " 'GACGCAGCTGGCTATCCTGA': 0,\n", + " 'GACGCAGCTGGCTATCGTGA': 0,\n", + " 'GACGCAGCTGGCTATCTTGA': 0,\n", + " 'GAGGCAGCTGGCTATCCTGA': 0,\n", + " 'GAGGCAGCTGGCTATCGTGA': 0,\n", + " 'GAGGCAGCTGGCTATCTTGA': 0,\n", + " 'GAAGCAGCTGGCTATCAAGA': 0,\n", + " 'GAAGCAGCTGGCTATCACGA': 0,\n", + " 'GAAGCAGCTGGCTATCAGGA': 0,\n", + " 'GACGCAGCTGGCTATCAAGA': 0,\n", + " 'GACGCAGCTGGCTATCACGA': 0,\n", + " 'GACGCAGCTGGCTATCAGGA': 0,\n", + " 'GAGGCAGCTGGCTATCAAGA': 0,\n", + " 'GAGGCAGCTGGCTATCACGA': 0,\n", + " 'GAGGCAGCTGGCTATCAGGA': 0,\n", + " 'GAAGCAGCTGGCTATCATAA': 0,\n", + " 'GAAGCAGCTGGCTATCATCA': 0,\n", + " 'GAAGCAGCTGGCTATCATTA': 0,\n", + " 'GACGCAGCTGGCTATCATAA': 0,\n", + " 'GACGCAGCTGGCTATCATCA': 0,\n", + " 'GACGCAGCTGGCTATCATTA': 0,\n", + " 'GAGGCAGCTGGCTATCATAA': 0,\n", + " 'GAGGCAGCTGGCTATCATCA': 0,\n", + " 'GAGGCAGCTGGCTATCATTA': 0,\n", + " 'GAAGCAGCTGGCTATCATGC': 0,\n", + " 'GAAGCAGCTGGCTATCATGG': 0,\n", + " 'GAAGCAGCTGGCTATCATGT': 0,\n", + " 'GACGCAGCTGGCTATCATGC': 0,\n", + " 'GACGCAGCTGGCTATCATGG': 0,\n", + " 'GACGCAGCTGGCTATCATGT': 0,\n", + " 'GAGGCAGCTGGCTATCATGC': 0,\n", + " 'GAGGCAGCTGGCTATCATGG': 0,\n", + " 'GAGGCAGCTGGCTATCATGT': 0,\n", + " 'GATAACAGCTGGCTATCATGA': 0,\n", + " 'GATACCAGCTGGCTATCATGA': 0,\n", + " 'GATATCAGCTGGCTATCATGA': 0,\n", + " 'GATCACAGCTGGCTATCATGA': 0,\n", + " 'GATCCCAGCTGGCTATCATGA': 0,\n", + " 'GATCTCAGCTGGCTATCATGA': 0,\n", + " 'GATGACAGCTGGCTATCATGA': 0,\n", + " 'GATGCCAGCTGGCTATCATGA': 0,\n", + " 'GATGTCAGCTGGCTATCATGA': 0,\n", + " 'GATTACAGCTGGCTATCATGA': 0,\n", + " 'GATTCCAGCTGGCTATCATGA': 0,\n", + " 'GATTTCAGCTGGCTATCATGA': 0,\n", + " 'GATAAAGCTGGCTATCATGA': 0,\n", + " 'GATAGAGCTGGCTATCATGA': 0,\n", + " 'GATATAGCTGGCTATCATGA': 0,\n", + " 'GATCAAGCTGGCTATCATGA': 0,\n", + " 'GATCGAGCTGGCTATCATGA': 0,\n", + " 'GATCTAGCTGGCTATCATGA': 0,\n", + " 'GATTAAGCTGGCTATCATGA': 0,\n", + " 'GATTGAGCTGGCTATCATGA': 0,\n", + " 'GATTTAGCTGGCTATCATGA': 0,\n", + " 'GATACCGCTGGCTATCATGA': 0,\n", + " 'GATACGGCTGGCTATCATGA': 0,\n", + " 'GATACTGCTGGCTATCATGA': 0,\n", + " 'GATCCCGCTGGCTATCATGA': 0,\n", + " 'GATCCGGCTGGCTATCATGA': 0,\n", + " 'GATCCTGCTGGCTATCATGA': 0,\n", + " 'GATTCCGCTGGCTATCATGA': 0,\n", + " 'GATTCGGCTGGCTATCATGA': 0,\n", + " 'GATTCTGCTGGCTATCATGA': 0,\n", + " 'GATACAACTGGCTATCATGA': 0,\n", + " 'GATACACCTGGCTATCATGA': 0,\n", + " 'GATACATCTGGCTATCATGA': 0,\n", + " 'GATCCAACTGGCTATCATGA': 0,\n", + " 'GATCCACCTGGCTATCATGA': 0,\n", + " 'GATCCATCTGGCTATCATGA': 0,\n", + " 'GATTCAACTGGCTATCATGA': 0,\n", + " 'GATTCACCTGGCTATCATGA': 0,\n", + " 'GATTCATCTGGCTATCATGA': 0,\n", + " 'GATACAGATGGCTATCATGA': 0,\n", + " 'GATACAGGTGGCTATCATGA': 0,\n", + " 'GATACAGTTGGCTATCATGA': 0,\n", + " 'GATCCAGATGGCTATCATGA': 0,\n", + " 'GATCCAGGTGGCTATCATGA': 0,\n", + " 'GATCCAGTTGGCTATCATGA': 0,\n", + " 'GATTCAGATGGCTATCATGA': 0,\n", + " 'GATTCAGGTGGCTATCATGA': 0,\n", + " 'GATTCAGTTGGCTATCATGA': 0,\n", + " 'GATACAGCAGGCTATCATGA': 0,\n", + " 'GATACAGCCGGCTATCATGA': 0,\n", + " 'GATACAGCGGGCTATCATGA': 0,\n", + " 'GATCCAGCAGGCTATCATGA': 0,\n", + " 'GATCCAGCCGGCTATCATGA': 0,\n", + " 'GATCCAGCGGGCTATCATGA': 0,\n", + " 'GATTCAGCAGGCTATCATGA': 0,\n", + " 'GATTCAGCCGGCTATCATGA': 0,\n", + " 'GATTCAGCGGGCTATCATGA': 0,\n", + " 'GATACAGCTAGCTATCATGA': 0,\n", + " 'GATACAGCTCGCTATCATGA': 0,\n", + " 'GATACAGCTTGCTATCATGA': 0,\n", + " 'GATCCAGCTAGCTATCATGA': 0,\n", + " 'GATCCAGCTCGCTATCATGA': 0,\n", + " 'GATCCAGCTTGCTATCATGA': 0,\n", + " 'GATTCAGCTAGCTATCATGA': 0,\n", + " 'GATTCAGCTCGCTATCATGA': 0,\n", + " 'GATTCAGCTTGCTATCATGA': 0,\n", + " 'GATACAGCTGACTATCATGA': 0,\n", + " 'GATACAGCTGCCTATCATGA': 0,\n", + " 'GATACAGCTGTCTATCATGA': 0,\n", + " 'GATCCAGCTGACTATCATGA': 0,\n", + " 'GATCCAGCTGCCTATCATGA': 0,\n", + " 'GATCCAGCTGTCTATCATGA': 0,\n", + " 'GATTCAGCTGACTATCATGA': 0,\n", + " 'GATTCAGCTGCCTATCATGA': 0,\n", + " 'GATTCAGCTGTCTATCATGA': 0,\n", + " 'GATACAGCTGGATATCATGA': 0,\n", + " 'GATACAGCTGGGTATCATGA': 0,\n", + " 'GATACAGCTGGTTATCATGA': 0,\n", + " 'GATCCAGCTGGATATCATGA': 0,\n", + " 'GATCCAGCTGGGTATCATGA': 0,\n", + " 'GATCCAGCTGGTTATCATGA': 0,\n", + " 'GATTCAGCTGGATATCATGA': 0,\n", + " 'GATTCAGCTGGGTATCATGA': 0,\n", + " 'GATTCAGCTGGTTATCATGA': 0,\n", + " 'GATACAGCTGGCAATCATGA': 0,\n", + " 'GATACAGCTGGCCATCATGA': 0,\n", + " 'GATACAGCTGGCGATCATGA': 0,\n", + " 'GATCCAGCTGGCAATCATGA': 0,\n", + " 'GATCCAGCTGGCCATCATGA': 0,\n", + " 'GATCCAGCTGGCGATCATGA': 0,\n", + " 'GATTCAGCTGGCAATCATGA': 0,\n", + " 'GATTCAGCTGGCCATCATGA': 0,\n", + " 'GATTCAGCTGGCGATCATGA': 0,\n", + " 'GATACAGCTGGCTCTCATGA': 0,\n", + " 'GATACAGCTGGCTGTCATGA': 0,\n", + " 'GATACAGCTGGCTTTCATGA': 0,\n", + " 'GATCCAGCTGGCTCTCATGA': 0,\n", + " 'GATCCAGCTGGCTGTCATGA': 0,\n", + " 'GATCCAGCTGGCTTTCATGA': 0,\n", + " 'GATTCAGCTGGCTCTCATGA': 0,\n", + " 'GATTCAGCTGGCTGTCATGA': 0,\n", + " 'GATTCAGCTGGCTTTCATGA': 0,\n", + " 'GATACAGCTGGCTAACATGA': 0,\n", + " 'GATACAGCTGGCTACCATGA': 0,\n", + " 'GATACAGCTGGCTAGCATGA': 0,\n", + " 'GATCCAGCTGGCTAACATGA': 0,\n", + " 'GATCCAGCTGGCTACCATGA': 0,\n", + " 'GATCCAGCTGGCTAGCATGA': 0,\n", + " 'GATTCAGCTGGCTAACATGA': 0,\n", + " 'GATTCAGCTGGCTACCATGA': 0,\n", + " 'GATTCAGCTGGCTAGCATGA': 0,\n", + " 'GATACAGCTGGCTATAATGA': 0,\n", + " 'GATACAGCTGGCTATGATGA': 0,\n", + " 'GATACAGCTGGCTATTATGA': 0,\n", + " 'GATCCAGCTGGCTATAATGA': 0,\n", + " 'GATCCAGCTGGCTATGATGA': 0,\n", + " 'GATCCAGCTGGCTATTATGA': 0,\n", + " 'GATTCAGCTGGCTATAATGA': 0,\n", + " 'GATTCAGCTGGCTATGATGA': 0,\n", + " 'GATTCAGCTGGCTATTATGA': 0,\n", + " 'GATACAGCTGGCTATCCTGA': 0,\n", + " 'GATACAGCTGGCTATCGTGA': 0,\n", + " 'GATACAGCTGGCTATCTTGA': 0,\n", + " 'GATCCAGCTGGCTATCCTGA': 0,\n", + " 'GATCCAGCTGGCTATCGTGA': 0,\n", + " 'GATCCAGCTGGCTATCTTGA': 0,\n", + " 'GATTCAGCTGGCTATCCTGA': 0,\n", + " 'GATTCAGCTGGCTATCGTGA': 0,\n", + " 'GATTCAGCTGGCTATCTTGA': 0,\n", + " 'GATACAGCTGGCTATCAAGA': 0,\n", + " 'GATACAGCTGGCTATCACGA': 0,\n", + " 'GATACAGCTGGCTATCAGGA': 0,\n", + " 'GATCCAGCTGGCTATCAAGA': 0,\n", + " 'GATCCAGCTGGCTATCACGA': 0,\n", + " 'GATCCAGCTGGCTATCAGGA': 0,\n", + " 'GATTCAGCTGGCTATCAAGA': 0,\n", + " 'GATTCAGCTGGCTATCACGA': 0,\n", + " 'GATTCAGCTGGCTATCAGGA': 0,\n", + " 'GATACAGCTGGCTATCATAA': 0,\n", + " 'GATACAGCTGGCTATCATCA': 0,\n", + " 'GATACAGCTGGCTATCATTA': 0,\n", + " 'GATCCAGCTGGCTATCATAA': 0,\n", + " 'GATCCAGCTGGCTATCATCA': 0,\n", + " 'GATCCAGCTGGCTATCATTA': 0,\n", + " 'GATTCAGCTGGCTATCATAA': 0,\n", + " 'GATTCAGCTGGCTATCATCA': 0,\n", + " 'GATTCAGCTGGCTATCATTA': 0,\n", + " 'GATACAGCTGGCTATCATGC': 0,\n", + " 'GATACAGCTGGCTATCATGG': 0,\n", + " 'GATACAGCTGGCTATCATGT': 0,\n", + " 'GATCCAGCTGGCTATCATGC': 0,\n", + " 'GATCCAGCTGGCTATCATGG': 0,\n", + " 'GATCCAGCTGGCTATCATGT': 0,\n", + " 'GATTCAGCTGGCTATCATGC': 0,\n", + " 'GATTCAGCTGGCTATCATGG': 0,\n", + " 'GATTCAGCTGGCTATCATGT': 0,\n", + " 'GATGAAAGCTGGCTATCATGA': 0,\n", + " 'GATGAGAGCTGGCTATCATGA': 0,\n", + " 'GATGATAGCTGGCTATCATGA': 0,\n", + " 'GATGCAAGCTGGCTATCATGA': 0,\n", + " 'GATGCGAGCTGGCTATCATGA': 0,\n", + " 'GATGCTAGCTGGCTATCATGA': 0,\n", + " 'GATGGAAGCTGGCTATCATGA': 0,\n", + " 'GATGGGAGCTGGCTATCATGA': 0,\n", + " 'GATGGTAGCTGGCTATCATGA': 0,\n", + " 'GATGTAAGCTGGCTATCATGA': 0,\n", + " 'GATGTGAGCTGGCTATCATGA': 0,\n", + " 'GATGTTAGCTGGCTATCATGA': 0,\n", + " 'GATGACGCTGGCTATCATGA': 0,\n", + " 'GATGAGGCTGGCTATCATGA': 0,\n", + " 'GATGATGCTGGCTATCATGA': 0,\n", + " 'GATGGCGCTGGCTATCATGA': 0,\n", + " 'GATGGGGCTGGCTATCATGA': 0,\n", + " 'GATGGTGCTGGCTATCATGA': 0,\n", + " 'GATGTCGCTGGCTATCATGA': 0,\n", + " 'GATGTGGCTGGCTATCATGA': 0,\n", + " 'GATGTTGCTGGCTATCATGA': 0,\n", + " 'GATGAAACTGGCTATCATGA': 0,\n", + " 'GATGAACCTGGCTATCATGA': 0,\n", + " 'GATGAATCTGGCTATCATGA': 0,\n", + " 'GATGGAACTGGCTATCATGA': 0,\n", + " 'GATGGACCTGGCTATCATGA': 0,\n", + " 'GATGGATCTGGCTATCATGA': 0,\n", + " 'GATGTAACTGGCTATCATGA': 0,\n", + " 'GATGTACCTGGCTATCATGA': 0,\n", + " 'GATGTATCTGGCTATCATGA': 0,\n", + " 'GATGAAGATGGCTATCATGA': 0,\n", + " 'GATGAAGGTGGCTATCATGA': 0,\n", + " 'GATGAAGTTGGCTATCATGA': 0,\n", + " 'GATGGAGATGGCTATCATGA': 0,\n", + " 'GATGGAGGTGGCTATCATGA': 0,\n", + " 'GATGGAGTTGGCTATCATGA': 0,\n", + " 'GATGTAGATGGCTATCATGA': 0,\n", + " 'GATGTAGGTGGCTATCATGA': 0,\n", + " 'GATGTAGTTGGCTATCATGA': 0,\n", + " 'GATGAAGCAGGCTATCATGA': 0,\n", + " 'GATGAAGCCGGCTATCATGA': 0,\n", + " 'GATGAAGCGGGCTATCATGA': 0,\n", + " 'GATGGAGCAGGCTATCATGA': 0,\n", + " 'GATGGAGCCGGCTATCATGA': 0,\n", + " 'GATGGAGCGGGCTATCATGA': 0,\n", + " 'GATGTAGCAGGCTATCATGA': 0,\n", + " 'GATGTAGCCGGCTATCATGA': 0,\n", + " 'GATGTAGCGGGCTATCATGA': 0,\n", + " 'GATGAAGCTAGCTATCATGA': 0,\n", + " 'GATGAAGCTCGCTATCATGA': 0,\n", + " 'GATGAAGCTTGCTATCATGA': 0,\n", + " 'GATGGAGCTAGCTATCATGA': 0,\n", + " 'GATGGAGCTCGCTATCATGA': 0,\n", + " 'GATGGAGCTTGCTATCATGA': 0,\n", + " 'GATGTAGCTAGCTATCATGA': 0,\n", + " 'GATGTAGCTCGCTATCATGA': 0,\n", + " 'GATGTAGCTTGCTATCATGA': 0,\n", + " 'GATGAAGCTGACTATCATGA': 0,\n", + " 'GATGAAGCTGCCTATCATGA': 0,\n", + " 'GATGAAGCTGTCTATCATGA': 0,\n", + " 'GATGGAGCTGACTATCATGA': 0,\n", + " 'GATGGAGCTGCCTATCATGA': 0,\n", + " 'GATGGAGCTGTCTATCATGA': 0,\n", + " 'GATGTAGCTGACTATCATGA': 0,\n", + " 'GATGTAGCTGCCTATCATGA': 0,\n", + " 'GATGTAGCTGTCTATCATGA': 0,\n", + " 'GATGAAGCTGGATATCATGA': 0,\n", + " 'GATGAAGCTGGGTATCATGA': 0,\n", + " 'GATGAAGCTGGTTATCATGA': 0,\n", + " 'GATGGAGCTGGATATCATGA': 0,\n", + " 'GATGGAGCTGGGTATCATGA': 0,\n", + " 'GATGGAGCTGGTTATCATGA': 0,\n", + " 'GATGTAGCTGGATATCATGA': 0,\n", + " 'GATGTAGCTGGGTATCATGA': 0,\n", + " 'GATGTAGCTGGTTATCATGA': 0,\n", + " 'GATGAAGCTGGCAATCATGA': 0,\n", + " 'GATGAAGCTGGCCATCATGA': 0,\n", + " 'GATGAAGCTGGCGATCATGA': 0,\n", + " 'GATGGAGCTGGCAATCATGA': 0,\n", + " 'GATGGAGCTGGCCATCATGA': 0,\n", + " 'GATGGAGCTGGCGATCATGA': 0,\n", + " 'GATGTAGCTGGCAATCATGA': 0,\n", + " 'GATGTAGCTGGCCATCATGA': 0,\n", + " 'GATGTAGCTGGCGATCATGA': 0,\n", + " 'GATGAAGCTGGCTCTCATGA': 0,\n", + " 'GATGAAGCTGGCTGTCATGA': 0,\n", + " 'GATGAAGCTGGCTTTCATGA': 0,\n", + " 'GATGGAGCTGGCTCTCATGA': 0,\n", + " 'GATGGAGCTGGCTGTCATGA': 0,\n", + " 'GATGGAGCTGGCTTTCATGA': 0,\n", + " 'GATGTAGCTGGCTCTCATGA': 0,\n", + " 'GATGTAGCTGGCTGTCATGA': 0,\n", + " 'GATGTAGCTGGCTTTCATGA': 0,\n", + " 'GATGAAGCTGGCTAACATGA': 0,\n", + " 'GATGAAGCTGGCTACCATGA': 0,\n", + " 'GATGAAGCTGGCTAGCATGA': 0,\n", + " 'GATGGAGCTGGCTAACATGA': 0,\n", + " 'GATGGAGCTGGCTACCATGA': 0,\n", + " 'GATGGAGCTGGCTAGCATGA': 0,\n", + " 'GATGTAGCTGGCTAACATGA': 0,\n", + " 'GATGTAGCTGGCTACCATGA': 0,\n", + " 'GATGTAGCTGGCTAGCATGA': 0,\n", + " 'GATGAAGCTGGCTATAATGA': 0,\n", + " 'GATGAAGCTGGCTATGATGA': 0,\n", + " 'GATGAAGCTGGCTATTATGA': 0,\n", + " 'GATGGAGCTGGCTATAATGA': 0,\n", + " 'GATGGAGCTGGCTATGATGA': 0,\n", + " 'GATGGAGCTGGCTATTATGA': 0,\n", + " 'GATGTAGCTGGCTATAATGA': 0,\n", + " 'GATGTAGCTGGCTATGATGA': 0,\n", + " 'GATGTAGCTGGCTATTATGA': 0,\n", + " 'GATGAAGCTGGCTATCCTGA': 0,\n", + " 'GATGAAGCTGGCTATCGTGA': 0,\n", + " 'GATGAAGCTGGCTATCTTGA': 0,\n", + " 'GATGGAGCTGGCTATCCTGA': 0,\n", + " 'GATGGAGCTGGCTATCGTGA': 0,\n", + " 'GATGGAGCTGGCTATCTTGA': 0,\n", + " 'GATGTAGCTGGCTATCCTGA': 0,\n", + " 'GATGTAGCTGGCTATCGTGA': 0,\n", + " 'GATGTAGCTGGCTATCTTGA': 0,\n", + " 'GATGAAGCTGGCTATCAAGA': 0,\n", + " 'GATGAAGCTGGCTATCACGA': 0,\n", + " 'GATGAAGCTGGCTATCAGGA': 0,\n", + " 'GATGGAGCTGGCTATCAAGA': 0,\n", + " 'GATGGAGCTGGCTATCACGA': 0,\n", + " 'GATGGAGCTGGCTATCAGGA': 0,\n", + " 'GATGTAGCTGGCTATCAAGA': 0,\n", + " 'GATGTAGCTGGCTATCACGA': 0,\n", + " 'GATGTAGCTGGCTATCAGGA': 0,\n", + " 'GATGAAGCTGGCTATCATAA': 0,\n", + " 'GATGAAGCTGGCTATCATCA': 0,\n", + " 'GATGAAGCTGGCTATCATTA': 0,\n", + " 'GATGGAGCTGGCTATCATAA': 0,\n", + " 'GATGGAGCTGGCTATCATCA': 0,\n", + " 'GATGGAGCTGGCTATCATTA': 0,\n", + " 'GATGTAGCTGGCTATCATAA': 0,\n", + " 'GATGTAGCTGGCTATCATCA': 0,\n", + " 'GATGTAGCTGGCTATCATTA': 0,\n", + " 'GATGAAGCTGGCTATCATGC': 0,\n", + " 'GATGAAGCTGGCTATCATGG': 0,\n", + " 'GATGAAGCTGGCTATCATGT': 0,\n", + " 'GATGGAGCTGGCTATCATGC': 0,\n", + " 'GATGGAGCTGGCTATCATGG': 0,\n", + " 'GATGGAGCTGGCTATCATGT': 0,\n", + " 'GATGTAGCTGGCTATCATGC': 0,\n", + " 'GATGTAGCTGGCTATCATGG': 0,\n", + " 'GATGTAGCTGGCTATCATGT': 0,\n", + " 'GATGCACGCTGGCTATCATGA': 0,\n", + " 'GATGCAGGCTGGCTATCATGA': 0,\n", + " 'GATGCATGCTGGCTATCATGA': 0,\n", + " 'GATGCCCGCTGGCTATCATGA': 0,\n", + " 'GATGCCGGCTGGCTATCATGA': 0,\n", + " 'GATGCCTGCTGGCTATCATGA': 0,\n", + " 'GATGCGCGCTGGCTATCATGA': 0,\n", + " 'GATGCGGGCTGGCTATCATGA': 0,\n", + " 'GATGCGTGCTGGCTATCATGA': 0,\n", + " 'GATGCTCGCTGGCTATCATGA': 0,\n", + " 'GATGCTGGCTGGCTATCATGA': 0,\n", + " 'GATGCTTGCTGGCTATCATGA': 0,\n", + " 'GATGCCACTGGCTATCATGA': 0,\n", + " 'GATGCCCCTGGCTATCATGA': 0,\n", + " 'GATGCCTCTGGCTATCATGA': 0,\n", + " 'GATGCGACTGGCTATCATGA': 0,\n", + " 'GATGCGCCTGGCTATCATGA': 0,\n", + " 'GATGCGTCTGGCTATCATGA': 0,\n", + " 'GATGCTACTGGCTATCATGA': 0,\n", + " 'GATGCTCCTGGCTATCATGA': 0,\n", + " 'GATGCTTCTGGCTATCATGA': 0,\n", + " 'GATGCCGATGGCTATCATGA': 0,\n", + " 'GATGCCGGTGGCTATCATGA': 0,\n", + " 'GATGCCGTTGGCTATCATGA': 0,\n", + " 'GATGCGGATGGCTATCATGA': 0,\n", + " 'GATGCGGGTGGCTATCATGA': 0,\n", + " 'GATGCGGTTGGCTATCATGA': 0,\n", + " 'GATGCTGATGGCTATCATGA': 0,\n", + " 'GATGCTGGTGGCTATCATGA': 0,\n", + " 'GATGCTGTTGGCTATCATGA': 0,\n", + " 'GATGCCGCAGGCTATCATGA': 0,\n", + " 'GATGCCGCCGGCTATCATGA': 0,\n", + " 'GATGCCGCGGGCTATCATGA': 0,\n", + " 'GATGCGGCAGGCTATCATGA': 0,\n", + " 'GATGCGGCCGGCTATCATGA': 0,\n", + " 'GATGCGGCGGGCTATCATGA': 0,\n", + " 'GATGCTGCAGGCTATCATGA': 0,\n", + " 'GATGCTGCCGGCTATCATGA': 0,\n", + " 'GATGCTGCGGGCTATCATGA': 0,\n", + " 'GATGCCGCTAGCTATCATGA': 0,\n", + " 'GATGCCGCTCGCTATCATGA': 0,\n", + " 'GATGCCGCTTGCTATCATGA': 0,\n", + " 'GATGCGGCTAGCTATCATGA': 0,\n", + " 'GATGCGGCTCGCTATCATGA': 0,\n", + " 'GATGCGGCTTGCTATCATGA': 0,\n", + " 'GATGCTGCTAGCTATCATGA': 0,\n", + " 'GATGCTGCTCGCTATCATGA': 0,\n", + " 'GATGCTGCTTGCTATCATGA': 0,\n", + " 'GATGCCGCTGACTATCATGA': 0,\n", + " 'GATGCCGCTGCCTATCATGA': 0,\n", + " 'GATGCCGCTGTCTATCATGA': 0,\n", + " 'GATGCGGCTGACTATCATGA': 0,\n", + " 'GATGCGGCTGCCTATCATGA': 0,\n", + " 'GATGCGGCTGTCTATCATGA': 0,\n", + " 'GATGCTGCTGACTATCATGA': 0,\n", + " 'GATGCTGCTGCCTATCATGA': 0,\n", + " 'GATGCTGCTGTCTATCATGA': 0,\n", + " 'GATGCCGCTGGATATCATGA': 0,\n", + " 'GATGCCGCTGGGTATCATGA': 0,\n", + " 'GATGCCGCTGGTTATCATGA': 0,\n", + " 'GATGCGGCTGGATATCATGA': 0,\n", + " 'GATGCGGCTGGGTATCATGA': 0,\n", + " 'GATGCGGCTGGTTATCATGA': 0,\n", + " 'GATGCTGCTGGATATCATGA': 0,\n", + " 'GATGCTGCTGGGTATCATGA': 0,\n", + " 'GATGCTGCTGGTTATCATGA': 0,\n", + " 'GATGCCGCTGGCAATCATGA': 0,\n", + " 'GATGCCGCTGGCCATCATGA': 0,\n", + " 'GATGCCGCTGGCGATCATGA': 0,\n", + " 'GATGCGGCTGGCAATCATGA': 0,\n", + " 'GATGCGGCTGGCCATCATGA': 0,\n", + " 'GATGCGGCTGGCGATCATGA': 0,\n", + " 'GATGCTGCTGGCAATCATGA': 0,\n", + " 'GATGCTGCTGGCCATCATGA': 0,\n", + " 'GATGCTGCTGGCGATCATGA': 0,\n", + " 'GATGCCGCTGGCTCTCATGA': 0,\n", + " 'GATGCCGCTGGCTGTCATGA': 0,\n", + " 'GATGCCGCTGGCTTTCATGA': 0,\n", + " 'GATGCGGCTGGCTCTCATGA': 0,\n", + " 'GATGCGGCTGGCTGTCATGA': 0,\n", + " 'GATGCGGCTGGCTTTCATGA': 0,\n", + " 'GATGCTGCTGGCTCTCATGA': 0,\n", + " 'GATGCTGCTGGCTGTCATGA': 0,\n", + " 'GATGCTGCTGGCTTTCATGA': 0,\n", + " 'GATGCCGCTGGCTAACATGA': 0,\n", + " 'GATGCCGCTGGCTACCATGA': 0,\n", + " 'GATGCCGCTGGCTAGCATGA': 0,\n", + " 'GATGCGGCTGGCTAACATGA': 0,\n", + " 'GATGCGGCTGGCTACCATGA': 0,\n", + " 'GATGCGGCTGGCTAGCATGA': 0,\n", + " 'GATGCTGCTGGCTAACATGA': 0,\n", + " 'GATGCTGCTGGCTACCATGA': 0,\n", + " 'GATGCTGCTGGCTAGCATGA': 0,\n", + " 'GATGCCGCTGGCTATAATGA': 0,\n", + " 'GATGCCGCTGGCTATGATGA': 0,\n", + " 'GATGCCGCTGGCTATTATGA': 0,\n", + " 'GATGCGGCTGGCTATAATGA': 0,\n", + " 'GATGCGGCTGGCTATGATGA': 0,\n", + " 'GATGCGGCTGGCTATTATGA': 0,\n", + " 'GATGCTGCTGGCTATAATGA': 0,\n", + " 'GATGCTGCTGGCTATGATGA': 0,\n", + " 'GATGCTGCTGGCTATTATGA': 0,\n", + " 'GATGCCGCTGGCTATCCTGA': 0,\n", + " 'GATGCCGCTGGCTATCGTGA': 0,\n", + " 'GATGCCGCTGGCTATCTTGA': 0,\n", + " 'GATGCGGCTGGCTATCCTGA': 0,\n", + " 'GATGCGGCTGGCTATCGTGA': 0,\n", + " 'GATGCGGCTGGCTATCTTGA': 0,\n", + " 'GATGCTGCTGGCTATCCTGA': 0,\n", + " 'GATGCTGCTGGCTATCGTGA': 0,\n", + " ...}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "barcode_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing read 0...\n", + "Processing read 10000...\n", + "Processing read 20000...\n", + "Processing read 30000...\n", + "Processing read 40000...\n", + "Processing read 50000...\n", + "Processing read 60000...\n", + "Processing read 70000...\n", + "Processing read 80000...\n", + "Processing read 90000...\n", + "Processing read 100000...\n", + "Processing read 110000...\n", + "Processing read 120000...\n", + "Processing read 130000...\n", + "Done!\n", + "Total # of unique-UMI reads = 86492\n" + ] + } + ], + "source": [ + "#Parse and map RNA sequencing reads\n", + "\n", + "save_suffix = \"_var_repl_2_hek293_v3\"\n", + "\n", + "r1_name = \"unprocessed_data/HEK_V2_trimmed.fastq\"\n", + "r2_name = \"unprocessed_data/HEK-V2_umi_trimmed.fastq\"\n", + "\n", + "polya_regexp = re.compile(r\"AAAAA(AAAAAAAAAAAAAAA){s<=2}\")\n", + "distal_regexp = re.compile(r\"(GCCTCGACTGTGCCTTCTAG){s<=2}\")\n", + "\n", + "def _hamming(s1, s2) :\n", + " \n", + " d = 0.\n", + " for j in range(len(s1)) :\n", + " if s1[j] != s2[j] :\n", + " d += 1.\n", + " \n", + " return d\n", + "\n", + "max_pos = 176\n", + "\n", + "umi_dict = {}\n", + "umi_n_muts = 0\n", + "\n", + "bases = ['A', 'C', 'G', 'T']\n", + "\n", + "cuts = np.zeros((len(library_df), 206))\n", + "\n", + "f1 = open(r1_name, 'rt')\n", + "f2 = open(r2_name, 'rt')\n", + "\n", + "#Iterate through reads sequenctially (r1 and r2)\n", + "\n", + "r1_counter = 0\n", + "while True :\n", + " \n", + " #Read 1\n", + " id1 = f1.readline().strip()\n", + " \n", + " #Check for end-of-file\n", + " if len(id1) == 0 :\n", + " break\n", + " \n", + " r1 = f1.readline().strip()\n", + " s1 = f1.readline().strip()\n", + " q1 = f1.readline().strip()\n", + " \n", + " #Read 2\n", + " id2 = f2.readline().strip()\n", + " r2 = f2.readline().strip()\n", + " s2 = f2.readline().strip()\n", + " q2 = f2.readline().strip()\n", + " \n", + " if r1_counter % 10000 == 0 :\n", + " print(\"Processing read \" + str(r1_counter) + \"...\")\n", + " \n", + " r1_counter += 1\n", + " \n", + " #Map read to library member\n", + " bc = r1[:20]\n", + " \n", + " lib_i = -1\n", + " if bc in barcode_dict :\n", + " lib_i = barcode_dict[bc]\n", + " \n", + " if lib_i == -1 :\n", + " continue\n", + " \n", + " if umi_n_muts == 0 :\n", + " bc = sequences[lib_i][:20]\n", + " \n", + " #Determine if we have seen this umi before, otherwise mark as visited\n", + " umi = r2[:8]\n", + " \n", + " if bc not in umi_dict :\n", + " umi_dict[bc] = {}\n", + " \n", + " umi_visited = False\n", + " if umi in umi_dict[bc] :\n", + " umi_visited = True\n", + " elif umi_n_muts == 1 :\n", + " for pos1 in range(len(umi)) :\n", + " for b1 in bases :\n", + " umi_mut = umi[:pos1] + b1 + umi[pos1+1:]\n", + " if umi_mut in umi_dict[bc] :\n", + " umi_visited = True\n", + " break\n", + " if umi_visited :\n", + " break\n", + " elif umi_n_muts == 2 :\n", + " for pos1 in range(len(umi)) :\n", + " for pos2 in range(pos1, len(umi)) :\n", + " for b1 in bases :\n", + " for b2 in bases :\n", + " umi_mut = umi[:pos1] + b1 + umi[pos1+1:pos2] + b2 + umi[pos2+1:]\n", + " if umi_mut in umi_dict[bc] :\n", + " umi_visited = True\n", + " break\n", + " if umi_visited :\n", + " break\n", + " if umi_visited :\n", + " break\n", + " if umi_visited :\n", + " break\n", + " \n", + " #Skip if umi already seen\n", + " if umi_visited :\n", + " continue\n", + " \n", + " #Determine polyA position (or alternative if the read is distally polyadenylated)\n", + " polya_match = re.search(polya_regexp, r1)\n", + " \n", + " polya_pos = -1\n", + " if polya_match is not None and polya_match.span()[0] < max_pos :\n", + " polya_pos = polya_match.span()[0]\n", + " \n", + " #Determine if distal read\n", + " is_distal = False\n", + " distal_match = re.search(distal_regexp, r1[209-5:209+20+5])\n", + " \n", + " if distal_match is not None :\n", + " is_distal = True\n", + " \n", + " #Aggregate read-position occurrence counts\n", + " if is_distal :\n", + " cuts[lib_i, -1] += 1.\n", + " \n", + " #Mark as seen and proceed\n", + " umi_dict[bc][umi] = True\n", + " \n", + " elif polya_pos != -1 and polya_pos >= 30 :\n", + " \n", + " #Perform hamming-based consistency check against reference of region upstream of cleavage\n", + " \n", + " hamming_dist = _hamming(sequences[lib_i][polya_pos-20:polya_pos], r1[polya_pos-20:polya_pos])\n", + " \n", + " if hamming_dist <= 3 :\n", + " cuts[lib_i, polya_pos] += 1.\n", + "\n", + " #Mark as seen and proceed\n", + " umi_dict[bc][umi] = True\n", + "\n", + "f1.close()\n", + "f2.close()\n", + "\n", + "print(\"Done!\")\n", + "\n", + "print(\"Total # of unique-UMI reads = \" + str(int(np.sum(cuts))))\n", + "\n", + "#Store processed read-position count matrix\n", + "np.save('apa_oligo_2022' + save_suffix + '_umi_mut_' + str(umi_n_muts) + '_cuts', cuts)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#Load processed read count data\n", + "\n", + "ref_cuts_repl_1 = np.load(\"apa_oligo_2022_ref_repl_1_hek293_v3_umi_mut_0_cuts.npy\")\n", + "var_cuts_repl_1 = np.load(\"apa_oligo_2022_var_repl_1_hek293_v3_umi_mut_0_cuts.npy\")\n", + "\n", + "ref_cuts_repl_2 = np.load(\"apa_oligo_2022_ref_repl_2_hek293_v3_umi_mut_0_cuts.npy\")\n", + "var_cuts_repl_2 = np.load(\"apa_oligo_2022_var_repl_2_hek293_v3_umi_mut_0_cuts.npy\")\n", + "\n", + "#Pooled counts\n", + "\n", + "ref_cuts_pooled = ref_cuts_repl_1 + ref_cuts_repl_2\n", + "var_cuts_pooled = var_cuts_repl_1 + var_cuts_repl_2\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#Augment library file with measured isoform summary statistics\n", + "\n", + "pseudo_c = 1.\n", + "\n", + "#Replicate 1\n", + "library_df['ref_count_77_127_repl_1'] = np.sum(ref_cuts_repl_1[:, 77:127], axis=-1)\n", + "library_df['ref_count_0_205_repl_1'] = np.sum(ref_cuts_repl_1[:, 0:205], axis=-1)\n", + "library_df['ref_count_total_repl_1'] = np.sum(ref_cuts_repl_1, axis=-1)\n", + "library_df['ref_logit_77_127_repl_1'] = np.log(\n", + " ((library_df['ref_count_77_127_repl_1'] + pseudo_c) / (library_df['ref_count_total_repl_1'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_77_127_repl_1'] + pseudo_c) / (library_df['ref_count_total_repl_1'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['ref_logit_0_205_repl_1'] = np.log(\n", + " ((library_df['ref_count_0_205_repl_1'] + pseudo_c) / (library_df['ref_count_total_repl_1'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_0_205_repl_1'] + pseudo_c) / (library_df['ref_count_total_repl_1'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['var_count_77_127_repl_1'] = np.sum(var_cuts_repl_1[:, 77:127], axis=-1)\n", + "library_df['var_count_0_205_repl_1'] = np.sum(var_cuts_repl_1[:, 0:205], axis=-1)\n", + "library_df['var_count_total_repl_1'] = np.sum(var_cuts_repl_1, axis=-1)\n", + "library_df['var_logit_77_127_repl_1'] = np.log(\n", + " ((library_df['var_count_77_127_repl_1'] + pseudo_c) / (library_df['var_count_total_repl_1'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_77_127_repl_1'] + pseudo_c) / (library_df['var_count_total_repl_1'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['var_logit_0_205_repl_1'] = np.log(\n", + " ((library_df['var_count_0_205_repl_1'] + pseudo_c) / (library_df['var_count_total_repl_1'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_0_205_repl_1'] + pseudo_c) / (library_df['var_count_total_repl_1'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['delta_logodds_true_77_127_repl_1'] = library_df['var_logit_77_127_repl_1'] - library_df['ref_logit_77_127_repl_1']\n", + "library_df['delta_logodds_true_0_205_repl_1'] = library_df['var_logit_0_205_repl_1'] - library_df['ref_logit_0_205_repl_1']\n", + "\n", + "#Replicate 2\n", + "library_df['ref_count_77_127_repl_2'] = np.sum(ref_cuts_repl_2[:, 77:127], axis=-1)\n", + "library_df['ref_count_0_205_repl_2'] = np.sum(ref_cuts_repl_2[:, 0:205], axis=-1)\n", + "library_df['ref_count_total_repl_2'] = np.sum(ref_cuts_repl_2, axis=-1)\n", + "library_df['ref_logit_77_127_repl_2'] = np.log(\n", + " ((library_df['ref_count_77_127_repl_2'] + pseudo_c) / (library_df['ref_count_total_repl_2'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_77_127_repl_2'] + pseudo_c) / (library_df['ref_count_total_repl_2'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['ref_logit_0_205_repl_2'] = np.log(\n", + " ((library_df['ref_count_0_205_repl_2'] + pseudo_c) / (library_df['ref_count_total_repl_2'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_0_205_repl_2'] + pseudo_c) / (library_df['ref_count_total_repl_2'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['var_count_77_127_repl_2'] = np.sum(var_cuts_repl_2[:, 77:127], axis=-1)\n", + "library_df['var_count_0_205_repl_2'] = np.sum(var_cuts_repl_2[:, 0:205], axis=-1)\n", + "library_df['var_count_total_repl_2'] = np.sum(var_cuts_repl_2, axis=-1)\n", + "library_df['var_logit_77_127_repl_2'] = np.log(\n", + " ((library_df['var_count_77_127_repl_2'] + pseudo_c) / (library_df['var_count_total_repl_2'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_77_127_repl_2'] + pseudo_c) / (library_df['var_count_total_repl_2'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['var_logit_0_205_repl_2'] = np.log(\n", + " ((library_df['var_count_0_205_repl_2'] + pseudo_c) / (library_df['var_count_total_repl_2'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_0_205_repl_2'] + pseudo_c) / (library_df['var_count_total_repl_2'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['delta_logodds_true_77_127_repl_2'] = library_df['var_logit_77_127_repl_2'] - library_df['ref_logit_77_127_repl_2']\n", + "library_df['delta_logodds_true_0_205_repl_2'] = library_df['var_logit_0_205_repl_2'] - library_df['ref_logit_0_205_repl_2']\n", + "\n", + "#Pooled replicates\n", + "library_df['ref_count_77_127_repl_pooled'] = np.sum(ref_cuts_pooled[:, 77:127], axis=-1)\n", + "library_df['ref_count_0_205_repl_pooled'] = np.sum(ref_cuts_pooled[:, 0:205], axis=-1)\n", + "library_df['ref_count_total_repl_pooled'] = np.sum(ref_cuts_pooled, axis=-1)\n", + "library_df['ref_logit_77_127_repl_pooled'] = np.log(\n", + " ((library_df['ref_count_77_127_repl_pooled'] + pseudo_c) / (library_df['ref_count_total_repl_pooled'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_77_127_repl_pooled'] + pseudo_c) / (library_df['ref_count_total_repl_pooled'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['ref_logit_0_205_repl_pooled'] = np.log(\n", + " ((library_df['ref_count_0_205_repl_pooled'] + pseudo_c) / (library_df['ref_count_total_repl_pooled'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_0_205_repl_pooled'] + pseudo_c) / (library_df['ref_count_total_repl_pooled'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['var_count_77_127_repl_pooled'] = np.sum(var_cuts_pooled[:, 77:127], axis=-1)\n", + "library_df['var_count_0_205_repl_pooled'] = np.sum(var_cuts_pooled[:, 0:205], axis=-1)\n", + "library_df['var_count_total_repl_pooled'] = np.sum(var_cuts_pooled, axis=-1)\n", + "library_df['var_logit_77_127_repl_pooled'] = np.log(\n", + " ((library_df['var_count_77_127_repl_pooled'] + pseudo_c) / (library_df['var_count_total_repl_pooled'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_77_127_repl_pooled'] + pseudo_c) / (library_df['var_count_total_repl_pooled'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['var_logit_0_205_repl_pooled'] = np.log(\n", + " ((library_df['var_count_0_205_repl_pooled'] + pseudo_c) / (library_df['var_count_total_repl_pooled'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_0_205_repl_pooled'] + pseudo_c) / (library_df['var_count_total_repl_pooled'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['delta_logodds_true_77_127_repl_pooled'] = library_df['var_logit_77_127_repl_pooled'] - library_df['ref_logit_77_127_repl_pooled']\n", + "library_df['delta_logodds_true_0_205_repl_pooled'] = library_df['var_logit_0_205_repl_pooled'] - library_df['ref_logit_0_205_repl_pooled']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "#Cache measurements in dataframe\n", + "\n", + "library_df.to_csv(\"apa_100_variants_rev2_20220621_hek293_v3_umi_mut_0.csv\", sep='\\t')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/__main__.py:4: RuntimeWarning: invalid value encountered in true_divide\n", + "/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/__main__.py:7: RuntimeWarning: invalid value encountered in true_divide\n", + "/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/__main__.py:10: RuntimeWarning: invalid value encountered in true_divide\n" + ] + } + ], + "source": [ + "#Compute cleavage probabilities\n", + "\n", + "ref_cut_prob_repl_1 = ref_cuts_repl_1 / np.sum(ref_cuts_repl_1, axis=1)[:, None]\n", + "var_cut_prob_repl_1 = var_cuts_repl_1 / np.sum(var_cuts_repl_1, axis=1)[:, None]\n", + "\n", + "ref_cut_prob_repl_2 = ref_cuts_repl_2 / np.sum(ref_cuts_repl_2, axis=1)[:, None]\n", + "var_cut_prob_repl_2 = var_cuts_repl_2 / np.sum(var_cuts_repl_2, axis=1)[:, None]\n", + "\n", + "ref_cut_prob_pooled = ref_cuts_pooled / np.sum(ref_cuts_pooled, axis=1)[:, None]\n", + "var_cut_prob_pooled = var_cuts_pooled / np.sum(var_cuts_pooled, axis=1)[:, None]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "delta_logodds (repl 1) = 1.0482\n", + "delta_logodds (repl 2) = 0.7639\n", + "delta_logodds (pooled) = 0.9385\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Control: F2 variant (creation of new cutsite -1bp relative to reference)\n", + "\n", + "save_figs = True\n", + "fig_name = \"F2_control_profile_hek293\"\n", + "\n", + "test_ix = 9\n", + "\n", + "#Plot range\n", + "plot_start = 0\n", + "plot_end = 146\n", + "\n", + "#Isoform definition\n", + "cut_start = 77\n", + "cut_end = 127\n", + "\n", + "c_ref_1 = ref_cut_prob_repl_1[test_ix, :]\n", + "c_var_1 = var_cut_prob_repl_1[test_ix, :]\n", + "\n", + "c_ref_2 = ref_cut_prob_repl_2[test_ix, :]\n", + "c_var_2 = var_cut_prob_repl_2[test_ix, :]\n", + "\n", + "c_ref_pooled = ref_cut_prob_pooled[test_ix, :]\n", + "c_var_pooled = var_cut_prob_pooled[test_ix, :]\n", + "\n", + "delta_logodds_1 = np.log(np.sum(c_var_1[77:127]) / (1. - np.sum(c_var_1[77:127]))) - np.log(np.sum(c_ref_1[77:127]) / (1. - np.sum(c_ref_1[77:127])))\n", + "delta_logodds_2 = np.log(np.sum(c_var_2[77:127]) / (1. - np.sum(c_var_2[77:127]))) - np.log(np.sum(c_ref_2[77:127]) / (1. - np.sum(c_ref_2[77:127])))\n", + "delta_logodds_pooled = np.log(np.sum(c_var_pooled[77:127]) / (1. - np.sum(c_var_pooled[77:127]))) - np.log(np.sum(c_ref_pooled[77:127]) / (1. - np.sum(c_ref_pooled[77:127])))\n", + "\n", + "print(\"delta_logodds (repl 1) = \" + str(round(delta_logodds_1, 4)))\n", + "print(\"delta_logodds (repl 2) = \" + str(round(delta_logodds_2, 4)))\n", + "print(\"delta_logodds (pooled) = \" + str(round(delta_logodds_pooled, 4)))\n", + "\n", + "#Plot replicate 1 profile\n", + "f = plt.figure(figsize=(10, 3))\n", + "\n", + "plt.plot(c_ref_1[plot_start: plot_end], color='darkblue', linewidth=3)\n", + "plt.plot(c_var_1[plot_start: plot_end], color='darkorange', linewidth=3)\n", + "\n", + "plt.axvline(x=70, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "plt.axvline(x=76, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "\n", + "plt.xlim(plot_start, plot_end)\n", + "plt.ylim(0.)\n", + "\n", + "plt.title(\"Replicate 1. Delta Isoform Log Odds = \" + str(round(delta_logodds_1, 4)))\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_repl_1.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_repl_1.eps\")\n", + "\n", + "plt.show()\n", + "\n", + "#Plot replicate 2 profile\n", + "f = plt.figure(figsize=(10, 3))\n", + "\n", + "plt.plot(c_ref_2[plot_start: plot_end], color='darkblue', linewidth=3)\n", + "plt.plot(c_var_2[plot_start: plot_end], color='darkorange', linewidth=3)\n", + "\n", + "plt.axvline(x=70, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "plt.axvline(x=76, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "\n", + "plt.xlim(plot_start, plot_end)\n", + "plt.ylim(0.)\n", + "\n", + "plt.title(\"Replicate 2. Delta Isoform Log Odds = \" + str(round(delta_logodds_2, 4)))\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_repl_2.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_repl_2.eps\")\n", + "\n", + "plt.show()\n", + "\n", + "#Plot pooled replicate profile\n", + "f = plt.figure(figsize=(10, 3))\n", + "\n", + "plt.plot(c_ref_pooled[plot_start: plot_end], color='darkblue', linewidth=3)\n", + "plt.plot(c_var_pooled[plot_start: plot_end], color='darkorange', linewidth=3)\n", + "\n", + "plt.axvline(x=70, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "plt.axvline(x=76, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "\n", + "plt.xlim(plot_start, plot_end)\n", + "plt.ylim(0.)\n", + "\n", + "plt.title(\"Pooled replicates. Delta Isoform Log Odds = \" + str(round(delta_logodds_pooled, 4)))\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_pooled.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_pooled.eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(filtered_df) = 94\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from scipy.stats import spearmanr\n", + "\n", + "save_figs = True\n", + "fig_name = \"replicate_hek293\"\n", + "\n", + "min_c = 5.\n", + "\n", + "x_min = -6.\n", + "x_max = 4.\n", + "\n", + "filtered_df = library_df.query(\"ref_count_total_repl_1 >= \" + str(min_c) + \" and \" + \"ref_count_total_repl_2 >= \" + str(min_c) + \" and \" + \"var_count_total_repl_1 >= \" + str(min_c) + \" and \" + \"var_count_total_repl_2 >= \" + str(min_c))\n", + "\n", + "print(\"len(filtered_df) = \" + str(len(filtered_df)))\n", + "\n", + "#Reference library (replicate correlation)\n", + "r_val_ref, _ = spearmanr(filtered_df['ref_logit_77_127_repl_1'], filtered_df['ref_logit_77_127_repl_2'])\n", + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "plt.scatter(filtered_df['ref_logit_77_127_repl_1'], filtered_df['ref_logit_77_127_repl_2'], color='lightgreen', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "plt.xlim(x_min, x_max)\n", + "plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Ref Logit (Repl. 1)\", fontsize=12)\n", + "plt.ylabel(\"Ref Logit (Repl. 2)\", fontsize=12)\n", + "\n", + "plt.title(\"r = \" + str(round(r_val_ref, 3)) + \", n = \" + str(len(filtered_df)), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_ref_logits.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_ref_logits.eps\")\n", + "\n", + "plt.show()\n", + "\n", + "#Variant library (replicate correlation)\n", + "r_val_var, _ = spearmanr(filtered_df['var_logit_77_127_repl_1'], filtered_df['var_logit_77_127_repl_2'])\n", + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "plt.scatter(filtered_df['var_logit_77_127_repl_1'], filtered_df['var_logit_77_127_repl_2'], color='lightcoral', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "plt.xlim(x_min, x_max)\n", + "plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Var Logit (Repl. 1)\", fontsize=12)\n", + "plt.ylabel(\"Var Logit (Repl. 2)\", fontsize=12)\n", + "\n", + "plt.title(\"r = \" + str(round(r_val_var, 3)) + \", n = \" + str(len(filtered_df)), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_var_logits.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_var_logits.eps\")\n", + "\n", + "plt.show()\n", + "\n", + "#Ref-Var library (delta replicate correlation)\n", + "r_val_var, _ = spearmanr(filtered_df['delta_logodds_true_77_127_repl_1'], filtered_df['delta_logodds_true_77_127_repl_2'])\n", + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "plt.scatter(filtered_df['delta_logodds_true_77_127_repl_1'], filtered_df['delta_logodds_true_77_127_repl_2'], color='lightcoral', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "plt.xlim(x_min, x_max)\n", + "plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"LOR (Repl. 1)\", fontsize=12)\n", + "plt.ylabel(\"LOR (Repl. 2)\", fontsize=12)\n", + "\n", + "plt.title(\"r = \" + str(round(r_val_var, 3)) + \", n = \" + str(len(filtered_df)), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_delta_logodds.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_delta_logodds.eps\")\n", + "\n", + "plt.show()\n", + "\n", + "#Ref-Var control correlation\n", + "control_df = filtered_df.loc[filtered_df['experiment'].str.contains(\"control_\") & filtered_df['data_source'].str.contains(\"Array_2019\")]\n", + "\n", + "r_val_control, _ = spearmanr(control_df['ref_logit_77_127_repl_pooled'], control_df['var_logit_77_127_repl_pooled'])\n", + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "plt.scatter(control_df['ref_logit_77_127_repl_pooled'], control_df['var_logit_77_127_repl_pooled'], color='black', s=175, marker='^')\n", + "\n", + "plt.plot([x_min, x_max], [x_min, x_max], color='darkgreen', linestyle='--', linewidth=2,)\n", + "\n", + "plt.xlim(x_min, x_max)\n", + "plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Ref Logit\", fontsize=12)\n", + "plt.ylabel(\"Var Logit\", fontsize=12)\n", + "\n", + "plt.title(\"Controls; Spearman r = \" + str(round(r_val_control, 3)) + \", n = \" + str(len(control_df)), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_control_logits.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_control_logits.eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n (sequences) = 186066\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Compare controls from 2022 oligo array to the Cell paper 2019 array\n", + "\n", + "#Load oligo array from 2019\n", + "import isolearn.io as isoio\n", + "\n", + "save_figs = True\n", + "fig_name = \"replicate_hek293_to_2019\"\n", + "\n", + "isoform_pseudo_count = 1.\n", + "proximal_start = 77\n", + "proximal_end = 127\n", + "\n", + "file_prefix = str(proximal_start) + \"_\" + str(proximal_end)\n", + "\n", + "seq_dict = isoio.load('../../../../aparent/data/prepared_data/apa_array_data/apa_array_data_seq')\n", + "\n", + "seq_df = seq_dict['array_df']\n", + "seq_cuts = seq_dict['pooled_cuts']\n", + "\n", + "cut_true = np.concatenate([np.array(seq_cuts[:, 180: 180 + 205].todense()), np.array(seq_cuts[:, -1].todense()).reshape(-1, 1)], axis=-1)# - 1\n", + "\n", + "seq_df['proximal_count'] = [np.sum(cut_true[i, proximal_start:proximal_end]) for i in range(len(seq_df))]\n", + "seq_df['total_count'] = [np.sum(cut_true[i, :]) for i in range(len(seq_df))]\n", + "\n", + "seq_df['iso_true'] = (seq_df['proximal_count'] + isoform_pseudo_count) / (seq_df['total_count'] + 2. * isoform_pseudo_count)\n", + "seq_df['logodds_true'] = np.log(seq_df['iso_true'] / (1.0 - seq_df['iso_true']))\n", + "\n", + "seq_df['seq'] = seq_df['seq'].str.slice(0, 205)\n", + "\n", + "print(\"n (sequences) = \" + str(len(seq_df)))\n", + "\n", + "#Ref-Array 2019 control correlation\n", + "control_df_2019 = control_df.join(seq_df[['seq', 'logodds_true']].set_index(\"seq\"), on='ref_seq', how='inner').copy().reset_index(drop=True)\n", + "\n", + "r_val_control, _ = spearmanr(control_df_2019['ref_logit_77_127_repl_pooled'], control_df_2019['logodds_true'])\n", + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "plt.scatter(control_df_2019['ref_logit_77_127_repl_pooled'], control_df_2019['logodds_true'], color='deepskyblue', edgecolor='black', linewidth=1, s=175, marker='^')\n", + "\n", + "plt.plot([x_min, x_max], [x_min, x_max], color='darkgreen', linestyle='--', linewidth=2,)\n", + "\n", + "plt.xlim(x_min, x_max)\n", + "plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Logit (2022)\", fontsize=12)\n", + "plt.ylabel(\"Logit (2019)\", fontsize=12)\n", + "\n", + "plt.title(\"Controls; Spearman r = \" + str(round(r_val_control, 3)) + \", n = \" + str(len(control_df_2019)), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:tensorflow]", + "language": "python", + "name": "conda-env-tensorflow-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data/oligo_pool_2022/medium_library/process_reads_hmc3.ipynb b/data/oligo_pool_2022/medium_library/process_reads_hmc3.ipynb new file mode 100644 index 0000000..d12eefd --- /dev/null +++ b/data/oligo_pool_2022/medium_library/process_reads_hmc3.ipynb @@ -0,0 +1,1877 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import scipy\n", + "import scipy.io as spio\n", + "import scipy.sparse as sp\n", + "\n", + "import regex as re\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(barcode_dict) = 201500\n" + ] + } + ], + "source": [ + "#Load reference library dataframe and build barcode dictionary\n", + "\n", + "library_df = pd.read_csv(\"apa_100_variants_rev2_20220621_pred.csv\", sep='\\t')\n", + "\n", + "#Build dictionary (double-mutation support)\n", + "bases = ['A', 'C', 'G', 'T']\n", + "\n", + "barcode_dict = {}\n", + "sequences = []\n", + "for i, [_, row] in enumerate(library_df.iterrows()) :\n", + " bc = row['ref_barcode']\n", + " \n", + " sequences.append(row['ref_seq'])\n", + " \n", + " barcode_dict[bc] = i\n", + " for pos1 in range(len(bc)) :\n", + " for pos2 in range(pos1, len(bc)) :\n", + " for b1 in bases :\n", + " for b2 in bases :\n", + " bc_mut = bc[:pos1] + b1 + bc[pos1+1:pos2] + b2 + bc[pos2+1:]\n", + " \n", + " if bc_mut in barcode_dict and barcode_dict[bc_mut] != i :\n", + " print(\"[ERROR] Barcode dictionary collision.\")\n", + " else :\n", + " barcode_dict[bc_mut] = i\n", + "\n", + "print(\"len(barcode_dict) = \" + str(len(barcode_dict)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'GATGCAGCTGGCTATCATGA': 0,\n", + " 'AAATGCAGCTGGCTATCATGA': 0,\n", + " 'ACATGCAGCTGGCTATCATGA': 0,\n", + " 'AGATGCAGCTGGCTATCATGA': 0,\n", + " 'ATATGCAGCTGGCTATCATGA': 0,\n", + " 'CAATGCAGCTGGCTATCATGA': 0,\n", + " 'CCATGCAGCTGGCTATCATGA': 0,\n", + " 'CGATGCAGCTGGCTATCATGA': 0,\n", + " 'CTATGCAGCTGGCTATCATGA': 0,\n", + " 'GAATGCAGCTGGCTATCATGA': 0,\n", + " 'GCATGCAGCTGGCTATCATGA': 0,\n", + " 'GGATGCAGCTGGCTATCATGA': 0,\n", + " 'GTATGCAGCTGGCTATCATGA': 0,\n", + " 'TAATGCAGCTGGCTATCATGA': 0,\n", + " 'TCATGCAGCTGGCTATCATGA': 0,\n", + " 'TGATGCAGCTGGCTATCATGA': 0,\n", + " 'TTATGCAGCTGGCTATCATGA': 0,\n", + " 'AATGCAGCTGGCTATCATGA': 0,\n", + " 'ACTGCAGCTGGCTATCATGA': 0,\n", + " 'AGTGCAGCTGGCTATCATGA': 0,\n", + " 'ATTGCAGCTGGCTATCATGA': 0,\n", + " 'CATGCAGCTGGCTATCATGA': 0,\n", + " 'CCTGCAGCTGGCTATCATGA': 0,\n", + " 'CGTGCAGCTGGCTATCATGA': 0,\n", + " 'CTTGCAGCTGGCTATCATGA': 0,\n", + " 'GCTGCAGCTGGCTATCATGA': 0,\n", + " 'GGTGCAGCTGGCTATCATGA': 0,\n", + " 'GTTGCAGCTGGCTATCATGA': 0,\n", + " 'TATGCAGCTGGCTATCATGA': 0,\n", + " 'TCTGCAGCTGGCTATCATGA': 0,\n", + " 'TGTGCAGCTGGCTATCATGA': 0,\n", + " 'TTTGCAGCTGGCTATCATGA': 0,\n", + " 'AAAGCAGCTGGCTATCATGA': 0,\n", + " 'AACGCAGCTGGCTATCATGA': 0,\n", + " 'AAGGCAGCTGGCTATCATGA': 0,\n", + " 'CAAGCAGCTGGCTATCATGA': 0,\n", + " 'CACGCAGCTGGCTATCATGA': 0,\n", + " 'CAGGCAGCTGGCTATCATGA': 0,\n", + " 'GAAGCAGCTGGCTATCATGA': 0,\n", + " 'GACGCAGCTGGCTATCATGA': 0,\n", + " 'GAGGCAGCTGGCTATCATGA': 0,\n", + " 'TAAGCAGCTGGCTATCATGA': 0,\n", + " 'TACGCAGCTGGCTATCATGA': 0,\n", + " 'TAGGCAGCTGGCTATCATGA': 0,\n", + " 'AATACAGCTGGCTATCATGA': 0,\n", + " 'AATCCAGCTGGCTATCATGA': 0,\n", + " 'AATTCAGCTGGCTATCATGA': 0,\n", + " 'CATACAGCTGGCTATCATGA': 0,\n", + " 'CATCCAGCTGGCTATCATGA': 0,\n", + " 'CATTCAGCTGGCTATCATGA': 0,\n", + " 'GATACAGCTGGCTATCATGA': 0,\n", + " 'GATCCAGCTGGCTATCATGA': 0,\n", + " 'GATTCAGCTGGCTATCATGA': 0,\n", + " 'TATACAGCTGGCTATCATGA': 0,\n", + " 'TATCCAGCTGGCTATCATGA': 0,\n", + " 'TATTCAGCTGGCTATCATGA': 0,\n", + " 'AATGAAGCTGGCTATCATGA': 0,\n", + " 'AATGGAGCTGGCTATCATGA': 0,\n", + " 'AATGTAGCTGGCTATCATGA': 0,\n", + " 'CATGAAGCTGGCTATCATGA': 0,\n", + " 'CATGGAGCTGGCTATCATGA': 0,\n", + " 'CATGTAGCTGGCTATCATGA': 0,\n", + " 'GATGAAGCTGGCTATCATGA': 0,\n", + " 'GATGGAGCTGGCTATCATGA': 0,\n", + " 'GATGTAGCTGGCTATCATGA': 0,\n", + " 'TATGAAGCTGGCTATCATGA': 0,\n", + " 'TATGGAGCTGGCTATCATGA': 0,\n", + " 'TATGTAGCTGGCTATCATGA': 0,\n", + " 'AATGCCGCTGGCTATCATGA': 0,\n", + " 'AATGCGGCTGGCTATCATGA': 0,\n", + " 'AATGCTGCTGGCTATCATGA': 0,\n", + " 'CATGCCGCTGGCTATCATGA': 0,\n", + " 'CATGCGGCTGGCTATCATGA': 0,\n", + " 'CATGCTGCTGGCTATCATGA': 0,\n", + " 'GATGCCGCTGGCTATCATGA': 0,\n", + " 'GATGCGGCTGGCTATCATGA': 0,\n", + " 'GATGCTGCTGGCTATCATGA': 0,\n", + " 'TATGCCGCTGGCTATCATGA': 0,\n", + " 'TATGCGGCTGGCTATCATGA': 0,\n", + " 'TATGCTGCTGGCTATCATGA': 0,\n", + " 'AATGCAACTGGCTATCATGA': 0,\n", + " 'AATGCACCTGGCTATCATGA': 0,\n", + " 'AATGCATCTGGCTATCATGA': 0,\n", + " 'CATGCAACTGGCTATCATGA': 0,\n", + " 'CATGCACCTGGCTATCATGA': 0,\n", + " 'CATGCATCTGGCTATCATGA': 0,\n", + " 'GATGCAACTGGCTATCATGA': 0,\n", + " 'GATGCACCTGGCTATCATGA': 0,\n", + " 'GATGCATCTGGCTATCATGA': 0,\n", + " 'TATGCAACTGGCTATCATGA': 0,\n", + " 'TATGCACCTGGCTATCATGA': 0,\n", + " 'TATGCATCTGGCTATCATGA': 0,\n", + " 'AATGCAGATGGCTATCATGA': 0,\n", + " 'AATGCAGGTGGCTATCATGA': 0,\n", + " 'AATGCAGTTGGCTATCATGA': 0,\n", + " 'CATGCAGATGGCTATCATGA': 0,\n", + " 'CATGCAGGTGGCTATCATGA': 0,\n", + " 'CATGCAGTTGGCTATCATGA': 0,\n", + " 'GATGCAGATGGCTATCATGA': 0,\n", + " 'GATGCAGGTGGCTATCATGA': 0,\n", + " 'GATGCAGTTGGCTATCATGA': 0,\n", + " 'TATGCAGATGGCTATCATGA': 0,\n", + " 'TATGCAGGTGGCTATCATGA': 0,\n", + " 'TATGCAGTTGGCTATCATGA': 0,\n", + " 'AATGCAGCAGGCTATCATGA': 0,\n", + " 'AATGCAGCCGGCTATCATGA': 0,\n", + " 'AATGCAGCGGGCTATCATGA': 0,\n", + " 'CATGCAGCAGGCTATCATGA': 0,\n", + " 'CATGCAGCCGGCTATCATGA': 0,\n", + " 'CATGCAGCGGGCTATCATGA': 0,\n", + " 'GATGCAGCAGGCTATCATGA': 0,\n", + " 'GATGCAGCCGGCTATCATGA': 0,\n", + " 'GATGCAGCGGGCTATCATGA': 0,\n", + " 'TATGCAGCAGGCTATCATGA': 0,\n", + " 'TATGCAGCCGGCTATCATGA': 0,\n", + " 'TATGCAGCGGGCTATCATGA': 0,\n", + " 'AATGCAGCTAGCTATCATGA': 0,\n", + " 'AATGCAGCTCGCTATCATGA': 0,\n", + " 'AATGCAGCTTGCTATCATGA': 0,\n", + " 'CATGCAGCTAGCTATCATGA': 0,\n", + " 'CATGCAGCTCGCTATCATGA': 0,\n", + " 'CATGCAGCTTGCTATCATGA': 0,\n", + " 'GATGCAGCTAGCTATCATGA': 0,\n", + " 'GATGCAGCTCGCTATCATGA': 0,\n", + " 'GATGCAGCTTGCTATCATGA': 0,\n", + " 'TATGCAGCTAGCTATCATGA': 0,\n", + " 'TATGCAGCTCGCTATCATGA': 0,\n", + " 'TATGCAGCTTGCTATCATGA': 0,\n", + " 'AATGCAGCTGACTATCATGA': 0,\n", + " 'AATGCAGCTGCCTATCATGA': 0,\n", + " 'AATGCAGCTGTCTATCATGA': 0,\n", + " 'CATGCAGCTGACTATCATGA': 0,\n", + " 'CATGCAGCTGCCTATCATGA': 0,\n", + " 'CATGCAGCTGTCTATCATGA': 0,\n", + " 'GATGCAGCTGACTATCATGA': 0,\n", + " 'GATGCAGCTGCCTATCATGA': 0,\n", + " 'GATGCAGCTGTCTATCATGA': 0,\n", + " 'TATGCAGCTGACTATCATGA': 0,\n", + " 'TATGCAGCTGCCTATCATGA': 0,\n", + " 'TATGCAGCTGTCTATCATGA': 0,\n", + " 'AATGCAGCTGGATATCATGA': 0,\n", + " 'AATGCAGCTGGGTATCATGA': 0,\n", + " 'AATGCAGCTGGTTATCATGA': 0,\n", + " 'CATGCAGCTGGATATCATGA': 0,\n", + " 'CATGCAGCTGGGTATCATGA': 0,\n", + " 'CATGCAGCTGGTTATCATGA': 0,\n", + " 'GATGCAGCTGGATATCATGA': 0,\n", + " 'GATGCAGCTGGGTATCATGA': 0,\n", + " 'GATGCAGCTGGTTATCATGA': 0,\n", + " 'TATGCAGCTGGATATCATGA': 0,\n", + " 'TATGCAGCTGGGTATCATGA': 0,\n", + " 'TATGCAGCTGGTTATCATGA': 0,\n", + " 'AATGCAGCTGGCAATCATGA': 0,\n", + " 'AATGCAGCTGGCCATCATGA': 0,\n", + " 'AATGCAGCTGGCGATCATGA': 0,\n", + " 'CATGCAGCTGGCAATCATGA': 0,\n", + " 'CATGCAGCTGGCCATCATGA': 0,\n", + " 'CATGCAGCTGGCGATCATGA': 0,\n", + " 'GATGCAGCTGGCAATCATGA': 0,\n", + " 'GATGCAGCTGGCCATCATGA': 0,\n", + " 'GATGCAGCTGGCGATCATGA': 0,\n", + " 'TATGCAGCTGGCAATCATGA': 0,\n", + " 'TATGCAGCTGGCCATCATGA': 0,\n", + " 'TATGCAGCTGGCGATCATGA': 0,\n", + " 'AATGCAGCTGGCTCTCATGA': 0,\n", + " 'AATGCAGCTGGCTGTCATGA': 0,\n", + " 'AATGCAGCTGGCTTTCATGA': 0,\n", + " 'CATGCAGCTGGCTCTCATGA': 0,\n", + " 'CATGCAGCTGGCTGTCATGA': 0,\n", + " 'CATGCAGCTGGCTTTCATGA': 0,\n", + " 'GATGCAGCTGGCTCTCATGA': 0,\n", + " 'GATGCAGCTGGCTGTCATGA': 0,\n", + " 'GATGCAGCTGGCTTTCATGA': 0,\n", + " 'TATGCAGCTGGCTCTCATGA': 0,\n", + " 'TATGCAGCTGGCTGTCATGA': 0,\n", + " 'TATGCAGCTGGCTTTCATGA': 0,\n", + " 'AATGCAGCTGGCTAACATGA': 0,\n", + " 'AATGCAGCTGGCTACCATGA': 0,\n", + " 'AATGCAGCTGGCTAGCATGA': 0,\n", + " 'CATGCAGCTGGCTAACATGA': 0,\n", + " 'CATGCAGCTGGCTACCATGA': 0,\n", + " 'CATGCAGCTGGCTAGCATGA': 0,\n", + " 'GATGCAGCTGGCTAACATGA': 0,\n", + " 'GATGCAGCTGGCTACCATGA': 0,\n", + " 'GATGCAGCTGGCTAGCATGA': 0,\n", + " 'TATGCAGCTGGCTAACATGA': 0,\n", + " 'TATGCAGCTGGCTACCATGA': 0,\n", + " 'TATGCAGCTGGCTAGCATGA': 0,\n", + " 'AATGCAGCTGGCTATAATGA': 0,\n", + " 'AATGCAGCTGGCTATGATGA': 0,\n", + " 'AATGCAGCTGGCTATTATGA': 0,\n", + " 'CATGCAGCTGGCTATAATGA': 0,\n", + " 'CATGCAGCTGGCTATGATGA': 0,\n", + " 'CATGCAGCTGGCTATTATGA': 0,\n", + " 'GATGCAGCTGGCTATAATGA': 0,\n", + " 'GATGCAGCTGGCTATGATGA': 0,\n", + " 'GATGCAGCTGGCTATTATGA': 0,\n", + " 'TATGCAGCTGGCTATAATGA': 0,\n", + " 'TATGCAGCTGGCTATGATGA': 0,\n", + " 'TATGCAGCTGGCTATTATGA': 0,\n", + " 'AATGCAGCTGGCTATCCTGA': 0,\n", + " 'AATGCAGCTGGCTATCGTGA': 0,\n", + " 'AATGCAGCTGGCTATCTTGA': 0,\n", + " 'CATGCAGCTGGCTATCCTGA': 0,\n", + " 'CATGCAGCTGGCTATCGTGA': 0,\n", + " 'CATGCAGCTGGCTATCTTGA': 0,\n", + " 'GATGCAGCTGGCTATCCTGA': 0,\n", + " 'GATGCAGCTGGCTATCGTGA': 0,\n", + " 'GATGCAGCTGGCTATCTTGA': 0,\n", + " 'TATGCAGCTGGCTATCCTGA': 0,\n", + " 'TATGCAGCTGGCTATCGTGA': 0,\n", + " 'TATGCAGCTGGCTATCTTGA': 0,\n", + " 'AATGCAGCTGGCTATCAAGA': 0,\n", + " 'AATGCAGCTGGCTATCACGA': 0,\n", + " 'AATGCAGCTGGCTATCAGGA': 0,\n", + " 'CATGCAGCTGGCTATCAAGA': 0,\n", + " 'CATGCAGCTGGCTATCACGA': 0,\n", + " 'CATGCAGCTGGCTATCAGGA': 0,\n", + " 'GATGCAGCTGGCTATCAAGA': 0,\n", + " 'GATGCAGCTGGCTATCACGA': 0,\n", + " 'GATGCAGCTGGCTATCAGGA': 0,\n", + " 'TATGCAGCTGGCTATCAAGA': 0,\n", + " 'TATGCAGCTGGCTATCACGA': 0,\n", + " 'TATGCAGCTGGCTATCAGGA': 0,\n", + " 'AATGCAGCTGGCTATCATAA': 0,\n", + " 'AATGCAGCTGGCTATCATCA': 0,\n", + " 'AATGCAGCTGGCTATCATTA': 0,\n", + " 'CATGCAGCTGGCTATCATAA': 0,\n", + " 'CATGCAGCTGGCTATCATCA': 0,\n", + " 'CATGCAGCTGGCTATCATTA': 0,\n", + " 'GATGCAGCTGGCTATCATAA': 0,\n", + " 'GATGCAGCTGGCTATCATCA': 0,\n", + " 'GATGCAGCTGGCTATCATTA': 0,\n", + " 'TATGCAGCTGGCTATCATAA': 0,\n", + " 'TATGCAGCTGGCTATCATCA': 0,\n", + " 'TATGCAGCTGGCTATCATTA': 0,\n", + " 'AATGCAGCTGGCTATCATGC': 0,\n", + " 'AATGCAGCTGGCTATCATGG': 0,\n", + " 'AATGCAGCTGGCTATCATGT': 0,\n", + " 'CATGCAGCTGGCTATCATGC': 0,\n", + " 'CATGCAGCTGGCTATCATGG': 0,\n", + " 'CATGCAGCTGGCTATCATGT': 0,\n", + " 'GATGCAGCTGGCTATCATGC': 0,\n", + " 'GATGCAGCTGGCTATCATGG': 0,\n", + " 'GATGCAGCTGGCTATCATGT': 0,\n", + " 'TATGCAGCTGGCTATCATGC': 0,\n", + " 'TATGCAGCTGGCTATCATGG': 0,\n", + " 'TATGCAGCTGGCTATCATGT': 0,\n", + " 'GACTGCAGCTGGCTATCATGA': 0,\n", + " 'GAGTGCAGCTGGCTATCATGA': 0,\n", + " 'GATTGCAGCTGGCTATCATGA': 0,\n", + " 'GCCTGCAGCTGGCTATCATGA': 0,\n", + " 'GCGTGCAGCTGGCTATCATGA': 0,\n", + " 'GCTTGCAGCTGGCTATCATGA': 0,\n", + " 'GGCTGCAGCTGGCTATCATGA': 0,\n", + " 'GGGTGCAGCTGGCTATCATGA': 0,\n", + " 'GGTTGCAGCTGGCTATCATGA': 0,\n", + " 'GTCTGCAGCTGGCTATCATGA': 0,\n", + " 'GTGTGCAGCTGGCTATCATGA': 0,\n", + " 'GTTTGCAGCTGGCTATCATGA': 0,\n", + " 'GCAGCAGCTGGCTATCATGA': 0,\n", + " 'GCCGCAGCTGGCTATCATGA': 0,\n", + " 'GCGGCAGCTGGCTATCATGA': 0,\n", + " 'GGAGCAGCTGGCTATCATGA': 0,\n", + " 'GGCGCAGCTGGCTATCATGA': 0,\n", + " 'GGGGCAGCTGGCTATCATGA': 0,\n", + " 'GTAGCAGCTGGCTATCATGA': 0,\n", + " 'GTCGCAGCTGGCTATCATGA': 0,\n", + " 'GTGGCAGCTGGCTATCATGA': 0,\n", + " 'GCTACAGCTGGCTATCATGA': 0,\n", + " 'GCTCCAGCTGGCTATCATGA': 0,\n", + " 'GCTTCAGCTGGCTATCATGA': 0,\n", + " 'GGTACAGCTGGCTATCATGA': 0,\n", + " 'GGTCCAGCTGGCTATCATGA': 0,\n", + " 'GGTTCAGCTGGCTATCATGA': 0,\n", + " 'GTTACAGCTGGCTATCATGA': 0,\n", + " 'GTTCCAGCTGGCTATCATGA': 0,\n", + " 'GTTTCAGCTGGCTATCATGA': 0,\n", + " 'GCTGAAGCTGGCTATCATGA': 0,\n", + " 'GCTGGAGCTGGCTATCATGA': 0,\n", + " 'GCTGTAGCTGGCTATCATGA': 0,\n", + " 'GGTGAAGCTGGCTATCATGA': 0,\n", + " 'GGTGGAGCTGGCTATCATGA': 0,\n", + " 'GGTGTAGCTGGCTATCATGA': 0,\n", + " 'GTTGAAGCTGGCTATCATGA': 0,\n", + " 'GTTGGAGCTGGCTATCATGA': 0,\n", + " 'GTTGTAGCTGGCTATCATGA': 0,\n", + " 'GCTGCCGCTGGCTATCATGA': 0,\n", + " 'GCTGCGGCTGGCTATCATGA': 0,\n", + " 'GCTGCTGCTGGCTATCATGA': 0,\n", + " 'GGTGCCGCTGGCTATCATGA': 0,\n", + " 'GGTGCGGCTGGCTATCATGA': 0,\n", + " 'GGTGCTGCTGGCTATCATGA': 0,\n", + " 'GTTGCCGCTGGCTATCATGA': 0,\n", + " 'GTTGCGGCTGGCTATCATGA': 0,\n", + " 'GTTGCTGCTGGCTATCATGA': 0,\n", + " 'GCTGCAACTGGCTATCATGA': 0,\n", + " 'GCTGCACCTGGCTATCATGA': 0,\n", + " 'GCTGCATCTGGCTATCATGA': 0,\n", + " 'GGTGCAACTGGCTATCATGA': 0,\n", + " 'GGTGCACCTGGCTATCATGA': 0,\n", + " 'GGTGCATCTGGCTATCATGA': 0,\n", + " 'GTTGCAACTGGCTATCATGA': 0,\n", + " 'GTTGCACCTGGCTATCATGA': 0,\n", + " 'GTTGCATCTGGCTATCATGA': 0,\n", + " 'GCTGCAGATGGCTATCATGA': 0,\n", + " 'GCTGCAGGTGGCTATCATGA': 0,\n", + " 'GCTGCAGTTGGCTATCATGA': 0,\n", + " 'GGTGCAGATGGCTATCATGA': 0,\n", + " 'GGTGCAGGTGGCTATCATGA': 0,\n", + " 'GGTGCAGTTGGCTATCATGA': 0,\n", + " 'GTTGCAGATGGCTATCATGA': 0,\n", + " 'GTTGCAGGTGGCTATCATGA': 0,\n", + " 'GTTGCAGTTGGCTATCATGA': 0,\n", + " 'GCTGCAGCAGGCTATCATGA': 0,\n", + " 'GCTGCAGCCGGCTATCATGA': 0,\n", + " 'GCTGCAGCGGGCTATCATGA': 0,\n", + " 'GGTGCAGCAGGCTATCATGA': 0,\n", + " 'GGTGCAGCCGGCTATCATGA': 0,\n", + " 'GGTGCAGCGGGCTATCATGA': 0,\n", + " 'GTTGCAGCAGGCTATCATGA': 0,\n", + " 'GTTGCAGCCGGCTATCATGA': 0,\n", + " 'GTTGCAGCGGGCTATCATGA': 0,\n", + " 'GCTGCAGCTAGCTATCATGA': 0,\n", + " 'GCTGCAGCTCGCTATCATGA': 0,\n", + " 'GCTGCAGCTTGCTATCATGA': 0,\n", + " 'GGTGCAGCTAGCTATCATGA': 0,\n", + " 'GGTGCAGCTCGCTATCATGA': 0,\n", + " 'GGTGCAGCTTGCTATCATGA': 0,\n", + " 'GTTGCAGCTAGCTATCATGA': 0,\n", + " 'GTTGCAGCTCGCTATCATGA': 0,\n", + " 'GTTGCAGCTTGCTATCATGA': 0,\n", + " 'GCTGCAGCTGACTATCATGA': 0,\n", + " 'GCTGCAGCTGCCTATCATGA': 0,\n", + " 'GCTGCAGCTGTCTATCATGA': 0,\n", + " 'GGTGCAGCTGACTATCATGA': 0,\n", + " 'GGTGCAGCTGCCTATCATGA': 0,\n", + " 'GGTGCAGCTGTCTATCATGA': 0,\n", + " 'GTTGCAGCTGACTATCATGA': 0,\n", + " 'GTTGCAGCTGCCTATCATGA': 0,\n", + " 'GTTGCAGCTGTCTATCATGA': 0,\n", + " 'GCTGCAGCTGGATATCATGA': 0,\n", + " 'GCTGCAGCTGGGTATCATGA': 0,\n", + " 'GCTGCAGCTGGTTATCATGA': 0,\n", + " 'GGTGCAGCTGGATATCATGA': 0,\n", + " 'GGTGCAGCTGGGTATCATGA': 0,\n", + " 'GGTGCAGCTGGTTATCATGA': 0,\n", + " 'GTTGCAGCTGGATATCATGA': 0,\n", + " 'GTTGCAGCTGGGTATCATGA': 0,\n", + " 'GTTGCAGCTGGTTATCATGA': 0,\n", + " 'GCTGCAGCTGGCAATCATGA': 0,\n", + " 'GCTGCAGCTGGCCATCATGA': 0,\n", + " 'GCTGCAGCTGGCGATCATGA': 0,\n", + " 'GGTGCAGCTGGCAATCATGA': 0,\n", + " 'GGTGCAGCTGGCCATCATGA': 0,\n", + " 'GGTGCAGCTGGCGATCATGA': 0,\n", + " 'GTTGCAGCTGGCAATCATGA': 0,\n", + " 'GTTGCAGCTGGCCATCATGA': 0,\n", + " 'GTTGCAGCTGGCGATCATGA': 0,\n", + " 'GCTGCAGCTGGCTCTCATGA': 0,\n", + " 'GCTGCAGCTGGCTGTCATGA': 0,\n", + " 'GCTGCAGCTGGCTTTCATGA': 0,\n", + " 'GGTGCAGCTGGCTCTCATGA': 0,\n", + " 'GGTGCAGCTGGCTGTCATGA': 0,\n", + " 'GGTGCAGCTGGCTTTCATGA': 0,\n", + " 'GTTGCAGCTGGCTCTCATGA': 0,\n", + " 'GTTGCAGCTGGCTGTCATGA': 0,\n", + " 'GTTGCAGCTGGCTTTCATGA': 0,\n", + " 'GCTGCAGCTGGCTAACATGA': 0,\n", + " 'GCTGCAGCTGGCTACCATGA': 0,\n", + " 'GCTGCAGCTGGCTAGCATGA': 0,\n", + " 'GGTGCAGCTGGCTAACATGA': 0,\n", + " 'GGTGCAGCTGGCTACCATGA': 0,\n", + " 'GGTGCAGCTGGCTAGCATGA': 0,\n", + " 'GTTGCAGCTGGCTAACATGA': 0,\n", + " 'GTTGCAGCTGGCTACCATGA': 0,\n", + " 'GTTGCAGCTGGCTAGCATGA': 0,\n", + " 'GCTGCAGCTGGCTATAATGA': 0,\n", + " 'GCTGCAGCTGGCTATGATGA': 0,\n", + " 'GCTGCAGCTGGCTATTATGA': 0,\n", + " 'GGTGCAGCTGGCTATAATGA': 0,\n", + " 'GGTGCAGCTGGCTATGATGA': 0,\n", + " 'GGTGCAGCTGGCTATTATGA': 0,\n", + " 'GTTGCAGCTGGCTATAATGA': 0,\n", + " 'GTTGCAGCTGGCTATGATGA': 0,\n", + " 'GTTGCAGCTGGCTATTATGA': 0,\n", + " 'GCTGCAGCTGGCTATCCTGA': 0,\n", + " 'GCTGCAGCTGGCTATCGTGA': 0,\n", + " 'GCTGCAGCTGGCTATCTTGA': 0,\n", + " 'GGTGCAGCTGGCTATCCTGA': 0,\n", + " 'GGTGCAGCTGGCTATCGTGA': 0,\n", + " 'GGTGCAGCTGGCTATCTTGA': 0,\n", + " 'GTTGCAGCTGGCTATCCTGA': 0,\n", + " 'GTTGCAGCTGGCTATCGTGA': 0,\n", + " 'GTTGCAGCTGGCTATCTTGA': 0,\n", + " 'GCTGCAGCTGGCTATCAAGA': 0,\n", + " 'GCTGCAGCTGGCTATCACGA': 0,\n", + " 'GCTGCAGCTGGCTATCAGGA': 0,\n", + " 'GGTGCAGCTGGCTATCAAGA': 0,\n", + " 'GGTGCAGCTGGCTATCACGA': 0,\n", + " 'GGTGCAGCTGGCTATCAGGA': 0,\n", + " 'GTTGCAGCTGGCTATCAAGA': 0,\n", + " 'GTTGCAGCTGGCTATCACGA': 0,\n", + " 'GTTGCAGCTGGCTATCAGGA': 0,\n", + " 'GCTGCAGCTGGCTATCATAA': 0,\n", + " 'GCTGCAGCTGGCTATCATCA': 0,\n", + " 'GCTGCAGCTGGCTATCATTA': 0,\n", + " 'GGTGCAGCTGGCTATCATAA': 0,\n", + " 'GGTGCAGCTGGCTATCATCA': 0,\n", + " 'GGTGCAGCTGGCTATCATTA': 0,\n", + " 'GTTGCAGCTGGCTATCATAA': 0,\n", + " 'GTTGCAGCTGGCTATCATCA': 0,\n", + " 'GTTGCAGCTGGCTATCATTA': 0,\n", + " 'GCTGCAGCTGGCTATCATGC': 0,\n", + " 'GCTGCAGCTGGCTATCATGG': 0,\n", + " 'GCTGCAGCTGGCTATCATGT': 0,\n", + " 'GGTGCAGCTGGCTATCATGC': 0,\n", + " 'GGTGCAGCTGGCTATCATGG': 0,\n", + " 'GGTGCAGCTGGCTATCATGT': 0,\n", + " 'GTTGCAGCTGGCTATCATGC': 0,\n", + " 'GTTGCAGCTGGCTATCATGG': 0,\n", + " 'GTTGCAGCTGGCTATCATGT': 0,\n", + " 'GAAAGCAGCTGGCTATCATGA': 0,\n", + " 'GAACGCAGCTGGCTATCATGA': 0,\n", + " 'GAAGGCAGCTGGCTATCATGA': 0,\n", + " 'GACAGCAGCTGGCTATCATGA': 0,\n", + " 'GACCGCAGCTGGCTATCATGA': 0,\n", + " 'GACGGCAGCTGGCTATCATGA': 0,\n", + " 'GAGAGCAGCTGGCTATCATGA': 0,\n", + " 'GAGCGCAGCTGGCTATCATGA': 0,\n", + " 'GAGGGCAGCTGGCTATCATGA': 0,\n", + " 'GATAGCAGCTGGCTATCATGA': 0,\n", + " 'GATCGCAGCTGGCTATCATGA': 0,\n", + " 'GATGGCAGCTGGCTATCATGA': 0,\n", + " 'GAAACAGCTGGCTATCATGA': 0,\n", + " 'GAACCAGCTGGCTATCATGA': 0,\n", + " 'GAATCAGCTGGCTATCATGA': 0,\n", + " 'GACACAGCTGGCTATCATGA': 0,\n", + " 'GACCCAGCTGGCTATCATGA': 0,\n", + " 'GACTCAGCTGGCTATCATGA': 0,\n", + " 'GAGACAGCTGGCTATCATGA': 0,\n", + " 'GAGCCAGCTGGCTATCATGA': 0,\n", + " 'GAGTCAGCTGGCTATCATGA': 0,\n", + " 'GAAGAAGCTGGCTATCATGA': 0,\n", + " 'GAAGGAGCTGGCTATCATGA': 0,\n", + " 'GAAGTAGCTGGCTATCATGA': 0,\n", + " 'GACGAAGCTGGCTATCATGA': 0,\n", + " 'GACGGAGCTGGCTATCATGA': 0,\n", + " 'GACGTAGCTGGCTATCATGA': 0,\n", + " 'GAGGAAGCTGGCTATCATGA': 0,\n", + " 'GAGGGAGCTGGCTATCATGA': 0,\n", + " 'GAGGTAGCTGGCTATCATGA': 0,\n", + " 'GAAGCCGCTGGCTATCATGA': 0,\n", + " 'GAAGCGGCTGGCTATCATGA': 0,\n", + " 'GAAGCTGCTGGCTATCATGA': 0,\n", + " 'GACGCCGCTGGCTATCATGA': 0,\n", + " 'GACGCGGCTGGCTATCATGA': 0,\n", + " 'GACGCTGCTGGCTATCATGA': 0,\n", + " 'GAGGCCGCTGGCTATCATGA': 0,\n", + " 'GAGGCGGCTGGCTATCATGA': 0,\n", + " 'GAGGCTGCTGGCTATCATGA': 0,\n", + " 'GAAGCAACTGGCTATCATGA': 0,\n", + " 'GAAGCACCTGGCTATCATGA': 0,\n", + " 'GAAGCATCTGGCTATCATGA': 0,\n", + " 'GACGCAACTGGCTATCATGA': 0,\n", + " 'GACGCACCTGGCTATCATGA': 0,\n", + " 'GACGCATCTGGCTATCATGA': 0,\n", + " 'GAGGCAACTGGCTATCATGA': 0,\n", + " 'GAGGCACCTGGCTATCATGA': 0,\n", + " 'GAGGCATCTGGCTATCATGA': 0,\n", + " 'GAAGCAGATGGCTATCATGA': 0,\n", + " 'GAAGCAGGTGGCTATCATGA': 0,\n", + " 'GAAGCAGTTGGCTATCATGA': 0,\n", + " 'GACGCAGATGGCTATCATGA': 0,\n", + " 'GACGCAGGTGGCTATCATGA': 0,\n", + " 'GACGCAGTTGGCTATCATGA': 0,\n", + " 'GAGGCAGATGGCTATCATGA': 0,\n", + " 'GAGGCAGGTGGCTATCATGA': 0,\n", + " 'GAGGCAGTTGGCTATCATGA': 0,\n", + " 'GAAGCAGCAGGCTATCATGA': 0,\n", + " 'GAAGCAGCCGGCTATCATGA': 0,\n", + " 'GAAGCAGCGGGCTATCATGA': 0,\n", + " 'GACGCAGCAGGCTATCATGA': 0,\n", + " 'GACGCAGCCGGCTATCATGA': 0,\n", + " 'GACGCAGCGGGCTATCATGA': 0,\n", + " 'GAGGCAGCAGGCTATCATGA': 0,\n", + " 'GAGGCAGCCGGCTATCATGA': 0,\n", + " 'GAGGCAGCGGGCTATCATGA': 0,\n", + " 'GAAGCAGCTAGCTATCATGA': 0,\n", + " 'GAAGCAGCTCGCTATCATGA': 0,\n", + " 'GAAGCAGCTTGCTATCATGA': 0,\n", + " 'GACGCAGCTAGCTATCATGA': 0,\n", + " 'GACGCAGCTCGCTATCATGA': 0,\n", + " 'GACGCAGCTTGCTATCATGA': 0,\n", + " 'GAGGCAGCTAGCTATCATGA': 0,\n", + " 'GAGGCAGCTCGCTATCATGA': 0,\n", + " 'GAGGCAGCTTGCTATCATGA': 0,\n", + " 'GAAGCAGCTGACTATCATGA': 0,\n", + " 'GAAGCAGCTGCCTATCATGA': 0,\n", + " 'GAAGCAGCTGTCTATCATGA': 0,\n", + " 'GACGCAGCTGACTATCATGA': 0,\n", + " 'GACGCAGCTGCCTATCATGA': 0,\n", + " 'GACGCAGCTGTCTATCATGA': 0,\n", + " 'GAGGCAGCTGACTATCATGA': 0,\n", + " 'GAGGCAGCTGCCTATCATGA': 0,\n", + " 'GAGGCAGCTGTCTATCATGA': 0,\n", + " 'GAAGCAGCTGGATATCATGA': 0,\n", + " 'GAAGCAGCTGGGTATCATGA': 0,\n", + " 'GAAGCAGCTGGTTATCATGA': 0,\n", + " 'GACGCAGCTGGATATCATGA': 0,\n", + " 'GACGCAGCTGGGTATCATGA': 0,\n", + " 'GACGCAGCTGGTTATCATGA': 0,\n", + " 'GAGGCAGCTGGATATCATGA': 0,\n", + " 'GAGGCAGCTGGGTATCATGA': 0,\n", + " 'GAGGCAGCTGGTTATCATGA': 0,\n", + " 'GAAGCAGCTGGCAATCATGA': 0,\n", + " 'GAAGCAGCTGGCCATCATGA': 0,\n", + " 'GAAGCAGCTGGCGATCATGA': 0,\n", + " 'GACGCAGCTGGCAATCATGA': 0,\n", + " 'GACGCAGCTGGCCATCATGA': 0,\n", + " 'GACGCAGCTGGCGATCATGA': 0,\n", + " 'GAGGCAGCTGGCAATCATGA': 0,\n", + " 'GAGGCAGCTGGCCATCATGA': 0,\n", + " 'GAGGCAGCTGGCGATCATGA': 0,\n", + " 'GAAGCAGCTGGCTCTCATGA': 0,\n", + " 'GAAGCAGCTGGCTGTCATGA': 0,\n", + " 'GAAGCAGCTGGCTTTCATGA': 0,\n", + " 'GACGCAGCTGGCTCTCATGA': 0,\n", + " 'GACGCAGCTGGCTGTCATGA': 0,\n", + " 'GACGCAGCTGGCTTTCATGA': 0,\n", + " 'GAGGCAGCTGGCTCTCATGA': 0,\n", + " 'GAGGCAGCTGGCTGTCATGA': 0,\n", + " 'GAGGCAGCTGGCTTTCATGA': 0,\n", + " 'GAAGCAGCTGGCTAACATGA': 0,\n", + " 'GAAGCAGCTGGCTACCATGA': 0,\n", + " 'GAAGCAGCTGGCTAGCATGA': 0,\n", + " 'GACGCAGCTGGCTAACATGA': 0,\n", + " 'GACGCAGCTGGCTACCATGA': 0,\n", + " 'GACGCAGCTGGCTAGCATGA': 0,\n", + " 'GAGGCAGCTGGCTAACATGA': 0,\n", + " 'GAGGCAGCTGGCTACCATGA': 0,\n", + " 'GAGGCAGCTGGCTAGCATGA': 0,\n", + " 'GAAGCAGCTGGCTATAATGA': 0,\n", + " 'GAAGCAGCTGGCTATGATGA': 0,\n", + " 'GAAGCAGCTGGCTATTATGA': 0,\n", + " 'GACGCAGCTGGCTATAATGA': 0,\n", + " 'GACGCAGCTGGCTATGATGA': 0,\n", + " 'GACGCAGCTGGCTATTATGA': 0,\n", + " 'GAGGCAGCTGGCTATAATGA': 0,\n", + " 'GAGGCAGCTGGCTATGATGA': 0,\n", + " 'GAGGCAGCTGGCTATTATGA': 0,\n", + " 'GAAGCAGCTGGCTATCCTGA': 0,\n", + " 'GAAGCAGCTGGCTATCGTGA': 0,\n", + " 'GAAGCAGCTGGCTATCTTGA': 0,\n", + " 'GACGCAGCTGGCTATCCTGA': 0,\n", + " 'GACGCAGCTGGCTATCGTGA': 0,\n", + " 'GACGCAGCTGGCTATCTTGA': 0,\n", + " 'GAGGCAGCTGGCTATCCTGA': 0,\n", + " 'GAGGCAGCTGGCTATCGTGA': 0,\n", + " 'GAGGCAGCTGGCTATCTTGA': 0,\n", + " 'GAAGCAGCTGGCTATCAAGA': 0,\n", + " 'GAAGCAGCTGGCTATCACGA': 0,\n", + " 'GAAGCAGCTGGCTATCAGGA': 0,\n", + " 'GACGCAGCTGGCTATCAAGA': 0,\n", + " 'GACGCAGCTGGCTATCACGA': 0,\n", + " 'GACGCAGCTGGCTATCAGGA': 0,\n", + " 'GAGGCAGCTGGCTATCAAGA': 0,\n", + " 'GAGGCAGCTGGCTATCACGA': 0,\n", + " 'GAGGCAGCTGGCTATCAGGA': 0,\n", + " 'GAAGCAGCTGGCTATCATAA': 0,\n", + " 'GAAGCAGCTGGCTATCATCA': 0,\n", + " 'GAAGCAGCTGGCTATCATTA': 0,\n", + " 'GACGCAGCTGGCTATCATAA': 0,\n", + " 'GACGCAGCTGGCTATCATCA': 0,\n", + " 'GACGCAGCTGGCTATCATTA': 0,\n", + " 'GAGGCAGCTGGCTATCATAA': 0,\n", + " 'GAGGCAGCTGGCTATCATCA': 0,\n", + " 'GAGGCAGCTGGCTATCATTA': 0,\n", + " 'GAAGCAGCTGGCTATCATGC': 0,\n", + " 'GAAGCAGCTGGCTATCATGG': 0,\n", + " 'GAAGCAGCTGGCTATCATGT': 0,\n", + " 'GACGCAGCTGGCTATCATGC': 0,\n", + " 'GACGCAGCTGGCTATCATGG': 0,\n", + " 'GACGCAGCTGGCTATCATGT': 0,\n", + " 'GAGGCAGCTGGCTATCATGC': 0,\n", + " 'GAGGCAGCTGGCTATCATGG': 0,\n", + " 'GAGGCAGCTGGCTATCATGT': 0,\n", + " 'GATAACAGCTGGCTATCATGA': 0,\n", + " 'GATACCAGCTGGCTATCATGA': 0,\n", + " 'GATATCAGCTGGCTATCATGA': 0,\n", + " 'GATCACAGCTGGCTATCATGA': 0,\n", + " 'GATCCCAGCTGGCTATCATGA': 0,\n", + " 'GATCTCAGCTGGCTATCATGA': 0,\n", + " 'GATGACAGCTGGCTATCATGA': 0,\n", + " 'GATGCCAGCTGGCTATCATGA': 0,\n", + " 'GATGTCAGCTGGCTATCATGA': 0,\n", + " 'GATTACAGCTGGCTATCATGA': 0,\n", + " 'GATTCCAGCTGGCTATCATGA': 0,\n", + " 'GATTTCAGCTGGCTATCATGA': 0,\n", + " 'GATAAAGCTGGCTATCATGA': 0,\n", + " 'GATAGAGCTGGCTATCATGA': 0,\n", + " 'GATATAGCTGGCTATCATGA': 0,\n", + " 'GATCAAGCTGGCTATCATGA': 0,\n", + " 'GATCGAGCTGGCTATCATGA': 0,\n", + " 'GATCTAGCTGGCTATCATGA': 0,\n", + " 'GATTAAGCTGGCTATCATGA': 0,\n", + " 'GATTGAGCTGGCTATCATGA': 0,\n", + " 'GATTTAGCTGGCTATCATGA': 0,\n", + " 'GATACCGCTGGCTATCATGA': 0,\n", + " 'GATACGGCTGGCTATCATGA': 0,\n", + " 'GATACTGCTGGCTATCATGA': 0,\n", + " 'GATCCCGCTGGCTATCATGA': 0,\n", + " 'GATCCGGCTGGCTATCATGA': 0,\n", + " 'GATCCTGCTGGCTATCATGA': 0,\n", + " 'GATTCCGCTGGCTATCATGA': 0,\n", + " 'GATTCGGCTGGCTATCATGA': 0,\n", + " 'GATTCTGCTGGCTATCATGA': 0,\n", + " 'GATACAACTGGCTATCATGA': 0,\n", + " 'GATACACCTGGCTATCATGA': 0,\n", + " 'GATACATCTGGCTATCATGA': 0,\n", + " 'GATCCAACTGGCTATCATGA': 0,\n", + " 'GATCCACCTGGCTATCATGA': 0,\n", + " 'GATCCATCTGGCTATCATGA': 0,\n", + " 'GATTCAACTGGCTATCATGA': 0,\n", + " 'GATTCACCTGGCTATCATGA': 0,\n", + " 'GATTCATCTGGCTATCATGA': 0,\n", + " 'GATACAGATGGCTATCATGA': 0,\n", + " 'GATACAGGTGGCTATCATGA': 0,\n", + " 'GATACAGTTGGCTATCATGA': 0,\n", + " 'GATCCAGATGGCTATCATGA': 0,\n", + " 'GATCCAGGTGGCTATCATGA': 0,\n", + " 'GATCCAGTTGGCTATCATGA': 0,\n", + " 'GATTCAGATGGCTATCATGA': 0,\n", + " 'GATTCAGGTGGCTATCATGA': 0,\n", + " 'GATTCAGTTGGCTATCATGA': 0,\n", + " 'GATACAGCAGGCTATCATGA': 0,\n", + " 'GATACAGCCGGCTATCATGA': 0,\n", + " 'GATACAGCGGGCTATCATGA': 0,\n", + " 'GATCCAGCAGGCTATCATGA': 0,\n", + " 'GATCCAGCCGGCTATCATGA': 0,\n", + " 'GATCCAGCGGGCTATCATGA': 0,\n", + " 'GATTCAGCAGGCTATCATGA': 0,\n", + " 'GATTCAGCCGGCTATCATGA': 0,\n", + " 'GATTCAGCGGGCTATCATGA': 0,\n", + " 'GATACAGCTAGCTATCATGA': 0,\n", + " 'GATACAGCTCGCTATCATGA': 0,\n", + " 'GATACAGCTTGCTATCATGA': 0,\n", + " 'GATCCAGCTAGCTATCATGA': 0,\n", + " 'GATCCAGCTCGCTATCATGA': 0,\n", + " 'GATCCAGCTTGCTATCATGA': 0,\n", + " 'GATTCAGCTAGCTATCATGA': 0,\n", + " 'GATTCAGCTCGCTATCATGA': 0,\n", + " 'GATTCAGCTTGCTATCATGA': 0,\n", + " 'GATACAGCTGACTATCATGA': 0,\n", + " 'GATACAGCTGCCTATCATGA': 0,\n", + " 'GATACAGCTGTCTATCATGA': 0,\n", + " 'GATCCAGCTGACTATCATGA': 0,\n", + " 'GATCCAGCTGCCTATCATGA': 0,\n", + " 'GATCCAGCTGTCTATCATGA': 0,\n", + " 'GATTCAGCTGACTATCATGA': 0,\n", + " 'GATTCAGCTGCCTATCATGA': 0,\n", + " 'GATTCAGCTGTCTATCATGA': 0,\n", + " 'GATACAGCTGGATATCATGA': 0,\n", + " 'GATACAGCTGGGTATCATGA': 0,\n", + " 'GATACAGCTGGTTATCATGA': 0,\n", + " 'GATCCAGCTGGATATCATGA': 0,\n", + " 'GATCCAGCTGGGTATCATGA': 0,\n", + " 'GATCCAGCTGGTTATCATGA': 0,\n", + " 'GATTCAGCTGGATATCATGA': 0,\n", + " 'GATTCAGCTGGGTATCATGA': 0,\n", + " 'GATTCAGCTGGTTATCATGA': 0,\n", + " 'GATACAGCTGGCAATCATGA': 0,\n", + " 'GATACAGCTGGCCATCATGA': 0,\n", + " 'GATACAGCTGGCGATCATGA': 0,\n", + " 'GATCCAGCTGGCAATCATGA': 0,\n", + " 'GATCCAGCTGGCCATCATGA': 0,\n", + " 'GATCCAGCTGGCGATCATGA': 0,\n", + " 'GATTCAGCTGGCAATCATGA': 0,\n", + " 'GATTCAGCTGGCCATCATGA': 0,\n", + " 'GATTCAGCTGGCGATCATGA': 0,\n", + " 'GATACAGCTGGCTCTCATGA': 0,\n", + " 'GATACAGCTGGCTGTCATGA': 0,\n", + " 'GATACAGCTGGCTTTCATGA': 0,\n", + " 'GATCCAGCTGGCTCTCATGA': 0,\n", + " 'GATCCAGCTGGCTGTCATGA': 0,\n", + " 'GATCCAGCTGGCTTTCATGA': 0,\n", + " 'GATTCAGCTGGCTCTCATGA': 0,\n", + " 'GATTCAGCTGGCTGTCATGA': 0,\n", + " 'GATTCAGCTGGCTTTCATGA': 0,\n", + " 'GATACAGCTGGCTAACATGA': 0,\n", + " 'GATACAGCTGGCTACCATGA': 0,\n", + " 'GATACAGCTGGCTAGCATGA': 0,\n", + " 'GATCCAGCTGGCTAACATGA': 0,\n", + " 'GATCCAGCTGGCTACCATGA': 0,\n", + " 'GATCCAGCTGGCTAGCATGA': 0,\n", + " 'GATTCAGCTGGCTAACATGA': 0,\n", + " 'GATTCAGCTGGCTACCATGA': 0,\n", + " 'GATTCAGCTGGCTAGCATGA': 0,\n", + " 'GATACAGCTGGCTATAATGA': 0,\n", + " 'GATACAGCTGGCTATGATGA': 0,\n", + " 'GATACAGCTGGCTATTATGA': 0,\n", + " 'GATCCAGCTGGCTATAATGA': 0,\n", + " 'GATCCAGCTGGCTATGATGA': 0,\n", + " 'GATCCAGCTGGCTATTATGA': 0,\n", + " 'GATTCAGCTGGCTATAATGA': 0,\n", + " 'GATTCAGCTGGCTATGATGA': 0,\n", + " 'GATTCAGCTGGCTATTATGA': 0,\n", + " 'GATACAGCTGGCTATCCTGA': 0,\n", + " 'GATACAGCTGGCTATCGTGA': 0,\n", + " 'GATACAGCTGGCTATCTTGA': 0,\n", + " 'GATCCAGCTGGCTATCCTGA': 0,\n", + " 'GATCCAGCTGGCTATCGTGA': 0,\n", + " 'GATCCAGCTGGCTATCTTGA': 0,\n", + " 'GATTCAGCTGGCTATCCTGA': 0,\n", + " 'GATTCAGCTGGCTATCGTGA': 0,\n", + " 'GATTCAGCTGGCTATCTTGA': 0,\n", + " 'GATACAGCTGGCTATCAAGA': 0,\n", + " 'GATACAGCTGGCTATCACGA': 0,\n", + " 'GATACAGCTGGCTATCAGGA': 0,\n", + " 'GATCCAGCTGGCTATCAAGA': 0,\n", + " 'GATCCAGCTGGCTATCACGA': 0,\n", + " 'GATCCAGCTGGCTATCAGGA': 0,\n", + " 'GATTCAGCTGGCTATCAAGA': 0,\n", + " 'GATTCAGCTGGCTATCACGA': 0,\n", + " 'GATTCAGCTGGCTATCAGGA': 0,\n", + " 'GATACAGCTGGCTATCATAA': 0,\n", + " 'GATACAGCTGGCTATCATCA': 0,\n", + " 'GATACAGCTGGCTATCATTA': 0,\n", + " 'GATCCAGCTGGCTATCATAA': 0,\n", + " 'GATCCAGCTGGCTATCATCA': 0,\n", + " 'GATCCAGCTGGCTATCATTA': 0,\n", + " 'GATTCAGCTGGCTATCATAA': 0,\n", + " 'GATTCAGCTGGCTATCATCA': 0,\n", + " 'GATTCAGCTGGCTATCATTA': 0,\n", + " 'GATACAGCTGGCTATCATGC': 0,\n", + " 'GATACAGCTGGCTATCATGG': 0,\n", + " 'GATACAGCTGGCTATCATGT': 0,\n", + " 'GATCCAGCTGGCTATCATGC': 0,\n", + " 'GATCCAGCTGGCTATCATGG': 0,\n", + " 'GATCCAGCTGGCTATCATGT': 0,\n", + " 'GATTCAGCTGGCTATCATGC': 0,\n", + " 'GATTCAGCTGGCTATCATGG': 0,\n", + " 'GATTCAGCTGGCTATCATGT': 0,\n", + " 'GATGAAAGCTGGCTATCATGA': 0,\n", + " 'GATGAGAGCTGGCTATCATGA': 0,\n", + " 'GATGATAGCTGGCTATCATGA': 0,\n", + " 'GATGCAAGCTGGCTATCATGA': 0,\n", + " 'GATGCGAGCTGGCTATCATGA': 0,\n", + " 'GATGCTAGCTGGCTATCATGA': 0,\n", + " 'GATGGAAGCTGGCTATCATGA': 0,\n", + " 'GATGGGAGCTGGCTATCATGA': 0,\n", + " 'GATGGTAGCTGGCTATCATGA': 0,\n", + " 'GATGTAAGCTGGCTATCATGA': 0,\n", + " 'GATGTGAGCTGGCTATCATGA': 0,\n", + " 'GATGTTAGCTGGCTATCATGA': 0,\n", + " 'GATGACGCTGGCTATCATGA': 0,\n", + " 'GATGAGGCTGGCTATCATGA': 0,\n", + " 'GATGATGCTGGCTATCATGA': 0,\n", + " 'GATGGCGCTGGCTATCATGA': 0,\n", + " 'GATGGGGCTGGCTATCATGA': 0,\n", + " 'GATGGTGCTGGCTATCATGA': 0,\n", + " 'GATGTCGCTGGCTATCATGA': 0,\n", + " 'GATGTGGCTGGCTATCATGA': 0,\n", + " 'GATGTTGCTGGCTATCATGA': 0,\n", + " 'GATGAAACTGGCTATCATGA': 0,\n", + " 'GATGAACCTGGCTATCATGA': 0,\n", + " 'GATGAATCTGGCTATCATGA': 0,\n", + " 'GATGGAACTGGCTATCATGA': 0,\n", + " 'GATGGACCTGGCTATCATGA': 0,\n", + " 'GATGGATCTGGCTATCATGA': 0,\n", + " 'GATGTAACTGGCTATCATGA': 0,\n", + " 'GATGTACCTGGCTATCATGA': 0,\n", + " 'GATGTATCTGGCTATCATGA': 0,\n", + " 'GATGAAGATGGCTATCATGA': 0,\n", + " 'GATGAAGGTGGCTATCATGA': 0,\n", + " 'GATGAAGTTGGCTATCATGA': 0,\n", + " 'GATGGAGATGGCTATCATGA': 0,\n", + " 'GATGGAGGTGGCTATCATGA': 0,\n", + " 'GATGGAGTTGGCTATCATGA': 0,\n", + " 'GATGTAGATGGCTATCATGA': 0,\n", + " 'GATGTAGGTGGCTATCATGA': 0,\n", + " 'GATGTAGTTGGCTATCATGA': 0,\n", + " 'GATGAAGCAGGCTATCATGA': 0,\n", + " 'GATGAAGCCGGCTATCATGA': 0,\n", + " 'GATGAAGCGGGCTATCATGA': 0,\n", + " 'GATGGAGCAGGCTATCATGA': 0,\n", + " 'GATGGAGCCGGCTATCATGA': 0,\n", + " 'GATGGAGCGGGCTATCATGA': 0,\n", + " 'GATGTAGCAGGCTATCATGA': 0,\n", + " 'GATGTAGCCGGCTATCATGA': 0,\n", + " 'GATGTAGCGGGCTATCATGA': 0,\n", + " 'GATGAAGCTAGCTATCATGA': 0,\n", + " 'GATGAAGCTCGCTATCATGA': 0,\n", + " 'GATGAAGCTTGCTATCATGA': 0,\n", + " 'GATGGAGCTAGCTATCATGA': 0,\n", + " 'GATGGAGCTCGCTATCATGA': 0,\n", + " 'GATGGAGCTTGCTATCATGA': 0,\n", + " 'GATGTAGCTAGCTATCATGA': 0,\n", + " 'GATGTAGCTCGCTATCATGA': 0,\n", + " 'GATGTAGCTTGCTATCATGA': 0,\n", + " 'GATGAAGCTGACTATCATGA': 0,\n", + " 'GATGAAGCTGCCTATCATGA': 0,\n", + " 'GATGAAGCTGTCTATCATGA': 0,\n", + " 'GATGGAGCTGACTATCATGA': 0,\n", + " 'GATGGAGCTGCCTATCATGA': 0,\n", + " 'GATGGAGCTGTCTATCATGA': 0,\n", + " 'GATGTAGCTGACTATCATGA': 0,\n", + " 'GATGTAGCTGCCTATCATGA': 0,\n", + " 'GATGTAGCTGTCTATCATGA': 0,\n", + " 'GATGAAGCTGGATATCATGA': 0,\n", + " 'GATGAAGCTGGGTATCATGA': 0,\n", + " 'GATGAAGCTGGTTATCATGA': 0,\n", + " 'GATGGAGCTGGATATCATGA': 0,\n", + " 'GATGGAGCTGGGTATCATGA': 0,\n", + " 'GATGGAGCTGGTTATCATGA': 0,\n", + " 'GATGTAGCTGGATATCATGA': 0,\n", + " 'GATGTAGCTGGGTATCATGA': 0,\n", + " 'GATGTAGCTGGTTATCATGA': 0,\n", + " 'GATGAAGCTGGCAATCATGA': 0,\n", + " 'GATGAAGCTGGCCATCATGA': 0,\n", + " 'GATGAAGCTGGCGATCATGA': 0,\n", + " 'GATGGAGCTGGCAATCATGA': 0,\n", + " 'GATGGAGCTGGCCATCATGA': 0,\n", + " 'GATGGAGCTGGCGATCATGA': 0,\n", + " 'GATGTAGCTGGCAATCATGA': 0,\n", + " 'GATGTAGCTGGCCATCATGA': 0,\n", + " 'GATGTAGCTGGCGATCATGA': 0,\n", + " 'GATGAAGCTGGCTCTCATGA': 0,\n", + " 'GATGAAGCTGGCTGTCATGA': 0,\n", + " 'GATGAAGCTGGCTTTCATGA': 0,\n", + " 'GATGGAGCTGGCTCTCATGA': 0,\n", + " 'GATGGAGCTGGCTGTCATGA': 0,\n", + " 'GATGGAGCTGGCTTTCATGA': 0,\n", + " 'GATGTAGCTGGCTCTCATGA': 0,\n", + " 'GATGTAGCTGGCTGTCATGA': 0,\n", + " 'GATGTAGCTGGCTTTCATGA': 0,\n", + " 'GATGAAGCTGGCTAACATGA': 0,\n", + " 'GATGAAGCTGGCTACCATGA': 0,\n", + " 'GATGAAGCTGGCTAGCATGA': 0,\n", + " 'GATGGAGCTGGCTAACATGA': 0,\n", + " 'GATGGAGCTGGCTACCATGA': 0,\n", + " 'GATGGAGCTGGCTAGCATGA': 0,\n", + " 'GATGTAGCTGGCTAACATGA': 0,\n", + " 'GATGTAGCTGGCTACCATGA': 0,\n", + " 'GATGTAGCTGGCTAGCATGA': 0,\n", + " 'GATGAAGCTGGCTATAATGA': 0,\n", + " 'GATGAAGCTGGCTATGATGA': 0,\n", + " 'GATGAAGCTGGCTATTATGA': 0,\n", + " 'GATGGAGCTGGCTATAATGA': 0,\n", + " 'GATGGAGCTGGCTATGATGA': 0,\n", + " 'GATGGAGCTGGCTATTATGA': 0,\n", + " 'GATGTAGCTGGCTATAATGA': 0,\n", + " 'GATGTAGCTGGCTATGATGA': 0,\n", + " 'GATGTAGCTGGCTATTATGA': 0,\n", + " 'GATGAAGCTGGCTATCCTGA': 0,\n", + " 'GATGAAGCTGGCTATCGTGA': 0,\n", + " 'GATGAAGCTGGCTATCTTGA': 0,\n", + " 'GATGGAGCTGGCTATCCTGA': 0,\n", + " 'GATGGAGCTGGCTATCGTGA': 0,\n", + " 'GATGGAGCTGGCTATCTTGA': 0,\n", + " 'GATGTAGCTGGCTATCCTGA': 0,\n", + " 'GATGTAGCTGGCTATCGTGA': 0,\n", + " 'GATGTAGCTGGCTATCTTGA': 0,\n", + " 'GATGAAGCTGGCTATCAAGA': 0,\n", + " 'GATGAAGCTGGCTATCACGA': 0,\n", + " 'GATGAAGCTGGCTATCAGGA': 0,\n", + " 'GATGGAGCTGGCTATCAAGA': 0,\n", + " 'GATGGAGCTGGCTATCACGA': 0,\n", + " 'GATGGAGCTGGCTATCAGGA': 0,\n", + " 'GATGTAGCTGGCTATCAAGA': 0,\n", + " 'GATGTAGCTGGCTATCACGA': 0,\n", + " 'GATGTAGCTGGCTATCAGGA': 0,\n", + " 'GATGAAGCTGGCTATCATAA': 0,\n", + " 'GATGAAGCTGGCTATCATCA': 0,\n", + " 'GATGAAGCTGGCTATCATTA': 0,\n", + " 'GATGGAGCTGGCTATCATAA': 0,\n", + " 'GATGGAGCTGGCTATCATCA': 0,\n", + " 'GATGGAGCTGGCTATCATTA': 0,\n", + " 'GATGTAGCTGGCTATCATAA': 0,\n", + " 'GATGTAGCTGGCTATCATCA': 0,\n", + " 'GATGTAGCTGGCTATCATTA': 0,\n", + " 'GATGAAGCTGGCTATCATGC': 0,\n", + " 'GATGAAGCTGGCTATCATGG': 0,\n", + " 'GATGAAGCTGGCTATCATGT': 0,\n", + " 'GATGGAGCTGGCTATCATGC': 0,\n", + " 'GATGGAGCTGGCTATCATGG': 0,\n", + " 'GATGGAGCTGGCTATCATGT': 0,\n", + " 'GATGTAGCTGGCTATCATGC': 0,\n", + " 'GATGTAGCTGGCTATCATGG': 0,\n", + " 'GATGTAGCTGGCTATCATGT': 0,\n", + " 'GATGCACGCTGGCTATCATGA': 0,\n", + " 'GATGCAGGCTGGCTATCATGA': 0,\n", + " 'GATGCATGCTGGCTATCATGA': 0,\n", + " 'GATGCCCGCTGGCTATCATGA': 0,\n", + " 'GATGCCGGCTGGCTATCATGA': 0,\n", + " 'GATGCCTGCTGGCTATCATGA': 0,\n", + " 'GATGCGCGCTGGCTATCATGA': 0,\n", + " 'GATGCGGGCTGGCTATCATGA': 0,\n", + " 'GATGCGTGCTGGCTATCATGA': 0,\n", + " 'GATGCTCGCTGGCTATCATGA': 0,\n", + " 'GATGCTGGCTGGCTATCATGA': 0,\n", + " 'GATGCTTGCTGGCTATCATGA': 0,\n", + " 'GATGCCACTGGCTATCATGA': 0,\n", + " 'GATGCCCCTGGCTATCATGA': 0,\n", + " 'GATGCCTCTGGCTATCATGA': 0,\n", + " 'GATGCGACTGGCTATCATGA': 0,\n", + " 'GATGCGCCTGGCTATCATGA': 0,\n", + " 'GATGCGTCTGGCTATCATGA': 0,\n", + " 'GATGCTACTGGCTATCATGA': 0,\n", + " 'GATGCTCCTGGCTATCATGA': 0,\n", + " 'GATGCTTCTGGCTATCATGA': 0,\n", + " 'GATGCCGATGGCTATCATGA': 0,\n", + " 'GATGCCGGTGGCTATCATGA': 0,\n", + " 'GATGCCGTTGGCTATCATGA': 0,\n", + " 'GATGCGGATGGCTATCATGA': 0,\n", + " 'GATGCGGGTGGCTATCATGA': 0,\n", + " 'GATGCGGTTGGCTATCATGA': 0,\n", + " 'GATGCTGATGGCTATCATGA': 0,\n", + " 'GATGCTGGTGGCTATCATGA': 0,\n", + " 'GATGCTGTTGGCTATCATGA': 0,\n", + " 'GATGCCGCAGGCTATCATGA': 0,\n", + " 'GATGCCGCCGGCTATCATGA': 0,\n", + " 'GATGCCGCGGGCTATCATGA': 0,\n", + " 'GATGCGGCAGGCTATCATGA': 0,\n", + " 'GATGCGGCCGGCTATCATGA': 0,\n", + " 'GATGCGGCGGGCTATCATGA': 0,\n", + " 'GATGCTGCAGGCTATCATGA': 0,\n", + " 'GATGCTGCCGGCTATCATGA': 0,\n", + " 'GATGCTGCGGGCTATCATGA': 0,\n", + " 'GATGCCGCTAGCTATCATGA': 0,\n", + " 'GATGCCGCTCGCTATCATGA': 0,\n", + " 'GATGCCGCTTGCTATCATGA': 0,\n", + " 'GATGCGGCTAGCTATCATGA': 0,\n", + " 'GATGCGGCTCGCTATCATGA': 0,\n", + " 'GATGCGGCTTGCTATCATGA': 0,\n", + " 'GATGCTGCTAGCTATCATGA': 0,\n", + " 'GATGCTGCTCGCTATCATGA': 0,\n", + " 'GATGCTGCTTGCTATCATGA': 0,\n", + " 'GATGCCGCTGACTATCATGA': 0,\n", + " 'GATGCCGCTGCCTATCATGA': 0,\n", + " 'GATGCCGCTGTCTATCATGA': 0,\n", + " 'GATGCGGCTGACTATCATGA': 0,\n", + " 'GATGCGGCTGCCTATCATGA': 0,\n", + " 'GATGCGGCTGTCTATCATGA': 0,\n", + " 'GATGCTGCTGACTATCATGA': 0,\n", + " 'GATGCTGCTGCCTATCATGA': 0,\n", + " 'GATGCTGCTGTCTATCATGA': 0,\n", + " 'GATGCCGCTGGATATCATGA': 0,\n", + " 'GATGCCGCTGGGTATCATGA': 0,\n", + " 'GATGCCGCTGGTTATCATGA': 0,\n", + " 'GATGCGGCTGGATATCATGA': 0,\n", + " 'GATGCGGCTGGGTATCATGA': 0,\n", + " 'GATGCGGCTGGTTATCATGA': 0,\n", + " 'GATGCTGCTGGATATCATGA': 0,\n", + " 'GATGCTGCTGGGTATCATGA': 0,\n", + " 'GATGCTGCTGGTTATCATGA': 0,\n", + " 'GATGCCGCTGGCAATCATGA': 0,\n", + " 'GATGCCGCTGGCCATCATGA': 0,\n", + " 'GATGCCGCTGGCGATCATGA': 0,\n", + " 'GATGCGGCTGGCAATCATGA': 0,\n", + " 'GATGCGGCTGGCCATCATGA': 0,\n", + " 'GATGCGGCTGGCGATCATGA': 0,\n", + " 'GATGCTGCTGGCAATCATGA': 0,\n", + " 'GATGCTGCTGGCCATCATGA': 0,\n", + " 'GATGCTGCTGGCGATCATGA': 0,\n", + " 'GATGCCGCTGGCTCTCATGA': 0,\n", + " 'GATGCCGCTGGCTGTCATGA': 0,\n", + " 'GATGCCGCTGGCTTTCATGA': 0,\n", + " 'GATGCGGCTGGCTCTCATGA': 0,\n", + " 'GATGCGGCTGGCTGTCATGA': 0,\n", + " 'GATGCGGCTGGCTTTCATGA': 0,\n", + " 'GATGCTGCTGGCTCTCATGA': 0,\n", + " 'GATGCTGCTGGCTGTCATGA': 0,\n", + " 'GATGCTGCTGGCTTTCATGA': 0,\n", + " 'GATGCCGCTGGCTAACATGA': 0,\n", + " 'GATGCCGCTGGCTACCATGA': 0,\n", + " 'GATGCCGCTGGCTAGCATGA': 0,\n", + " 'GATGCGGCTGGCTAACATGA': 0,\n", + " 'GATGCGGCTGGCTACCATGA': 0,\n", + " 'GATGCGGCTGGCTAGCATGA': 0,\n", + " 'GATGCTGCTGGCTAACATGA': 0,\n", + " 'GATGCTGCTGGCTACCATGA': 0,\n", + " 'GATGCTGCTGGCTAGCATGA': 0,\n", + " 'GATGCCGCTGGCTATAATGA': 0,\n", + " 'GATGCCGCTGGCTATGATGA': 0,\n", + " 'GATGCCGCTGGCTATTATGA': 0,\n", + " 'GATGCGGCTGGCTATAATGA': 0,\n", + " 'GATGCGGCTGGCTATGATGA': 0,\n", + " 'GATGCGGCTGGCTATTATGA': 0,\n", + " 'GATGCTGCTGGCTATAATGA': 0,\n", + " 'GATGCTGCTGGCTATGATGA': 0,\n", + " 'GATGCTGCTGGCTATTATGA': 0,\n", + " 'GATGCCGCTGGCTATCCTGA': 0,\n", + " 'GATGCCGCTGGCTATCGTGA': 0,\n", + " 'GATGCCGCTGGCTATCTTGA': 0,\n", + " 'GATGCGGCTGGCTATCCTGA': 0,\n", + " 'GATGCGGCTGGCTATCGTGA': 0,\n", + " 'GATGCGGCTGGCTATCTTGA': 0,\n", + " 'GATGCTGCTGGCTATCCTGA': 0,\n", + " 'GATGCTGCTGGCTATCGTGA': 0,\n", + " ...}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "barcode_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing read 0...\n", + "Processing read 10000...\n", + "Processing read 20000...\n", + "Processing read 30000...\n", + "Processing read 40000...\n", + "Processing read 50000...\n", + "Processing read 60000...\n", + "Processing read 70000...\n", + "Processing read 80000...\n", + "Processing read 90000...\n", + "Processing read 100000...\n", + "Processing read 110000...\n", + "Processing read 120000...\n", + "Processing read 130000...\n", + "Processing read 140000...\n", + "Processing read 150000...\n", + "Processing read 160000...\n", + "Processing read 170000...\n", + "Processing read 180000...\n", + "Processing read 190000...\n", + "Processing read 200000...\n", + "Processing read 210000...\n", + "Processing read 220000...\n", + "Processing read 230000...\n", + "Done!\n", + "Total # of unique-UMI reads = 164844\n" + ] + } + ], + "source": [ + "#Parse and map RNA sequencing reads\n", + "\n", + "save_suffix = \"_var_repl_2_hmc3_v3\"\n", + "\n", + "r1_name = \"unprocessed_data/HMC3_Var_R2_trimmed.fastq\"\n", + "r2_name = \"unprocessed_data/HMC3_Var_R2_UMI_trimmed.fastq\"\n", + "\n", + "polya_regexp = re.compile(r\"AAAAA(AAAAAAAAAAAAAAA){s<=2}\")\n", + "distal_regexp = re.compile(r\"(GCCTCGACTGTGCCTTCTAG){s<=2}\")\n", + "\n", + "def _hamming(s1, s2) :\n", + " \n", + " d = 0.\n", + " for j in range(len(s1)) :\n", + " if s1[j] != s2[j] :\n", + " d += 1.\n", + " \n", + " return d\n", + "\n", + "max_pos = 176\n", + "\n", + "umi_dict = {}\n", + "umi_n_muts = 0\n", + "\n", + "bases = ['A', 'C', 'G', 'T']\n", + "\n", + "cuts = np.zeros((len(library_df), 206))\n", + "\n", + "f1 = open(r1_name, 'rt')\n", + "f2 = open(r2_name, 'rt')\n", + "\n", + "#Iterate through reads sequenctially (r1 and r2)\n", + "\n", + "r1_counter = 0\n", + "while True :\n", + " \n", + " #Read 1\n", + " id1 = f1.readline().strip()\n", + " \n", + " #Check for end-of-file\n", + " if len(id1) == 0 :\n", + " break\n", + " \n", + " r1 = f1.readline().strip()\n", + " s1 = f1.readline().strip()\n", + " q1 = f1.readline().strip()\n", + " \n", + " #Read 2\n", + " id2 = f2.readline().strip()\n", + " r2 = f2.readline().strip()\n", + " s2 = f2.readline().strip()\n", + " q2 = f2.readline().strip()\n", + " \n", + " if r1_counter % 10000 == 0 :\n", + " print(\"Processing read \" + str(r1_counter) + \"...\")\n", + " \n", + " r1_counter += 1\n", + " \n", + " #Map read to library member\n", + " bc = r1[:20]\n", + " \n", + " lib_i = -1\n", + " if bc in barcode_dict :\n", + " lib_i = barcode_dict[bc]\n", + " \n", + " if lib_i == -1 :\n", + " continue\n", + " \n", + " if umi_n_muts == 0 :\n", + " bc = sequences[lib_i][:20]\n", + " \n", + " #Determine if we have seen this umi before, otherwise mark as visited\n", + " umi = r2[:8]\n", + " \n", + " if bc not in umi_dict :\n", + " umi_dict[bc] = {}\n", + " \n", + " umi_visited = False\n", + " if umi in umi_dict[bc] :\n", + " umi_visited = True\n", + " elif umi_n_muts == 1 :\n", + " for pos1 in range(len(umi)) :\n", + " for b1 in bases :\n", + " umi_mut = umi[:pos1] + b1 + umi[pos1+1:]\n", + " if umi_mut in umi_dict[bc] :\n", + " umi_visited = True\n", + " break\n", + " if umi_visited :\n", + " break\n", + " elif umi_n_muts == 2 :\n", + " for pos1 in range(len(umi)) :\n", + " for pos2 in range(pos1, len(umi)) :\n", + " for b1 in bases :\n", + " for b2 in bases :\n", + " umi_mut = umi[:pos1] + b1 + umi[pos1+1:pos2] + b2 + umi[pos2+1:]\n", + " if umi_mut in umi_dict[bc] :\n", + " umi_visited = True\n", + " break\n", + " if umi_visited :\n", + " break\n", + " if umi_visited :\n", + " break\n", + " if umi_visited :\n", + " break\n", + " \n", + " #Skip if umi already seen\n", + " if umi_visited :\n", + " continue\n", + " \n", + " #Determine polyA position (or alternative if the read is distally polyadenylated)\n", + " polya_match = re.search(polya_regexp, r1)\n", + " \n", + " polya_pos = -1\n", + " if polya_match is not None and polya_match.span()[0] < max_pos :\n", + " polya_pos = polya_match.span()[0]\n", + " \n", + " #Determine if distal read\n", + " is_distal = False\n", + " distal_match = re.search(distal_regexp, r1[209-5:209+20+5])\n", + " \n", + " if distal_match is not None :\n", + " is_distal = True\n", + " \n", + " #Aggregate read-position occurrence counts\n", + " if is_distal :\n", + " cuts[lib_i, -1] += 1.\n", + " \n", + " #Mark as seen and proceed\n", + " umi_dict[bc][umi] = True\n", + " \n", + " elif polya_pos != -1 and polya_pos >= 30 :\n", + " \n", + " #Perform hamming-based consistency check against reference of region upstream of cleavage\n", + " \n", + " hamming_dist = _hamming(sequences[lib_i][polya_pos-20:polya_pos], r1[polya_pos-20:polya_pos])\n", + " \n", + " if hamming_dist <= 3 :\n", + " cuts[lib_i, polya_pos] += 1.\n", + "\n", + " #Mark as seen and proceed\n", + " umi_dict[bc][umi] = True\n", + "\n", + "f1.close()\n", + "f2.close()\n", + "\n", + "print(\"Done!\")\n", + "\n", + "print(\"Total # of unique-UMI reads = \" + str(int(np.sum(cuts))))\n", + "\n", + "#Store processed read-position count matrix\n", + "np.save('apa_oligo_2022' + save_suffix + '_umi_mut_' + str(umi_n_muts) + '_cuts', cuts)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#Load processed read count data\n", + "\n", + "ref_cuts_repl_1 = np.zeros(np.load(\"apa_oligo_2022_ref_repl_1_hmc3_v3_umi_mut_0_cuts.npy\").shape)\n", + "var_cuts_repl_1 = np.load(\"apa_oligo_2022_var_repl_1_hmc3_v3_umi_mut_0_cuts.npy\")\n", + "\n", + "ref_cuts_repl_2 = np.load(\"apa_oligo_2022_ref_repl_2_hmc3_v3_umi_mut_0_cuts.npy\")\n", + "var_cuts_repl_2 = np.load(\"apa_oligo_2022_var_repl_2_hmc3_v3_umi_mut_0_cuts.npy\")\n", + "\n", + "#Pooled counts\n", + "\n", + "ref_cuts_pooled = ref_cuts_repl_1 + ref_cuts_repl_2\n", + "var_cuts_pooled = var_cuts_repl_1 + var_cuts_repl_2\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#Augment library file with measured isoform summary statistics\n", + "\n", + "pseudo_c = 1.\n", + "\n", + "#Replicate 1\n", + "library_df['ref_count_77_127_repl_1'] = np.sum(ref_cuts_repl_1[:, 77:127], axis=-1)\n", + "library_df['ref_count_0_205_repl_1'] = np.sum(ref_cuts_repl_1[:, 0:205], axis=-1)\n", + "library_df['ref_count_total_repl_1'] = np.sum(ref_cuts_repl_1, axis=-1)\n", + "library_df['ref_logit_77_127_repl_1'] = np.log(\n", + " ((library_df['ref_count_77_127_repl_1'] + pseudo_c) / (library_df['ref_count_total_repl_1'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_77_127_repl_1'] + pseudo_c) / (library_df['ref_count_total_repl_1'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['ref_logit_0_205_repl_1'] = np.log(\n", + " ((library_df['ref_count_0_205_repl_1'] + pseudo_c) / (library_df['ref_count_total_repl_1'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_0_205_repl_1'] + pseudo_c) / (library_df['ref_count_total_repl_1'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['var_count_77_127_repl_1'] = np.sum(var_cuts_repl_1[:, 77:127], axis=-1)\n", + "library_df['var_count_0_205_repl_1'] = np.sum(var_cuts_repl_1[:, 0:205], axis=-1)\n", + "library_df['var_count_total_repl_1'] = np.sum(var_cuts_repl_1, axis=-1)\n", + "library_df['var_logit_77_127_repl_1'] = np.log(\n", + " ((library_df['var_count_77_127_repl_1'] + pseudo_c) / (library_df['var_count_total_repl_1'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_77_127_repl_1'] + pseudo_c) / (library_df['var_count_total_repl_1'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['var_logit_0_205_repl_1'] = np.log(\n", + " ((library_df['var_count_0_205_repl_1'] + pseudo_c) / (library_df['var_count_total_repl_1'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_0_205_repl_1'] + pseudo_c) / (library_df['var_count_total_repl_1'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['delta_logodds_true_77_127_repl_1'] = library_df['var_logit_77_127_repl_1'] - library_df['ref_logit_77_127_repl_1']\n", + "library_df['delta_logodds_true_0_205_repl_1'] = library_df['var_logit_0_205_repl_1'] - library_df['ref_logit_0_205_repl_1']\n", + "\n", + "#Replicate 2\n", + "library_df['ref_count_77_127_repl_2'] = np.sum(ref_cuts_repl_2[:, 77:127], axis=-1)\n", + "library_df['ref_count_0_205_repl_2'] = np.sum(ref_cuts_repl_2[:, 0:205], axis=-1)\n", + "library_df['ref_count_total_repl_2'] = np.sum(ref_cuts_repl_2, axis=-1)\n", + "library_df['ref_logit_77_127_repl_2'] = np.log(\n", + " ((library_df['ref_count_77_127_repl_2'] + pseudo_c) / (library_df['ref_count_total_repl_2'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_77_127_repl_2'] + pseudo_c) / (library_df['ref_count_total_repl_2'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['ref_logit_0_205_repl_2'] = np.log(\n", + " ((library_df['ref_count_0_205_repl_2'] + pseudo_c) / (library_df['ref_count_total_repl_2'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_0_205_repl_2'] + pseudo_c) / (library_df['ref_count_total_repl_2'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['var_count_77_127_repl_2'] = np.sum(var_cuts_repl_2[:, 77:127], axis=-1)\n", + "library_df['var_count_0_205_repl_2'] = np.sum(var_cuts_repl_2[:, 0:205], axis=-1)\n", + "library_df['var_count_total_repl_2'] = np.sum(var_cuts_repl_2, axis=-1)\n", + "library_df['var_logit_77_127_repl_2'] = np.log(\n", + " ((library_df['var_count_77_127_repl_2'] + pseudo_c) / (library_df['var_count_total_repl_2'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_77_127_repl_2'] + pseudo_c) / (library_df['var_count_total_repl_2'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['var_logit_0_205_repl_2'] = np.log(\n", + " ((library_df['var_count_0_205_repl_2'] + pseudo_c) / (library_df['var_count_total_repl_2'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_0_205_repl_2'] + pseudo_c) / (library_df['var_count_total_repl_2'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['delta_logodds_true_77_127_repl_2'] = library_df['var_logit_77_127_repl_2'] - library_df['ref_logit_77_127_repl_2']\n", + "library_df['delta_logodds_true_0_205_repl_2'] = library_df['var_logit_0_205_repl_2'] - library_df['ref_logit_0_205_repl_2']\n", + "\n", + "#Pooled replicates\n", + "library_df['ref_count_77_127_repl_pooled'] = np.sum(ref_cuts_pooled[:, 77:127], axis=-1)\n", + "library_df['ref_count_0_205_repl_pooled'] = np.sum(ref_cuts_pooled[:, 0:205], axis=-1)\n", + "library_df['ref_count_total_repl_pooled'] = np.sum(ref_cuts_pooled, axis=-1)\n", + "library_df['ref_logit_77_127_repl_pooled'] = np.log(\n", + " ((library_df['ref_count_77_127_repl_pooled'] + pseudo_c) / (library_df['ref_count_total_repl_pooled'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_77_127_repl_pooled'] + pseudo_c) / (library_df['ref_count_total_repl_pooled'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['ref_logit_0_205_repl_pooled'] = np.log(\n", + " ((library_df['ref_count_0_205_repl_pooled'] + pseudo_c) / (library_df['ref_count_total_repl_pooled'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_0_205_repl_pooled'] + pseudo_c) / (library_df['ref_count_total_repl_pooled'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['var_count_77_127_repl_pooled'] = np.sum(var_cuts_pooled[:, 77:127], axis=-1)\n", + "library_df['var_count_0_205_repl_pooled'] = np.sum(var_cuts_pooled[:, 0:205], axis=-1)\n", + "library_df['var_count_total_repl_pooled'] = np.sum(var_cuts_pooled, axis=-1)\n", + "library_df['var_logit_77_127_repl_pooled'] = np.log(\n", + " ((library_df['var_count_77_127_repl_pooled'] + pseudo_c) / (library_df['var_count_total_repl_pooled'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_77_127_repl_pooled'] + pseudo_c) / (library_df['var_count_total_repl_pooled'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['var_logit_0_205_repl_pooled'] = np.log(\n", + " ((library_df['var_count_0_205_repl_pooled'] + pseudo_c) / (library_df['var_count_total_repl_pooled'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_0_205_repl_pooled'] + pseudo_c) / (library_df['var_count_total_repl_pooled'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['delta_logodds_true_77_127_repl_pooled'] = library_df['var_logit_77_127_repl_pooled'] - library_df['ref_logit_77_127_repl_pooled']\n", + "library_df['delta_logodds_true_0_205_repl_pooled'] = library_df['var_logit_0_205_repl_pooled'] - library_df['ref_logit_0_205_repl_pooled']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "#Cache measurements in dataframe\n", + "\n", + "library_df.to_csv(\"apa_100_variants_rev2_20220621_hmc3_v3_umi_mut_0.csv\", sep='\\t')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/__main__.py:3: RuntimeWarning: invalid value encountered in true_divide\n", + " app.launch_new_instance()\n", + "/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/__main__.py:7: RuntimeWarning: invalid value encountered in true_divide\n" + ] + } + ], + "source": [ + "#Compute cleavage probabilities\n", + "\n", + "ref_cut_prob_repl_1 = ref_cuts_repl_1 / np.sum(ref_cuts_repl_1, axis=1)[:, None]\n", + "var_cut_prob_repl_1 = var_cuts_repl_1 / np.sum(var_cuts_repl_1, axis=1)[:, None]\n", + "\n", + "ref_cut_prob_repl_2 = ref_cuts_repl_2 / np.sum(ref_cuts_repl_2, axis=1)[:, None]\n", + "var_cut_prob_repl_2 = var_cuts_repl_2 / np.sum(var_cuts_repl_2, axis=1)[:, None]\n", + "\n", + "ref_cut_prob_pooled = ref_cuts_pooled / np.sum(ref_cuts_pooled, axis=1)[:, None]\n", + "var_cut_prob_pooled = var_cuts_pooled / np.sum(var_cuts_pooled, axis=1)[:, None]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "delta_logodds (repl 1) = nan\n", + "delta_logodds (repl 2) = 0.851\n", + "delta_logodds (pooled) = 0.7094\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Control: F2 variant (creation of new cutsite -1bp relative to reference)\n", + "\n", + "save_figs = True\n", + "fig_name = \"F2_control_profile_hmc3\"\n", + "\n", + "test_ix = 9\n", + "\n", + "#Plot range\n", + "plot_start = 0\n", + "plot_end = 146\n", + "\n", + "#Isoform definition\n", + "cut_start = 77\n", + "cut_end = 127\n", + "\n", + "c_ref_1 = ref_cut_prob_repl_1[test_ix, :]\n", + "c_var_1 = var_cut_prob_repl_1[test_ix, :]\n", + "\n", + "c_ref_2 = ref_cut_prob_repl_2[test_ix, :]\n", + "c_var_2 = var_cut_prob_repl_2[test_ix, :]\n", + "\n", + "c_ref_pooled = ref_cut_prob_pooled[test_ix, :]\n", + "c_var_pooled = var_cut_prob_pooled[test_ix, :]\n", + "\n", + "delta_logodds_1 = np.log(np.sum(c_var_1[77:127]) / (1. - np.sum(c_var_1[77:127]))) - np.log(np.sum(c_ref_1[77:127]) / (1. - np.sum(c_ref_1[77:127])))\n", + "delta_logodds_2 = np.log(np.sum(c_var_2[77:127]) / (1. - np.sum(c_var_2[77:127]))) - np.log(np.sum(c_ref_2[77:127]) / (1. - np.sum(c_ref_2[77:127])))\n", + "delta_logodds_pooled = np.log(np.sum(c_var_pooled[77:127]) / (1. - np.sum(c_var_pooled[77:127]))) - np.log(np.sum(c_ref_pooled[77:127]) / (1. - np.sum(c_ref_pooled[77:127])))\n", + "\n", + "print(\"delta_logodds (repl 1) = \" + str(round(delta_logodds_1, 4)))\n", + "print(\"delta_logodds (repl 2) = \" + str(round(delta_logodds_2, 4)))\n", + "print(\"delta_logodds (pooled) = \" + str(round(delta_logodds_pooled, 4)))\n", + "\n", + "#Plot replicate 1 profile\n", + "f = plt.figure(figsize=(10, 3))\n", + "\n", + "plt.plot(c_ref_1[plot_start: plot_end], color='darkblue', linewidth=3)\n", + "plt.plot(c_var_1[plot_start: plot_end], color='darkorange', linewidth=3)\n", + "\n", + "plt.axvline(x=70, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "plt.axvline(x=76, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "\n", + "plt.xlim(plot_start, plot_end)\n", + "plt.ylim(0.)\n", + "\n", + "plt.title(\"Replicate 1. Delta Isoform Log Odds = \" + str(round(delta_logodds_1, 4)))\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_repl_1.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_repl_1.eps\")\n", + "\n", + "plt.show()\n", + "\n", + "#Plot replicate 2 profile\n", + "f = plt.figure(figsize=(10, 3))\n", + "\n", + "plt.plot(c_ref_2[plot_start: plot_end], color='darkblue', linewidth=3)\n", + "plt.plot(c_var_2[plot_start: plot_end], color='darkorange', linewidth=3)\n", + "\n", + "plt.axvline(x=70, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "plt.axvline(x=76, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "\n", + "plt.xlim(plot_start, plot_end)\n", + "plt.ylim(0.)\n", + "\n", + "plt.title(\"Replicate 2. Delta Isoform Log Odds = \" + str(round(delta_logodds_2, 4)))\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_repl_2.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_repl_2.eps\")\n", + "\n", + "plt.show()\n", + "\n", + "#Plot pooled replicate profile\n", + "f = plt.figure(figsize=(10, 3))\n", + "\n", + "plt.plot(c_ref_pooled[plot_start: plot_end], color='darkblue', linewidth=3)\n", + "plt.plot(c_var_pooled[plot_start: plot_end], color='darkorange', linewidth=3)\n", + "\n", + "plt.axvline(x=70, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "plt.axvline(x=76, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "\n", + "plt.xlim(plot_start, plot_end)\n", + "plt.ylim(0.)\n", + "\n", + "plt.title(\"Pooled replicates. Delta Isoform Log Odds = \" + str(round(delta_logodds_pooled, 4)))\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_pooled.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_pooled.eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(filtered_df) = 93\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/numpy/lib/function_base.py:2530: RuntimeWarning: invalid value encountered in true_divide\n", + " c /= stddev[:, None]\n", + "/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/numpy/lib/function_base.py:2531: RuntimeWarning: invalid value encountered in true_divide\n", + " c /= stddev[None, :]\n", + "/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:901: RuntimeWarning: invalid value encountered in greater\n", + " return (a < x) & (x < b)\n", + "/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:901: RuntimeWarning: invalid value encountered in less\n", + " return (a < x) & (x < b)\n", + "/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:1892: RuntimeWarning: invalid value encountered in less_equal\n", + " cond2 = cond0 & (x <= _a)\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from scipy.stats import spearmanr\n", + "\n", + "save_figs = True\n", + "fig_name = \"replicate_hmc3\"\n", + "\n", + "min_c = 5.\n", + "\n", + "x_min = -6.\n", + "x_max = 4.\n", + "\n", + "filtered_df = library_df.query(\"ref_count_total_repl_1 >= 0 and \" + \"ref_count_total_repl_2 >= \" + str(min_c) + \" and \" + \"var_count_total_repl_1 >= \" + str(min_c) + \" and \" + \"var_count_total_repl_2 >= \" + str(min_c))\n", + "\n", + "print(\"len(filtered_df) = \" + str(len(filtered_df)))\n", + "\n", + "#Reference library (replicate correlation)\n", + "r_val_ref, _ = spearmanr(filtered_df['ref_logit_77_127_repl_1'], filtered_df['ref_logit_77_127_repl_2'])\n", + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "plt.scatter(filtered_df['ref_logit_77_127_repl_1'], filtered_df['ref_logit_77_127_repl_2'], color='lightgreen', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "plt.xlim(x_min, x_max)\n", + "plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Ref Logit (Repl. 1)\", fontsize=12)\n", + "plt.ylabel(\"Ref Logit (Repl. 2)\", fontsize=12)\n", + "\n", + "plt.title(\"r = \" + str(round(r_val_ref, 3)) + \", n = \" + str(len(filtered_df)), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_ref_logits.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_ref_logits.eps\")\n", + "\n", + "plt.show()\n", + "\n", + "#Variant library (replicate correlation)\n", + "r_val_var, _ = spearmanr(filtered_df['var_logit_77_127_repl_1'], filtered_df['var_logit_77_127_repl_2'])\n", + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "plt.scatter(filtered_df['var_logit_77_127_repl_1'], filtered_df['var_logit_77_127_repl_2'], color='lightcoral', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "plt.xlim(x_min, x_max)\n", + "plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Var Logit (Repl. 1)\", fontsize=12)\n", + "plt.ylabel(\"Var Logit (Repl. 2)\", fontsize=12)\n", + "\n", + "plt.title(\"r = \" + str(round(r_val_var, 3)) + \", n = \" + str(len(filtered_df)), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_var_logits.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_var_logits.eps\")\n", + "\n", + "plt.show()\n", + "\n", + "#Ref-Var library (delta replicate correlation)\n", + "r_val_var, _ = spearmanr(filtered_df['delta_logodds_true_77_127_repl_1'], filtered_df['delta_logodds_true_77_127_repl_2'])\n", + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "plt.scatter(filtered_df['delta_logodds_true_77_127_repl_1'], filtered_df['delta_logodds_true_77_127_repl_2'], color='lightcoral', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "plt.xlim(x_min, x_max)\n", + "plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"LOR (Repl. 1)\", fontsize=12)\n", + "plt.ylabel(\"LOR (Repl. 2)\", fontsize=12)\n", + "\n", + "plt.title(\"r = \" + str(round(r_val_var, 3)) + \", n = \" + str(len(filtered_df)), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_delta_logodds.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_delta_logodds.eps\")\n", + "\n", + "plt.show()\n", + "\n", + "#Ref-Var control correlation\n", + "control_df = filtered_df.loc[filtered_df['experiment'].str.contains(\"control_\") & filtered_df['data_source'].str.contains(\"Array_2019\")]\n", + "\n", + "r_val_control, _ = spearmanr(control_df['ref_logit_77_127_repl_pooled'], control_df['var_logit_77_127_repl_pooled'])\n", + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "plt.scatter(control_df['ref_logit_77_127_repl_pooled'], control_df['var_logit_77_127_repl_pooled'], color='black', s=175, marker='^')\n", + "\n", + "plt.plot([x_min, x_max], [x_min, x_max], color='darkgreen', linestyle='--', linewidth=2,)\n", + "\n", + "plt.xlim(x_min, x_max)\n", + "plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Ref Logit\", fontsize=12)\n", + "plt.ylabel(\"Var Logit\", fontsize=12)\n", + "\n", + "plt.title(\"Controls; Spearman r = \" + str(round(r_val_control, 3)) + \", n = \" + str(len(control_df)), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_control_logits.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_control_logits.eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n (sequences) = 186066\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Compare controls from 2022 oligo array to the Cell paper 2019 array\n", + "\n", + "#Load oligo array from 2019\n", + "import isolearn.io as isoio\n", + "\n", + "save_figs = True\n", + "fig_name = \"replicate_hmc3_to_2019\"\n", + "\n", + "isoform_pseudo_count = 1.\n", + "proximal_start = 77\n", + "proximal_end = 127\n", + "\n", + "file_prefix = str(proximal_start) + \"_\" + str(proximal_end)\n", + "\n", + "seq_dict = isoio.load('../../../../aparent/data/prepared_data/apa_array_data/apa_array_data_seq')\n", + "\n", + "seq_df = seq_dict['array_df']\n", + "seq_cuts = seq_dict['pooled_cuts']\n", + "\n", + "cut_true = np.concatenate([np.array(seq_cuts[:, 180: 180 + 205].todense()), np.array(seq_cuts[:, -1].todense()).reshape(-1, 1)], axis=-1)# - 1\n", + "\n", + "seq_df['proximal_count'] = [np.sum(cut_true[i, proximal_start:proximal_end]) for i in range(len(seq_df))]\n", + "seq_df['total_count'] = [np.sum(cut_true[i, :]) for i in range(len(seq_df))]\n", + "\n", + "seq_df['iso_true'] = (seq_df['proximal_count'] + isoform_pseudo_count) / (seq_df['total_count'] + 2. * isoform_pseudo_count)\n", + "seq_df['logodds_true'] = np.log(seq_df['iso_true'] / (1.0 - seq_df['iso_true']))\n", + "\n", + "seq_df['seq'] = seq_df['seq'].str.slice(0, 205)\n", + "\n", + "print(\"n (sequences) = \" + str(len(seq_df)))\n", + "\n", + "#Ref-Array 2019 control correlation\n", + "control_df_2019 = control_df.join(seq_df[['seq', 'logodds_true']].set_index(\"seq\"), on='ref_seq', how='inner').copy().reset_index(drop=True)\n", + "\n", + "r_val_control, _ = spearmanr(control_df_2019['ref_logit_77_127_repl_pooled'], control_df_2019['logodds_true'])\n", + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "plt.scatter(control_df_2019['ref_logit_77_127_repl_pooled'], control_df_2019['logodds_true'], color='deepskyblue', edgecolor='black', linewidth=1, s=175, marker='^')\n", + "\n", + "plt.plot([x_min, x_max], [x_min, x_max], color='darkgreen', linestyle='--', linewidth=2,)\n", + "\n", + "plt.xlim(x_min, x_max)\n", + "plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Logit (2022)\", fontsize=12)\n", + "plt.ylabel(\"Logit (2019)\", fontsize=12)\n", + "\n", + "plt.title(\"Controls; Spearman r = \" + str(round(r_val_control, 3)) + \", n = \" + str(len(control_df_2019)), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:tensorflow]", + "language": "python", + "name": "conda-env-tensorflow-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data/oligo_pool_2022/medium_library/process_reads_sknsh.ipynb b/data/oligo_pool_2022/medium_library/process_reads_sknsh.ipynb new file mode 100644 index 0000000..572a423 --- /dev/null +++ b/data/oligo_pool_2022/medium_library/process_reads_sknsh.ipynb @@ -0,0 +1,1865 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import scipy\n", + "import scipy.io as spio\n", + "import scipy.sparse as sp\n", + "\n", + "import regex as re\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(barcode_dict) = 201500\n" + ] + } + ], + "source": [ + "#Load reference library dataframe and build barcode dictionary\n", + "\n", + "library_df = pd.read_csv(\"apa_100_variants_rev2_20220621_pred.csv\", sep='\\t')\n", + "\n", + "#Build dictionary (double-mutation support)\n", + "bases = ['A', 'C', 'G', 'T']\n", + "\n", + "barcode_dict = {}\n", + "sequences = []\n", + "for i, [_, row] in enumerate(library_df.iterrows()) :\n", + " bc = row['ref_barcode']\n", + " \n", + " sequences.append(row['ref_seq'])\n", + " \n", + " barcode_dict[bc] = i\n", + " for pos1 in range(len(bc)) :\n", + " for pos2 in range(pos1, len(bc)) :\n", + " for b1 in bases :\n", + " for b2 in bases :\n", + " bc_mut = bc[:pos1] + b1 + bc[pos1+1:pos2] + b2 + bc[pos2+1:]\n", + " \n", + " if bc_mut in barcode_dict and barcode_dict[bc_mut] != i :\n", + " print(\"[ERROR] Barcode dictionary collision.\")\n", + " else :\n", + " barcode_dict[bc_mut] = i\n", + "\n", + "print(\"len(barcode_dict) = \" + str(len(barcode_dict)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'GATGCAGCTGGCTATCATGA': 0,\n", + " 'AAATGCAGCTGGCTATCATGA': 0,\n", + " 'ACATGCAGCTGGCTATCATGA': 0,\n", + " 'AGATGCAGCTGGCTATCATGA': 0,\n", + " 'ATATGCAGCTGGCTATCATGA': 0,\n", + " 'CAATGCAGCTGGCTATCATGA': 0,\n", + " 'CCATGCAGCTGGCTATCATGA': 0,\n", + " 'CGATGCAGCTGGCTATCATGA': 0,\n", + " 'CTATGCAGCTGGCTATCATGA': 0,\n", + " 'GAATGCAGCTGGCTATCATGA': 0,\n", + " 'GCATGCAGCTGGCTATCATGA': 0,\n", + " 'GGATGCAGCTGGCTATCATGA': 0,\n", + " 'GTATGCAGCTGGCTATCATGA': 0,\n", + " 'TAATGCAGCTGGCTATCATGA': 0,\n", + " 'TCATGCAGCTGGCTATCATGA': 0,\n", + " 'TGATGCAGCTGGCTATCATGA': 0,\n", + " 'TTATGCAGCTGGCTATCATGA': 0,\n", + " 'AATGCAGCTGGCTATCATGA': 0,\n", + " 'ACTGCAGCTGGCTATCATGA': 0,\n", + " 'AGTGCAGCTGGCTATCATGA': 0,\n", + " 'ATTGCAGCTGGCTATCATGA': 0,\n", + " 'CATGCAGCTGGCTATCATGA': 0,\n", + " 'CCTGCAGCTGGCTATCATGA': 0,\n", + " 'CGTGCAGCTGGCTATCATGA': 0,\n", + " 'CTTGCAGCTGGCTATCATGA': 0,\n", + " 'GCTGCAGCTGGCTATCATGA': 0,\n", + " 'GGTGCAGCTGGCTATCATGA': 0,\n", + " 'GTTGCAGCTGGCTATCATGA': 0,\n", + " 'TATGCAGCTGGCTATCATGA': 0,\n", + " 'TCTGCAGCTGGCTATCATGA': 0,\n", + " 'TGTGCAGCTGGCTATCATGA': 0,\n", + " 'TTTGCAGCTGGCTATCATGA': 0,\n", + " 'AAAGCAGCTGGCTATCATGA': 0,\n", + " 'AACGCAGCTGGCTATCATGA': 0,\n", + " 'AAGGCAGCTGGCTATCATGA': 0,\n", + " 'CAAGCAGCTGGCTATCATGA': 0,\n", + " 'CACGCAGCTGGCTATCATGA': 0,\n", + " 'CAGGCAGCTGGCTATCATGA': 0,\n", + " 'GAAGCAGCTGGCTATCATGA': 0,\n", + " 'GACGCAGCTGGCTATCATGA': 0,\n", + " 'GAGGCAGCTGGCTATCATGA': 0,\n", + " 'TAAGCAGCTGGCTATCATGA': 0,\n", + " 'TACGCAGCTGGCTATCATGA': 0,\n", + " 'TAGGCAGCTGGCTATCATGA': 0,\n", + " 'AATACAGCTGGCTATCATGA': 0,\n", + " 'AATCCAGCTGGCTATCATGA': 0,\n", + " 'AATTCAGCTGGCTATCATGA': 0,\n", + " 'CATACAGCTGGCTATCATGA': 0,\n", + " 'CATCCAGCTGGCTATCATGA': 0,\n", + " 'CATTCAGCTGGCTATCATGA': 0,\n", + " 'GATACAGCTGGCTATCATGA': 0,\n", + " 'GATCCAGCTGGCTATCATGA': 0,\n", + " 'GATTCAGCTGGCTATCATGA': 0,\n", + " 'TATACAGCTGGCTATCATGA': 0,\n", + " 'TATCCAGCTGGCTATCATGA': 0,\n", + " 'TATTCAGCTGGCTATCATGA': 0,\n", + " 'AATGAAGCTGGCTATCATGA': 0,\n", + " 'AATGGAGCTGGCTATCATGA': 0,\n", + " 'AATGTAGCTGGCTATCATGA': 0,\n", + " 'CATGAAGCTGGCTATCATGA': 0,\n", + " 'CATGGAGCTGGCTATCATGA': 0,\n", + " 'CATGTAGCTGGCTATCATGA': 0,\n", + " 'GATGAAGCTGGCTATCATGA': 0,\n", + " 'GATGGAGCTGGCTATCATGA': 0,\n", + " 'GATGTAGCTGGCTATCATGA': 0,\n", + " 'TATGAAGCTGGCTATCATGA': 0,\n", + " 'TATGGAGCTGGCTATCATGA': 0,\n", + " 'TATGTAGCTGGCTATCATGA': 0,\n", + " 'AATGCCGCTGGCTATCATGA': 0,\n", + " 'AATGCGGCTGGCTATCATGA': 0,\n", + " 'AATGCTGCTGGCTATCATGA': 0,\n", + " 'CATGCCGCTGGCTATCATGA': 0,\n", + " 'CATGCGGCTGGCTATCATGA': 0,\n", + " 'CATGCTGCTGGCTATCATGA': 0,\n", + " 'GATGCCGCTGGCTATCATGA': 0,\n", + " 'GATGCGGCTGGCTATCATGA': 0,\n", + " 'GATGCTGCTGGCTATCATGA': 0,\n", + " 'TATGCCGCTGGCTATCATGA': 0,\n", + " 'TATGCGGCTGGCTATCATGA': 0,\n", + " 'TATGCTGCTGGCTATCATGA': 0,\n", + " 'AATGCAACTGGCTATCATGA': 0,\n", + " 'AATGCACCTGGCTATCATGA': 0,\n", + " 'AATGCATCTGGCTATCATGA': 0,\n", + " 'CATGCAACTGGCTATCATGA': 0,\n", + " 'CATGCACCTGGCTATCATGA': 0,\n", + " 'CATGCATCTGGCTATCATGA': 0,\n", + " 'GATGCAACTGGCTATCATGA': 0,\n", + " 'GATGCACCTGGCTATCATGA': 0,\n", + " 'GATGCATCTGGCTATCATGA': 0,\n", + " 'TATGCAACTGGCTATCATGA': 0,\n", + " 'TATGCACCTGGCTATCATGA': 0,\n", + " 'TATGCATCTGGCTATCATGA': 0,\n", + " 'AATGCAGATGGCTATCATGA': 0,\n", + " 'AATGCAGGTGGCTATCATGA': 0,\n", + " 'AATGCAGTTGGCTATCATGA': 0,\n", + " 'CATGCAGATGGCTATCATGA': 0,\n", + " 'CATGCAGGTGGCTATCATGA': 0,\n", + " 'CATGCAGTTGGCTATCATGA': 0,\n", + " 'GATGCAGATGGCTATCATGA': 0,\n", + " 'GATGCAGGTGGCTATCATGA': 0,\n", + " 'GATGCAGTTGGCTATCATGA': 0,\n", + " 'TATGCAGATGGCTATCATGA': 0,\n", + " 'TATGCAGGTGGCTATCATGA': 0,\n", + " 'TATGCAGTTGGCTATCATGA': 0,\n", + " 'AATGCAGCAGGCTATCATGA': 0,\n", + " 'AATGCAGCCGGCTATCATGA': 0,\n", + " 'AATGCAGCGGGCTATCATGA': 0,\n", + " 'CATGCAGCAGGCTATCATGA': 0,\n", + " 'CATGCAGCCGGCTATCATGA': 0,\n", + " 'CATGCAGCGGGCTATCATGA': 0,\n", + " 'GATGCAGCAGGCTATCATGA': 0,\n", + " 'GATGCAGCCGGCTATCATGA': 0,\n", + " 'GATGCAGCGGGCTATCATGA': 0,\n", + " 'TATGCAGCAGGCTATCATGA': 0,\n", + " 'TATGCAGCCGGCTATCATGA': 0,\n", + " 'TATGCAGCGGGCTATCATGA': 0,\n", + " 'AATGCAGCTAGCTATCATGA': 0,\n", + " 'AATGCAGCTCGCTATCATGA': 0,\n", + " 'AATGCAGCTTGCTATCATGA': 0,\n", + " 'CATGCAGCTAGCTATCATGA': 0,\n", + " 'CATGCAGCTCGCTATCATGA': 0,\n", + " 'CATGCAGCTTGCTATCATGA': 0,\n", + " 'GATGCAGCTAGCTATCATGA': 0,\n", + " 'GATGCAGCTCGCTATCATGA': 0,\n", + " 'GATGCAGCTTGCTATCATGA': 0,\n", + " 'TATGCAGCTAGCTATCATGA': 0,\n", + " 'TATGCAGCTCGCTATCATGA': 0,\n", + " 'TATGCAGCTTGCTATCATGA': 0,\n", + " 'AATGCAGCTGACTATCATGA': 0,\n", + " 'AATGCAGCTGCCTATCATGA': 0,\n", + " 'AATGCAGCTGTCTATCATGA': 0,\n", + " 'CATGCAGCTGACTATCATGA': 0,\n", + " 'CATGCAGCTGCCTATCATGA': 0,\n", + " 'CATGCAGCTGTCTATCATGA': 0,\n", + " 'GATGCAGCTGACTATCATGA': 0,\n", + " 'GATGCAGCTGCCTATCATGA': 0,\n", + " 'GATGCAGCTGTCTATCATGA': 0,\n", + " 'TATGCAGCTGACTATCATGA': 0,\n", + " 'TATGCAGCTGCCTATCATGA': 0,\n", + " 'TATGCAGCTGTCTATCATGA': 0,\n", + " 'AATGCAGCTGGATATCATGA': 0,\n", + " 'AATGCAGCTGGGTATCATGA': 0,\n", + " 'AATGCAGCTGGTTATCATGA': 0,\n", + " 'CATGCAGCTGGATATCATGA': 0,\n", + " 'CATGCAGCTGGGTATCATGA': 0,\n", + " 'CATGCAGCTGGTTATCATGA': 0,\n", + " 'GATGCAGCTGGATATCATGA': 0,\n", + " 'GATGCAGCTGGGTATCATGA': 0,\n", + " 'GATGCAGCTGGTTATCATGA': 0,\n", + " 'TATGCAGCTGGATATCATGA': 0,\n", + " 'TATGCAGCTGGGTATCATGA': 0,\n", + " 'TATGCAGCTGGTTATCATGA': 0,\n", + " 'AATGCAGCTGGCAATCATGA': 0,\n", + " 'AATGCAGCTGGCCATCATGA': 0,\n", + " 'AATGCAGCTGGCGATCATGA': 0,\n", + " 'CATGCAGCTGGCAATCATGA': 0,\n", + " 'CATGCAGCTGGCCATCATGA': 0,\n", + " 'CATGCAGCTGGCGATCATGA': 0,\n", + " 'GATGCAGCTGGCAATCATGA': 0,\n", + " 'GATGCAGCTGGCCATCATGA': 0,\n", + " 'GATGCAGCTGGCGATCATGA': 0,\n", + " 'TATGCAGCTGGCAATCATGA': 0,\n", + " 'TATGCAGCTGGCCATCATGA': 0,\n", + " 'TATGCAGCTGGCGATCATGA': 0,\n", + " 'AATGCAGCTGGCTCTCATGA': 0,\n", + " 'AATGCAGCTGGCTGTCATGA': 0,\n", + " 'AATGCAGCTGGCTTTCATGA': 0,\n", + " 'CATGCAGCTGGCTCTCATGA': 0,\n", + " 'CATGCAGCTGGCTGTCATGA': 0,\n", + " 'CATGCAGCTGGCTTTCATGA': 0,\n", + " 'GATGCAGCTGGCTCTCATGA': 0,\n", + " 'GATGCAGCTGGCTGTCATGA': 0,\n", + " 'GATGCAGCTGGCTTTCATGA': 0,\n", + " 'TATGCAGCTGGCTCTCATGA': 0,\n", + " 'TATGCAGCTGGCTGTCATGA': 0,\n", + " 'TATGCAGCTGGCTTTCATGA': 0,\n", + " 'AATGCAGCTGGCTAACATGA': 0,\n", + " 'AATGCAGCTGGCTACCATGA': 0,\n", + " 'AATGCAGCTGGCTAGCATGA': 0,\n", + " 'CATGCAGCTGGCTAACATGA': 0,\n", + " 'CATGCAGCTGGCTACCATGA': 0,\n", + " 'CATGCAGCTGGCTAGCATGA': 0,\n", + " 'GATGCAGCTGGCTAACATGA': 0,\n", + " 'GATGCAGCTGGCTACCATGA': 0,\n", + " 'GATGCAGCTGGCTAGCATGA': 0,\n", + " 'TATGCAGCTGGCTAACATGA': 0,\n", + " 'TATGCAGCTGGCTACCATGA': 0,\n", + " 'TATGCAGCTGGCTAGCATGA': 0,\n", + " 'AATGCAGCTGGCTATAATGA': 0,\n", + " 'AATGCAGCTGGCTATGATGA': 0,\n", + " 'AATGCAGCTGGCTATTATGA': 0,\n", + " 'CATGCAGCTGGCTATAATGA': 0,\n", + " 'CATGCAGCTGGCTATGATGA': 0,\n", + " 'CATGCAGCTGGCTATTATGA': 0,\n", + " 'GATGCAGCTGGCTATAATGA': 0,\n", + " 'GATGCAGCTGGCTATGATGA': 0,\n", + " 'GATGCAGCTGGCTATTATGA': 0,\n", + " 'TATGCAGCTGGCTATAATGA': 0,\n", + " 'TATGCAGCTGGCTATGATGA': 0,\n", + " 'TATGCAGCTGGCTATTATGA': 0,\n", + " 'AATGCAGCTGGCTATCCTGA': 0,\n", + " 'AATGCAGCTGGCTATCGTGA': 0,\n", + " 'AATGCAGCTGGCTATCTTGA': 0,\n", + " 'CATGCAGCTGGCTATCCTGA': 0,\n", + " 'CATGCAGCTGGCTATCGTGA': 0,\n", + " 'CATGCAGCTGGCTATCTTGA': 0,\n", + " 'GATGCAGCTGGCTATCCTGA': 0,\n", + " 'GATGCAGCTGGCTATCGTGA': 0,\n", + " 'GATGCAGCTGGCTATCTTGA': 0,\n", + " 'TATGCAGCTGGCTATCCTGA': 0,\n", + " 'TATGCAGCTGGCTATCGTGA': 0,\n", + " 'TATGCAGCTGGCTATCTTGA': 0,\n", + " 'AATGCAGCTGGCTATCAAGA': 0,\n", + " 'AATGCAGCTGGCTATCACGA': 0,\n", + " 'AATGCAGCTGGCTATCAGGA': 0,\n", + " 'CATGCAGCTGGCTATCAAGA': 0,\n", + " 'CATGCAGCTGGCTATCACGA': 0,\n", + " 'CATGCAGCTGGCTATCAGGA': 0,\n", + " 'GATGCAGCTGGCTATCAAGA': 0,\n", + " 'GATGCAGCTGGCTATCACGA': 0,\n", + " 'GATGCAGCTGGCTATCAGGA': 0,\n", + " 'TATGCAGCTGGCTATCAAGA': 0,\n", + " 'TATGCAGCTGGCTATCACGA': 0,\n", + " 'TATGCAGCTGGCTATCAGGA': 0,\n", + " 'AATGCAGCTGGCTATCATAA': 0,\n", + " 'AATGCAGCTGGCTATCATCA': 0,\n", + " 'AATGCAGCTGGCTATCATTA': 0,\n", + " 'CATGCAGCTGGCTATCATAA': 0,\n", + " 'CATGCAGCTGGCTATCATCA': 0,\n", + " 'CATGCAGCTGGCTATCATTA': 0,\n", + " 'GATGCAGCTGGCTATCATAA': 0,\n", + " 'GATGCAGCTGGCTATCATCA': 0,\n", + " 'GATGCAGCTGGCTATCATTA': 0,\n", + " 'TATGCAGCTGGCTATCATAA': 0,\n", + " 'TATGCAGCTGGCTATCATCA': 0,\n", + " 'TATGCAGCTGGCTATCATTA': 0,\n", + " 'AATGCAGCTGGCTATCATGC': 0,\n", + " 'AATGCAGCTGGCTATCATGG': 0,\n", + " 'AATGCAGCTGGCTATCATGT': 0,\n", + " 'CATGCAGCTGGCTATCATGC': 0,\n", + " 'CATGCAGCTGGCTATCATGG': 0,\n", + " 'CATGCAGCTGGCTATCATGT': 0,\n", + " 'GATGCAGCTGGCTATCATGC': 0,\n", + " 'GATGCAGCTGGCTATCATGG': 0,\n", + " 'GATGCAGCTGGCTATCATGT': 0,\n", + " 'TATGCAGCTGGCTATCATGC': 0,\n", + " 'TATGCAGCTGGCTATCATGG': 0,\n", + " 'TATGCAGCTGGCTATCATGT': 0,\n", + " 'GACTGCAGCTGGCTATCATGA': 0,\n", + " 'GAGTGCAGCTGGCTATCATGA': 0,\n", + " 'GATTGCAGCTGGCTATCATGA': 0,\n", + " 'GCCTGCAGCTGGCTATCATGA': 0,\n", + " 'GCGTGCAGCTGGCTATCATGA': 0,\n", + " 'GCTTGCAGCTGGCTATCATGA': 0,\n", + " 'GGCTGCAGCTGGCTATCATGA': 0,\n", + " 'GGGTGCAGCTGGCTATCATGA': 0,\n", + " 'GGTTGCAGCTGGCTATCATGA': 0,\n", + " 'GTCTGCAGCTGGCTATCATGA': 0,\n", + " 'GTGTGCAGCTGGCTATCATGA': 0,\n", + " 'GTTTGCAGCTGGCTATCATGA': 0,\n", + " 'GCAGCAGCTGGCTATCATGA': 0,\n", + " 'GCCGCAGCTGGCTATCATGA': 0,\n", + " 'GCGGCAGCTGGCTATCATGA': 0,\n", + " 'GGAGCAGCTGGCTATCATGA': 0,\n", + " 'GGCGCAGCTGGCTATCATGA': 0,\n", + " 'GGGGCAGCTGGCTATCATGA': 0,\n", + " 'GTAGCAGCTGGCTATCATGA': 0,\n", + " 'GTCGCAGCTGGCTATCATGA': 0,\n", + " 'GTGGCAGCTGGCTATCATGA': 0,\n", + " 'GCTACAGCTGGCTATCATGA': 0,\n", + " 'GCTCCAGCTGGCTATCATGA': 0,\n", + " 'GCTTCAGCTGGCTATCATGA': 0,\n", + " 'GGTACAGCTGGCTATCATGA': 0,\n", + " 'GGTCCAGCTGGCTATCATGA': 0,\n", + " 'GGTTCAGCTGGCTATCATGA': 0,\n", + " 'GTTACAGCTGGCTATCATGA': 0,\n", + " 'GTTCCAGCTGGCTATCATGA': 0,\n", + " 'GTTTCAGCTGGCTATCATGA': 0,\n", + " 'GCTGAAGCTGGCTATCATGA': 0,\n", + " 'GCTGGAGCTGGCTATCATGA': 0,\n", + " 'GCTGTAGCTGGCTATCATGA': 0,\n", + " 'GGTGAAGCTGGCTATCATGA': 0,\n", + " 'GGTGGAGCTGGCTATCATGA': 0,\n", + " 'GGTGTAGCTGGCTATCATGA': 0,\n", + " 'GTTGAAGCTGGCTATCATGA': 0,\n", + " 'GTTGGAGCTGGCTATCATGA': 0,\n", + " 'GTTGTAGCTGGCTATCATGA': 0,\n", + " 'GCTGCCGCTGGCTATCATGA': 0,\n", + " 'GCTGCGGCTGGCTATCATGA': 0,\n", + " 'GCTGCTGCTGGCTATCATGA': 0,\n", + " 'GGTGCCGCTGGCTATCATGA': 0,\n", + " 'GGTGCGGCTGGCTATCATGA': 0,\n", + " 'GGTGCTGCTGGCTATCATGA': 0,\n", + " 'GTTGCCGCTGGCTATCATGA': 0,\n", + " 'GTTGCGGCTGGCTATCATGA': 0,\n", + " 'GTTGCTGCTGGCTATCATGA': 0,\n", + " 'GCTGCAACTGGCTATCATGA': 0,\n", + " 'GCTGCACCTGGCTATCATGA': 0,\n", + " 'GCTGCATCTGGCTATCATGA': 0,\n", + " 'GGTGCAACTGGCTATCATGA': 0,\n", + " 'GGTGCACCTGGCTATCATGA': 0,\n", + " 'GGTGCATCTGGCTATCATGA': 0,\n", + " 'GTTGCAACTGGCTATCATGA': 0,\n", + " 'GTTGCACCTGGCTATCATGA': 0,\n", + " 'GTTGCATCTGGCTATCATGA': 0,\n", + " 'GCTGCAGATGGCTATCATGA': 0,\n", + " 'GCTGCAGGTGGCTATCATGA': 0,\n", + " 'GCTGCAGTTGGCTATCATGA': 0,\n", + " 'GGTGCAGATGGCTATCATGA': 0,\n", + " 'GGTGCAGGTGGCTATCATGA': 0,\n", + " 'GGTGCAGTTGGCTATCATGA': 0,\n", + " 'GTTGCAGATGGCTATCATGA': 0,\n", + " 'GTTGCAGGTGGCTATCATGA': 0,\n", + " 'GTTGCAGTTGGCTATCATGA': 0,\n", + " 'GCTGCAGCAGGCTATCATGA': 0,\n", + " 'GCTGCAGCCGGCTATCATGA': 0,\n", + " 'GCTGCAGCGGGCTATCATGA': 0,\n", + " 'GGTGCAGCAGGCTATCATGA': 0,\n", + " 'GGTGCAGCCGGCTATCATGA': 0,\n", + " 'GGTGCAGCGGGCTATCATGA': 0,\n", + " 'GTTGCAGCAGGCTATCATGA': 0,\n", + " 'GTTGCAGCCGGCTATCATGA': 0,\n", + " 'GTTGCAGCGGGCTATCATGA': 0,\n", + " 'GCTGCAGCTAGCTATCATGA': 0,\n", + " 'GCTGCAGCTCGCTATCATGA': 0,\n", + " 'GCTGCAGCTTGCTATCATGA': 0,\n", + " 'GGTGCAGCTAGCTATCATGA': 0,\n", + " 'GGTGCAGCTCGCTATCATGA': 0,\n", + " 'GGTGCAGCTTGCTATCATGA': 0,\n", + " 'GTTGCAGCTAGCTATCATGA': 0,\n", + " 'GTTGCAGCTCGCTATCATGA': 0,\n", + " 'GTTGCAGCTTGCTATCATGA': 0,\n", + " 'GCTGCAGCTGACTATCATGA': 0,\n", + " 'GCTGCAGCTGCCTATCATGA': 0,\n", + " 'GCTGCAGCTGTCTATCATGA': 0,\n", + " 'GGTGCAGCTGACTATCATGA': 0,\n", + " 'GGTGCAGCTGCCTATCATGA': 0,\n", + " 'GGTGCAGCTGTCTATCATGA': 0,\n", + " 'GTTGCAGCTGACTATCATGA': 0,\n", + " 'GTTGCAGCTGCCTATCATGA': 0,\n", + " 'GTTGCAGCTGTCTATCATGA': 0,\n", + " 'GCTGCAGCTGGATATCATGA': 0,\n", + " 'GCTGCAGCTGGGTATCATGA': 0,\n", + " 'GCTGCAGCTGGTTATCATGA': 0,\n", + " 'GGTGCAGCTGGATATCATGA': 0,\n", + " 'GGTGCAGCTGGGTATCATGA': 0,\n", + " 'GGTGCAGCTGGTTATCATGA': 0,\n", + " 'GTTGCAGCTGGATATCATGA': 0,\n", + " 'GTTGCAGCTGGGTATCATGA': 0,\n", + " 'GTTGCAGCTGGTTATCATGA': 0,\n", + " 'GCTGCAGCTGGCAATCATGA': 0,\n", + " 'GCTGCAGCTGGCCATCATGA': 0,\n", + " 'GCTGCAGCTGGCGATCATGA': 0,\n", + " 'GGTGCAGCTGGCAATCATGA': 0,\n", + " 'GGTGCAGCTGGCCATCATGA': 0,\n", + " 'GGTGCAGCTGGCGATCATGA': 0,\n", + " 'GTTGCAGCTGGCAATCATGA': 0,\n", + " 'GTTGCAGCTGGCCATCATGA': 0,\n", + " 'GTTGCAGCTGGCGATCATGA': 0,\n", + " 'GCTGCAGCTGGCTCTCATGA': 0,\n", + " 'GCTGCAGCTGGCTGTCATGA': 0,\n", + " 'GCTGCAGCTGGCTTTCATGA': 0,\n", + " 'GGTGCAGCTGGCTCTCATGA': 0,\n", + " 'GGTGCAGCTGGCTGTCATGA': 0,\n", + " 'GGTGCAGCTGGCTTTCATGA': 0,\n", + " 'GTTGCAGCTGGCTCTCATGA': 0,\n", + " 'GTTGCAGCTGGCTGTCATGA': 0,\n", + " 'GTTGCAGCTGGCTTTCATGA': 0,\n", + " 'GCTGCAGCTGGCTAACATGA': 0,\n", + " 'GCTGCAGCTGGCTACCATGA': 0,\n", + " 'GCTGCAGCTGGCTAGCATGA': 0,\n", + " 'GGTGCAGCTGGCTAACATGA': 0,\n", + " 'GGTGCAGCTGGCTACCATGA': 0,\n", + " 'GGTGCAGCTGGCTAGCATGA': 0,\n", + " 'GTTGCAGCTGGCTAACATGA': 0,\n", + " 'GTTGCAGCTGGCTACCATGA': 0,\n", + " 'GTTGCAGCTGGCTAGCATGA': 0,\n", + " 'GCTGCAGCTGGCTATAATGA': 0,\n", + " 'GCTGCAGCTGGCTATGATGA': 0,\n", + " 'GCTGCAGCTGGCTATTATGA': 0,\n", + " 'GGTGCAGCTGGCTATAATGA': 0,\n", + " 'GGTGCAGCTGGCTATGATGA': 0,\n", + " 'GGTGCAGCTGGCTATTATGA': 0,\n", + " 'GTTGCAGCTGGCTATAATGA': 0,\n", + " 'GTTGCAGCTGGCTATGATGA': 0,\n", + " 'GTTGCAGCTGGCTATTATGA': 0,\n", + " 'GCTGCAGCTGGCTATCCTGA': 0,\n", + " 'GCTGCAGCTGGCTATCGTGA': 0,\n", + " 'GCTGCAGCTGGCTATCTTGA': 0,\n", + " 'GGTGCAGCTGGCTATCCTGA': 0,\n", + " 'GGTGCAGCTGGCTATCGTGA': 0,\n", + " 'GGTGCAGCTGGCTATCTTGA': 0,\n", + " 'GTTGCAGCTGGCTATCCTGA': 0,\n", + " 'GTTGCAGCTGGCTATCGTGA': 0,\n", + " 'GTTGCAGCTGGCTATCTTGA': 0,\n", + " 'GCTGCAGCTGGCTATCAAGA': 0,\n", + " 'GCTGCAGCTGGCTATCACGA': 0,\n", + " 'GCTGCAGCTGGCTATCAGGA': 0,\n", + " 'GGTGCAGCTGGCTATCAAGA': 0,\n", + " 'GGTGCAGCTGGCTATCACGA': 0,\n", + " 'GGTGCAGCTGGCTATCAGGA': 0,\n", + " 'GTTGCAGCTGGCTATCAAGA': 0,\n", + " 'GTTGCAGCTGGCTATCACGA': 0,\n", + " 'GTTGCAGCTGGCTATCAGGA': 0,\n", + " 'GCTGCAGCTGGCTATCATAA': 0,\n", + " 'GCTGCAGCTGGCTATCATCA': 0,\n", + " 'GCTGCAGCTGGCTATCATTA': 0,\n", + " 'GGTGCAGCTGGCTATCATAA': 0,\n", + " 'GGTGCAGCTGGCTATCATCA': 0,\n", + " 'GGTGCAGCTGGCTATCATTA': 0,\n", + " 'GTTGCAGCTGGCTATCATAA': 0,\n", + " 'GTTGCAGCTGGCTATCATCA': 0,\n", + " 'GTTGCAGCTGGCTATCATTA': 0,\n", + " 'GCTGCAGCTGGCTATCATGC': 0,\n", + " 'GCTGCAGCTGGCTATCATGG': 0,\n", + " 'GCTGCAGCTGGCTATCATGT': 0,\n", + " 'GGTGCAGCTGGCTATCATGC': 0,\n", + " 'GGTGCAGCTGGCTATCATGG': 0,\n", + " 'GGTGCAGCTGGCTATCATGT': 0,\n", + " 'GTTGCAGCTGGCTATCATGC': 0,\n", + " 'GTTGCAGCTGGCTATCATGG': 0,\n", + " 'GTTGCAGCTGGCTATCATGT': 0,\n", + " 'GAAAGCAGCTGGCTATCATGA': 0,\n", + " 'GAACGCAGCTGGCTATCATGA': 0,\n", + " 'GAAGGCAGCTGGCTATCATGA': 0,\n", + " 'GACAGCAGCTGGCTATCATGA': 0,\n", + " 'GACCGCAGCTGGCTATCATGA': 0,\n", + " 'GACGGCAGCTGGCTATCATGA': 0,\n", + " 'GAGAGCAGCTGGCTATCATGA': 0,\n", + " 'GAGCGCAGCTGGCTATCATGA': 0,\n", + " 'GAGGGCAGCTGGCTATCATGA': 0,\n", + " 'GATAGCAGCTGGCTATCATGA': 0,\n", + " 'GATCGCAGCTGGCTATCATGA': 0,\n", + " 'GATGGCAGCTGGCTATCATGA': 0,\n", + " 'GAAACAGCTGGCTATCATGA': 0,\n", + " 'GAACCAGCTGGCTATCATGA': 0,\n", + " 'GAATCAGCTGGCTATCATGA': 0,\n", + " 'GACACAGCTGGCTATCATGA': 0,\n", + " 'GACCCAGCTGGCTATCATGA': 0,\n", + " 'GACTCAGCTGGCTATCATGA': 0,\n", + " 'GAGACAGCTGGCTATCATGA': 0,\n", + " 'GAGCCAGCTGGCTATCATGA': 0,\n", + " 'GAGTCAGCTGGCTATCATGA': 0,\n", + " 'GAAGAAGCTGGCTATCATGA': 0,\n", + " 'GAAGGAGCTGGCTATCATGA': 0,\n", + " 'GAAGTAGCTGGCTATCATGA': 0,\n", + " 'GACGAAGCTGGCTATCATGA': 0,\n", + " 'GACGGAGCTGGCTATCATGA': 0,\n", + " 'GACGTAGCTGGCTATCATGA': 0,\n", + " 'GAGGAAGCTGGCTATCATGA': 0,\n", + " 'GAGGGAGCTGGCTATCATGA': 0,\n", + " 'GAGGTAGCTGGCTATCATGA': 0,\n", + " 'GAAGCCGCTGGCTATCATGA': 0,\n", + " 'GAAGCGGCTGGCTATCATGA': 0,\n", + " 'GAAGCTGCTGGCTATCATGA': 0,\n", + " 'GACGCCGCTGGCTATCATGA': 0,\n", + " 'GACGCGGCTGGCTATCATGA': 0,\n", + " 'GACGCTGCTGGCTATCATGA': 0,\n", + " 'GAGGCCGCTGGCTATCATGA': 0,\n", + " 'GAGGCGGCTGGCTATCATGA': 0,\n", + " 'GAGGCTGCTGGCTATCATGA': 0,\n", + " 'GAAGCAACTGGCTATCATGA': 0,\n", + " 'GAAGCACCTGGCTATCATGA': 0,\n", + " 'GAAGCATCTGGCTATCATGA': 0,\n", + " 'GACGCAACTGGCTATCATGA': 0,\n", + " 'GACGCACCTGGCTATCATGA': 0,\n", + " 'GACGCATCTGGCTATCATGA': 0,\n", + " 'GAGGCAACTGGCTATCATGA': 0,\n", + " 'GAGGCACCTGGCTATCATGA': 0,\n", + " 'GAGGCATCTGGCTATCATGA': 0,\n", + " 'GAAGCAGATGGCTATCATGA': 0,\n", + " 'GAAGCAGGTGGCTATCATGA': 0,\n", + " 'GAAGCAGTTGGCTATCATGA': 0,\n", + " 'GACGCAGATGGCTATCATGA': 0,\n", + " 'GACGCAGGTGGCTATCATGA': 0,\n", + " 'GACGCAGTTGGCTATCATGA': 0,\n", + " 'GAGGCAGATGGCTATCATGA': 0,\n", + " 'GAGGCAGGTGGCTATCATGA': 0,\n", + " 'GAGGCAGTTGGCTATCATGA': 0,\n", + " 'GAAGCAGCAGGCTATCATGA': 0,\n", + " 'GAAGCAGCCGGCTATCATGA': 0,\n", + " 'GAAGCAGCGGGCTATCATGA': 0,\n", + " 'GACGCAGCAGGCTATCATGA': 0,\n", + " 'GACGCAGCCGGCTATCATGA': 0,\n", + " 'GACGCAGCGGGCTATCATGA': 0,\n", + " 'GAGGCAGCAGGCTATCATGA': 0,\n", + " 'GAGGCAGCCGGCTATCATGA': 0,\n", + " 'GAGGCAGCGGGCTATCATGA': 0,\n", + " 'GAAGCAGCTAGCTATCATGA': 0,\n", + " 'GAAGCAGCTCGCTATCATGA': 0,\n", + " 'GAAGCAGCTTGCTATCATGA': 0,\n", + " 'GACGCAGCTAGCTATCATGA': 0,\n", + " 'GACGCAGCTCGCTATCATGA': 0,\n", + " 'GACGCAGCTTGCTATCATGA': 0,\n", + " 'GAGGCAGCTAGCTATCATGA': 0,\n", + " 'GAGGCAGCTCGCTATCATGA': 0,\n", + " 'GAGGCAGCTTGCTATCATGA': 0,\n", + " 'GAAGCAGCTGACTATCATGA': 0,\n", + " 'GAAGCAGCTGCCTATCATGA': 0,\n", + " 'GAAGCAGCTGTCTATCATGA': 0,\n", + " 'GACGCAGCTGACTATCATGA': 0,\n", + " 'GACGCAGCTGCCTATCATGA': 0,\n", + " 'GACGCAGCTGTCTATCATGA': 0,\n", + " 'GAGGCAGCTGACTATCATGA': 0,\n", + " 'GAGGCAGCTGCCTATCATGA': 0,\n", + " 'GAGGCAGCTGTCTATCATGA': 0,\n", + " 'GAAGCAGCTGGATATCATGA': 0,\n", + " 'GAAGCAGCTGGGTATCATGA': 0,\n", + " 'GAAGCAGCTGGTTATCATGA': 0,\n", + " 'GACGCAGCTGGATATCATGA': 0,\n", + " 'GACGCAGCTGGGTATCATGA': 0,\n", + " 'GACGCAGCTGGTTATCATGA': 0,\n", + " 'GAGGCAGCTGGATATCATGA': 0,\n", + " 'GAGGCAGCTGGGTATCATGA': 0,\n", + " 'GAGGCAGCTGGTTATCATGA': 0,\n", + " 'GAAGCAGCTGGCAATCATGA': 0,\n", + " 'GAAGCAGCTGGCCATCATGA': 0,\n", + " 'GAAGCAGCTGGCGATCATGA': 0,\n", + " 'GACGCAGCTGGCAATCATGA': 0,\n", + " 'GACGCAGCTGGCCATCATGA': 0,\n", + " 'GACGCAGCTGGCGATCATGA': 0,\n", + " 'GAGGCAGCTGGCAATCATGA': 0,\n", + " 'GAGGCAGCTGGCCATCATGA': 0,\n", + " 'GAGGCAGCTGGCGATCATGA': 0,\n", + " 'GAAGCAGCTGGCTCTCATGA': 0,\n", + " 'GAAGCAGCTGGCTGTCATGA': 0,\n", + " 'GAAGCAGCTGGCTTTCATGA': 0,\n", + " 'GACGCAGCTGGCTCTCATGA': 0,\n", + " 'GACGCAGCTGGCTGTCATGA': 0,\n", + " 'GACGCAGCTGGCTTTCATGA': 0,\n", + " 'GAGGCAGCTGGCTCTCATGA': 0,\n", + " 'GAGGCAGCTGGCTGTCATGA': 0,\n", + " 'GAGGCAGCTGGCTTTCATGA': 0,\n", + " 'GAAGCAGCTGGCTAACATGA': 0,\n", + " 'GAAGCAGCTGGCTACCATGA': 0,\n", + " 'GAAGCAGCTGGCTAGCATGA': 0,\n", + " 'GACGCAGCTGGCTAACATGA': 0,\n", + " 'GACGCAGCTGGCTACCATGA': 0,\n", + " 'GACGCAGCTGGCTAGCATGA': 0,\n", + " 'GAGGCAGCTGGCTAACATGA': 0,\n", + " 'GAGGCAGCTGGCTACCATGA': 0,\n", + " 'GAGGCAGCTGGCTAGCATGA': 0,\n", + " 'GAAGCAGCTGGCTATAATGA': 0,\n", + " 'GAAGCAGCTGGCTATGATGA': 0,\n", + " 'GAAGCAGCTGGCTATTATGA': 0,\n", + " 'GACGCAGCTGGCTATAATGA': 0,\n", + " 'GACGCAGCTGGCTATGATGA': 0,\n", + " 'GACGCAGCTGGCTATTATGA': 0,\n", + " 'GAGGCAGCTGGCTATAATGA': 0,\n", + " 'GAGGCAGCTGGCTATGATGA': 0,\n", + " 'GAGGCAGCTGGCTATTATGA': 0,\n", + " 'GAAGCAGCTGGCTATCCTGA': 0,\n", + " 'GAAGCAGCTGGCTATCGTGA': 0,\n", + " 'GAAGCAGCTGGCTATCTTGA': 0,\n", + " 'GACGCAGCTGGCTATCCTGA': 0,\n", + " 'GACGCAGCTGGCTATCGTGA': 0,\n", + " 'GACGCAGCTGGCTATCTTGA': 0,\n", + " 'GAGGCAGCTGGCTATCCTGA': 0,\n", + " 'GAGGCAGCTGGCTATCGTGA': 0,\n", + " 'GAGGCAGCTGGCTATCTTGA': 0,\n", + " 'GAAGCAGCTGGCTATCAAGA': 0,\n", + " 'GAAGCAGCTGGCTATCACGA': 0,\n", + " 'GAAGCAGCTGGCTATCAGGA': 0,\n", + " 'GACGCAGCTGGCTATCAAGA': 0,\n", + " 'GACGCAGCTGGCTATCACGA': 0,\n", + " 'GACGCAGCTGGCTATCAGGA': 0,\n", + " 'GAGGCAGCTGGCTATCAAGA': 0,\n", + " 'GAGGCAGCTGGCTATCACGA': 0,\n", + " 'GAGGCAGCTGGCTATCAGGA': 0,\n", + " 'GAAGCAGCTGGCTATCATAA': 0,\n", + " 'GAAGCAGCTGGCTATCATCA': 0,\n", + " 'GAAGCAGCTGGCTATCATTA': 0,\n", + " 'GACGCAGCTGGCTATCATAA': 0,\n", + " 'GACGCAGCTGGCTATCATCA': 0,\n", + " 'GACGCAGCTGGCTATCATTA': 0,\n", + " 'GAGGCAGCTGGCTATCATAA': 0,\n", + " 'GAGGCAGCTGGCTATCATCA': 0,\n", + " 'GAGGCAGCTGGCTATCATTA': 0,\n", + " 'GAAGCAGCTGGCTATCATGC': 0,\n", + " 'GAAGCAGCTGGCTATCATGG': 0,\n", + " 'GAAGCAGCTGGCTATCATGT': 0,\n", + " 'GACGCAGCTGGCTATCATGC': 0,\n", + " 'GACGCAGCTGGCTATCATGG': 0,\n", + " 'GACGCAGCTGGCTATCATGT': 0,\n", + " 'GAGGCAGCTGGCTATCATGC': 0,\n", + " 'GAGGCAGCTGGCTATCATGG': 0,\n", + " 'GAGGCAGCTGGCTATCATGT': 0,\n", + " 'GATAACAGCTGGCTATCATGA': 0,\n", + " 'GATACCAGCTGGCTATCATGA': 0,\n", + " 'GATATCAGCTGGCTATCATGA': 0,\n", + " 'GATCACAGCTGGCTATCATGA': 0,\n", + " 'GATCCCAGCTGGCTATCATGA': 0,\n", + " 'GATCTCAGCTGGCTATCATGA': 0,\n", + " 'GATGACAGCTGGCTATCATGA': 0,\n", + " 'GATGCCAGCTGGCTATCATGA': 0,\n", + " 'GATGTCAGCTGGCTATCATGA': 0,\n", + " 'GATTACAGCTGGCTATCATGA': 0,\n", + " 'GATTCCAGCTGGCTATCATGA': 0,\n", + " 'GATTTCAGCTGGCTATCATGA': 0,\n", + " 'GATAAAGCTGGCTATCATGA': 0,\n", + " 'GATAGAGCTGGCTATCATGA': 0,\n", + " 'GATATAGCTGGCTATCATGA': 0,\n", + " 'GATCAAGCTGGCTATCATGA': 0,\n", + " 'GATCGAGCTGGCTATCATGA': 0,\n", + " 'GATCTAGCTGGCTATCATGA': 0,\n", + " 'GATTAAGCTGGCTATCATGA': 0,\n", + " 'GATTGAGCTGGCTATCATGA': 0,\n", + " 'GATTTAGCTGGCTATCATGA': 0,\n", + " 'GATACCGCTGGCTATCATGA': 0,\n", + " 'GATACGGCTGGCTATCATGA': 0,\n", + " 'GATACTGCTGGCTATCATGA': 0,\n", + " 'GATCCCGCTGGCTATCATGA': 0,\n", + " 'GATCCGGCTGGCTATCATGA': 0,\n", + " 'GATCCTGCTGGCTATCATGA': 0,\n", + " 'GATTCCGCTGGCTATCATGA': 0,\n", + " 'GATTCGGCTGGCTATCATGA': 0,\n", + " 'GATTCTGCTGGCTATCATGA': 0,\n", + " 'GATACAACTGGCTATCATGA': 0,\n", + " 'GATACACCTGGCTATCATGA': 0,\n", + " 'GATACATCTGGCTATCATGA': 0,\n", + " 'GATCCAACTGGCTATCATGA': 0,\n", + " 'GATCCACCTGGCTATCATGA': 0,\n", + " 'GATCCATCTGGCTATCATGA': 0,\n", + " 'GATTCAACTGGCTATCATGA': 0,\n", + " 'GATTCACCTGGCTATCATGA': 0,\n", + " 'GATTCATCTGGCTATCATGA': 0,\n", + " 'GATACAGATGGCTATCATGA': 0,\n", + " 'GATACAGGTGGCTATCATGA': 0,\n", + " 'GATACAGTTGGCTATCATGA': 0,\n", + " 'GATCCAGATGGCTATCATGA': 0,\n", + " 'GATCCAGGTGGCTATCATGA': 0,\n", + " 'GATCCAGTTGGCTATCATGA': 0,\n", + " 'GATTCAGATGGCTATCATGA': 0,\n", + " 'GATTCAGGTGGCTATCATGA': 0,\n", + " 'GATTCAGTTGGCTATCATGA': 0,\n", + " 'GATACAGCAGGCTATCATGA': 0,\n", + " 'GATACAGCCGGCTATCATGA': 0,\n", + " 'GATACAGCGGGCTATCATGA': 0,\n", + " 'GATCCAGCAGGCTATCATGA': 0,\n", + " 'GATCCAGCCGGCTATCATGA': 0,\n", + " 'GATCCAGCGGGCTATCATGA': 0,\n", + " 'GATTCAGCAGGCTATCATGA': 0,\n", + " 'GATTCAGCCGGCTATCATGA': 0,\n", + " 'GATTCAGCGGGCTATCATGA': 0,\n", + " 'GATACAGCTAGCTATCATGA': 0,\n", + " 'GATACAGCTCGCTATCATGA': 0,\n", + " 'GATACAGCTTGCTATCATGA': 0,\n", + " 'GATCCAGCTAGCTATCATGA': 0,\n", + " 'GATCCAGCTCGCTATCATGA': 0,\n", + " 'GATCCAGCTTGCTATCATGA': 0,\n", + " 'GATTCAGCTAGCTATCATGA': 0,\n", + " 'GATTCAGCTCGCTATCATGA': 0,\n", + " 'GATTCAGCTTGCTATCATGA': 0,\n", + " 'GATACAGCTGACTATCATGA': 0,\n", + " 'GATACAGCTGCCTATCATGA': 0,\n", + " 'GATACAGCTGTCTATCATGA': 0,\n", + " 'GATCCAGCTGACTATCATGA': 0,\n", + " 'GATCCAGCTGCCTATCATGA': 0,\n", + " 'GATCCAGCTGTCTATCATGA': 0,\n", + " 'GATTCAGCTGACTATCATGA': 0,\n", + " 'GATTCAGCTGCCTATCATGA': 0,\n", + " 'GATTCAGCTGTCTATCATGA': 0,\n", + " 'GATACAGCTGGATATCATGA': 0,\n", + " 'GATACAGCTGGGTATCATGA': 0,\n", + " 'GATACAGCTGGTTATCATGA': 0,\n", + " 'GATCCAGCTGGATATCATGA': 0,\n", + " 'GATCCAGCTGGGTATCATGA': 0,\n", + " 'GATCCAGCTGGTTATCATGA': 0,\n", + " 'GATTCAGCTGGATATCATGA': 0,\n", + " 'GATTCAGCTGGGTATCATGA': 0,\n", + " 'GATTCAGCTGGTTATCATGA': 0,\n", + " 'GATACAGCTGGCAATCATGA': 0,\n", + " 'GATACAGCTGGCCATCATGA': 0,\n", + " 'GATACAGCTGGCGATCATGA': 0,\n", + " 'GATCCAGCTGGCAATCATGA': 0,\n", + " 'GATCCAGCTGGCCATCATGA': 0,\n", + " 'GATCCAGCTGGCGATCATGA': 0,\n", + " 'GATTCAGCTGGCAATCATGA': 0,\n", + " 'GATTCAGCTGGCCATCATGA': 0,\n", + " 'GATTCAGCTGGCGATCATGA': 0,\n", + " 'GATACAGCTGGCTCTCATGA': 0,\n", + " 'GATACAGCTGGCTGTCATGA': 0,\n", + " 'GATACAGCTGGCTTTCATGA': 0,\n", + " 'GATCCAGCTGGCTCTCATGA': 0,\n", + " 'GATCCAGCTGGCTGTCATGA': 0,\n", + " 'GATCCAGCTGGCTTTCATGA': 0,\n", + " 'GATTCAGCTGGCTCTCATGA': 0,\n", + " 'GATTCAGCTGGCTGTCATGA': 0,\n", + " 'GATTCAGCTGGCTTTCATGA': 0,\n", + " 'GATACAGCTGGCTAACATGA': 0,\n", + " 'GATACAGCTGGCTACCATGA': 0,\n", + " 'GATACAGCTGGCTAGCATGA': 0,\n", + " 'GATCCAGCTGGCTAACATGA': 0,\n", + " 'GATCCAGCTGGCTACCATGA': 0,\n", + " 'GATCCAGCTGGCTAGCATGA': 0,\n", + " 'GATTCAGCTGGCTAACATGA': 0,\n", + " 'GATTCAGCTGGCTACCATGA': 0,\n", + " 'GATTCAGCTGGCTAGCATGA': 0,\n", + " 'GATACAGCTGGCTATAATGA': 0,\n", + " 'GATACAGCTGGCTATGATGA': 0,\n", + " 'GATACAGCTGGCTATTATGA': 0,\n", + " 'GATCCAGCTGGCTATAATGA': 0,\n", + " 'GATCCAGCTGGCTATGATGA': 0,\n", + " 'GATCCAGCTGGCTATTATGA': 0,\n", + " 'GATTCAGCTGGCTATAATGA': 0,\n", + " 'GATTCAGCTGGCTATGATGA': 0,\n", + " 'GATTCAGCTGGCTATTATGA': 0,\n", + " 'GATACAGCTGGCTATCCTGA': 0,\n", + " 'GATACAGCTGGCTATCGTGA': 0,\n", + " 'GATACAGCTGGCTATCTTGA': 0,\n", + " 'GATCCAGCTGGCTATCCTGA': 0,\n", + " 'GATCCAGCTGGCTATCGTGA': 0,\n", + " 'GATCCAGCTGGCTATCTTGA': 0,\n", + " 'GATTCAGCTGGCTATCCTGA': 0,\n", + " 'GATTCAGCTGGCTATCGTGA': 0,\n", + " 'GATTCAGCTGGCTATCTTGA': 0,\n", + " 'GATACAGCTGGCTATCAAGA': 0,\n", + " 'GATACAGCTGGCTATCACGA': 0,\n", + " 'GATACAGCTGGCTATCAGGA': 0,\n", + " 'GATCCAGCTGGCTATCAAGA': 0,\n", + " 'GATCCAGCTGGCTATCACGA': 0,\n", + " 'GATCCAGCTGGCTATCAGGA': 0,\n", + " 'GATTCAGCTGGCTATCAAGA': 0,\n", + " 'GATTCAGCTGGCTATCACGA': 0,\n", + " 'GATTCAGCTGGCTATCAGGA': 0,\n", + " 'GATACAGCTGGCTATCATAA': 0,\n", + " 'GATACAGCTGGCTATCATCA': 0,\n", + " 'GATACAGCTGGCTATCATTA': 0,\n", + " 'GATCCAGCTGGCTATCATAA': 0,\n", + " 'GATCCAGCTGGCTATCATCA': 0,\n", + " 'GATCCAGCTGGCTATCATTA': 0,\n", + " 'GATTCAGCTGGCTATCATAA': 0,\n", + " 'GATTCAGCTGGCTATCATCA': 0,\n", + " 'GATTCAGCTGGCTATCATTA': 0,\n", + " 'GATACAGCTGGCTATCATGC': 0,\n", + " 'GATACAGCTGGCTATCATGG': 0,\n", + " 'GATACAGCTGGCTATCATGT': 0,\n", + " 'GATCCAGCTGGCTATCATGC': 0,\n", + " 'GATCCAGCTGGCTATCATGG': 0,\n", + " 'GATCCAGCTGGCTATCATGT': 0,\n", + " 'GATTCAGCTGGCTATCATGC': 0,\n", + " 'GATTCAGCTGGCTATCATGG': 0,\n", + " 'GATTCAGCTGGCTATCATGT': 0,\n", + " 'GATGAAAGCTGGCTATCATGA': 0,\n", + " 'GATGAGAGCTGGCTATCATGA': 0,\n", + " 'GATGATAGCTGGCTATCATGA': 0,\n", + " 'GATGCAAGCTGGCTATCATGA': 0,\n", + " 'GATGCGAGCTGGCTATCATGA': 0,\n", + " 'GATGCTAGCTGGCTATCATGA': 0,\n", + " 'GATGGAAGCTGGCTATCATGA': 0,\n", + " 'GATGGGAGCTGGCTATCATGA': 0,\n", + " 'GATGGTAGCTGGCTATCATGA': 0,\n", + " 'GATGTAAGCTGGCTATCATGA': 0,\n", + " 'GATGTGAGCTGGCTATCATGA': 0,\n", + " 'GATGTTAGCTGGCTATCATGA': 0,\n", + " 'GATGACGCTGGCTATCATGA': 0,\n", + " 'GATGAGGCTGGCTATCATGA': 0,\n", + " 'GATGATGCTGGCTATCATGA': 0,\n", + " 'GATGGCGCTGGCTATCATGA': 0,\n", + " 'GATGGGGCTGGCTATCATGA': 0,\n", + " 'GATGGTGCTGGCTATCATGA': 0,\n", + " 'GATGTCGCTGGCTATCATGA': 0,\n", + " 'GATGTGGCTGGCTATCATGA': 0,\n", + " 'GATGTTGCTGGCTATCATGA': 0,\n", + " 'GATGAAACTGGCTATCATGA': 0,\n", + " 'GATGAACCTGGCTATCATGA': 0,\n", + " 'GATGAATCTGGCTATCATGA': 0,\n", + " 'GATGGAACTGGCTATCATGA': 0,\n", + " 'GATGGACCTGGCTATCATGA': 0,\n", + " 'GATGGATCTGGCTATCATGA': 0,\n", + " 'GATGTAACTGGCTATCATGA': 0,\n", + " 'GATGTACCTGGCTATCATGA': 0,\n", + " 'GATGTATCTGGCTATCATGA': 0,\n", + " 'GATGAAGATGGCTATCATGA': 0,\n", + " 'GATGAAGGTGGCTATCATGA': 0,\n", + " 'GATGAAGTTGGCTATCATGA': 0,\n", + " 'GATGGAGATGGCTATCATGA': 0,\n", + " 'GATGGAGGTGGCTATCATGA': 0,\n", + " 'GATGGAGTTGGCTATCATGA': 0,\n", + " 'GATGTAGATGGCTATCATGA': 0,\n", + " 'GATGTAGGTGGCTATCATGA': 0,\n", + " 'GATGTAGTTGGCTATCATGA': 0,\n", + " 'GATGAAGCAGGCTATCATGA': 0,\n", + " 'GATGAAGCCGGCTATCATGA': 0,\n", + " 'GATGAAGCGGGCTATCATGA': 0,\n", + " 'GATGGAGCAGGCTATCATGA': 0,\n", + " 'GATGGAGCCGGCTATCATGA': 0,\n", + " 'GATGGAGCGGGCTATCATGA': 0,\n", + " 'GATGTAGCAGGCTATCATGA': 0,\n", + " 'GATGTAGCCGGCTATCATGA': 0,\n", + " 'GATGTAGCGGGCTATCATGA': 0,\n", + " 'GATGAAGCTAGCTATCATGA': 0,\n", + " 'GATGAAGCTCGCTATCATGA': 0,\n", + " 'GATGAAGCTTGCTATCATGA': 0,\n", + " 'GATGGAGCTAGCTATCATGA': 0,\n", + " 'GATGGAGCTCGCTATCATGA': 0,\n", + " 'GATGGAGCTTGCTATCATGA': 0,\n", + " 'GATGTAGCTAGCTATCATGA': 0,\n", + " 'GATGTAGCTCGCTATCATGA': 0,\n", + " 'GATGTAGCTTGCTATCATGA': 0,\n", + " 'GATGAAGCTGACTATCATGA': 0,\n", + " 'GATGAAGCTGCCTATCATGA': 0,\n", + " 'GATGAAGCTGTCTATCATGA': 0,\n", + " 'GATGGAGCTGACTATCATGA': 0,\n", + " 'GATGGAGCTGCCTATCATGA': 0,\n", + " 'GATGGAGCTGTCTATCATGA': 0,\n", + " 'GATGTAGCTGACTATCATGA': 0,\n", + " 'GATGTAGCTGCCTATCATGA': 0,\n", + " 'GATGTAGCTGTCTATCATGA': 0,\n", + " 'GATGAAGCTGGATATCATGA': 0,\n", + " 'GATGAAGCTGGGTATCATGA': 0,\n", + " 'GATGAAGCTGGTTATCATGA': 0,\n", + " 'GATGGAGCTGGATATCATGA': 0,\n", + " 'GATGGAGCTGGGTATCATGA': 0,\n", + " 'GATGGAGCTGGTTATCATGA': 0,\n", + " 'GATGTAGCTGGATATCATGA': 0,\n", + " 'GATGTAGCTGGGTATCATGA': 0,\n", + " 'GATGTAGCTGGTTATCATGA': 0,\n", + " 'GATGAAGCTGGCAATCATGA': 0,\n", + " 'GATGAAGCTGGCCATCATGA': 0,\n", + " 'GATGAAGCTGGCGATCATGA': 0,\n", + " 'GATGGAGCTGGCAATCATGA': 0,\n", + " 'GATGGAGCTGGCCATCATGA': 0,\n", + " 'GATGGAGCTGGCGATCATGA': 0,\n", + " 'GATGTAGCTGGCAATCATGA': 0,\n", + " 'GATGTAGCTGGCCATCATGA': 0,\n", + " 'GATGTAGCTGGCGATCATGA': 0,\n", + " 'GATGAAGCTGGCTCTCATGA': 0,\n", + " 'GATGAAGCTGGCTGTCATGA': 0,\n", + " 'GATGAAGCTGGCTTTCATGA': 0,\n", + " 'GATGGAGCTGGCTCTCATGA': 0,\n", + " 'GATGGAGCTGGCTGTCATGA': 0,\n", + " 'GATGGAGCTGGCTTTCATGA': 0,\n", + " 'GATGTAGCTGGCTCTCATGA': 0,\n", + " 'GATGTAGCTGGCTGTCATGA': 0,\n", + " 'GATGTAGCTGGCTTTCATGA': 0,\n", + " 'GATGAAGCTGGCTAACATGA': 0,\n", + " 'GATGAAGCTGGCTACCATGA': 0,\n", + " 'GATGAAGCTGGCTAGCATGA': 0,\n", + " 'GATGGAGCTGGCTAACATGA': 0,\n", + " 'GATGGAGCTGGCTACCATGA': 0,\n", + " 'GATGGAGCTGGCTAGCATGA': 0,\n", + " 'GATGTAGCTGGCTAACATGA': 0,\n", + " 'GATGTAGCTGGCTACCATGA': 0,\n", + " 'GATGTAGCTGGCTAGCATGA': 0,\n", + " 'GATGAAGCTGGCTATAATGA': 0,\n", + " 'GATGAAGCTGGCTATGATGA': 0,\n", + " 'GATGAAGCTGGCTATTATGA': 0,\n", + " 'GATGGAGCTGGCTATAATGA': 0,\n", + " 'GATGGAGCTGGCTATGATGA': 0,\n", + " 'GATGGAGCTGGCTATTATGA': 0,\n", + " 'GATGTAGCTGGCTATAATGA': 0,\n", + " 'GATGTAGCTGGCTATGATGA': 0,\n", + " 'GATGTAGCTGGCTATTATGA': 0,\n", + " 'GATGAAGCTGGCTATCCTGA': 0,\n", + " 'GATGAAGCTGGCTATCGTGA': 0,\n", + " 'GATGAAGCTGGCTATCTTGA': 0,\n", + " 'GATGGAGCTGGCTATCCTGA': 0,\n", + " 'GATGGAGCTGGCTATCGTGA': 0,\n", + " 'GATGGAGCTGGCTATCTTGA': 0,\n", + " 'GATGTAGCTGGCTATCCTGA': 0,\n", + " 'GATGTAGCTGGCTATCGTGA': 0,\n", + " 'GATGTAGCTGGCTATCTTGA': 0,\n", + " 'GATGAAGCTGGCTATCAAGA': 0,\n", + " 'GATGAAGCTGGCTATCACGA': 0,\n", + " 'GATGAAGCTGGCTATCAGGA': 0,\n", + " 'GATGGAGCTGGCTATCAAGA': 0,\n", + " 'GATGGAGCTGGCTATCACGA': 0,\n", + " 'GATGGAGCTGGCTATCAGGA': 0,\n", + " 'GATGTAGCTGGCTATCAAGA': 0,\n", + " 'GATGTAGCTGGCTATCACGA': 0,\n", + " 'GATGTAGCTGGCTATCAGGA': 0,\n", + " 'GATGAAGCTGGCTATCATAA': 0,\n", + " 'GATGAAGCTGGCTATCATCA': 0,\n", + " 'GATGAAGCTGGCTATCATTA': 0,\n", + " 'GATGGAGCTGGCTATCATAA': 0,\n", + " 'GATGGAGCTGGCTATCATCA': 0,\n", + " 'GATGGAGCTGGCTATCATTA': 0,\n", + " 'GATGTAGCTGGCTATCATAA': 0,\n", + " 'GATGTAGCTGGCTATCATCA': 0,\n", + " 'GATGTAGCTGGCTATCATTA': 0,\n", + " 'GATGAAGCTGGCTATCATGC': 0,\n", + " 'GATGAAGCTGGCTATCATGG': 0,\n", + " 'GATGAAGCTGGCTATCATGT': 0,\n", + " 'GATGGAGCTGGCTATCATGC': 0,\n", + " 'GATGGAGCTGGCTATCATGG': 0,\n", + " 'GATGGAGCTGGCTATCATGT': 0,\n", + " 'GATGTAGCTGGCTATCATGC': 0,\n", + " 'GATGTAGCTGGCTATCATGG': 0,\n", + " 'GATGTAGCTGGCTATCATGT': 0,\n", + " 'GATGCACGCTGGCTATCATGA': 0,\n", + " 'GATGCAGGCTGGCTATCATGA': 0,\n", + " 'GATGCATGCTGGCTATCATGA': 0,\n", + " 'GATGCCCGCTGGCTATCATGA': 0,\n", + " 'GATGCCGGCTGGCTATCATGA': 0,\n", + " 'GATGCCTGCTGGCTATCATGA': 0,\n", + " 'GATGCGCGCTGGCTATCATGA': 0,\n", + " 'GATGCGGGCTGGCTATCATGA': 0,\n", + " 'GATGCGTGCTGGCTATCATGA': 0,\n", + " 'GATGCTCGCTGGCTATCATGA': 0,\n", + " 'GATGCTGGCTGGCTATCATGA': 0,\n", + " 'GATGCTTGCTGGCTATCATGA': 0,\n", + " 'GATGCCACTGGCTATCATGA': 0,\n", + " 'GATGCCCCTGGCTATCATGA': 0,\n", + " 'GATGCCTCTGGCTATCATGA': 0,\n", + " 'GATGCGACTGGCTATCATGA': 0,\n", + " 'GATGCGCCTGGCTATCATGA': 0,\n", + " 'GATGCGTCTGGCTATCATGA': 0,\n", + " 'GATGCTACTGGCTATCATGA': 0,\n", + " 'GATGCTCCTGGCTATCATGA': 0,\n", + " 'GATGCTTCTGGCTATCATGA': 0,\n", + " 'GATGCCGATGGCTATCATGA': 0,\n", + " 'GATGCCGGTGGCTATCATGA': 0,\n", + " 'GATGCCGTTGGCTATCATGA': 0,\n", + " 'GATGCGGATGGCTATCATGA': 0,\n", + " 'GATGCGGGTGGCTATCATGA': 0,\n", + " 'GATGCGGTTGGCTATCATGA': 0,\n", + " 'GATGCTGATGGCTATCATGA': 0,\n", + " 'GATGCTGGTGGCTATCATGA': 0,\n", + " 'GATGCTGTTGGCTATCATGA': 0,\n", + " 'GATGCCGCAGGCTATCATGA': 0,\n", + " 'GATGCCGCCGGCTATCATGA': 0,\n", + " 'GATGCCGCGGGCTATCATGA': 0,\n", + " 'GATGCGGCAGGCTATCATGA': 0,\n", + " 'GATGCGGCCGGCTATCATGA': 0,\n", + " 'GATGCGGCGGGCTATCATGA': 0,\n", + " 'GATGCTGCAGGCTATCATGA': 0,\n", + " 'GATGCTGCCGGCTATCATGA': 0,\n", + " 'GATGCTGCGGGCTATCATGA': 0,\n", + " 'GATGCCGCTAGCTATCATGA': 0,\n", + " 'GATGCCGCTCGCTATCATGA': 0,\n", + " 'GATGCCGCTTGCTATCATGA': 0,\n", + " 'GATGCGGCTAGCTATCATGA': 0,\n", + " 'GATGCGGCTCGCTATCATGA': 0,\n", + " 'GATGCGGCTTGCTATCATGA': 0,\n", + " 'GATGCTGCTAGCTATCATGA': 0,\n", + " 'GATGCTGCTCGCTATCATGA': 0,\n", + " 'GATGCTGCTTGCTATCATGA': 0,\n", + " 'GATGCCGCTGACTATCATGA': 0,\n", + " 'GATGCCGCTGCCTATCATGA': 0,\n", + " 'GATGCCGCTGTCTATCATGA': 0,\n", + " 'GATGCGGCTGACTATCATGA': 0,\n", + " 'GATGCGGCTGCCTATCATGA': 0,\n", + " 'GATGCGGCTGTCTATCATGA': 0,\n", + " 'GATGCTGCTGACTATCATGA': 0,\n", + " 'GATGCTGCTGCCTATCATGA': 0,\n", + " 'GATGCTGCTGTCTATCATGA': 0,\n", + " 'GATGCCGCTGGATATCATGA': 0,\n", + " 'GATGCCGCTGGGTATCATGA': 0,\n", + " 'GATGCCGCTGGTTATCATGA': 0,\n", + " 'GATGCGGCTGGATATCATGA': 0,\n", + " 'GATGCGGCTGGGTATCATGA': 0,\n", + " 'GATGCGGCTGGTTATCATGA': 0,\n", + " 'GATGCTGCTGGATATCATGA': 0,\n", + " 'GATGCTGCTGGGTATCATGA': 0,\n", + " 'GATGCTGCTGGTTATCATGA': 0,\n", + " 'GATGCCGCTGGCAATCATGA': 0,\n", + " 'GATGCCGCTGGCCATCATGA': 0,\n", + " 'GATGCCGCTGGCGATCATGA': 0,\n", + " 'GATGCGGCTGGCAATCATGA': 0,\n", + " 'GATGCGGCTGGCCATCATGA': 0,\n", + " 'GATGCGGCTGGCGATCATGA': 0,\n", + " 'GATGCTGCTGGCAATCATGA': 0,\n", + " 'GATGCTGCTGGCCATCATGA': 0,\n", + " 'GATGCTGCTGGCGATCATGA': 0,\n", + " 'GATGCCGCTGGCTCTCATGA': 0,\n", + " 'GATGCCGCTGGCTGTCATGA': 0,\n", + " 'GATGCCGCTGGCTTTCATGA': 0,\n", + " 'GATGCGGCTGGCTCTCATGA': 0,\n", + " 'GATGCGGCTGGCTGTCATGA': 0,\n", + " 'GATGCGGCTGGCTTTCATGA': 0,\n", + " 'GATGCTGCTGGCTCTCATGA': 0,\n", + " 'GATGCTGCTGGCTGTCATGA': 0,\n", + " 'GATGCTGCTGGCTTTCATGA': 0,\n", + " 'GATGCCGCTGGCTAACATGA': 0,\n", + " 'GATGCCGCTGGCTACCATGA': 0,\n", + " 'GATGCCGCTGGCTAGCATGA': 0,\n", + " 'GATGCGGCTGGCTAACATGA': 0,\n", + " 'GATGCGGCTGGCTACCATGA': 0,\n", + " 'GATGCGGCTGGCTAGCATGA': 0,\n", + " 'GATGCTGCTGGCTAACATGA': 0,\n", + " 'GATGCTGCTGGCTACCATGA': 0,\n", + " 'GATGCTGCTGGCTAGCATGA': 0,\n", + " 'GATGCCGCTGGCTATAATGA': 0,\n", + " 'GATGCCGCTGGCTATGATGA': 0,\n", + " 'GATGCCGCTGGCTATTATGA': 0,\n", + " 'GATGCGGCTGGCTATAATGA': 0,\n", + " 'GATGCGGCTGGCTATGATGA': 0,\n", + " 'GATGCGGCTGGCTATTATGA': 0,\n", + " 'GATGCTGCTGGCTATAATGA': 0,\n", + " 'GATGCTGCTGGCTATGATGA': 0,\n", + " 'GATGCTGCTGGCTATTATGA': 0,\n", + " 'GATGCCGCTGGCTATCCTGA': 0,\n", + " 'GATGCCGCTGGCTATCGTGA': 0,\n", + " 'GATGCCGCTGGCTATCTTGA': 0,\n", + " 'GATGCGGCTGGCTATCCTGA': 0,\n", + " 'GATGCGGCTGGCTATCGTGA': 0,\n", + " 'GATGCGGCTGGCTATCTTGA': 0,\n", + " 'GATGCTGCTGGCTATCCTGA': 0,\n", + " 'GATGCTGCTGGCTATCGTGA': 0,\n", + " ...}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "barcode_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing read 0...\n", + "Processing read 10000...\n", + "Processing read 20000...\n", + "Processing read 30000...\n", + "Processing read 40000...\n", + "Processing read 50000...\n", + "Processing read 60000...\n", + "Processing read 70000...\n", + "Processing read 80000...\n", + "Processing read 90000...\n", + "Processing read 100000...\n", + "Processing read 110000...\n", + "Processing read 120000...\n", + "Processing read 130000...\n", + "Processing read 140000...\n", + "Processing read 150000...\n", + "Processing read 160000...\n", + "Processing read 170000...\n", + "Processing read 180000...\n", + "Processing read 190000...\n", + "Processing read 200000...\n", + "Processing read 210000...\n", + "Processing read 220000...\n", + "Processing read 230000...\n", + "Processing read 240000...\n", + "Processing read 250000...\n", + "Processing read 260000...\n", + "Processing read 270000...\n", + "Done!\n", + "Total # of unique-UMI reads = 192543\n" + ] + } + ], + "source": [ + "#Parse and map RNA sequencing reads\n", + "\n", + "save_suffix = \"_var_repl_2_sknsh_v3\"\n", + "\n", + "r1_name = \"unprocessed_data/SKNSH_Var_R2_trimmed.fastq\"\n", + "r2_name = \"unprocessed_data/SKNSH_Var_R2_UMI_trimmed.fastq\"\n", + "\n", + "polya_regexp = re.compile(r\"AAAAA(AAAAAAAAAAAAAAA){s<=2}\")\n", + "distal_regexp = re.compile(r\"(GCCTCGACTGTGCCTTCTAG){s<=2}\")\n", + "\n", + "def _hamming(s1, s2) :\n", + " \n", + " d = 0.\n", + " for j in range(len(s1)) :\n", + " if s1[j] != s2[j] :\n", + " d += 1.\n", + " \n", + " return d\n", + "\n", + "max_pos = 176\n", + "\n", + "umi_dict = {}\n", + "umi_n_muts = 0\n", + "\n", + "bases = ['A', 'C', 'G', 'T']\n", + "\n", + "cuts = np.zeros((len(library_df), 206))\n", + "\n", + "f1 = open(r1_name, 'rt')\n", + "f2 = open(r2_name, 'rt')\n", + "\n", + "#Iterate through reads sequenctially (r1 and r2)\n", + "\n", + "r1_counter = 0\n", + "while True :\n", + " \n", + " #Read 1\n", + " id1 = f1.readline().strip()\n", + " \n", + " #Check for end-of-file\n", + " if len(id1) == 0 :\n", + " break\n", + " \n", + " r1 = f1.readline().strip()\n", + " s1 = f1.readline().strip()\n", + " q1 = f1.readline().strip()\n", + " \n", + " #Read 2\n", + " id2 = f2.readline().strip()\n", + " r2 = f2.readline().strip()\n", + " s2 = f2.readline().strip()\n", + " q2 = f2.readline().strip()\n", + " \n", + " if r1_counter % 10000 == 0 :\n", + " print(\"Processing read \" + str(r1_counter) + \"...\")\n", + " \n", + " r1_counter += 1\n", + " \n", + " #Map read to library member\n", + " bc = r1[:20]\n", + " \n", + " lib_i = -1\n", + " if bc in barcode_dict :\n", + " lib_i = barcode_dict[bc]\n", + " \n", + " if lib_i == -1 :\n", + " continue\n", + " \n", + " if umi_n_muts == 0 :\n", + " bc = sequences[lib_i][:20]\n", + " \n", + " #Determine if we have seen this umi before, otherwise mark as visited\n", + " umi = r2[:8]\n", + " \n", + " if bc not in umi_dict :\n", + " umi_dict[bc] = {}\n", + " \n", + " umi_visited = False\n", + " if umi in umi_dict[bc] :\n", + " umi_visited = True\n", + " elif umi_n_muts == 1 :\n", + " for pos1 in range(len(umi)) :\n", + " for b1 in bases :\n", + " umi_mut = umi[:pos1] + b1 + umi[pos1+1:]\n", + " if umi_mut in umi_dict[bc] :\n", + " umi_visited = True\n", + " break\n", + " if umi_visited :\n", + " break\n", + " elif umi_n_muts == 2 :\n", + " for pos1 in range(len(umi)) :\n", + " for pos2 in range(pos1, len(umi)) :\n", + " for b1 in bases :\n", + " for b2 in bases :\n", + " umi_mut = umi[:pos1] + b1 + umi[pos1+1:pos2] + b2 + umi[pos2+1:]\n", + " if umi_mut in umi_dict[bc] :\n", + " umi_visited = True\n", + " break\n", + " if umi_visited :\n", + " break\n", + " if umi_visited :\n", + " break\n", + " if umi_visited :\n", + " break\n", + " \n", + " #Skip if umi already seen\n", + " if umi_visited :\n", + " continue\n", + " \n", + " #Determine polyA position (or alternative if the read is distally polyadenylated)\n", + " polya_match = re.search(polya_regexp, r1)\n", + " \n", + " polya_pos = -1\n", + " if polya_match is not None and polya_match.span()[0] < max_pos :\n", + " polya_pos = polya_match.span()[0]\n", + " \n", + " #Determine if distal read\n", + " is_distal = False\n", + " distal_match = re.search(distal_regexp, r1[209-5:209+20+5])\n", + " \n", + " if distal_match is not None :\n", + " is_distal = True\n", + " \n", + " #Aggregate read-position occurrence counts\n", + " if is_distal :\n", + " cuts[lib_i, -1] += 1.\n", + " \n", + " #Mark as seen and proceed\n", + " umi_dict[bc][umi] = True\n", + " \n", + " elif polya_pos != -1 and polya_pos >= 30 :\n", + " \n", + " #Perform hamming-based consistency check against reference of region upstream of cleavage\n", + " \n", + " hamming_dist = _hamming(sequences[lib_i][polya_pos-20:polya_pos], r1[polya_pos-20:polya_pos])\n", + " \n", + " if hamming_dist <= 3 :\n", + " cuts[lib_i, polya_pos] += 1.\n", + "\n", + " #Mark as seen and proceed\n", + " umi_dict[bc][umi] = True\n", + "\n", + "f1.close()\n", + "f2.close()\n", + "\n", + "print(\"Done!\")\n", + "\n", + "print(\"Total # of unique-UMI reads = \" + str(int(np.sum(cuts))))\n", + "\n", + "#Store processed read-position count matrix\n", + "np.save('apa_oligo_2022' + save_suffix + '_umi_mut_' + str(umi_n_muts) + '_cuts', cuts)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#Load processed read count data\n", + "\n", + "ref_cuts_repl_1 = np.load(\"apa_oligo_2022_ref_repl_1_sknsh_v3_umi_mut_0_cuts.npy\")\n", + "var_cuts_repl_1 = np.load(\"apa_oligo_2022_var_repl_1_sknsh_v3_umi_mut_0_cuts.npy\")\n", + "\n", + "ref_cuts_repl_2 = np.load(\"apa_oligo_2022_ref_repl_2_sknsh_v3_umi_mut_0_cuts.npy\")\n", + "var_cuts_repl_2 = np.load(\"apa_oligo_2022_var_repl_2_sknsh_v3_umi_mut_0_cuts.npy\")\n", + "\n", + "#Pooled counts\n", + "\n", + "ref_cuts_pooled = ref_cuts_repl_1 + ref_cuts_repl_2\n", + "var_cuts_pooled = var_cuts_repl_1 + var_cuts_repl_2\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#Augment library file with measured isoform summary statistics\n", + "\n", + "pseudo_c = 1.\n", + "\n", + "#Replicate 1\n", + "library_df['ref_count_77_127_repl_1'] = np.sum(ref_cuts_repl_1[:, 77:127], axis=-1)\n", + "library_df['ref_count_0_205_repl_1'] = np.sum(ref_cuts_repl_1[:, 0:205], axis=-1)\n", + "library_df['ref_count_total_repl_1'] = np.sum(ref_cuts_repl_1, axis=-1)\n", + "library_df['ref_logit_77_127_repl_1'] = np.log(\n", + " ((library_df['ref_count_77_127_repl_1'] + pseudo_c) / (library_df['ref_count_total_repl_1'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_77_127_repl_1'] + pseudo_c) / (library_df['ref_count_total_repl_1'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['ref_logit_0_205_repl_1'] = np.log(\n", + " ((library_df['ref_count_0_205_repl_1'] + pseudo_c) / (library_df['ref_count_total_repl_1'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_0_205_repl_1'] + pseudo_c) / (library_df['ref_count_total_repl_1'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['var_count_77_127_repl_1'] = np.sum(var_cuts_repl_1[:, 77:127], axis=-1)\n", + "library_df['var_count_0_205_repl_1'] = np.sum(var_cuts_repl_1[:, 0:205], axis=-1)\n", + "library_df['var_count_total_repl_1'] = np.sum(var_cuts_repl_1, axis=-1)\n", + "library_df['var_logit_77_127_repl_1'] = np.log(\n", + " ((library_df['var_count_77_127_repl_1'] + pseudo_c) / (library_df['var_count_total_repl_1'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_77_127_repl_1'] + pseudo_c) / (library_df['var_count_total_repl_1'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['var_logit_0_205_repl_1'] = np.log(\n", + " ((library_df['var_count_0_205_repl_1'] + pseudo_c) / (library_df['var_count_total_repl_1'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_0_205_repl_1'] + pseudo_c) / (library_df['var_count_total_repl_1'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['delta_logodds_true_77_127_repl_1'] = library_df['var_logit_77_127_repl_1'] - library_df['ref_logit_77_127_repl_1']\n", + "library_df['delta_logodds_true_0_205_repl_1'] = library_df['var_logit_0_205_repl_1'] - library_df['ref_logit_0_205_repl_1']\n", + "\n", + "#Replicate 2\n", + "library_df['ref_count_77_127_repl_2'] = np.sum(ref_cuts_repl_2[:, 77:127], axis=-1)\n", + "library_df['ref_count_0_205_repl_2'] = np.sum(ref_cuts_repl_2[:, 0:205], axis=-1)\n", + "library_df['ref_count_total_repl_2'] = np.sum(ref_cuts_repl_2, axis=-1)\n", + "library_df['ref_logit_77_127_repl_2'] = np.log(\n", + " ((library_df['ref_count_77_127_repl_2'] + pseudo_c) / (library_df['ref_count_total_repl_2'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_77_127_repl_2'] + pseudo_c) / (library_df['ref_count_total_repl_2'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['ref_logit_0_205_repl_2'] = np.log(\n", + " ((library_df['ref_count_0_205_repl_2'] + pseudo_c) / (library_df['ref_count_total_repl_2'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_0_205_repl_2'] + pseudo_c) / (library_df['ref_count_total_repl_2'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['var_count_77_127_repl_2'] = np.sum(var_cuts_repl_2[:, 77:127], axis=-1)\n", + "library_df['var_count_0_205_repl_2'] = np.sum(var_cuts_repl_2[:, 0:205], axis=-1)\n", + "library_df['var_count_total_repl_2'] = np.sum(var_cuts_repl_2, axis=-1)\n", + "library_df['var_logit_77_127_repl_2'] = np.log(\n", + " ((library_df['var_count_77_127_repl_2'] + pseudo_c) / (library_df['var_count_total_repl_2'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_77_127_repl_2'] + pseudo_c) / (library_df['var_count_total_repl_2'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['var_logit_0_205_repl_2'] = np.log(\n", + " ((library_df['var_count_0_205_repl_2'] + pseudo_c) / (library_df['var_count_total_repl_2'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_0_205_repl_2'] + pseudo_c) / (library_df['var_count_total_repl_2'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['delta_logodds_true_77_127_repl_2'] = library_df['var_logit_77_127_repl_2'] - library_df['ref_logit_77_127_repl_2']\n", + "library_df['delta_logodds_true_0_205_repl_2'] = library_df['var_logit_0_205_repl_2'] - library_df['ref_logit_0_205_repl_2']\n", + "\n", + "#Pooled replicates\n", + "library_df['ref_count_77_127_repl_pooled'] = np.sum(ref_cuts_pooled[:, 77:127], axis=-1)\n", + "library_df['ref_count_0_205_repl_pooled'] = np.sum(ref_cuts_pooled[:, 0:205], axis=-1)\n", + "library_df['ref_count_total_repl_pooled'] = np.sum(ref_cuts_pooled, axis=-1)\n", + "library_df['ref_logit_77_127_repl_pooled'] = np.log(\n", + " ((library_df['ref_count_77_127_repl_pooled'] + pseudo_c) / (library_df['ref_count_total_repl_pooled'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_77_127_repl_pooled'] + pseudo_c) / (library_df['ref_count_total_repl_pooled'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['ref_logit_0_205_repl_pooled'] = np.log(\n", + " ((library_df['ref_count_0_205_repl_pooled'] + pseudo_c) / (library_df['ref_count_total_repl_pooled'] + 2. * pseudo_c)) / (1. - ((library_df['ref_count_0_205_repl_pooled'] + pseudo_c) / (library_df['ref_count_total_repl_pooled'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['var_count_77_127_repl_pooled'] = np.sum(var_cuts_pooled[:, 77:127], axis=-1)\n", + "library_df['var_count_0_205_repl_pooled'] = np.sum(var_cuts_pooled[:, 0:205], axis=-1)\n", + "library_df['var_count_total_repl_pooled'] = np.sum(var_cuts_pooled, axis=-1)\n", + "library_df['var_logit_77_127_repl_pooled'] = np.log(\n", + " ((library_df['var_count_77_127_repl_pooled'] + pseudo_c) / (library_df['var_count_total_repl_pooled'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_77_127_repl_pooled'] + pseudo_c) / (library_df['var_count_total_repl_pooled'] + 2. * pseudo_c)))\n", + ")\n", + "library_df['var_logit_0_205_repl_pooled'] = np.log(\n", + " ((library_df['var_count_0_205_repl_pooled'] + pseudo_c) / (library_df['var_count_total_repl_pooled'] + 2. * pseudo_c)) / (1. - ((library_df['var_count_0_205_repl_pooled'] + pseudo_c) / (library_df['var_count_total_repl_pooled'] + 2. * pseudo_c)))\n", + ")\n", + "\n", + "library_df['delta_logodds_true_77_127_repl_pooled'] = library_df['var_logit_77_127_repl_pooled'] - library_df['ref_logit_77_127_repl_pooled']\n", + "library_df['delta_logodds_true_0_205_repl_pooled'] = library_df['var_logit_0_205_repl_pooled'] - library_df['ref_logit_0_205_repl_pooled']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "#Cache measurements in dataframe\n", + "\n", + "library_df.to_csv(\"apa_100_variants_rev2_20220621_sknsh_v3_umi_mut_0.csv\", sep='\\t')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/__main__.py:4: RuntimeWarning: invalid value encountered in true_divide\n", + "/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/__main__.py:7: RuntimeWarning: invalid value encountered in true_divide\n", + "/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/__main__.py:10: RuntimeWarning: invalid value encountered in true_divide\n" + ] + } + ], + "source": [ + "#Compute cleavage probabilities\n", + "\n", + "ref_cut_prob_repl_1 = ref_cuts_repl_1 / np.sum(ref_cuts_repl_1, axis=1)[:, None]\n", + "var_cut_prob_repl_1 = var_cuts_repl_1 / np.sum(var_cuts_repl_1, axis=1)[:, None]\n", + "\n", + "ref_cut_prob_repl_2 = ref_cuts_repl_2 / np.sum(ref_cuts_repl_2, axis=1)[:, None]\n", + "var_cut_prob_repl_2 = var_cuts_repl_2 / np.sum(var_cuts_repl_2, axis=1)[:, None]\n", + "\n", + "ref_cut_prob_pooled = ref_cuts_pooled / np.sum(ref_cuts_pooled, axis=1)[:, None]\n", + "var_cut_prob_pooled = var_cuts_pooled / np.sum(var_cuts_pooled, axis=1)[:, None]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "delta_logodds (repl 1) = 1.1978\n", + "delta_logodds (repl 2) = 1.1273\n", + "delta_logodds (pooled) = 1.0435\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Control: F2 variant (creation of new cutsite -1bp relative to reference)\n", + "\n", + "save_figs = True\n", + "fig_name = \"F2_control_profile_sknsh\"\n", + "\n", + "test_ix = 9\n", + "\n", + "#Plot range\n", + "plot_start = 0\n", + "plot_end = 146\n", + "\n", + "#Isoform definition\n", + "cut_start = 77\n", + "cut_end = 127\n", + "\n", + "c_ref_1 = ref_cut_prob_repl_1[test_ix, :]\n", + "c_var_1 = var_cut_prob_repl_1[test_ix, :]\n", + "\n", + "c_ref_2 = ref_cut_prob_repl_2[test_ix, :]\n", + "c_var_2 = var_cut_prob_repl_2[test_ix, :]\n", + "\n", + "c_ref_pooled = ref_cut_prob_pooled[test_ix, :]\n", + "c_var_pooled = var_cut_prob_pooled[test_ix, :]\n", + "\n", + "delta_logodds_1 = np.log(np.sum(c_var_1[77:127]) / (1. - np.sum(c_var_1[77:127]))) - np.log(np.sum(c_ref_1[77:127]) / (1. - np.sum(c_ref_1[77:127])))\n", + "delta_logodds_2 = np.log(np.sum(c_var_2[77:127]) / (1. - np.sum(c_var_2[77:127]))) - np.log(np.sum(c_ref_2[77:127]) / (1. - np.sum(c_ref_2[77:127])))\n", + "delta_logodds_pooled = np.log(np.sum(c_var_pooled[77:127]) / (1. - np.sum(c_var_pooled[77:127]))) - np.log(np.sum(c_ref_pooled[77:127]) / (1. - np.sum(c_ref_pooled[77:127])))\n", + "\n", + "print(\"delta_logodds (repl 1) = \" + str(round(delta_logodds_1, 4)))\n", + "print(\"delta_logodds (repl 2) = \" + str(round(delta_logodds_2, 4)))\n", + "print(\"delta_logodds (pooled) = \" + str(round(delta_logodds_pooled, 4)))\n", + "\n", + "#Plot replicate 1 profile\n", + "f = plt.figure(figsize=(10, 3))\n", + "\n", + "plt.plot(c_ref_1[plot_start: plot_end], color='darkblue', linewidth=3)\n", + "plt.plot(c_var_1[plot_start: plot_end], color='darkorange', linewidth=3)\n", + "\n", + "plt.axvline(x=70, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "plt.axvline(x=76, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "\n", + "plt.xlim(plot_start, plot_end)\n", + "plt.ylim(0.)\n", + "\n", + "plt.title(\"Replicate 1. Delta Isoform Log Odds = \" + str(round(delta_logodds_1, 4)))\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_repl_1.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_repl_1.eps\")\n", + "\n", + "plt.show()\n", + "\n", + "#Plot replicate 2 profile\n", + "f = plt.figure(figsize=(10, 3))\n", + "\n", + "plt.plot(c_ref_2[plot_start: plot_end], color='darkblue', linewidth=3)\n", + "plt.plot(c_var_2[plot_start: plot_end], color='darkorange', linewidth=3)\n", + "\n", + "plt.axvline(x=70, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "plt.axvline(x=76, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "\n", + "plt.xlim(plot_start, plot_end)\n", + "plt.ylim(0.)\n", + "\n", + "plt.title(\"Replicate 2. Delta Isoform Log Odds = \" + str(round(delta_logodds_2, 4)))\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_repl_2.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_repl_2.eps\")\n", + "\n", + "plt.show()\n", + "\n", + "#Plot pooled replicate profile\n", + "f = plt.figure(figsize=(10, 3))\n", + "\n", + "plt.plot(c_ref_pooled[plot_start: plot_end], color='darkblue', linewidth=3)\n", + "plt.plot(c_var_pooled[plot_start: plot_end], color='darkorange', linewidth=3)\n", + "\n", + "plt.axvline(x=70, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "plt.axvline(x=76, linewidth=2, linestyle='--', color='black', alpha=0.75)\n", + "\n", + "plt.xlim(plot_start, plot_end)\n", + "plt.ylim(0.)\n", + "\n", + "plt.title(\"Pooled replicates. Delta Isoform Log Odds = \" + str(round(delta_logodds_pooled, 4)))\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_pooled.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_pooled.eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len(filtered_df) = 95\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from scipy.stats import spearmanr\n", + "\n", + "save_figs = True\n", + "fig_name = \"replicate_sknsh\"\n", + "\n", + "min_c = 5.\n", + "\n", + "x_min = -6.\n", + "x_max = 4.\n", + "\n", + "filtered_df = library_df.query(\"ref_count_total_repl_1 >= \" + str(min_c) + \" and \" + \"ref_count_total_repl_2 >= \" + str(min_c) + \" and \" + \"var_count_total_repl_1 >= \" + str(min_c) + \" and \" + \"var_count_total_repl_2 >= \" + str(min_c))\n", + "\n", + "print(\"len(filtered_df) = \" + str(len(filtered_df)))\n", + "\n", + "#Reference library (replicate correlation)\n", + "r_val_ref, _ = spearmanr(filtered_df['ref_logit_77_127_repl_1'], filtered_df['ref_logit_77_127_repl_2'])\n", + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "plt.scatter(filtered_df['ref_logit_77_127_repl_1'], filtered_df['ref_logit_77_127_repl_2'], color='lightgreen', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "plt.xlim(x_min, x_max)\n", + "plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Ref Logit (Repl. 1)\", fontsize=12)\n", + "plt.ylabel(\"Ref Logit (Repl. 2)\", fontsize=12)\n", + "\n", + "plt.title(\"r = \" + str(round(r_val_ref, 3)) + \", n = \" + str(len(filtered_df)), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_ref_logits.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_ref_logits.eps\")\n", + "\n", + "plt.show()\n", + "\n", + "#Variant library (replicate correlation)\n", + "r_val_var, _ = spearmanr(filtered_df['var_logit_77_127_repl_1'], filtered_df['var_logit_77_127_repl_2'])\n", + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "plt.scatter(filtered_df['var_logit_77_127_repl_1'], filtered_df['var_logit_77_127_repl_2'], color='lightcoral', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "plt.xlim(x_min, x_max)\n", + "plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Var Logit (Repl. 1)\", fontsize=12)\n", + "plt.ylabel(\"Var Logit (Repl. 2)\", fontsize=12)\n", + "\n", + "plt.title(\"r = \" + str(round(r_val_var, 3)) + \", n = \" + str(len(filtered_df)), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_var_logits.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_var_logits.eps\")\n", + "\n", + "plt.show()\n", + "\n", + "#Ref-Var library (delta replicate correlation)\n", + "r_val_var, _ = spearmanr(filtered_df['delta_logodds_true_77_127_repl_1'], filtered_df['delta_logodds_true_77_127_repl_2'])\n", + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "plt.scatter(filtered_df['delta_logodds_true_77_127_repl_1'], filtered_df['delta_logodds_true_77_127_repl_2'], color='lightcoral', s=45, edgecolor='black', linewidth=1)\n", + "\n", + "plt.xlim(x_min, x_max)\n", + "plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"LOR (Repl. 1)\", fontsize=12)\n", + "plt.ylabel(\"LOR (Repl. 2)\", fontsize=12)\n", + "\n", + "plt.title(\"r = \" + str(round(r_val_var, 3)) + \", n = \" + str(len(filtered_df)), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_delta_logodds.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_delta_logodds.eps\")\n", + "\n", + "plt.show()\n", + "\n", + "#Ref-Var control correlation\n", + "control_df = filtered_df.loc[filtered_df['experiment'].str.contains(\"control_\") & filtered_df['data_source'].str.contains(\"Array_2019\")]\n", + "\n", + "r_val_control, _ = spearmanr(control_df['ref_logit_77_127_repl_pooled'], control_df['var_logit_77_127_repl_pooled'])\n", + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "plt.scatter(control_df['ref_logit_77_127_repl_pooled'], control_df['var_logit_77_127_repl_pooled'], color='black', s=175, marker='^')\n", + "\n", + "plt.plot([x_min, x_max], [x_min, x_max], color='darkgreen', linestyle='--', linewidth=2,)\n", + "\n", + "plt.xlim(x_min, x_max)\n", + "plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Ref Logit\", fontsize=12)\n", + "plt.ylabel(\"Var Logit\", fontsize=12)\n", + "\n", + "plt.title(\"Controls; Spearman r = \" + str(round(r_val_control, 3)) + \", n = \" + str(len(control_df)), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \"_control_logits.png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \"_control_logits.eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n (sequences) = 186066\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Compare controls from 2022 oligo array to the Cell paper 2019 array\n", + "\n", + "#Load oligo array from 2019\n", + "import isolearn.io as isoio\n", + "\n", + "save_figs = True\n", + "fig_name = \"replicate_sknsh_to_2019\"\n", + "\n", + "isoform_pseudo_count = 1.\n", + "proximal_start = 77\n", + "proximal_end = 127\n", + "\n", + "file_prefix = str(proximal_start) + \"_\" + str(proximal_end)\n", + "\n", + "seq_dict = isoio.load('../../../../aparent/data/prepared_data/apa_array_data/apa_array_data_seq')\n", + "\n", + "seq_df = seq_dict['array_df']\n", + "seq_cuts = seq_dict['pooled_cuts']\n", + "\n", + "cut_true = np.concatenate([np.array(seq_cuts[:, 180: 180 + 205].todense()), np.array(seq_cuts[:, -1].todense()).reshape(-1, 1)], axis=-1)# - 1\n", + "\n", + "seq_df['proximal_count'] = [np.sum(cut_true[i, proximal_start:proximal_end]) for i in range(len(seq_df))]\n", + "seq_df['total_count'] = [np.sum(cut_true[i, :]) for i in range(len(seq_df))]\n", + "\n", + "seq_df['iso_true'] = (seq_df['proximal_count'] + isoform_pseudo_count) / (seq_df['total_count'] + 2. * isoform_pseudo_count)\n", + "seq_df['logodds_true'] = np.log(seq_df['iso_true'] / (1.0 - seq_df['iso_true']))\n", + "\n", + "seq_df['seq'] = seq_df['seq'].str.slice(0, 205)\n", + "\n", + "print(\"n (sequences) = \" + str(len(seq_df)))\n", + "\n", + "#Ref-Array 2019 control correlation\n", + "control_df_2019 = control_df.join(seq_df[['seq', 'logodds_true']].set_index(\"seq\"), on='ref_seq', how='inner').copy().reset_index(drop=True)\n", + "\n", + "r_val_control, _ = spearmanr(control_df_2019['ref_logit_77_127_repl_pooled'], control_df_2019['logodds_true'])\n", + "\n", + "f = plt.figure(figsize=(4, 4))\n", + "\n", + "plt.scatter(control_df_2019['ref_logit_77_127_repl_pooled'], control_df_2019['logodds_true'], color='deepskyblue', edgecolor='black', linewidth=1, s=175, marker='^')\n", + "\n", + "plt.plot([x_min, x_max], [x_min, x_max], color='darkgreen', linestyle='--', linewidth=2,)\n", + "\n", + "plt.xlim(x_min, x_max)\n", + "plt.ylim(x_min, x_max)\n", + "\n", + "plt.xticks(fontsize=12)\n", + "plt.yticks(fontsize=12)\n", + "\n", + "plt.xlabel(\"Logit (2022)\", fontsize=12)\n", + "plt.ylabel(\"Logit (2019)\", fontsize=12)\n", + "\n", + "plt.title(\"Controls; Spearman r = \" + str(round(r_val_control, 3)) + \", n = \" + str(len(control_df_2019)), fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "\n", + "if save_figs :\n", + " plt.savefig(fig_name + \".png\", dpi=600, transparent=True)\n", + " plt.savefig(fig_name + \".eps\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:tensorflow]", + "language": "python", + "name": "conda-env-tensorflow-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}