From f2635f97d4bc0abfaaeb454c645e882ebaf71ee4 Mon Sep 17 00:00:00 2001 From: Karollus Alexander Date: Thu, 4 Mar 2021 16:51:53 +0100 Subject: [PATCH 1/2] prototyped new generic dataloader types + API --- .gitignore | 1 + Prototyping.ipynb | 1349 +++++++++++++++++++++++++ kipoiseq/dataloaders/__init__.py | 1 + kipoiseq/dataloaders/prototype.py | 291 ++++++ kipoiseq/extractors/multi_interval.py | 1 - 5 files changed, 1642 insertions(+), 1 deletion(-) create mode 100644 Prototyping.ipynb create mode 100644 kipoiseq/dataloaders/prototype.py diff --git a/.gitignore b/.gitignore index 3cc7ead..cc0657e 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,4 @@ venv.bak/ # genomics /*.fai /**/*.fai +ExampleFiles/ diff --git a/Prototyping.ipynb b/Prototyping.ipynb new file mode 100644 index 0000000..dac815e --- /dev/null +++ b/Prototyping.ipynb @@ -0,0 +1,1349 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "premier-texture", + "metadata": {}, + "source": [ + "# Prototype for a more generic dataloader interface\n", + "\n", + "This notebook walks through the design of creating a generic single sequence dataloader, with some notes on my understanding of the kipoiseq codebase (I make no guarantees as to the correctness of the latter)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "lined-exercise", + "metadata": {}, + "outputs": [], + "source": [ + "import kipoiseq\n", + "import kipoi\n", + "import os\n", + "import urllib\n", + "import pandas as pd\n", + "import pyranges\n", + "from collections import defaultdict\n", + "\n", + "from kipoiseq.dataloaders import *\n", + "from kipoiseq.extractors import GenericMultiIntervalSeqExtractor, BaseMultiIntervalFetcher, \\\n", + " GTFMultiIntervalFetcher, BaseExtractor, FastaStringExtractor, SingleVariantMatcher, GenericSingleVariantMultiIntervalVCFSeqExtractor, \\\n", + " MultiSampleVCF" + ] + }, + { + "cell_type": "markdown", + "id": "vocational-reserve", + "metadata": {}, + "source": [ + "Get some data to work with" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "id": "wrong-horizontal", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('ExampleFiles/example_bed.fasta', )" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# make ExampleFile directory if it does not exist\n", + "if not os.path.exists(\"ExampleFiles\"):\n", + " os.makedirs(\"ExampleFiles\")\n", + " \n", + "# Download GTF\n", + "urllib.request.urlretrieve(\"https://zenodo.org/record/1466102/files/example_files-gencode.v24.annotation_chr22.gtf?download=1\", 'ExampleFiles/chrom22.gtf')\n", + "# Download fasta\n", + "urllib.request.urlretrieve(\"https://zenodo.org/record/1466102/files/example_files-hg38_chr22.fa?download=1\", 'ExampleFiles/chrom22.fa')\n", + "# Download bed\n", + "urllib.request.urlretrieve(\"https://raw.githubusercontent.com/kipoi/kipoiseq/master/tests/data/intervals_51bp.tsv\", \"ExampleFiles/example_bed.bed\")\n", + "# Download fasta that goes along with it\n", + "urllib.request.urlretrieve(\"https://raw.githubusercontent.com/kipoi/kipoiseq/master/tests/data/hg38_chr22_32000000_32300000.fa\", \"ExampleFiles/example_bed.fasta\")" + ] + }, + { + "cell_type": "markdown", + "id": "violent-chorus", + "metadata": {}, + "source": [ + "## Introducing the GenericSingleSeqDataloader in prototype.py\n", + "\n", + "The GenericSingleSeqDataloader extends SampleIterator from kipoi and is composed of three main components:\n", + "\n", + "* An interval fetcher (of type BaseMultiIntervalFetcher from kipoiseq.extractors.multi_interval). This is a generator object that provides intervals defining the location (in some reference) of the sequences of interest. The way BaseMultiIntervalFetcher is designed is that it provides, for a given key (e.g. a transcript_id), a list of intervals (kipoiseq.datatypes.Interval) that correspond to that key (e.g. all exons of that transcript). As generator, it simply iterates through the keys and returns both keys and corresponding intervals. \n", + " * Currently, the main working implementation of BaseMultiIntervalFetcher is GTFMultiIntervalFetcher (from kipoiseq.extractors.gtf). This, despite the name, is not tied to gtf but only requires a pyranges-like pandas dataframe (so dataframe with at least the columns Chromosome, Start, End, Strand) - which can be delivered from other sources easily. The key is the dataframe index. If the index is not unique, then it will return all intervals that have the same index (I am not sure this is an intended functionality in pandas, but it seems to work well). \n", + " * One could also probably easily design a fetcher that gets intervals just in time, from a database or similar. This might be better when doing analyses on whole genomes\n", + "* A reference sequence source, usually a FastaExtractor, which given an interval provides the corresponding reference sequence\n", + "* A sequence transformer (e.g. OneHot), which given a string sequence provides a transformed sequence as required by a model\n", + "\n", + "The operation of this class is quite straightforward: it gets keys and intervals from the fetcher, extracts the sequence using the reference sequence source, transforms it, and then returns it in a dict together with metadata." + ] + }, + { + "cell_type": "markdown", + "id": "little-burns", + "metadata": {}, + "source": [ + "With this, I believe it is very easy to build new dataloaders for most standard use-cases. All one needs to do is:\n", + "* Define a way to import and preprocess interval data and supply them to the Fetcher. In most cases, this will be as easy as reading in a dataframe with pyranges (from gtf, bed, tsv, ...), doing some pandas operations on it, and then calling init of the GTFMUltiIntervalFetcher (which could maybe be renamed to RangesDataFrameFetcher or something)\n", + "* Define a way to load reference sequence data (in 99% of cases this will be a fasta file supplied to a FastaSequenceExtractor)\n", + "* Define some transformations, if necessary.\n" + ] + }, + { + "cell_type": "markdown", + "id": "naughty-baghdad", + "metadata": {}, + "source": [ + "## Building a gtf based TSS dataloader for Xpresso" + ] + }, + { + "cell_type": "markdown", + "id": "disabled-nation", + "metadata": {}, + "source": [ + "We can use this template to design a TSS dataloader for Xpresso:\n", + "\n", + "The main thing we need to write is some way to extract TSS sites from a pyranges-like dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "based-riding", + "metadata": {}, + "outputs": [], + "source": [ + "class TSSFinder:\n", + " \"\"\"\n", + " Imputes approximate TSS location as 5' end of the gene annotation\n", + " \"\"\"\n", + " \n", + " def __init__(\n", + " self,\n", + " n_upstream,\n", + " n_downstream\n", + " ):\n", + " self.n_upstream = n_upstream\n", + " self.n_downstream = n_downstream\n", + " \n", + " def __call__(\n", + " self,\n", + " region_df : pd.DataFrame\n", + " ) -> pd.DataFrame:\n", + " region_df = region_df.query('Feature == \"gene\" and gene_type == \"protein_coding\"')\n", + " anchor = ((region_df.Start * (region_df.Strand == \"+\")) \n", + " + (region_df.End * (region_df.Strand == \"-\")))\n", + " region_df[\"Start\"] = (anchor + \n", + " (-self.n_upstream * (region_df.Strand == \"+\")) + \n", + " (-self.n_downstream * (region_df.Strand == \"-\")))\n", + " region_df[\"End\"] = (anchor + \n", + " (self.n_downstream * (region_df.Strand == \"+\")) + \n", + " (self.n_upstream * (region_df.Strand == \"-\")))\n", + " return region_df" + ] + }, + { + "cell_type": "markdown", + "id": "junior-cabin", + "metadata": {}, + "source": [ + "Once we have that, we can easily build a TSS Dataloader simply by extending our GenericSingleSeqDataloader and building the fetcher, extractor and transform:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "conventional-system", + "metadata": {}, + "outputs": [], + "source": [ + "from kipoiseq.dataloaders.prototype import GenericSingleSeqDataloader, IdentityTransform\n", + "class TSSDataloader(GenericSingleSeqDataloader):\n", + " \n", + " def __init__(\n", + " self,\n", + " gtf_file : str,\n", + " fasta_file : str,\n", + " n_upstream : int,\n", + " n_downstream : int,\n", + " interval_attrs = [\"gene_id\", \"gene_type\"]\n", + " ):\n", + " self.gtf_file = gtf_file\n", + " self.fasta_file = fasta_file\n", + " self.n_upstream = n_upstream\n", + " self.n_downstream = n_downstream\n", + " self.use_strand = True\n", + " \n", + " # Source interval data from gtf\n", + " df = pyranges.read_gtf(self.gtf_file).df\n", + " # Subset to areas of interest\n", + " df = TSSFinder(\n", + " self.n_upstream,\n", + " self.n_downstream\n", + " )(df)\n", + " # Build the interval fetcher\n", + " interval_source = GTFMultiIntervalFetcher(\n", + " df, \n", + " keep_attrs=interval_attrs\n", + " )\n", + " # Source reference sequence from fasta\n", + " reference_sequence_source = FastaStringExtractor(\n", + " fasta_file,\n", + " use_strand=self.use_strand\n", + " )\n", + " # Provide sequence transformer\n", + " sequence_transformer = IdentityTransform()\n", + " # Pass all to superclass\n", + " super().__init__(\n", + " interval_source,\n", + " reference_sequence_source,\n", + " sequence_transformer,\n", + " interval_attrs\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "essential-london", + "metadata": {}, + "source": [ + "Lets build it and test it" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "challenging-detection", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/modules/i12g/anaconda/envs/kipoi-Framepool2/lib/python3.6/site-packages/ipykernel_launcher.py:23: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "/opt/modules/i12g/anaconda/envs/kipoi-Framepool2/lib/python3.6/site-packages/ipykernel_launcher.py:26: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" + ] + } + ], + "source": [ + "tss = TSSDataloader(\"ExampleFiles/chrom22.gtf\",\n", + " \"ExampleFiles/chrom22.fa\",\n", + " 7000,\n", + " 3500)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "documentary-auckland", + "metadata": {}, + "outputs": [], + "source": [ + "gtf_df = pyranges.read_gtf(\"ExampleFiles/chrom22.gtf\").df\n", + "\n", + "# Check that we faithfully recover the original TSS\n", + "for sample in tss:\n", + " gene_id = sample[\"metadata\"][\"gene_id\"]\n", + " gene_strand = sample[\"metadata\"]['ranges'].strand\n", + " gtf_row = gtf_df.query('gene_id == @gene_id')\n", + " if len(gtf_row) == 0:\n", + " print(gene_id)\n", + " if gene_strand == \"+\":\n", + " implied_TSS = sample[\"metadata\"]['ranges'].start + 7000\n", + " assert(gtf_row.iloc[0].Start == implied_TSS)\n", + " if gene_strand == \"-\":\n", + " implied_TSS = sample[\"metadata\"]['ranges'].end - 7000\n", + " assert(gtf_row.iloc[0].End == implied_TSS)" + ] + }, + { + "cell_type": "markdown", + "id": "welcome-compound", + "metadata": {}, + "source": [ + "## Extracting PolyA sequences\n", + "\n", + "We could also extract PolyA sites:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "intellectual-devon", + "metadata": {}, + "outputs": [], + "source": [ + "class PolyAFinder:\n", + " \"\"\"\n", + " Imputes approximate polyA location as 3' end of the transcript annotation\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " n_upstream,\n", + " n_downstream\n", + " ):\n", + " self.n_upstream = n_upstream\n", + " self.n_downstream = n_downstream\n", + " \n", + " def __call__(\n", + " self,\n", + " region_df : pd.DataFrame\n", + " ) -> pd.DataFrame:\n", + " region_df = region_df.query('Feature == \"transcript\" and transcript_type == \"protein_coding\"')\n", + " anchor = ((region_df.End * (region_df.Strand == \"+\")) \n", + " + (region_df.Start * (region_df.Strand == \"-\")))\n", + " region_df[\"Start\"] = (anchor + \n", + " (-self.n_upstream * (region_df.Strand == \"+\")) + \n", + " (-self.n_downstream * (region_df.Strand == \"-\")))\n", + " region_df[\"End\"] = (anchor + \n", + " (self.n_downstream * (region_df.Strand == \"+\")) + \n", + " (self.n_upstream * (region_df.Strand == \"-\")))\n", + " return region_df" + ] + }, + { + "cell_type": "markdown", + "id": "jewish-washer", + "metadata": {}, + "source": [ + "All we need to do to achieve this, is to replace the TSSFinder with a PolyAFinder:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "metric-ordering", + "metadata": {}, + "outputs": [], + "source": [ + "class PolyADataloader(GenericSingleSeqDataloader):\n", + " \n", + " def __init__(\n", + " self,\n", + " gtf_file : str,\n", + " fasta_file : str,\n", + " n_upstream : int,\n", + " n_downstream : int,\n", + " interval_attrs = [\"gene_id\", \"transcript_id\", \"transcript_type\"]\n", + " ):\n", + " self.gtf_file = gtf_file\n", + " self.fasta_file = fasta_file\n", + " self.n_upstream = n_upstream\n", + " self.n_downstream = n_downstream\n", + " self.use_strand = True\n", + " \n", + " # Source interval data from gtf\n", + " df = pyranges.read_gtf(self.gtf_file).df\n", + " # Subset to areas of interest\n", + " df = PolyAFinder(\n", + " self.n_upstream,\n", + " self.n_downstream\n", + " )(df)\n", + " # Build the interval fetcher\n", + " interval_source = GTFMultiIntervalFetcher(\n", + " df, \n", + " keep_attrs=interval_attrs\n", + " )\n", + " # Source reference sequence from fasta\n", + " reference_sequence_source = FastaStringExtractor(\n", + " fasta_file,\n", + " use_strand=self.use_strand\n", + " )\n", + " # Provide sequence transformer\n", + " sequence_transformer = IdentityTransform()\n", + " # Pass all to superclass\n", + " super().__init__(\n", + " interval_source,\n", + " reference_sequence_source,\n", + " sequence_transformer,\n", + " interval_attrs\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "affiliated-upset", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/modules/i12g/anaconda/envs/kipoi-Framepool2/lib/python3.6/site-packages/ipykernel_launcher.py:23: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "/opt/modules/i12g/anaconda/envs/kipoi-Framepool2/lib/python3.6/site-packages/ipykernel_launcher.py:26: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" + ] + } + ], + "source": [ + "polyA = PolyADataloader(\"ExampleFiles/chrom22.gtf\",\n", + " \"ExampleFiles/chrom22.fa\",\n", + " 102,\n", + " 103)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "sorted-complex", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'inputs': {'seq': 'TTTCTGGCATTGTCTGCCCAGCTGCTCCAAGCCAGACTGATGAAGGAGGAGTCCCCAGTGGTGAGCTGGAGGTTGGAGCCTGAAGATGGCACAGCTCTGTGATTCATCTTCTGCGGTTGTGGCAGCCACGGTGATGGAGACGGCAGCTCAACAGGAGCAATAGGAGGGTACCCATGGAGGCCAAGTGGTAGGATCCTTGGAGGGT'},\n", + " 'metadata': {'gene_id': 'ENSG00000279973.1',\n", + " 'transcript_id': 'ENST00000624155.1',\n", + " 'transcript_type': 'protein_coding',\n", + " 'ranges': GenomicRanges(chr='chr22', start=11067987, end=11068192, id=1, strand='+')}}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(polyA)" + ] + }, + { + "cell_type": "markdown", + "id": "enormous-world", + "metadata": {}, + "source": [ + "## Sourcing data from a bed file" + ] + }, + { + "cell_type": "markdown", + "id": "ecological-diversity", + "metadata": {}, + "source": [ + "Say we already have a bed file defining the areas of interest. We can easily build a dataloader for this." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "practical-tunisia", + "metadata": {}, + "outputs": [], + "source": [ + "class BEDLoader:\n", + " \"\"\"\n", + " Class that loads a bed as pyranges-like pandas dataframe\n", + " \"\"\"\n", + " \n", + " def __init__(\n", + " self,\n", + " bed_path : str\n", + " ):\n", + " self.bed_path = bed_path\n", + " \n", + " def load_df(self) -> pd.DataFrame:\n", + " df = pyranges.read_bed(self.bed_path).df\n", + " if \"Strand\" not in df.keys():\n", + " df[\"Strand\"] = \"*\"\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "silent-roman", + "metadata": {}, + "outputs": [], + "source": [ + "class ChrRename:\n", + " \n", + " def __call__(\n", + " self,\n", + " region_df\n", + " ):\n", + " region_df[\"Chromosome\"] = region_df[\"Chromosome\"].str.replace(\"^chr\", \"\")\n", + " return region_df" + ] + }, + { + "cell_type": "markdown", + "id": "tired-deficit", + "metadata": {}, + "source": [ + "All we need to do to achieve this is to use the BEDLoader rather than reading a GTF:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "complete-algorithm", + "metadata": {}, + "outputs": [], + "source": [ + "class BedDataloader(GenericSingleSeqDataloader):\n", + " \n", + " def __init__(\n", + " self,\n", + " bed_file : str,\n", + " fasta_file : str,\n", + " use_strand : bool,\n", + " interval_attrs = [\"Name\"]\n", + " ):\n", + " self.bed_file = bed_file\n", + " self.fasta_file = fasta_file\n", + " self.use_strand = use_strand\n", + " \n", + " # Source interval data from bed\n", + " df = BEDLoader(self.bed_file).load_df()\n", + " #df = ChrRename()(df)\n", + " # Build the interval fetcher iterator\n", + " interval_source = GTFMultiIntervalFetcher(\n", + " df, \n", + " keep_attrs=interval_attrs\n", + " )\n", + " # Source reference sequence from fasta\n", + " reference_sequence_source = FastaStringExtractor(\n", + " fasta_file,\n", + " use_strand=self.use_strand\n", + " )\n", + " # Provide sequence transformer\n", + " sequence_transformer = IdentityTransform()\n", + " # Pass all to superclass\n", + " super().__init__(\n", + " interval_source,\n", + " reference_sequence_source,\n", + " sequence_transformer,\n", + " interval_attrs\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "theoretical-course", + "metadata": {}, + "outputs": [], + "source": [ + "bed = BedDataloader(\"ExampleFiles/example_bed.bed\",\n", + " \"ExampleFiles/example_bed.fasta\",\n", + " use_strand = False)" + ] + }, + { + "cell_type": "markdown", + "id": "loose-thickness", + "metadata": {}, + "source": [ + "Test equivalence to old StringSeqDL dataloader" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "arctic-increase", + "metadata": {}, + "outputs": [], + "source": [ + "bed_old = StringSeqIntervalDl(\"ExampleFiles/example_bed.bed\",\n", + " \"ExampleFiles/example_bed.fasta\")\n", + "\n", + "seq_new = [x[\"inputs\"][\"seq\"] for x in bed]\n", + "seq_old = [x[\"inputs\"] for x in bed_old]\n", + "assert(x == y for x,y in zip(seq_new, seq_old))" + ] + }, + { + "cell_type": "markdown", + "id": "separated-threat", + "metadata": {}, + "source": [ + "## Extracting all CDS sequences" + ] + }, + { + "cell_type": "markdown", + "id": "disabled-frame", + "metadata": {}, + "source": [ + "Exploiting this design, we can very easily also make a dataloader that works on spliced sequences, e.g. coding sequences" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "willing-mercury", + "metadata": {}, + "outputs": [], + "source": [ + "class CDSFinder:\n", + " \"\"\"\n", + " Extracts CDS\n", + " \"\"\"\n", + " \n", + " def __call__(\n", + " self,\n", + " region_df : pd.DataFrame\n", + " ) -> pd.DataFrame:\n", + " region_df = region_df.query('Feature == \"CDS\" and transcript_type == \"protein_coding\"')\n", + " region_df.set_index(\"transcript_id\", inplace=True)\n", + " return region_df" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "considerable-grant", + "metadata": {}, + "outputs": [], + "source": [ + "class CDSDataloader(GenericSingleSeqDataloader):\n", + " \n", + " def __init__(\n", + " self,\n", + " gtf_file : str,\n", + " fasta_file : str,\n", + " incl_chr = [\"chr1\"],\n", + " interval_attrs = [\"gene_id\", \"transcript_type\"]\n", + " ):\n", + " self.gtf_file = gtf_file\n", + " self.fasta_file = fasta_file\n", + " self.use_strand = True\n", + " \n", + " # Source interval data from gtf\n", + " df = (pyranges.read_gtf(self.gtf_file)\n", + " .df\n", + " .query('Chromosome in @incl_chr')\n", + " )\n", + " # Subset to areas of interest\n", + " df = CDSFinder()(df)\n", + " # Build the interval fetcher iterator\n", + " interval_source = GTFMultiIntervalFetcher(\n", + " df, \n", + " keep_attrs=interval_attrs\n", + " )\n", + " # Source reference sequence from fasta\n", + " reference_sequence_source = FastaStringExtractor(\n", + " fasta_file,\n", + " use_strand=self.use_strand\n", + " )\n", + " # Provide sequence transformer\n", + " sequence_transformer = IdentityTransform()\n", + " # Pass all to superclass\n", + " super().__init__(\n", + " interval_source,\n", + " reference_sequence_source,\n", + " sequence_transformer,\n", + " interval_attrs\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "academic-quantity", + "metadata": {}, + "source": [ + "Get some test data (big files)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "sitting-testament", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('ExampleFiles/mouse.fa.gz', )" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# ground truth\n", + "urllib.request.urlretrieve(\"ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M26/gencode.vM26.pc_transcripts.fa.gz\", \n", + " \"ExampleFiles/mouse_transcripts.fa.gz\")\n", + "# gtf\n", + "urllib.request.urlretrieve(\"ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M26/gencode.vM26.basic.annotation.gtf.gz\", \n", + " \"ExampleFiles/mouse.gtf.gz\")\n", + "# fasta\n", + "urllib.request.urlretrieve(\"ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M26/GRCm39.primary_assembly.genome.fa.gz\", \n", + " \"ExampleFiles/mouse.fa.gz\")" + ] + }, + { + "cell_type": "markdown", + "id": "beneficial-leisure", + "metadata": {}, + "source": [ + "We have to unzip the fasta, because the kipoiseq fasta extractor is **very** slow on gzipped files" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "detected-queens", + "metadata": {}, + "outputs": [], + "source": [ + "! gunzip ExampleFiles/mouse.fa.gz" + ] + }, + { + "cell_type": "markdown", + "id": "running-organic", + "metadata": {}, + "source": [ + "PyRanges loading of big gtf is also not exactly lightning fast, but it offers a lot of flexibility in return" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "foster-matthew", + "metadata": {}, + "outputs": [], + "source": [ + "cds = CDSDataloader(\"ExampleFiles/mouse.gtf.gz\",\n", + " \"ExampleFiles/mouse.fa\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "blond-jonathan", + "metadata": {}, + "outputs": [], + "source": [ + "cds_seq = {x[\"metadata\"][\"ranges\"].id:x[\"inputs\"][\"seq\"] for x in cds}" + ] + }, + { + "cell_type": "markdown", + "id": "dated-sponsorship", + "metadata": {}, + "source": [ + "We compare to the groud truth to see if it works:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "every-chemistry", + "metadata": {}, + "outputs": [], + "source": [ + "from Bio.Seq import Seq\n", + "from Bio.SeqRecord import SeqRecord\n", + "from Bio import SeqIO\n", + "import gzip\n", + "\n", + "with gzip.open(\"ExampleFiles/mouse_transcripts.fa.gz\", \"rt\") as handle:\n", + " for record in SeqIO.parse(handle, \"fasta\"):\n", + " record_id = record.id.split(\"|\")[0] \n", + " if record_id in cds_seq:\n", + " our_seq = cds_seq[record_id]\n", + " # We extract the ground truth CDS position\n", + " for x in record.id.split(\"|\"):\n", + " if x.startswith(\"CDS\"):\n", + " cds_loc = x.split(\":\")[1].split(\"-\")\n", + " # For 3' end incomplete transcripts, we get the whole cds\n", + " if str(record.seq)[int(cds_loc[1]) - 3:int(cds_loc[1])] not in [\"TAA\", \"TGA\", \"TAG\"]:\n", + " true_seq = str(record.seq)[int(cds_loc[0]) - 1:int(cds_loc[1])]\n", + " else: # For 3' end complete transcripts, we need to exclude the stop codon\n", + " true_seq = str(record.seq)[int(cds_loc[0]) - 1:int(cds_loc[1]) - 3]\n", + " try:\n", + " assert(our_seq == true_seq)\n", + " except Exception:\n", + " print(record_id)\n", + " print(our_seq)\n", + " print(str(record.seq))\n", + " print(true_seq)" + ] + }, + { + "cell_type": "markdown", + "id": "graphic-montreal", + "metadata": {}, + "source": [ + "# Variants" + ] + }, + { + "cell_type": "markdown", + "id": "constant-parent", + "metadata": {}, + "source": [ + "The biggest value of kipoiseq are the classes that handle extraction and insertion of variants. These are very useful and provide many useful functions, but difficult to use/maintain because:\n", + "* They are not easy to understand, at least for me, since there is a large number of objects (matchers, extractors, interval_queries etc...) and it is not super obvious which components are needed\n", + "* The documentation is quite limited\n", + "* They all are designed to work on VCF files. Especially for large applications (gnomAD etc.), other sources of variants may be preferable, such as hail, databases, etc" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "suburban-morocco", + "metadata": {}, + "outputs": [], + "source": [ + "! cat ExampleFiles/chrom22.fa | sed 's/>chr/>/g' > ExampleFiles/chrom22_nochr.fa" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "proper-activity", + "metadata": {}, + "outputs": [], + "source": [ + "# Download vcf\n", + "urllib.request.urlretrieve(\"http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/pilot_data/paper_data_sets/a_map_of_human_variation/low_coverage/snps/CEU.low_coverage.2010_09.genotypes.vcf.gz\", 'ExampleFiles/CEU.low_coverage.2010_09.genotypes.vcf.gz')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aggregate-blind", + "metadata": {}, + "outputs": [], + "source": [ + "! tabix ExampleFiles/CEU.low_coverage.2010_09.genotypes.vcf.gz" + ] + }, + { + "cell_type": "markdown", + "id": "portable-cargo", + "metadata": {}, + "source": [ + "## An alternative solution\n", + "\n", + "Just as we can break down the single sequence dataloading task into the three main components of fetching intervals, getting the ref sequence and then transforming, we can also break down the variant dataloading into three main steps:\n", + "* Fetching variants belonging to a specific interval. For this we design a VariantFetcher class.\n", + "* Defining a strategy how to insert multiple variants (all at the same time, all individually, one alt-seq per chromosome for phased data etc...). This is the ugliest object in the current design, but I am not sure how to get around it. Its interface is: given a list of intervals and corresponding variants, yield a template (e.g. list of intervals with the desired variants) to build exactly one alternative sequence\n", + "* Inserting the variants into a reference sequence, respecting the strategy above. For this the existing VariantSeqExtractor works very well. For the time being, I have put a wrapper class around it, because the VariantSeqExtractor has two parameters (anchor and fixed_len) which it only gets at function call time rather than init time. If anything depends on setting these parameters dynamically, one could adapt the design to have an anchoring strategy as well, but for the time being I am not sure whether this is ever the case." + ] + }, + { + "cell_type": "markdown", + "id": "inside-beaver", + "metadata": {}, + "source": [ + "In the prototype.py, I have created a prototype for a GenericVariantDataloader, which takes these components to load variants. It is very similar in design to the GenericSingleSeqDataloader, but with the additional steps detailed above. The code is commented to explain the workflow" + ] + }, + { + "cell_type": "markdown", + "id": "thrown-claim", + "metadata": {}, + "source": [ + "I think this template is easier to use/extend than the currently existing dataloaders. The currently existing code has a large hierarchy of specialized coupled objects (for example, there is a SingleVariantUTRDataLoader, which has a GenericSingleVariantMultiIntervalVCFSeqExtractor, which is BaseMultiIntervalVCFSeqExtractor (which in turn is a GenericMultiIntervalSeqExtractor, which in turn is a BaseMultiIntervalSeqExtractor) and it has a interval_fetcher and a variant seq extractor + a mixin that determines the insertion strategy and a \"matcher\" that as far as I can see is never used (see appendix)). Its a bit of a matryoshka doll that for me at least, took a considerable time to understand, and I think most new users will be scared away by this. It also leads to a proliferation of kind of useless classes. For example if I want to get variants from a hail table, I have to create a BaseMultiIntervalHailSeqExtractor and then use mixins to create the GenericSingleVariantMultiIntervalHailSeqExtractor and the GenericMultiVariantMultiIntervalHailSeqExtractor and so on.\n", + "\n", + "I believe the design I propose, by decoupling all of these things, makes it in my opinion a lot easier to design new dataloaders in a modular fashion (i.e. if I want tyo load from hail, I jsut change the fetcher). It also makes it easier to write a documentation for new users to understand how to build a dataloader, changing just the particular components they need." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "wanted-mount", + "metadata": {}, + "outputs": [], + "source": [ + "class CDSFinder:\n", + " \"\"\"\n", + " Extracts CDS\n", + " \"\"\"\n", + " \n", + " def __call__(\n", + " self,\n", + " region_df : pd.DataFrame\n", + " ) -> pd.DataFrame:\n", + " region_df = region_df.query('Feature == \"CDS\" and transcript_type == \"protein_coding\"')\n", + " region_df.set_index(\"transcript_id\", inplace=True)\n", + " return region_df\n", + "\n", + "class ChrRename:\n", + " \n", + " def __call__(\n", + " self,\n", + " region_df\n", + " ):\n", + " region_df[\"Chromosome\"] = region_df[\"Chromosome\"].str.replace(\"^chr\", \"\")\n", + " return region_df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "related-delight", + "metadata": {}, + "outputs": [], + "source": [ + "from kipoiseq.dataloaders.prototype import GenericVariantDataloader, VCFVariantFetcher, SingleVariantStrategy,\\\n", + " IdentityTransform, VariantSequenceExtractor\n", + "class CDSVariantDataloader(GenericVariantDataloader):\n", + " \n", + " def __init__(\n", + " self,\n", + " gtf_file : str,\n", + " fasta_file : str,\n", + " vcf_file : str,\n", + " interval_attrs = [\"gene_id\", \"transcript_type\"]\n", + " ):\n", + " self.gtf_file = gtf_file\n", + " self.fasta_file = fasta_file\n", + " self.vcf_file = vcf_file\n", + " self.use_strand = True\n", + " \n", + " # Source interval data from gtf\n", + " df = pyranges.read_gtf(self.gtf_file).df\n", + " # Subset to areas of interest\n", + " df = CDSFinder()(df)\n", + " df = ChrRename()(df)\n", + " # Build the interval fetcher\n", + " interval_source = GTFMultiIntervalFetcher(\n", + " df, \n", + " keep_attrs=interval_attrs\n", + " )\n", + " # Source reference sequence from fasta\n", + " reference_sequence_source = FastaStringExtractor(\n", + " fasta_file,\n", + " use_strand=self.use_strand\n", + " )\n", + " # Source variants from vcf\n", + " variant_source = VCFVariantFetcher(\n", + " self.vcf_file\n", + " )\n", + " # Build variant sequence extractor\n", + " # I am not super sure what changing the anchor achieves \n", + " # The GenericSingleVariantMultiIntervalVCFSeqExtractor hardcodes it to 0\n", + " # So I set it by default to 0 too.\n", + " variant_sequence_extractor = VariantSequenceExtractor(\n", + " reference_sequence_source,\n", + " anchor = 0,\n", + " fixed_len = False\n", + " )\n", + " # Provide variants individually\n", + " variant_insertion_strategy = SingleVariantStrategy()\n", + " # Provide sequence transformer\n", + " sequence_transformer = IdentityTransform()\n", + " # Pass all to superclass\n", + " super().__init__(\n", + " interval_source,\n", + " variant_source,\n", + " variant_insertion_strategy,\n", + " reference_sequence_source,\n", + " variant_sequence_extractor,\n", + " sequence_transformer,\n", + " interval_attrs\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "touched-receiver", + "metadata": {}, + "source": [ + "## Test" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "distributed-shipping", + "metadata": {}, + "outputs": [], + "source": [ + "cdsvar = CDSVariantDataloader(\n", + " \"ExampleFiles/chrom22.gtf\",\n", + " \"ExampleFiles/chrom22_nochr.fa\",\n", + " \"ExampleFiles/CEU.low_coverage.2010_09.genotypes.vcf.gz\"\n", + ")\n", + "\n", + "# Results when using the old extractor\n", + "var_dict_new = defaultdict(dict)\n", + "for item in cdsvar:\n", + " transcript_id = item[\"metadata\"][\"ranges\"].name\n", + " var_dict_new[transcript_id][\"ref\"] = item[\"inputs\"][\"ref_seq\"]\n", + " var_dict_new[transcript_id][\"alts\"] = var_dict_new[transcript_id].get(\"alts\", []) + [item[\"inputs\"][\"alt_seq\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "responsible-multimedia", + "metadata": {}, + "outputs": [], + "source": [ + "# get variants using old extractor as ground truth\n", + "df = pyranges.read_gtf('ExampleFiles/chrom22.gtf').df\n", + "df = CDSFinder()(df)\n", + "df = ChrRename()(df)\n", + "interval_source = GTFMultiIntervalFetcher(\n", + " df, \n", + " keep_attrs=[\"gene_id\"]\n", + ")\n", + "variant_matcher = SingleVariantMatcher(\n", + " \"ExampleFiles/CEU.low_coverage.2010_09.genotypes.vcf.gz\",\n", + " pranges=pyranges.PyRanges(\n", + " interval_source.df.reset_index()\n", + " )\n", + ")\n", + "reference_sequence_source = FastaStringExtractor(\n", + " \"ExampleFiles/chrom22_nochr.fa\",\n", + " use_strand=True\n", + ")\n", + "multi_sample_VCF = MultiSampleVCF(\"ExampleFiles/CEU.low_coverage.2010_09.genotypes.vcf.gz\")\n", + "extractor = GenericSingleVariantMultiIntervalVCFSeqExtractor(\n", + " interval_fetcher=interval_source,\n", + " reference_seq_extractor=reference_sequence_source,\n", + " variant_matcher=variant_matcher,\n", + " multi_sample_VCF=multi_sample_VCF,\n", + ")\n", + "\n", + "# Results when using the old extractor\n", + "var_dict = defaultdict(dict)\n", + "for transcript_id, (ref_seq, alt_seqs) in extractor.items():\n", + " alt_seq_list = [alt_seq[0] for alt_seq in alt_seqs]\n", + " if len(alt_seq_list) > 0:\n", + " var_dict[transcript_id][\"ref\"] = ref_seq\n", + " var_dict[transcript_id][\"alts\"] = alt_seq_list" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "floating-first", + "metadata": {}, + "outputs": [], + "source": [ + "assert([sorted(var_dict_new.keys()) == sorted(var_dict.keys())])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "false-organ", + "metadata": {}, + "outputs": [], + "source": [ + "for transcript_id in var_dict_new.keys():\n", + " assert(var_dict_new[transcript_id][\"ref\"] == var_dict[transcript_id][\"ref\"])\n", + " assert(sorted(var_dict_new[transcript_id][\"alts\"]) == sorted(var_dict[transcript_id][\"alts\"]))" + ] + }, + { + "cell_type": "markdown", + "id": "tired-following", + "metadata": {}, + "source": [ + "# Outlook\n", + "\n", + "Following this design, one could create a reduced and maybe more intuitive structure of kipoiseq, structured along these 5 main components:\n", + "* interval_fetcher.py, which provide intervals from whatever source (base class: BaseMultiIntervalFetcher, maybe rename to BaseIntervalFetcher)\n", + "* variant_fetcher.py, which provide variants given intervals, from whatever source (base class: BaseVariantFetcher)\n", + "* sequence_extractor.py, which provides reference sequence given intervals, from whatever source (base class: BaseExtractor)\n", + "* variant_extractor.py, which provides a sequence with variants inserted, given intervals and variants (base class: VariantSeqExtractor)\n", + "* transforms.py, which provide transformations" + ] + }, + { + "cell_type": "markdown", + "id": "velvet-classroom", + "metadata": {}, + "source": [ + "# Appendix\n", + "\n", + "An example of the somewhat confusing nature of the current variant objects: The GenericSingleVariantMultiIntervalVCFSeqExtractor (I love OOP names) wants a VariantMatcher, but as far as I can see, never uses it in any capacity" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "daily-wallace", + "metadata": {}, + "outputs": [], + "source": [ + "df = pyranges.read_gtf('ExampleFiles/chrom22.gtf').df\n", + "df = CDSFinder()(df)\n", + "df = ChrRename()(df)\n", + "interval_source = GTFMultiIntervalFetcher(\n", + " df, \n", + " keep_attrs=[\"gene_id\"]\n", + ")\n", + "variant_matcher = SingleVariantMatcher(\n", + " \"ExampleFiles/CEU.low_coverage.2010_09.genotypes.vcf.gz\",\n", + " pranges=pyranges.PyRanges(\n", + " interval_source.df.reset_index()\n", + " )\n", + ")\n", + "reference_sequence_source = FastaStringExtractor(\n", + " \"ExampleFiles/chrom22_nochr.fa\",\n", + " use_strand=True\n", + ")\n", + "multi_sample_VCF = MultiSampleVCF(\"ExampleFiles/CEU.low_coverage.2010_09.genotypes.vcf.gz\")\n", + "extractor = GenericSingleVariantMultiIntervalVCFSeqExtractor(\n", + " interval_fetcher=interval_source,\n", + " reference_seq_extractor=reference_sequence_source,\n", + " variant_matcher=variant_matcher,\n", + " multi_sample_VCF=multi_sample_VCF,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "imported-biography", + "metadata": {}, + "outputs": [], + "source": [ + "# Results when using the matcher\n", + "var_dict = defaultdict(list)\n", + "for transcript_id, (ref_seq, alt_seqs) in extractor.items():\n", + " alt_seq_list = [alt_seq for alt_seq in alt_seqs]\n", + " for alt_seq in alt_seq_list: \n", + " alt_seq[1][\"source\"] = None # because cyvcf variant objects are incomparable\n", + " if len(alt_seq_list) > 0:\n", + " var_dict[transcript_id].append((ref_seq, alt_seq_list))" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "deadly-latino", + "metadata": {}, + "outputs": [], + "source": [ + "extractor = GenericSingleVariantMultiIntervalVCFSeqExtractor(\n", + " interval_fetcher=interval_source,\n", + " reference_seq_extractor=reference_sequence_source,\n", + " variant_matcher=None,\n", + " multi_sample_VCF=multi_sample_VCF,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "id": "violent-telephone", + "metadata": {}, + "outputs": [], + "source": [ + "# Results when matcher is none\n", + "var_dict_nomatcher = defaultdict(list)\n", + "for transcript_id, (ref_seq, alt_seqs) in extractor.items():\n", + " alt_seq_list = [alt_seq for alt_seq in alt_seqs]\n", + " for alt_seq in alt_seq_list: \n", + " alt_seq[1][\"source\"] = None # because cyvcf variant objects are incomparable\n", + " if len(alt_seq_list) > 0:\n", + " var_dict_nomatcher[transcript_id].append((ref_seq, alt_seq_list))" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "id": "preceding-reflection", + "metadata": {}, + "outputs": [], + "source": [ + "assert(var_dict.keys() == var_dict_nomatcher.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "id": "caroline-boring", + "metadata": {}, + "outputs": [], + "source": [ + "for key in var_dict.keys():\n", + " try:\n", + " assert(var_dict[key] == var_dict_nomatcher[key])\n", + " except Exception:\n", + " a = var_dict[key]\n", + " b = var_dict_nomatcher[key]\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "prerequisite-facial", + "metadata": {}, + "outputs": [], + "source": [ + "x = var_dict[next(iter(var_dict))]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "widespread-louisiana", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('ATGGTGGCTGAGGCTGGTTCAATGCCGGCTGCCTCCTCTGTGAAGAAGCCATTTGGTCTCAGAAGCAAGATGGGCAAGTGGTGCCGCCACTGCTTCGCCTGGTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGATTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGGTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGATTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGCTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGACTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAACTGGTGCTGCCACTGCTTCCCCTGCTGCAGGGGGAGCGGCAAGAACAAAGTGGGCCCTTGGGGAGACTACGACGACAGCGCTTTCATGGAGCCGAGGTACCACGTCCGTCGAGAAGATCTGGACAAGCTCCACAGAGCTGCCTGGTGGGGTAAAGTCCCCAGAAAGGATCTCATCGTCATGCTCAAGGACACTGACATGAACAAGAAGGACAAGCAAAAGAGGACTGCTCTACATCTGGCCTCTGCCAATGGAAATTCAGAAGTAGTAAAACTCCTGCTGGACAGACGATGTCAACTTAATATCCTTGACAACAAAAAGAGGACAGCTCTGACAAAGGCCGTACAATGCCAGGAAGATGAATGTGCGTTAATGTTGCTGGAACATGGCACTGATCCGAATATTCCAGATGAGTATGGAAATACCGCTCTACACTATGCTATCTACAATGAAGATAAATTAATGGCCAAAGCACTGCTCTTATACGGTGCTGATATCGAATCAAAAAACAAGCATGGCCTCACACCACTGTTACTTGGTGTACATGAGCAAAAACAGCAAGTGGTGAAATTTTTAATCAAGAAAAAAGCAAATTTAAATGCACTGGATAGATATGGAAGAACTGCTCTCATACTTGCTGTATGTTGTGGATCGGCAAGTATAGTCAGCCTTCTACTTGAGCAAAACATTGATGTATCTTCTCAAGATCTATCTGGACAGACGGCCAGAGAGTATGCTGTTTCTAGTCGTCATAATGTAATTTGCCAGTTACTTTCTGACTACAAAGAAAAACAGATACTAAAAGTCTCTTCTGAAAACAGCAATCCAGAACAAGACTTAAAGCTGACATCAGAGGAAGAGTCACAAAGGCTTAAAGGAAGTGAAAATAGCCAGCCAGAGGAAATGTCTCAAGAACCAGAAATAAATAAGGGTGGTGATAGAAAGGTTGAAGAAGAAATGAAGAAGCACGGAAGTACTCATATGGGATTCCCAGAAAACCTGACTAACGGTGCCACTGCTGACAATGGTGATGATGGATTAATTCCACCAAGGAAAAGCAGAACACCTGAAAGCCAGCAATTTCCTGACACTGAGAATGAACAGTATCACAGTGATGAACAAAATGATACTCAGAAGCAACTTTCTGAAGAACAGAACACTGGAATATTACAAGATGAGATTCTGATTCATGAAGAAAAGCAGATAGAAGTGGCTGAAAATGAATTC',\n", + " {'chrom': '22',\n", + " 'pos': 15690488,\n", + " 'ref': 'T',\n", + " 'alt': 'C',\n", + " 'id': 'rs175148',\n", + " 'qual': None,\n", + " 'filter': None,\n", + " 'info': {'AA': 'T',\n", + " 'AC': 72,\n", + " 'AN': 120,\n", + " 'DP': 285,\n", + " 'HM2': True,\n", + " 'HM3': True,\n", + " 'GP': '22:17310488',\n", + " 'BN': 79},\n", + " 'source': None}),\n", + " ('ATGGTGGCTGAGGCTGGTTCAATGCCGGCTGCCTCCTCTGTGAAGAAGCCATTTGGTCTCAGAAGCAAGATGGGCAAGTGGTGCCGCCACTGCTTCGCCTGGTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGATTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGGTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGATTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGCTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGACTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGCTGCAGGGGGAGCGGCAAGAACAAAGTGGGCCCTTGGGGAGACTACGACGACAGCGCTTTCATGGAGCCGAGGTACCACGTCCGTCGAGAAGATCTGGACAAGCTCCACAGAGCTGCCTGGTGGGGTAAAGTCCCCAGAAAGGATCTCATCGTCATGCTCAAGGACACTGACATGAACAAGAAGGACAAGCAAAAGAGGACTGCTCTACATCTGGCCTCTGCCAATGGAAATTCAGAAGTAGTAAAACTCCTGCTGGACAGACGATGTCAACTTAATATCCTTGAGAACAAAAAGAGGACAGCTCTGACAAAGGCCGTACAATGCCAGGAAGATGAATGTGCGTTAATGTTGCTGGAACATGGCACTGATCCGAATATTCCAGATGAGTATGGAAATACCGCTCTACACTATGCTATCTACAATGAAGATAAATTAATGGCCAAAGCACTGCTCTTATACGGTGCTGATATCGAATCAAAAAACAAGCATGGCCTCACACCACTGTTACTTGGTGTACATGAGCAAAAACAGCAAGTGGTGAAATTTTTAATCAAGAAAAAAGCAAATTTAAATGCACTGGATAGATATGGAAGAACTGCTCTCATACTTGCTGTATGTTGTGGATCGGCAAGTATAGTCAGCCTTCTACTTGAGCAAAACATTGATGTATCTTCTCAAGATCTATCTGGACAGACGGCCAGAGAGTATGCTGTTTCTAGTCGTCATAATGTAATTTGCCAGTTACTTTCTGACTACAAAGAAAAACAGATACTAAAAGTCTCTTCTGAAAACAGCAATCCAGAACAAGACTTAAAGCTGACATCAGAGGAAGAGTCACAAAGGCTTAAAGGAAGTGAAAATAGCCAGCCAGAGGAAATGTCTCAAGAACCAGAAATAAATAAGGGTGGTGATAGAAAGGTTGAAGAAGAAATGAAGAAGCACGGAAGTACTCATATGGGATTCCCAGAAAACCTGACTAACGGTGCCACTGCTGACAATGGTGATGATGGATTAATTCCACCAAGGAAAAGCAGAACACCTGAAAGCCAGCAATTTCCTGACACTGAGAATGAACAGTATCACAGTGATGAACAAAATGATACTCAGAAGCAACTTTCTGAAGAACAGAACACTGGAATATTACAAGATGAGATTCTGATTCATGAAGAAAAGCAGATAGAAGTGGCTGAAAATGAATTC',\n", + " {'chrom': '22',\n", + " 'pos': 15695458,\n", + " 'ref': 'T',\n", + " 'alt': 'G',\n", + " 'id': 'rs165652',\n", + " 'qual': None,\n", + " 'filter': None,\n", + " 'info': {'AA': 'T',\n", + " 'AC': 92,\n", + " 'AN': 120,\n", + " 'DP': 288,\n", + " 'HM2': True,\n", + " 'HM3': True,\n", + " 'GP': '22:17315458',\n", + " 'BN': 79},\n", + " 'source': None}),\n", + " ('ATGGTGGCTGAGGCTGGTTCAATGCCGGCTGCCTCCTCTGTGAAGAAGCCATTTGGTCTCAGAAGCAAGATGGGCAAGTGGTGCCGCCACTGCTTCGCCTGGTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGATTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGGTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGATTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGCTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGACTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGCTGCAGGGGGAGCGGCAAGAACAAAGTGGGCCCTTGGGGAGACTACGACGACAGCGCTTTCATGGAGCCGAGGTACCACGTCCGTCGAGAAGATCTGGACAAGCTCCACAGAGCTGCCTGGTGGGGTAAAGTCCCCAGAAAGGATCTCATCGTCATGCTCAAGGACACTGACATGAACAAGAAGGACAAGCAAAAGAGGACTGCTCTACATCTGGCCTCTGCCAATGGAAATTCAGAAGTAGTAAAACTCCTGCTGGACAGACGATGTCAACTTAATATCCTTGACAACAAAAAGAGGACAGCTCTGACAAAGGCCGTACAATGCCAGGAAGATGAATGTGCGTTAATGTTGCTGGAACATGGCACTGATCCGAATATTCCAGATGAGTATGGAAATACCGCTCTACACTATGCTATCTACAATGAAGATAAATTAATGGCCAAAGCACTGCTCTTATACGGTGCTGATATCGAATCAAAAAACAAGCATGGCCTCACACCACTGTTTCTTGGTGTACATGAGCAAAAACAGCAAGTGGTGAAATTTTTAATCAAGAAAAAAGCAAATTTAAATGCACTGGATAGATATGGAAGAACTGCTCTCATACTTGCTGTATGTTGTGGATCGGCAAGTATAGTCAGCCTTCTACTTGAGCAAAACATTGATGTATCTTCTCAAGATCTATCTGGACAGACGGCCAGAGAGTATGCTGTTTCTAGTCGTCATAATGTAATTTGCCAGTTACTTTCTGACTACAAAGAAAAACAGATACTAAAAGTCTCTTCTGAAAACAGCAATCCAGAACAAGACTTAAAGCTGACATCAGAGGAAGAGTCACAAAGGCTTAAAGGAAGTGAAAATAGCCAGCCAGAGGAAATGTCTCAAGAACCAGAAATAAATAAGGGTGGTGATAGAAAGGTTGAAGAAGAAATGAAGAAGCACGGAAGTACTCATATGGGATTCCCAGAAAACCTGACTAACGGTGCCACTGCTGACAATGGTGATGATGGATTAATTCCACCAAGGAAAAGCAGAACACCTGAAAGCCAGCAATTTCCTGACACTGAGAATGAACAGTATCACAGTGATGAACAAAATGATACTCAGAAGCAACTTTCTGAAGAACAGAACACTGGAATATTACAAGATGAGATTCTGATTCATGAAGAAAAGCAGATAGAAGTGGCTGAAAATGAATTC',\n", + " {'chrom': '22',\n", + " 'pos': 15698682,\n", + " 'ref': 'C',\n", + " 'alt': 'T',\n", + " 'id': 'rs165670',\n", + " 'qual': None,\n", + " 'filter': None,\n", + " 'info': {'AA': 'c',\n", + " 'AC': 67,\n", + " 'AN': 120,\n", + " 'DP': 293,\n", + " 'HM2': True,\n", + " 'HM3': True,\n", + " 'GP': '22:17318682',\n", + " 'BN': 79},\n", + " 'source': None}),\n", + " ('ATGGTGGCTGAGGCTGGTTCAATGCCGGCTGCCTCCTCTGTGAAGAAGCCATTTGGTCTCAGAAGCAAGATGGGCAAGTGGTGCCGCCACTGCTTCGCCTGGTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGATTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGGTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGATTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGCTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGACTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGCTGCAGGGGGAGCGGCAAGAACAAAGTGGGCCCTTGGGGAGACTACGACGACAGCGCTTTCATGGAGCCGAGGTACCACGTCCGTCGAGAAGATCTGGACAAGCTCCACAGAGCTGCCTGGTGGGGTAAAGTCCCCAGAAAGGATCTCATCGTCATGCTCAAGGACACTGACATGAACAAGAAGGACAAGCAAAAGAGGACTGCTCTACATCTGGCCTCTGCCAATGGAAATTCAGAAGTAGTAAAACTCCTGCTGGACAGACGATGTCAACTTAATATCCTTGACAACAAAAAGAGGACAGCTCTGACAAAGGCCGTACAATGCCAGGAAGATGAATGTGCGTTAATGTTGCTGGAACATGGCACTGATCCGAATATTCCAGATGAGTATGGAAATACCGCTCTACACTATGCTATCTACAATGAAGATAAATTAATGGCCAAAGCACTGCTCTTATACGGTGCTGATATCGAATCAAAAAACAAGCATGGCCTCACACCACTGTTACTTGGTGTACATGAGCAAAAACAGCAAGTGGTGAAATTTTTAATCAAGAAAAAAGCAAATTTAAATGCACTGGATAGATATGGAAGAACTGCTCTCATACTTGCTGTATGTTGTGGATCGGCAAGTATAGTCAGCCTTCTACTTGAGCAAAACATTGATGTATCTTCTCAAGATCTATCTGGACAGACGGCCAGAGAGTATGCTGTTTCTAGTCGTCATAATGTAATTTGCCAGTTACTTTCTGACTACAAAGAAAAACAGATACTAAAAGTCTCTTCTGAAAACAGCAATCCAGAACAAGACTTAAAGCTGACATCAGAGGAAGAGTCACAAAGGCTTAAAGGAAGTGAAAATAGCCAGCCAGAGGAAATGTCTCAAGAACCAGAAATAAATAAGGGTGGTGATAGAAAGGTTGAAGAAGAAATGAAGAAGCACGGAAGTACTCATATGGGATTCCCAGAAAACCTGACTAACGGTGCCACTGCTGACAATGGTGATGATGGATTAATTCCACCAAGGAAAAGCAGAACACCTGAAAGCCAGCAATTTCCTGACACTGAGAAGGAACAGTATCACAGTGATGAACAAAATGATACTCAGAAGCAACTTTCTGAAGAACAGAACACTGGAATATTACAAGATGAGATTCTGATTCATGAAGAAAAGCAGATAGAAGTGGCTGAAAATGAATTC',\n", + " {'chrom': '22',\n", + " 'pos': 15711020,\n", + " 'ref': 'A',\n", + " 'alt': 'G',\n", + " 'id': 'rs12483904',\n", + " 'qual': None,\n", + " 'filter': None,\n", + " 'info': {'AA': 'a',\n", + " 'AC': 28,\n", + " 'AN': 120,\n", + " 'DP': 268,\n", + " 'HM2': True,\n", + " 'GP': '22:17331020',\n", + " 'BN': 120},\n", + " 'source': None}),\n", + " ('ATGGTGGCTGAGGCTGGTTCAATGCCGGCTGCCTCCTCTGTGAAGAAGCCATTTGGTCTCAGAAGCAAGATGGGCAAGTGGTGCCGCCACTGCTTCGCCTGGTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGATTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGGTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGATTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGCTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGACTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGCTGCAGGGGGAGCGGCAAGAACAAAGTGGGCCCTTGGGGAGACTACGACGACAGCGCTTTCATGGAGCCGAGGTACCACGTCCGTCGAGAAGATCTGGACAAGCTCCACAGAGCTGCCTGGTGGGGTAAAGTCCCCAGAAAGGATCTCATCGTCATGCTCAAGGACACTGACATGAACAAGAAGGACAAGCAAAAGAGGACTGCTCTACATCTGGCCTCTGCCAATGGAAATTCAGAAGTAGTAAAACTCCTGCTGGACAGACGATGTCAACTTAATATCCTTGACAACAAAAAGAGGACAGCTCTGACAAAGGCCGTACAATGCCAGGAAGATGAATGTGCGTTAATGTTGCTGGAACATGGCACTGATCCGAATATTCCAGATGAGTATGGAAATACCGCTCTACACTATGCTATCTACAATGAAGATAAATTAATGGCCAAAGCACTGCTCTTATACGGTGCTGATATCGAATCAAAAAACAAGCATGGCCTCACACCACTGTTACTTGGTGTACATGAGCAAAAACAGCAAGTGGTGAAATTTTTAATCAAGAAAAAAGCAAATTTAAATGCACTGGATAGATATGGAAGAACTGCTCTCATACTTGCTGTATGTTGTGGATCGGCAAGTATAGTCAGCCTTCTACTTGAGCAAAACATTGATGTATCTTCTCAAGATCTATCTGGACAGACGGCCAGAGAGTATGCTGTTTCTAGTCGTCATAATGTAATTTGCCAGTTACTTTCTGACTACAAAGAAAAACAGATACTAAAAGTCTCTTCTGAAAACAGCAATCCAGAACAAGACTTAAAGCTGACATCAGAGGAAGAGTCACAAAGGCTTAAAGGAAGTGAAAATAGCCAGCCAGAGGAAATGTCTCAAGAACCAGAAATAAATAAGGGTGGTGATAGAAAGGTTGAAGAAGAAATGAAGAAGCACGGAAGTACTCATATGGGATTCCCAGAAAACCTGACTAACGGTGCCACTGCTGACAATGGTGATGATGGATTAATTCCACCAAGGAAAAGCAGAACACCTGAAAGCCAGCAATTTCCTGACACTGAGAATGAACAGTATCACAGTGATGAACAAAATGCTACTCAGAAGCAACTTTCTGAAGAACAGAACACTGGAATATTACAAGATGAGATTCTGATTCATGAAGAAAAGCAGATAGAAGTGGCTGAAAATGAATTC',\n", + " {'chrom': '22',\n", + " 'pos': 15719674,\n", + " 'ref': 'T',\n", + " 'alt': 'C',\n", + " 'id': 'rs165760',\n", + " 'qual': None,\n", + " 'filter': None,\n", + " 'info': {'AA': 'N',\n", + " 'AC': 94,\n", + " 'AN': 120,\n", + " 'DP': 292,\n", + " 'GP': '22:17339674',\n", + " 'BN': 79},\n", + " 'source': None}),\n", + " ('ATGGTGGCTGAGGCTGGTTCAATGCCGGCTGCCTCCTCTGTGAAGAAGCCATTTGGTCTCAGAAGCAAGATGGGCAAGTGGTGCCGCCACTGCTTCGCCTGGTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGATTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGGTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGATTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGCTGCAGGGGGAGCGGCAAGAGCAACGTGGGCACTTCTGGAGACCACGACGACTCTGCTATGAAGACACTCAGGAGCAAGATGGGCAAGTGGTGCTGCCACTGCTTCCCCTGCTGCAGGGGGAGCGGCAAGAACAAAGTGGGCCCTTGGGGAGACTACGACGACAGCGCTTTCATGGAGCCGAGGTACCACGTCCGTCGAGAAGATCTGGACAAGCTCCACAGAGCTGCCTGGTGGGGTAAAGTCCCCAGAAAGGATCTCATCGTCATGCTCAAGGACACTGACATGAACAAGAAGGACAAGCAAAAGAGGACTGCTCTACATCTGGCCTCTGCCAATGGAAATTCAGAAGTAGTAAAACTCCTGCTGGACAGACGATGTCAACTTAATATCCTTGACAACAAAAAGAGGACAGCTCTGACAAAGGCCGTACAATGCCAGGAAGATGAATGTGCGTTAATGTTGCTGGAACATGGCACTGATCCGAATATTCCAGATGAGTATGGAAATACCGCTCTACACTATGCTATCTACAATGAAGATAAATTAATGGCCAAAGCACTGCTCTTATACGGTGCTGATATCGAATCAAAAAACAAGCATGGCCTCACACCACTGTTACTTGGTGTACATGAGCAAAAACAGCAAGTGGTGAAATTTTTAATCAAGAAAAAAGCAAATTTAAATGCACTGGATAGATATGGAAGAACTGCTCTCATACTTGCTGTATGTTGTGGATCGGCAAGTATAGTCAGCCTTCTACTTGAGCAAAACATTGATGTATCTTCTCAAGATCTATCTGGACAGACGGCCAGAGAGTATGCTGTTTCTAGTCGTCATAATGTAATTTGCCAGTTACTTTCTGACTACAAAGAAAAACAGATACTAAAAGTCTCTTCTGAAAACAGCAATCCAGAACAAGACTTAAAGCTGACATCAGAGGAAGAGTCACAAAGGCTTAAAGGAAGTGAAAATAGCCAGCCAGAGGAAATGTCTCAAGAACCAGAAATAAATAAGGGTGGTGATAGAAAGGTTGAAGAAGAAATGAAGAAGCACGGAAGTACTCATATGGGATTCCCAGAAAACCTGACTAACGGTGCCACTGCTGACAATGGTGATGATGGATTAATTCCACCAAGGAAAAGCAGAACACCTGAAAGCCAGCAATTTCCTGACACTGAGAATGAACAGTATCACAGTGATGAACAAAATGATACTCAGAAGCAACTTTCTGAAGAACAGAACACTGGAATATTACAAGATGAGATTCTGATTCATGAAGAAAAGCAGAAAGAAGTGGCTGAAAATGAATTC',\n", + " {'chrom': '22',\n", + " 'pos': 15719752,\n", + " 'ref': 'G',\n", + " 'alt': 'A',\n", + " 'id': 'rs2110439',\n", + " 'qual': None,\n", + " 'filter': None,\n", + " 'info': {'AA': 'g',\n", + " 'AC': 27,\n", + " 'AN': 120,\n", + " 'DP': 298,\n", + " 'GP': '22:17339752',\n", + " 'BN': 96},\n", + " 'source': None})]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x[0][1]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:anaconda-kipoi-Framepool2]", + "language": "python", + "name": "conda-env-anaconda-kipoi-Framepool2-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/kipoiseq/dataloaders/__init__.py b/kipoiseq/dataloaders/__init__.py index cf45a9d..5c63194 100644 --- a/kipoiseq/dataloaders/__init__.py +++ b/kipoiseq/dataloaders/__init__.py @@ -1,3 +1,4 @@ from .sequence import * from .splicing import * from .protein import * +from .prototype import * diff --git a/kipoiseq/dataloaders/prototype.py b/kipoiseq/dataloaders/prototype.py new file mode 100644 index 0000000..2e3dc74 --- /dev/null +++ b/kipoiseq/dataloaders/prototype.py @@ -0,0 +1,291 @@ +import abc +from typing import List, Union, Iterable, Tuple, Dict, Optional, Type, Iterator + +from kipoi.data import SampleIterator +from kipoiseq.transforms import OneHot +from kipoiseq.extractors import GenericMultiIntervalSeqExtractor, BaseMultiIntervalFetcher, \ + GTFMultiIntervalFetcher, BaseExtractor, FastaStringExtractor, VariantSeqExtractor, \ + MultiSampleVCF +from kipoiseq.dataclasses import Interval, Variant +from kipoi.metadata import GenomicRanges + +import pandas as pd +import pyranges + +from kipoi_conda.dependencies import Dependencies +from kipoi.specs import Author + +__all__ = [ + 'GenericSingleSeqDataloader' +] +deps = Dependencies( + conda=[ + 'bioconda::pybedtools', + 'bioconda::pyfaidx', + 'bioconda::pyranges', + 'bioconda::biopython', + 'numpy', + 'pandas', + ], + pip=['kipoiseq'] +) +package_authors = [ + Author(name='Alexander Karollus', github='Karollus'), + Author(name='Florian R. Hölzlwimmer', github='hoeze') +] + +##### Utility Functions ##### + +def _get_interval_metadata( + key, + intervals : List[Interval], + interval_attrs +): + start = intervals[0].start # start of the encompassing genomic range + end = intervals[-1].end # end of the encompassing genomic range + # assemble the metadata + # as we get a list of intervals, we take metadata from first (maybe questionable) + # one could define a metadata aggregation function + metadata_dict = {k:intervals[0].attrs.get(k, '') for k in interval_attrs} + metadata_dict["ranges"] = GenomicRanges(intervals[0].chrom, + start, + end, + key, + intervals[0].strand) + return metadata_dict + +class IdentityTransform: + + def __call__(self, x): + return x + +##### Generic Sequence Dataloader ##### + +class GenericSingleSeqDataloader(SampleIterator): + + def __init__( + self, + interval_fetcher : BaseMultiIntervalFetcher, + reference_sequence_extractor : BaseExtractor, + sequence_transformer, # need a base class for this + interval_attrs = [] + ): + # Generic dataloader ingredients: + # A source of (lists of) intervals (called as iterator) + self.interval_fetcher = interval_fetcher.items() + # A sequence source (usually a fasta extractor) + self.reference_sequence_extractor = reference_sequence_extractor + # A sequence transformer object (could be identity transform) + self.sequence_transformer = sequence_transformer + # Metadata attributes + self.interval_attrs = interval_attrs + + def __next__(self): + # Generic single seq dataloader step: + # Fetch the next list of intervals from the interval source + key, intervals = next(self.interval_fetcher) + # extract and assemble the corresponding sequence + intervals.sort(key=lambda x: x.start) + seq_parts = [self.reference_sequence_extractor.extract(interval) for interval in intervals] + if self.reference_sequence_extractor.use_strand and intervals[0].strand == "-": + seq_parts.reverse() + seq = "".join(seq_parts) + # transform the sequence + transformed_seq = self.sequence_transformer(seq) + # get metadata + metadata_dict = _get_interval_metadata(key, intervals, self.interval_attrs) + # return all as dictionary + return { + "inputs": { + "seq": transformed_seq, + }, + "metadata": metadata_dict + } + + def __iter__(self): + return self + +##### Variant Dataloader ##### + +### Variant Fetcher ### + +class BaseVariantFetcher: + + def fetch_variants( + self, + interval : Interval + ) -> Iterator[Variant]: + raise NotImplementedError + +# Like many existing variant loading classes, this is not super efficient +# One could probably make it more efficient using a smart prefetching/batching strategy +class VCFVariantFetcher(BaseVariantFetcher): + + def __init__( + self, + vcf_file : str + ): + self.vcf = MultiSampleVCF(vcf_file) + + def fetch_variants( + self, + interval : Interval + ) -> Iterator[Variant]: + return self.vcf.fetch_variants(interval) + +### Insertion Strategy ### + +class VariantStrategy: + + def generate_alt_seq_templates( + self, + intervals_with_variants : List[Tuple[Interval, Iterator[Variant]]] + ) -> Iterator[List[Tuple[Interval, List[Variant]]]]: + raise NotImplementedError() + +class MultiVariantStrategy(VariantStrategy): + """ + This class will yield a template to build one alt-seq + which has all matching variants inserted + If no variants intersect the intervals, nothing is returned. + The class assumes that there are no "mutually exclusive" variants + """ + + def generate_alt_seq_templates( + self, + intervals_with_variants : List[Tuple[Interval, Iterator[Variant]]] + ) -> Iterator[List[Tuple[Interval, List[Variant]]]]: + alt_seq_intervals_with_variants = [] + has_variant = False + for interval, var_iter in intervals_with_variants: + variants = [var for var in var_iter] + if len(variants) > 0: + has_variant = True + alt_seq_intervals_with_variants.append((interval, variants)) + if has_variant: + yield alt_seq_intervals_with_variants + +class SingleVariantStrategy(VariantStrategy): + """ + This class will yield a template to build one alt-seq + for every variant that intersects the interval + """ + + def generate_alt_seq_templates( + self, + intervals_with_variants : List[Tuple[Interval, Iterator[Variant]]] + ) -> Iterator[List[Tuple[Interval, List[Variant]]]]: + for idx, (interval, var_iter) in enumerate(intervals_with_variants): + for variant in var_iter: + alt_seq_intervals_with_variant = [(interval, []) for + interval, _ in intervals_with_variants] + alt_seq_intervals_with_variant[idx] = (interval, [variant]) + yield alt_seq_intervals_with_variant + +### Variant Sequence Extractor ### + +class VariantSequenceExtractor: + + def __init__( + self, + reference_sequence_extractor : BaseExtractor, + anchor = 0, + fixed_len = True + ): + self.anchor = anchor + self.fixed_len = fixed_len + self.extractor = VariantSeqExtractor( + reference_sequence = reference_sequence_extractor + ) + + def extract( + self, + interval, + variants + ): + return self.extractor.extract(interval, variants, + anchor=self.anchor, fixed_len=self.fixed_len) + +### The Generic Dataloader ### + +class GenericVariantDataloader(SampleIterator): + + def __init__( + self, + interval_fetcher, + variant_source, + variant_insertion_strategy, + reference_sequence_extractor, + variant_sequence_extractor, + sequence_transformer, + interval_attrs, + anchor = 0, + fixed_len = False + ): + # Generic variant dataloader ingredients: + # A interval source (called as iterator) + self.interval_fetcher = interval_fetcher.items() + # A variant source + self.variant_source = variant_source + #A variant insertion strategy + self.variant_insertion_strategy = variant_insertion_strategy + # A sequence source + self.reference_sequence_extractor = reference_sequence_extractor + # A variant sequence extractor + self.variant_sequence_extractor = variant_sequence_extractor + # A sequence transformer object (could be identity transform) + self.sequence_transformer = sequence_transformer + # Build iterator object + self.generator = self._generator() + # interval attributes + self.interval_attrs = interval_attrs + + def _build_sequences(self, alt_seq_template): + ref_seq_parts = [self.reference_sequence_extractor.extract(interval) + for interval, variants in alt_seq_template] + alt_seq_parts = [] + for idx, (interval, variants) in enumerate(alt_seq_template): + if len(variants) > 0: + alt_seq_parts.append(self.variant_sequence_extractor.extract(interval, variants)) + else: + alt_seq_parts.append(ref_seq_parts[idx]) + if self.reference_sequence_extractor.use_strand and alt_seq_template[0][0].strand == "-": + ref_seq_parts.reverse() + alt_seq_parts.reverse() + ref_seq = "".join(ref_seq_parts) + alt_seq = "".join(alt_seq_parts) + return ref_seq, alt_seq + + def _generator(self): + # Iterate through the interval source + for key, intervals in self.interval_fetcher: + intervals.sort(key=lambda x: x.start) + # Fetch variants corresponding to the intervals + intervals_with_variants = [(interval, self.variant_source.fetch_variants(interval)) + for interval in intervals] + # Get an alternative sequence template from the variant insertion strategy + alt_seq_template_iterator = (self.variant_insertion_strategy + .generate_alt_seq_templates(intervals_with_variants)) + # Get templates and build ref_seq, alt_seq pairs to supply to model + for alt_seq_template in alt_seq_template_iterator: + # extract and assemble the corresponding ref and alt sequences + ref_seq, alt_seq = self._build_sequences(alt_seq_template) + # transform the sequences + transformed_ref_seq = self.sequence_transformer(ref_seq) + transformed_alt_seq = self.sequence_transformer(alt_seq) + # get metadata + metadata_dict = _get_interval_metadata(key, intervals, self.interval_attrs) + # yield all as dictionary + yield { + "inputs": { + "ref_seq": transformed_ref_seq, + "alt_seq": transformed_alt_seq, + }, + "metadata": metadata_dict + } + + def __next__(self): + return next(self.generator) + + def __iter__(self): + return self diff --git a/kipoiseq/extractors/multi_interval.py b/kipoiseq/extractors/multi_interval.py index 5a3bee5..7922416 100644 --- a/kipoiseq/extractors/multi_interval.py +++ b/kipoiseq/extractors/multi_interval.py @@ -125,7 +125,6 @@ def items(self): # for i in self.keys(): # yield self.get_intervals(i) - class GenericMultiIntervalSeqExtractor(BaseMultiIntervalSeqExtractor): def __init__(self, extractor: BaseExtractor, interval_fetcher: BaseMultiIntervalFetcher, use_strand=True): From e6f709c05feb9b75cdb8cdea23f4f1dc083182af Mon Sep 17 00:00:00 2001 From: Karollus Alexander Date: Thu, 4 Mar 2021 17:16:13 +0100 Subject: [PATCH 2/2] added drawbacks --- Prototyping.ipynb | 149 +++++++++++++++++++++++----------------------- 1 file changed, 75 insertions(+), 74 deletions(-) diff --git a/Prototyping.ipynb b/Prototyping.ipynb index dac815e..8e23c34 100644 --- a/Prototyping.ipynb +++ b/Prototyping.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "premier-texture", + "id": "above-craps", "metadata": {}, "source": [ "# Prototype for a more generic dataloader interface\n", @@ -13,7 +13,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "lined-exercise", + "id": "gross-style", "metadata": {}, "outputs": [], "source": [ @@ -33,7 +33,7 @@ }, { "cell_type": "markdown", - "id": "vocational-reserve", + "id": "contrary-automation", "metadata": {}, "source": [ "Get some data to work with" @@ -42,7 +42,7 @@ { "cell_type": "code", "execution_count": 116, - "id": "wrong-horizontal", + "id": "unique-killer", "metadata": {}, "outputs": [ { @@ -73,7 +73,7 @@ }, { "cell_type": "markdown", - "id": "violent-chorus", + "id": "superb-station", "metadata": {}, "source": [ "## Introducing the GenericSingleSeqDataloader in prototype.py\n", @@ -91,7 +91,7 @@ }, { "cell_type": "markdown", - "id": "little-burns", + "id": "cooperative-pencil", "metadata": {}, "source": [ "With this, I believe it is very easy to build new dataloaders for most standard use-cases. All one needs to do is:\n", @@ -102,7 +102,7 @@ }, { "cell_type": "markdown", - "id": "naughty-baghdad", + "id": "phantom-schedule", "metadata": {}, "source": [ "## Building a gtf based TSS dataloader for Xpresso" @@ -110,7 +110,7 @@ }, { "cell_type": "markdown", - "id": "disabled-nation", + "id": "confidential-still", "metadata": {}, "source": [ "We can use this template to design a TSS dataloader for Xpresso:\n", @@ -121,7 +121,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "based-riding", + "id": "jewish-sociology", "metadata": {}, "outputs": [], "source": [ @@ -156,7 +156,7 @@ }, { "cell_type": "markdown", - "id": "junior-cabin", + "id": "present-paragraph", "metadata": {}, "source": [ "Once we have that, we can easily build a TSS Dataloader simply by extending our GenericSingleSeqDataloader and building the fetcher, extractor and transform:" @@ -165,7 +165,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "conventional-system", + "id": "developing-johns", "metadata": {}, "outputs": [], "source": [ @@ -216,7 +216,7 @@ }, { "cell_type": "markdown", - "id": "essential-london", + "id": "checked-suffering", "metadata": {}, "source": [ "Lets build it and test it" @@ -225,7 +225,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "challenging-detection", + "id": "passive-count", "metadata": {}, "outputs": [ { @@ -255,7 +255,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "documentary-auckland", + "id": "individual-growing", "metadata": {}, "outputs": [], "source": [ @@ -278,7 +278,7 @@ }, { "cell_type": "markdown", - "id": "welcome-compound", + "id": "secret-general", "metadata": {}, "source": [ "## Extracting PolyA sequences\n", @@ -289,7 +289,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "intellectual-devon", + "id": "hispanic-founder", "metadata": {}, "outputs": [], "source": [ @@ -324,7 +324,7 @@ }, { "cell_type": "markdown", - "id": "jewish-washer", + "id": "bound-technology", "metadata": {}, "source": [ "All we need to do to achieve this, is to replace the TSSFinder with a PolyAFinder:" @@ -333,7 +333,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "metric-ordering", + "id": "emotional-drama", "metadata": {}, "outputs": [], "source": [ @@ -384,7 +384,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "affiliated-upset", + "id": "demanding-clear", "metadata": {}, "outputs": [ { @@ -414,7 +414,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "sorted-complex", + "id": "purple-wholesale", "metadata": {}, "outputs": [ { @@ -438,7 +438,7 @@ }, { "cell_type": "markdown", - "id": "enormous-world", + "id": "pursuant-evolution", "metadata": {}, "source": [ "## Sourcing data from a bed file" @@ -446,7 +446,7 @@ }, { "cell_type": "markdown", - "id": "ecological-diversity", + "id": "classical-solid", "metadata": {}, "source": [ "Say we already have a bed file defining the areas of interest. We can easily build a dataloader for this." @@ -455,7 +455,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "practical-tunisia", + "id": "mediterranean-kinase", "metadata": {}, "outputs": [], "source": [ @@ -480,7 +480,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "silent-roman", + "id": "uniform-binary", "metadata": {}, "outputs": [], "source": [ @@ -496,7 +496,7 @@ }, { "cell_type": "markdown", - "id": "tired-deficit", + "id": "domestic-martin", "metadata": {}, "source": [ "All we need to do to achieve this is to use the BEDLoader rather than reading a GTF:" @@ -505,7 +505,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "complete-algorithm", + "id": "greatest-cabin", "metadata": {}, "outputs": [], "source": [ @@ -549,7 +549,7 @@ { "cell_type": "code", "execution_count": 13, - "id": "theoretical-course", + "id": "opposed-synthetic", "metadata": {}, "outputs": [], "source": [ @@ -560,7 +560,7 @@ }, { "cell_type": "markdown", - "id": "loose-thickness", + "id": "editorial-sigma", "metadata": {}, "source": [ "Test equivalence to old StringSeqDL dataloader" @@ -569,7 +569,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "arctic-increase", + "id": "apparent-making", "metadata": {}, "outputs": [], "source": [ @@ -583,7 +583,7 @@ }, { "cell_type": "markdown", - "id": "separated-threat", + "id": "descending-leisure", "metadata": {}, "source": [ "## Extracting all CDS sequences" @@ -591,7 +591,7 @@ }, { "cell_type": "markdown", - "id": "disabled-frame", + "id": "primary-tribune", "metadata": {}, "source": [ "Exploiting this design, we can very easily also make a dataloader that works on spliced sequences, e.g. coding sequences" @@ -600,7 +600,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "willing-mercury", + "id": "defined-necklace", "metadata": {}, "outputs": [], "source": [ @@ -621,7 +621,7 @@ { "cell_type": "code", "execution_count": 16, - "id": "considerable-grant", + "id": "cooperative-consideration", "metadata": {}, "outputs": [], "source": [ @@ -668,7 +668,7 @@ }, { "cell_type": "markdown", - "id": "academic-quantity", + "id": "dietary-benchmark", "metadata": {}, "source": [ "Get some test data (big files)" @@ -677,7 +677,7 @@ { "cell_type": "code", "execution_count": 25, - "id": "sitting-testament", + "id": "automatic-mauritius", "metadata": {}, "outputs": [ { @@ -705,7 +705,7 @@ }, { "cell_type": "markdown", - "id": "beneficial-leisure", + "id": "united-preservation", "metadata": {}, "source": [ "We have to unzip the fasta, because the kipoiseq fasta extractor is **very** slow on gzipped files" @@ -714,7 +714,7 @@ { "cell_type": "code", "execution_count": 26, - "id": "detected-queens", + "id": "desirable-classics", "metadata": {}, "outputs": [], "source": [ @@ -723,7 +723,7 @@ }, { "cell_type": "markdown", - "id": "running-organic", + "id": "confident-place", "metadata": {}, "source": [ "PyRanges loading of big gtf is also not exactly lightning fast, but it offers a lot of flexibility in return" @@ -732,7 +732,7 @@ { "cell_type": "code", "execution_count": 17, - "id": "foster-matthew", + "id": "needed-france", "metadata": {}, "outputs": [], "source": [ @@ -743,7 +743,7 @@ { "cell_type": "code", "execution_count": 18, - "id": "blond-jonathan", + "id": "perfect-client", "metadata": {}, "outputs": [], "source": [ @@ -752,7 +752,7 @@ }, { "cell_type": "markdown", - "id": "dated-sponsorship", + "id": "matched-christian", "metadata": {}, "source": [ "We compare to the groud truth to see if it works:" @@ -761,7 +761,7 @@ { "cell_type": "code", "execution_count": 19, - "id": "every-chemistry", + "id": "stuffed-inspection", "metadata": {}, "outputs": [], "source": [ @@ -795,7 +795,7 @@ }, { "cell_type": "markdown", - "id": "graphic-montreal", + "id": "living-executive", "metadata": {}, "source": [ "# Variants" @@ -803,7 +803,7 @@ }, { "cell_type": "markdown", - "id": "constant-parent", + "id": "qualified-shelf", "metadata": {}, "source": [ "The biggest value of kipoiseq are the classes that handle extraction and insertion of variants. These are very useful and provide many useful functions, but difficult to use/maintain because:\n", @@ -815,7 +815,7 @@ { "cell_type": "code", "execution_count": 93, - "id": "suburban-morocco", + "id": "posted-drink", "metadata": {}, "outputs": [], "source": [ @@ -825,7 +825,7 @@ { "cell_type": "code", "execution_count": null, - "id": "proper-activity", + "id": "joint-procedure", "metadata": {}, "outputs": [], "source": [ @@ -836,7 +836,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aggregate-blind", + "id": "stone-acquisition", "metadata": {}, "outputs": [], "source": [ @@ -845,7 +845,7 @@ }, { "cell_type": "markdown", - "id": "portable-cargo", + "id": "lesser-martial", "metadata": {}, "source": [ "## An alternative solution\n", @@ -858,7 +858,7 @@ }, { "cell_type": "markdown", - "id": "inside-beaver", + "id": "challenging-proxy", "metadata": {}, "source": [ "In the prototype.py, I have created a prototype for a GenericVariantDataloader, which takes these components to load variants. It is very similar in design to the GenericSingleSeqDataloader, but with the additional steps detailed above. The code is commented to explain the workflow" @@ -866,7 +866,7 @@ }, { "cell_type": "markdown", - "id": "thrown-claim", + "id": "recreational-potter", "metadata": {}, "source": [ "I think this template is easier to use/extend than the currently existing dataloaders. The currently existing code has a large hierarchy of specialized coupled objects (for example, there is a SingleVariantUTRDataLoader, which has a GenericSingleVariantMultiIntervalVCFSeqExtractor, which is BaseMultiIntervalVCFSeqExtractor (which in turn is a GenericMultiIntervalSeqExtractor, which in turn is a BaseMultiIntervalSeqExtractor) and it has a interval_fetcher and a variant seq extractor + a mixin that determines the insertion strategy and a \"matcher\" that as far as I can see is never used (see appendix)). Its a bit of a matryoshka doll that for me at least, took a considerable time to understand, and I think most new users will be scared away by this. It also leads to a proliferation of kind of useless classes. For example if I want to get variants from a hail table, I have to create a BaseMultiIntervalHailSeqExtractor and then use mixins to create the GenericSingleVariantMultiIntervalHailSeqExtractor and the GenericMultiVariantMultiIntervalHailSeqExtractor and so on.\n", @@ -877,7 +877,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "wanted-mount", + "id": "annual-strengthening", "metadata": {}, "outputs": [], "source": [ @@ -907,7 +907,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "related-delight", + "id": "joined-roman", "metadata": {}, "outputs": [], "source": [ @@ -973,7 +973,7 @@ }, { "cell_type": "markdown", - "id": "touched-receiver", + "id": "novel-lindsay", "metadata": {}, "source": [ "## Test" @@ -982,7 +982,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "distributed-shipping", + "id": "adult-yeast", "metadata": {}, "outputs": [], "source": [ @@ -1003,7 +1003,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "responsible-multimedia", + "id": "headed-hobby", "metadata": {}, "outputs": [], "source": [ @@ -1045,7 +1045,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "floating-first", + "id": "contained-greene", "metadata": {}, "outputs": [], "source": [ @@ -1055,7 +1055,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "false-organ", + "id": "earned-empty", "metadata": {}, "outputs": [], "source": [ @@ -1066,22 +1066,23 @@ }, { "cell_type": "markdown", - "id": "tired-following", + "id": "close-exhibition", "metadata": {}, "source": [ - "# Outlook\n", + "# Drawbacks and Remaining Issues\n", "\n", - "Following this design, one could create a reduced and maybe more intuitive structure of kipoiseq, structured along these 5 main components:\n", - "* interval_fetcher.py, which provide intervals from whatever source (base class: BaseMultiIntervalFetcher, maybe rename to BaseIntervalFetcher)\n", - "* variant_fetcher.py, which provide variants given intervals, from whatever source (base class: BaseVariantFetcher)\n", - "* sequence_extractor.py, which provides reference sequence given intervals, from whatever source (base class: BaseExtractor)\n", - "* variant_extractor.py, which provides a sequence with variants inserted, given intervals and variants (base class: VariantSeqExtractor)\n", - "* transforms.py, which provide transformations" + "Specific issues of this new design:\n", + "* The variant insertion strategy is a bit intimidating as it works with List[Tuple[Interval, List[Variant]]]. One could, of course, make a datatype for this. The main reason I didnt do it is (a) I couldnt figure out how to name it (except for ListOfIntervalVariantsTuples, which isn't really much better) and (b) because lists are nice to work with and I did not want to write all the boilerplate to make an object implement the list interface\n", + "* I dont really know what the anchor parameter of the VariantSeqExtractor really does, but if it actually makes a difference, this is a bit of a leaky abstraction since one needs to be quite familiar with class internals to set it correctly. \n", + "\n", + "More general issues:\n", + "* Generalizing to multi-input or, worse, multimodal models (e.g. ones that use sequence + experimental tracks) is not entirely straightforward\n", + "* There are complex variant events (e.g. a variant deleting a stop, causing a CDS to extend (on the spliced mRNA!) until the next stop is found), which are really hard to handle in a generic way" ] }, { "cell_type": "markdown", - "id": "velvet-classroom", + "id": "parental-retirement", "metadata": {}, "source": [ "# Appendix\n", @@ -1092,7 +1093,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "daily-wallace", + "id": "descending-raise", "metadata": {}, "outputs": [], "source": [ @@ -1125,7 +1126,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "imported-biography", + "id": "above-healing", "metadata": {}, "outputs": [], "source": [ @@ -1142,7 +1143,7 @@ { "cell_type": "code", "execution_count": 128, - "id": "deadly-latino", + "id": "fitted-pointer", "metadata": {}, "outputs": [], "source": [ @@ -1157,7 +1158,7 @@ { "cell_type": "code", "execution_count": 129, - "id": "violent-telephone", + "id": "dressed-plain", "metadata": {}, "outputs": [], "source": [ @@ -1174,7 +1175,7 @@ { "cell_type": "code", "execution_count": 130, - "id": "preceding-reflection", + "id": "stretch-boutique", "metadata": {}, "outputs": [], "source": [ @@ -1184,7 +1185,7 @@ { "cell_type": "code", "execution_count": 132, - "id": "caroline-boring", + "id": "regulated-meditation", "metadata": {}, "outputs": [], "source": [ @@ -1200,7 +1201,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "prerequisite-facial", + "id": "african-slovenia", "metadata": {}, "outputs": [], "source": [ @@ -1210,7 +1211,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "widespread-louisiana", + "id": "psychological-monday", "metadata": {}, "outputs": [ {