From 20d02ec88fd49add7e2e2dc8152eb836cabfee34 Mon Sep 17 00:00:00 2001 From: terrycojones Date: Sun, 31 Jan 2021 13:13:03 +0100 Subject: [PATCH 1/2] Removed FastaFaiIndex class. Fixed two skipped tests. --- CHANGELOG.md | 5 ++ dark/__init__.py | 2 +- dark/fasta.py | 51 ------------- setup.py | 1 - test/test_fasta.py | 152 +-------------------------------------- test/test_local_align.py | 12 ---- test/test_utils.py | 19 ++--- 7 files changed, 13 insertions(+), 229 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ed4728b..7e7d2017 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 3.2.0 January 31, 2021 + +Dropped the `FastaFaiIndex` class. Use `SqliteIndex` instead. Fixed two +skipped Python 3 tests (for gzipped and bz2'd file reading). + ## 3.1.86 January 31, 2021 Fixed two failing tests due to changed DIAMOND (version 2.0.6) bitscore diff --git a/dark/__init__.py b/dark/__init__.py index d0a1f4f2..409d661c 100644 --- a/dark/__init__.py +++ b/dark/__init__.py @@ -7,4 +7,4 @@ # will not be found by the version() function in ../setup.py # # Remember to update ../CHANGELOG.md describing what's new in each version. -__version__ = '3.1.86' +__version__ = '3.1.87' diff --git a/dark/fasta.py b/dark/fasta.py index 4f48adf7..4e29ccfb 100644 --- a/dark/fasta.py +++ b/dark/fasta.py @@ -4,7 +4,6 @@ import os from Bio import SeqIO, bgzf -from pyfaidx import Fasta from dark.reads import Reads, DNARead from dark.utils import asHandle @@ -131,56 +130,6 @@ def iter(self): count += 1 -class FastaFaiReads(Reads): - """ - Subclass of L{dark.reads.Reads} that provides dictionary-like access to - FASTA reads using the pyfaidx module. - - @param filename: The C{str} name of a file containing FASTA. This may be an - uncompressed file or one that has been compressed with bgzip (from - the samtools suite). - # The pyfaidx module will create an identically named FASTA index file - # with an additional ".fai" extension. An .fai file can also be made - # (if desired) with 'samtools faidx' or the 'faidx' command installed - # by pyfaidx. If you use the latter, you can preserve full sequence - # titles via '-e "lambda x: x". - @param readClass: The class of read that should be yielded by iter. - @param upperCase: If C{True}, read sequences will be converted to upper - case. - """ - def __init__(self, filename, readClass=DNARead, upperCase=False): - self._filename = filename - self._fasta = Fasta(filename) - self._readClass = readClass - # TODO: It would be better if upperCase were an argument that could - # be passed to Reads.__init__ and that could do the uppercasing in - # its add method (as opposed to using it below in our iter method). - # In that case, in the iter of this class we'd call self.add on - # each of the sequences coming from self._file. Or, if we'd already - # read the file we'd return Reads.iter(self) to re-iterate over the - # sequences already added from the file. - self._upperCase = upperCase - if PY3: - super().__init__() - else: - Reads.__init__(self) - - def iter(self): - """ - Iterate over the sequences in the files in self.files_, yielding each - as an instance of the desired read class. - """ - if self._upperCase: - for id_ in self._fasta: - yield self._readClass(id_, str(self._fasta[id_]).upper()) - else: - for id_ in self._fasta: - yield self._readClass(id_, str(self._fasta[id_])) - - def __getitem__(self, id_): - return self._readClass(str(id_), str(self._fasta[id_])) - - def combineReads(filename, sequences, readClass=DNARead, upperCase=False, idPrefix='command-line-read-'): """ diff --git a/setup.py b/setup.py index 801eeb94..18499751 100644 --- a/setup.py +++ b/setup.py @@ -141,7 +141,6 @@ def version(): 'mysql-connector-python==8.0.11', 'numpy>=1.14.2', 'pysam>=0.15.2', - 'pyfaidx>=0.4.8.4', 'pyzmq>=14.3.1', 'requests>=2.18.4', 'cachetools>=3.1.0', diff --git a/test/test_fasta.py b/test/test_fasta.py index 69752f43..d0d83856 100644 --- a/test/test_fasta.py +++ b/test/test_fasta.py @@ -1,12 +1,10 @@ -import sys +import os from six.moves import builtins from six import assertRaisesRegex from io import BytesIO -import os +from unittest import TestCase -from unittest import TestCase, skipUnless from Bio import SeqIO, bgzf -from contextlib import contextmanager try: from unittest.mock import patch @@ -17,11 +15,9 @@ from dark.reads import Read, AARead, DNARead, RNARead, Reads from dark.fasta import (dedupFasta, dePrefixAndSuffixFasta, fastaSubtract, - FastaReads, FastaFaiReads, combineReads, SqliteIndex) + FastaReads, combineReads, SqliteIndex) from dark.utils import StringIO -canTestPyfaidx = sys.platform != 'linux' - class FastaDeDup(TestCase): """ @@ -453,148 +449,6 @@ def sideEffect(self, filename, **kwargs): list(reads)) -@skipUnless(canTestPyfaidx, 'pyfaidx tests are skipped on Linux') -class TestFastaFaiReads(TestCase): - """ - Tests for the L{dark.fasta.FastaFaiReads} class. - """ - def testMissingKey(self): - """ - If a non-existent sequence id is looked up, a KeyError must be raised. - """ - - pyfaidxIndex = StringIO() - - class Open(object): - def __init__(self, test, manager): - self.test = test - self.manager = manager - self.count = 0 - - def sideEffect(self, filename, *args, **kwargs): - if self.count == 0: - self.test.assertEqual('filename.fasta', filename) - self.count += 1 - return BytesIO(b'>id1\nACTG\n') - elif self.count == 1: - self.test.assertEqual('filename.fasta', filename) - self.count += 1 - return StringIO('>id1\nACTG\n') - elif self.count == 2: - self.count += 1 - return self.manager - elif self.count == 3: - self.count += 1 - return StringIO(pyfaidxIndex.getvalue()) - else: - self.test.fail( - 'Open called too many times. Filename: %r, Args: %r, ' - 'Keyword args: %r.' % (filename, args, kwargs)) - - @contextmanager - def manager(): - yield pyfaidxIndex - - sideEffect = Open(self, manager()).sideEffect - with patch.object(builtins, 'open') as mockMethod: - mockMethod.side_effect = sideEffect - reads = FastaFaiReads('filename.fasta') - error = "^'id2 not in filename\\.fasta\\.'" - assertRaisesRegex(self, KeyError, error, reads.__getitem__, 'id2') - - def testOneRead(self): - """ - It must be possible to access a FASTA file with one read like a dict. - """ - - pyfaidxIndex = StringIO() - - class Open(object): - def __init__(self, test, manager): - self.test = test - self.manager = manager - self.count = 0 - - def sideEffect(self, filename, *args, **kwargs): - if self.count == 0: - self.test.assertEqual('filename.fasta', filename) - self.count += 1 - return BytesIO(b'>id1\nACTG\n') - elif self.count == 1: - self.test.assertEqual('filename.fasta', filename) - self.count += 1 - return StringIO('>id1\nACTG\n') - elif self.count == 2: - self.count += 1 - return self.manager - elif self.count == 3: - self.count += 1 - return StringIO(pyfaidxIndex.getvalue()) - else: - self.test.fail( - 'Open called too many times. Filename: %r, Args: %r, ' - 'Keyword args: %r.' % (filename, args, kwargs)) - - @contextmanager - def manager(): - yield pyfaidxIndex - - sideEffect = Open(self, manager()).sideEffect - with patch.object(builtins, 'open') as mockMethod: - mockMethod.side_effect = sideEffect - reads = FastaFaiReads('filename.fasta') - self.assertEqual(DNARead('id1', 'ACTG'), reads['id1']) - # Check that the fai index was built correctly. - self.assertEqual(pyfaidxIndex.getvalue(), 'id1\t4\t5\t4\t5\n') - - def testTwoReads(self): - """ - It must be possible to access a FASTA file with two reads like a dict. - """ - - pyfaidxIndex = StringIO() - - class Open(object): - def __init__(self, test, manager): - self.test = test - self.manager = manager - self.count = 0 - - def sideEffect(self, filename, *args, **kwargs): - if self.count == 0: - self.test.assertEqual('filename.fasta', filename) - self.count += 1 - return BytesIO(b'>id1\nACTG\n>id2\nAACCTTGG\n') - elif self.count == 1: - self.test.assertEqual('filename.fasta', filename) - self.count += 1 - return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n') - elif self.count == 2: - self.count += 1 - return self.manager - elif self.count == 3: - self.count += 1 - return StringIO(pyfaidxIndex.getvalue()) - else: - self.test.fail( - 'Open called too many times. Filename: %r, Args: %r, ' - 'Keyword args: %r.' % (filename, args, kwargs)) - - @contextmanager - def manager(): - yield pyfaidxIndex - - sideEffect = Open(self, manager()).sideEffect - with patch.object(builtins, 'open') as mockMethod: - mockMethod.side_effect = sideEffect - reads = FastaFaiReads('filename.fasta') - self.assertEqual(DNARead('id1', 'ACTG'), reads['id1']) - self.assertEqual(DNARead('id2', 'AACCTTGG'), reads['id2']) - # Check that the fai index was built correctly. - self.assertEqual(pyfaidxIndex.getvalue(), - 'id1\t4\t5\t4\t5\nid2\t8\t15\t8\t9\n') - - class TestCombineReads(TestCase): """ Tests for the L{dark.fasta.combineReads} function. diff --git a/test/test_local_align.py b/test/test_local_align.py index 736c2ffb..6896891b 100644 --- a/test/test_local_align.py +++ b/test/test_local_align.py @@ -11,18 +11,6 @@ class TestLocalAlign(TestCase): With match +1, mismatch -1, gap open -1, gap extend -1 and gap extend decay 0.0. """ - - def testNonDNAString(self): - """ - If the sequences do not consist of nucleotides, an exception - must be raised. - """ - self.skipTest('Non-DNA raising of ValueError is disabled.') - seq1 = Read('seq1', 'xxx') - seq2 = Read('seq2', 'yyy') - six.assertRaisesRegex(self, ValueError, 'Invalid DNA nucleotide: "X"', - LocalAlignment, seq1, seq2) - def testPositiveMismatch(self): """ If the mismatch value passed is positive, an exception diff --git a/test/test_utils.py b/test/test_utils.py index 35d687af..b7e2d374 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,4 +1,3 @@ -import six import bz2 import gzip from six.moves import builtins @@ -13,6 +12,8 @@ from .mocking import mockOpen +from io import BytesIO + from dark.utils import ( numericallySortFilenames, median, asHandle, parseRangeString, parseRangeExpression, pct, StringIO, baseCountsToStr, nucleotidesToStr, @@ -150,13 +151,7 @@ def testBZ2(self): When a string '*.bz2' filename is passed to asHandle, it must be possible to read the correct data from the fp that is returned. """ - if six.PY3: - self.skipTest('Mocking bz2.BZ2File disabled under Python 3') - - # This test should be better. It should actually create some bz2 - # compressed data and make sure that it's decompressed - # properly. But Python mocking makes me so confused... - result = StringIO('xxx') + result = BytesIO(b'xxx') with patch.object(bz2, 'BZ2File') as mockMethod: mockMethod.return_value = result @@ -168,13 +163,7 @@ def testGzip(self): When a string '*.gz' filename is passed to asHandle, it must be possible to read the correct data from the fp that is returned. """ - if six.PY3: - self.skipTest('Mocking gzip.GzipFile disabled under Python 3') - - # This test should be better. It should actually create some gzip - # compressed data and make sure that it's decompressed - # properly. But Python mocking makes me so confused... - result = StringIO('xxx') + result = BytesIO(b'xxx') with patch.object(gzip, 'GzipFile') as mockMethod: mockMethod.return_value = result From 817b667c6b62a912ec93a0a1aea21a9268e7b1ca Mon Sep 17 00:00:00 2001 From: terrycojones Date: Sun, 31 Jan 2021 13:36:17 +0100 Subject: [PATCH 2/2] Fixed some deprecated BioPython warnings. --- bin/trim-primers.py | 4 +--- test/test_dna.py | 12 ++++++++---- test/test_sequence.py | 38 ++++++++++++++++++-------------------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/bin/trim-primers.py b/bin/trim-primers.py index d661d275..4408d767 100755 --- a/bin/trim-primers.py +++ b/bin/trim-primers.py @@ -5,7 +5,6 @@ import sys from Bio import SeqIO from Bio.Seq import Seq -from Bio.Alphabet import IUPAC from dark.sequence import findPrimerBidiLimits @@ -53,5 +52,4 @@ def trimPrimers(primer, verbose): args = parser.parse_args() - trimPrimers(Seq(args.primer.upper(), IUPAC.unambiguous_dna), - args.verbose) + trimPrimers(Seq(args.primer.upper()), args.verbose) diff --git a/test/test_dna.py b/test/test_dna.py index f7b16c9e..c689d90c 100644 --- a/test/test_dna.py +++ b/test/test_dna.py @@ -1,13 +1,17 @@ import six from unittest import TestCase -from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA - from dark.dna import ( AMBIGUOUS, BASES_TO_AMBIGUOUS, compareDNAReads, matchToString, findKozakConsensus, FloatBaseCounts, sequenceToRegex) from dark.reads import Read, DNARead, DNAKozakRead +# The following are the letters that used to be on +# from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA +# IUPACAmbiguousDNA.letters +# But Bio.Alphabet is now deprecated and will be removed. +AMBIGUOUS_DNA_LETTERS = 'GATCRYWSMKHBVDN' + class TestAmbiguousLetters(TestCase): """ @@ -17,7 +21,7 @@ def testExpectedAmbiguousLetters(self): """ The ambiguous DNA letters must match those given by IUPAC. """ - self.assertEqual(sorted(IUPACAmbiguousDNA.letters), sorted(AMBIGUOUS)) + self.assertEqual(sorted(AMBIGUOUS_DNA_LETTERS), sorted(AMBIGUOUS)) def testExpectedLengthOne(self): """ @@ -31,7 +35,7 @@ def testExpectedLengthsGreaterThanOne(self): The ambiguous DNA letters must be in sets of size greater than one and less than 5. """ - for base in set(IUPACAmbiguousDNA.letters) - set('ACGT'): + for base in set(AMBIGUOUS_DNA_LETTERS) - set('ACGT'): self.assertTrue(5 > len(AMBIGUOUS[base]) > 1) def testAmbiguousLettersAreAllACGT(self): diff --git a/test/test_sequence.py b/test/test_sequence.py index 508b3c0c..e06c58e2 100644 --- a/test/test_sequence.py +++ b/test/test_sequence.py @@ -1,7 +1,6 @@ from unittest import TestCase from dark.sequence import findPrimer, findPrimerBidi, findPrimerBidiLimits from Bio.Seq import Seq -from Bio.Alphabet import IUPAC class TestFindPrimer(TestCase): @@ -13,7 +12,7 @@ def testNotFound(self): """ If a primer is not found, the empty list must be returned. """ - seq = Seq('ACGT', IUPAC.unambiguous_dna) + seq = Seq('ACGT') self.assertEqual([], findPrimer('BLAH', seq)) def testFoundAtStart(self): @@ -21,7 +20,7 @@ def testFoundAtStart(self): If a primer is found at the start of a sequence, a list containing 0 must be returned. """ - seq = Seq('ACGT', IUPAC.unambiguous_dna) + seq = Seq('ACGT') self.assertEqual([0], findPrimer('AC', seq)) def testFoundAtEnd(self): @@ -29,7 +28,7 @@ def testFoundAtEnd(self): If a primer is found at the end of a sequence, the correct value must be returned. """ - seq = Seq('ACGT', IUPAC.unambiguous_dna) + seq = Seq('ACGT') self.assertEqual([2], findPrimer('GT', seq)) def testFoundMultiple(self): @@ -37,7 +36,7 @@ def testFoundMultiple(self): If a primer is found multiple times, the correct value must be returned. """ - seq = Seq('ACGTACGT', IUPAC.unambiguous_dna) + seq = Seq('ACGTACGT') self.assertEqual([0, 4], findPrimer('ACG', seq)) def testOverlapping(self): @@ -45,7 +44,7 @@ def testOverlapping(self): If a primer is present twice but is overlapping, only the first instance should be returned. """ - seq = Seq('GAAA', IUPAC.unambiguous_dna) + seq = Seq('GAAA') self.assertEqual([1], findPrimer('AA', seq)) @@ -58,7 +57,7 @@ def testNotFound(self): """ If a primer is not found, empty lists must be returned. """ - seq = Seq('ACGT', IUPAC.unambiguous_dna) + seq = Seq('ACGT') self.assertEqual(([], []), findPrimerBidi('BLAH', seq)) def testFoundStartEnd(self): @@ -67,7 +66,7 @@ def testFoundStartEnd(self): the forward sequence, end of the reverse complement), the correct value must be returned. """ - seq = Seq('ACGT', IUPAC.unambiguous_dna) + seq = Seq('ACGT') self.assertEqual(([0], [2]), findPrimerBidi('AC', seq)) def testFoundEndStart(self): @@ -76,7 +75,7 @@ def testFoundEndStart(self): the forward sequence, start of the reverse complement), the correct value must be returned. """ - seq = Seq('ACGT', IUPAC.unambiguous_dna) + seq = Seq('ACGT') self.assertEqual(([2], [0]), findPrimerBidi('GT', seq)) def testFoundMultiple(self): @@ -84,7 +83,7 @@ def testFoundMultiple(self): If a primer is found multiple times, the correct value must be returned. """ - seq = Seq('ACGTACGT', IUPAC.unambiguous_dna) + seq = Seq('ACGTACGT') self.assertEqual(([0, 4], [1, 5]), findPrimerBidi('ACG', seq)) def testOverlappingForwards(self): @@ -92,7 +91,7 @@ def testOverlappingForwards(self): If a primer is present twice forwards but is overlapping, only the first instance should be returned. """ - seq = Seq('GAAA', IUPAC.unambiguous_dna) + seq = Seq('GAAA') self.assertEqual(([1], []), findPrimerBidi('AA', seq)) def testOverlappingBackwards(self): @@ -100,7 +99,7 @@ def testOverlappingBackwards(self): If a primer is present twice backwards but is overlapping, only the first instance should be returned. """ - seq = Seq('GTTT', IUPAC.unambiguous_dna) + seq = Seq('GTTT') self.assertEqual(([], [1]), findPrimerBidi('AA', seq)) @@ -114,7 +113,7 @@ def testNotFound(self): If a primer is not found, the returned offsets must include the whole sequence. """ - seq = Seq('ACGT', IUPAC.unambiguous_dna) + seq = Seq('ACGT') self.assertEqual((0, 4), findPrimerBidiLimits('BLAH', seq)) def testFoundStartEnd(self): @@ -123,7 +122,7 @@ def testFoundStartEnd(self): the forward sequence, end of the reverse complement), the correct value must be returned. """ - seq = Seq('ACGT', IUPAC.unambiguous_dna) + seq = Seq('ACGT') self.assertEqual((2, 2), findPrimerBidiLimits('AC', seq)) def testFoundEndStart(self): @@ -132,7 +131,7 @@ def testFoundEndStart(self): the forward sequence, start of the reverse complement), the correct value must be returned. """ - seq = Seq('ACGT', IUPAC.unambiguous_dna) + seq = Seq('ACGT') self.assertEqual((4, 4), findPrimerBidiLimits('GT', seq)) def testFoundMultiple(self): @@ -140,7 +139,7 @@ def testFoundMultiple(self): If a primer is found multiple times, the correct value must be returned. """ - seq = Seq('ACGTACGT', IUPAC.unambiguous_dna) + seq = Seq('ACGTACGT') self.assertEqual((7, 8), findPrimerBidiLimits('ACG', seq)) def testOverlappingForwards(self): @@ -148,7 +147,7 @@ def testOverlappingForwards(self): If a primer is present twice forwards but is overlapping, only the first instance should be returned. """ - seq = Seq('GAAA', IUPAC.unambiguous_dna) + seq = Seq('GAAA') self.assertEqual((3, 4), findPrimerBidiLimits('AA', seq)) def testOverlappingBackwards(self): @@ -156,7 +155,7 @@ def testOverlappingBackwards(self): If a primer is present twice backwards but is overlapping, only the first instance should be returned. """ - seq = Seq('GTTT', IUPAC.unambiguous_dna) + seq = Seq('GTTT') self.assertEqual((0, 1), findPrimerBidiLimits('AA', seq)) def testLonger(self): @@ -166,6 +165,5 @@ def testLonger(self): seq = Seq('AAAAAAAAAA' 'GGGGGGGGGG' 'AAAAAAAAAA' - 'AAAAAAAAAA', - IUPAC.unambiguous_dna) + 'AAAAAAAAAA') self.assertEqual((20, 40), findPrimerBidiLimits('GGGGGGGGGG', seq))