From 20d02ec88fd49add7e2e2dc8152eb836cabfee34 Mon Sep 17 00:00:00 2001
From: terrycojones <terry@jon.es>
Date: Sun, 31 Jan 2021 13:13:03 +0100
Subject: [PATCH 1/2] Removed FastaFaiIndex class. Fixed two skipped tests.

---
 CHANGELOG.md             |   5 ++
 dark/__init__.py         |   2 +-
 dark/fasta.py            |  51 -------------
 setup.py                 |   1 -
 test/test_fasta.py       | 152 +--------------------------------------
 test/test_local_align.py |  12 ----
 test/test_utils.py       |  19 ++---
 7 files changed, 13 insertions(+), 229 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1ed4728b..7e7d2017 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 3.2.0 January 31, 2021
+
+Dropped the `FastaFaiIndex` class. Use `SqliteIndex` instead. Fixed two
+skipped Python 3 tests (for gzipped and bz2'd file reading).
+
 ## 3.1.86 January 31, 2021
 
 Fixed two failing tests due to changed DIAMOND (version 2.0.6) bitscore
diff --git a/dark/__init__.py b/dark/__init__.py
index d0a1f4f2..409d661c 100644
--- a/dark/__init__.py
+++ b/dark/__init__.py
@@ -7,4 +7,4 @@
 # will not be found by the version() function in ../setup.py
 #
 # Remember to update ../CHANGELOG.md describing what's new in each version.
-__version__ = '3.1.86'
+__version__ = '3.1.87'
diff --git a/dark/fasta.py b/dark/fasta.py
index 4f48adf7..4e29ccfb 100644
--- a/dark/fasta.py
+++ b/dark/fasta.py
@@ -4,7 +4,6 @@
 import os
 
 from Bio import SeqIO, bgzf
-from pyfaidx import Fasta
 
 from dark.reads import Reads, DNARead
 from dark.utils import asHandle
@@ -131,56 +130,6 @@ def iter(self):
                         count += 1
 
 
-class FastaFaiReads(Reads):
-    """
-    Subclass of L{dark.reads.Reads} that provides dictionary-like access to
-    FASTA reads using the pyfaidx module.
-
-    @param filename: The C{str} name of a file containing FASTA. This may be an
-        uncompressed file or one that has been compressed with bgzip (from
-        the samtools suite).
-        # The pyfaidx module will create an identically named FASTA index file
-        # with an additional ".fai" extension. An .fai file can also be made
-        # (if desired) with 'samtools faidx' or the 'faidx' command installed
-        # by pyfaidx. If you use the latter, you can preserve full sequence
-        # titles via '-e "lambda x: x".
-    @param readClass: The class of read that should be yielded by iter.
-    @param upperCase: If C{True}, read sequences will be converted to upper
-        case.
-    """
-    def __init__(self, filename, readClass=DNARead, upperCase=False):
-        self._filename = filename
-        self._fasta = Fasta(filename)
-        self._readClass = readClass
-        # TODO: It would be better if upperCase were an argument that could
-        # be passed to Reads.__init__ and that could do the uppercasing in
-        # its add method (as opposed to using it below in our iter method).
-        # In that case, in the iter of this class we'd call self.add on
-        # each of the sequences coming from self._file. Or, if we'd already
-        # read the file we'd return Reads.iter(self) to re-iterate over the
-        # sequences already added from the file.
-        self._upperCase = upperCase
-        if PY3:
-            super().__init__()
-        else:
-            Reads.__init__(self)
-
-    def iter(self):
-        """
-        Iterate over the sequences in the files in self.files_, yielding each
-        as an instance of the desired read class.
-        """
-        if self._upperCase:
-            for id_ in self._fasta:
-                yield self._readClass(id_, str(self._fasta[id_]).upper())
-        else:
-            for id_ in self._fasta:
-                yield self._readClass(id_, str(self._fasta[id_]))
-
-    def __getitem__(self, id_):
-        return self._readClass(str(id_), str(self._fasta[id_]))
-
-
 def combineReads(filename, sequences, readClass=DNARead,
                  upperCase=False, idPrefix='command-line-read-'):
     """
diff --git a/setup.py b/setup.py
index 801eeb94..18499751 100644
--- a/setup.py
+++ b/setup.py
@@ -141,7 +141,6 @@ def version():
           'mysql-connector-python==8.0.11',
           'numpy>=1.14.2',
           'pysam>=0.15.2',
-          'pyfaidx>=0.4.8.4',
           'pyzmq>=14.3.1',
           'requests>=2.18.4',
           'cachetools>=3.1.0',
diff --git a/test/test_fasta.py b/test/test_fasta.py
index 69752f43..d0d83856 100644
--- a/test/test_fasta.py
+++ b/test/test_fasta.py
@@ -1,12 +1,10 @@
-import sys
+import os
 from six.moves import builtins
 from six import assertRaisesRegex
 from io import BytesIO
-import os
+from unittest import TestCase
 
-from unittest import TestCase, skipUnless
 from Bio import SeqIO, bgzf
-from contextlib import contextmanager
 
 try:
     from unittest.mock import patch
@@ -17,11 +15,9 @@
 
 from dark.reads import Read, AARead, DNARead, RNARead, Reads
 from dark.fasta import (dedupFasta, dePrefixAndSuffixFasta, fastaSubtract,
-                        FastaReads, FastaFaiReads, combineReads, SqliteIndex)
+                        FastaReads, combineReads, SqliteIndex)
 from dark.utils import StringIO
 
-canTestPyfaidx = sys.platform != 'linux'
-
 
 class FastaDeDup(TestCase):
     """
@@ -453,148 +449,6 @@ def sideEffect(self, filename, **kwargs):
                 list(reads))
 
 
-@skipUnless(canTestPyfaidx, 'pyfaidx tests are skipped on Linux')
-class TestFastaFaiReads(TestCase):
-    """
-    Tests for the L{dark.fasta.FastaFaiReads} class.
-    """
-    def testMissingKey(self):
-        """
-        If a non-existent sequence id is looked up, a KeyError must be raised.
-        """
-
-        pyfaidxIndex = StringIO()
-
-        class Open(object):
-            def __init__(self, test, manager):
-                self.test = test
-                self.manager = manager
-                self.count = 0
-
-            def sideEffect(self, filename, *args, **kwargs):
-                if self.count == 0:
-                    self.test.assertEqual('filename.fasta', filename)
-                    self.count += 1
-                    return BytesIO(b'>id1\nACTG\n')
-                elif self.count == 1:
-                    self.test.assertEqual('filename.fasta', filename)
-                    self.count += 1
-                    return StringIO('>id1\nACTG\n')
-                elif self.count == 2:
-                    self.count += 1
-                    return self.manager
-                elif self.count == 3:
-                    self.count += 1
-                    return StringIO(pyfaidxIndex.getvalue())
-                else:
-                    self.test.fail(
-                        'Open called too many times. Filename: %r, Args: %r, '
-                        'Keyword args: %r.' % (filename, args, kwargs))
-
-        @contextmanager
-        def manager():
-            yield pyfaidxIndex
-
-        sideEffect = Open(self, manager()).sideEffect
-        with patch.object(builtins, 'open') as mockMethod:
-            mockMethod.side_effect = sideEffect
-            reads = FastaFaiReads('filename.fasta')
-            error = "^'id2 not in filename\\.fasta\\.'"
-            assertRaisesRegex(self, KeyError, error, reads.__getitem__, 'id2')
-
-    def testOneRead(self):
-        """
-        It must be possible to access a FASTA file with one read like a dict.
-        """
-
-        pyfaidxIndex = StringIO()
-
-        class Open(object):
-            def __init__(self, test, manager):
-                self.test = test
-                self.manager = manager
-                self.count = 0
-
-            def sideEffect(self, filename, *args, **kwargs):
-                if self.count == 0:
-                    self.test.assertEqual('filename.fasta', filename)
-                    self.count += 1
-                    return BytesIO(b'>id1\nACTG\n')
-                elif self.count == 1:
-                    self.test.assertEqual('filename.fasta', filename)
-                    self.count += 1
-                    return StringIO('>id1\nACTG\n')
-                elif self.count == 2:
-                    self.count += 1
-                    return self.manager
-                elif self.count == 3:
-                    self.count += 1
-                    return StringIO(pyfaidxIndex.getvalue())
-                else:
-                    self.test.fail(
-                        'Open called too many times. Filename: %r, Args: %r, '
-                        'Keyword args: %r.' % (filename, args, kwargs))
-
-        @contextmanager
-        def manager():
-            yield pyfaidxIndex
-
-        sideEffect = Open(self, manager()).sideEffect
-        with patch.object(builtins, 'open') as mockMethod:
-            mockMethod.side_effect = sideEffect
-            reads = FastaFaiReads('filename.fasta')
-            self.assertEqual(DNARead('id1', 'ACTG'), reads['id1'])
-            # Check that the fai index was built correctly.
-            self.assertEqual(pyfaidxIndex.getvalue(), 'id1\t4\t5\t4\t5\n')
-
-    def testTwoReads(self):
-        """
-        It must be possible to access a FASTA file with two reads like a dict.
-        """
-
-        pyfaidxIndex = StringIO()
-
-        class Open(object):
-            def __init__(self, test, manager):
-                self.test = test
-                self.manager = manager
-                self.count = 0
-
-            def sideEffect(self, filename, *args, **kwargs):
-                if self.count == 0:
-                    self.test.assertEqual('filename.fasta', filename)
-                    self.count += 1
-                    return BytesIO(b'>id1\nACTG\n>id2\nAACCTTGG\n')
-                elif self.count == 1:
-                    self.test.assertEqual('filename.fasta', filename)
-                    self.count += 1
-                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
-                elif self.count == 2:
-                    self.count += 1
-                    return self.manager
-                elif self.count == 3:
-                    self.count += 1
-                    return StringIO(pyfaidxIndex.getvalue())
-                else:
-                    self.test.fail(
-                        'Open called too many times. Filename: %r, Args: %r, '
-                        'Keyword args: %r.' % (filename, args, kwargs))
-
-        @contextmanager
-        def manager():
-            yield pyfaidxIndex
-
-        sideEffect = Open(self, manager()).sideEffect
-        with patch.object(builtins, 'open') as mockMethod:
-            mockMethod.side_effect = sideEffect
-            reads = FastaFaiReads('filename.fasta')
-            self.assertEqual(DNARead('id1', 'ACTG'), reads['id1'])
-            self.assertEqual(DNARead('id2', 'AACCTTGG'), reads['id2'])
-            # Check that the fai index was built correctly.
-            self.assertEqual(pyfaidxIndex.getvalue(),
-                             'id1\t4\t5\t4\t5\nid2\t8\t15\t8\t9\n')
-
-
 class TestCombineReads(TestCase):
     """
     Tests for the L{dark.fasta.combineReads} function.
diff --git a/test/test_local_align.py b/test/test_local_align.py
index 736c2ffb..6896891b 100644
--- a/test/test_local_align.py
+++ b/test/test_local_align.py
@@ -11,18 +11,6 @@ class TestLocalAlign(TestCase):
     With match +1, mismatch -1, gap open -1, gap extend -1 and
         gap extend decay 0.0.
     """
-
-    def testNonDNAString(self):
-        """
-        If the sequences do not consist of nucleotides, an exception
-        must be raised.
-        """
-        self.skipTest('Non-DNA raising of ValueError is disabled.')
-        seq1 = Read('seq1', 'xxx')
-        seq2 = Read('seq2', 'yyy')
-        six.assertRaisesRegex(self, ValueError, 'Invalid DNA nucleotide: "X"',
-                              LocalAlignment, seq1, seq2)
-
     def testPositiveMismatch(self):
         """
         If the mismatch value passed is positive, an exception
diff --git a/test/test_utils.py b/test/test_utils.py
index 35d687af..b7e2d374 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1,4 +1,3 @@
-import six
 import bz2
 import gzip
 from six.moves import builtins
@@ -13,6 +12,8 @@
 
 from .mocking import mockOpen
 
+from io import BytesIO
+
 from dark.utils import (
     numericallySortFilenames, median, asHandle, parseRangeString,
     parseRangeExpression, pct, StringIO, baseCountsToStr, nucleotidesToStr,
@@ -150,13 +151,7 @@ def testBZ2(self):
         When a string '*.bz2' filename is passed to asHandle, it must be
         possible to read the correct data from the fp that is returned.
         """
-        if six.PY3:
-            self.skipTest('Mocking bz2.BZ2File disabled under Python 3')
-
-        # This test should be better. It should actually create some bz2
-        # compressed data and make sure that it's decompressed
-        # properly. But Python mocking makes me so confused...
-        result = StringIO('xxx')
+        result = BytesIO(b'xxx')
 
         with patch.object(bz2, 'BZ2File') as mockMethod:
             mockMethod.return_value = result
@@ -168,13 +163,7 @@ def testGzip(self):
         When a string '*.gz' filename is passed to asHandle, it must be
         possible to read the correct data from the fp that is returned.
         """
-        if six.PY3:
-            self.skipTest('Mocking gzip.GzipFile disabled under Python 3')
-
-        # This test should be better. It should actually create some gzip
-        # compressed data and make sure that it's decompressed
-        # properly. But Python mocking makes me so confused...
-        result = StringIO('xxx')
+        result = BytesIO(b'xxx')
 
         with patch.object(gzip, 'GzipFile') as mockMethod:
             mockMethod.return_value = result

From 817b667c6b62a912ec93a0a1aea21a9268e7b1ca Mon Sep 17 00:00:00 2001
From: terrycojones <terry@jon.es>
Date: Sun, 31 Jan 2021 13:36:17 +0100
Subject: [PATCH 2/2] Fixed some deprecated BioPython warnings.

---
 bin/trim-primers.py   |  4 +---
 test/test_dna.py      | 12 ++++++++----
 test/test_sequence.py | 38 ++++++++++++++++++--------------------
 3 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/bin/trim-primers.py b/bin/trim-primers.py
index d661d275..4408d767 100755
--- a/bin/trim-primers.py
+++ b/bin/trim-primers.py
@@ -5,7 +5,6 @@
 import sys
 from Bio import SeqIO
 from Bio.Seq import Seq
-from Bio.Alphabet import IUPAC
 from dark.sequence import findPrimerBidiLimits
 
 
@@ -53,5 +52,4 @@ def trimPrimers(primer, verbose):
 
     args = parser.parse_args()
 
-    trimPrimers(Seq(args.primer.upper(), IUPAC.unambiguous_dna),
-                args.verbose)
+    trimPrimers(Seq(args.primer.upper()), args.verbose)
diff --git a/test/test_dna.py b/test/test_dna.py
index f7b16c9e..c689d90c 100644
--- a/test/test_dna.py
+++ b/test/test_dna.py
@@ -1,13 +1,17 @@
 import six
 from unittest import TestCase
 
-from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA
-
 from dark.dna import (
     AMBIGUOUS, BASES_TO_AMBIGUOUS, compareDNAReads, matchToString,
     findKozakConsensus, FloatBaseCounts, sequenceToRegex)
 from dark.reads import Read, DNARead, DNAKozakRead
 
+# The following are the letters that used to be on
+# from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA
+# IUPACAmbiguousDNA.letters
+# But Bio.Alphabet is now deprecated and will be removed.
+AMBIGUOUS_DNA_LETTERS = 'GATCRYWSMKHBVDN'
+
 
 class TestAmbiguousLetters(TestCase):
     """
@@ -17,7 +21,7 @@ def testExpectedAmbiguousLetters(self):
         """
         The ambiguous DNA letters must match those given by IUPAC.
         """
-        self.assertEqual(sorted(IUPACAmbiguousDNA.letters), sorted(AMBIGUOUS))
+        self.assertEqual(sorted(AMBIGUOUS_DNA_LETTERS), sorted(AMBIGUOUS))
 
     def testExpectedLengthOne(self):
         """
@@ -31,7 +35,7 @@ def testExpectedLengthsGreaterThanOne(self):
         The ambiguous DNA letters must be in sets of size greater than one
         and less than 5.
         """
-        for base in set(IUPACAmbiguousDNA.letters) - set('ACGT'):
+        for base in set(AMBIGUOUS_DNA_LETTERS) - set('ACGT'):
             self.assertTrue(5 > len(AMBIGUOUS[base]) > 1)
 
     def testAmbiguousLettersAreAllACGT(self):
diff --git a/test/test_sequence.py b/test/test_sequence.py
index 508b3c0c..e06c58e2 100644
--- a/test/test_sequence.py
+++ b/test/test_sequence.py
@@ -1,7 +1,6 @@
 from unittest import TestCase
 from dark.sequence import findPrimer, findPrimerBidi, findPrimerBidiLimits
 from Bio.Seq import Seq
-from Bio.Alphabet import IUPAC
 
 
 class TestFindPrimer(TestCase):
@@ -13,7 +12,7 @@ def testNotFound(self):
         """
         If a primer is not found, the empty list must be returned.
         """
-        seq = Seq('ACGT', IUPAC.unambiguous_dna)
+        seq = Seq('ACGT')
         self.assertEqual([], findPrimer('BLAH', seq))
 
     def testFoundAtStart(self):
@@ -21,7 +20,7 @@ def testFoundAtStart(self):
         If a primer is found at the start of a sequence, a list containing 0
         must be returned.
         """
-        seq = Seq('ACGT', IUPAC.unambiguous_dna)
+        seq = Seq('ACGT')
         self.assertEqual([0], findPrimer('AC', seq))
 
     def testFoundAtEnd(self):
@@ -29,7 +28,7 @@ def testFoundAtEnd(self):
         If a primer is found at the end of a sequence, the correct value
         must be returned.
         """
-        seq = Seq('ACGT', IUPAC.unambiguous_dna)
+        seq = Seq('ACGT')
         self.assertEqual([2], findPrimer('GT', seq))
 
     def testFoundMultiple(self):
@@ -37,7 +36,7 @@ def testFoundMultiple(self):
         If a primer is found multiple times, the correct value
         must be returned.
         """
-        seq = Seq('ACGTACGT', IUPAC.unambiguous_dna)
+        seq = Seq('ACGTACGT')
         self.assertEqual([0, 4], findPrimer('ACG', seq))
 
     def testOverlapping(self):
@@ -45,7 +44,7 @@ def testOverlapping(self):
         If a primer is present twice but is overlapping, only the first
         instance should be returned.
         """
-        seq = Seq('GAAA', IUPAC.unambiguous_dna)
+        seq = Seq('GAAA')
         self.assertEqual([1], findPrimer('AA', seq))
 
 
@@ -58,7 +57,7 @@ def testNotFound(self):
         """
         If a primer is not found, empty lists must be returned.
         """
-        seq = Seq('ACGT', IUPAC.unambiguous_dna)
+        seq = Seq('ACGT')
         self.assertEqual(([], []), findPrimerBidi('BLAH', seq))
 
     def testFoundStartEnd(self):
@@ -67,7 +66,7 @@ def testFoundStartEnd(self):
         the forward sequence, end of the reverse complement), the
         correct value must be returned.
         """
-        seq = Seq('ACGT', IUPAC.unambiguous_dna)
+        seq = Seq('ACGT')
         self.assertEqual(([0], [2]), findPrimerBidi('AC', seq))
 
     def testFoundEndStart(self):
@@ -76,7 +75,7 @@ def testFoundEndStart(self):
         the forward sequence, start of the reverse complement), the
         correct value must be returned.
         """
-        seq = Seq('ACGT', IUPAC.unambiguous_dna)
+        seq = Seq('ACGT')
         self.assertEqual(([2], [0]), findPrimerBidi('GT', seq))
 
     def testFoundMultiple(self):
@@ -84,7 +83,7 @@ def testFoundMultiple(self):
         If a primer is found multiple times, the correct value
         must be returned.
         """
-        seq = Seq('ACGTACGT', IUPAC.unambiguous_dna)
+        seq = Seq('ACGTACGT')
         self.assertEqual(([0, 4], [1, 5]), findPrimerBidi('ACG', seq))
 
     def testOverlappingForwards(self):
@@ -92,7 +91,7 @@ def testOverlappingForwards(self):
         If a primer is present twice forwards but is overlapping, only
         the first instance should be returned.
         """
-        seq = Seq('GAAA', IUPAC.unambiguous_dna)
+        seq = Seq('GAAA')
         self.assertEqual(([1], []), findPrimerBidi('AA', seq))
 
     def testOverlappingBackwards(self):
@@ -100,7 +99,7 @@ def testOverlappingBackwards(self):
         If a primer is present twice backwards but is overlapping, only
         the first instance should be returned.
         """
-        seq = Seq('GTTT', IUPAC.unambiguous_dna)
+        seq = Seq('GTTT')
         self.assertEqual(([], [1]), findPrimerBidi('AA', seq))
 
 
@@ -114,7 +113,7 @@ def testNotFound(self):
         If a primer is not found, the returned offsets must include
         the whole sequence.
         """
-        seq = Seq('ACGT', IUPAC.unambiguous_dna)
+        seq = Seq('ACGT')
         self.assertEqual((0, 4), findPrimerBidiLimits('BLAH', seq))
 
     def testFoundStartEnd(self):
@@ -123,7 +122,7 @@ def testFoundStartEnd(self):
         the forward sequence, end of the reverse complement), the
         correct value must be returned.
         """
-        seq = Seq('ACGT', IUPAC.unambiguous_dna)
+        seq = Seq('ACGT')
         self.assertEqual((2, 2), findPrimerBidiLimits('AC', seq))
 
     def testFoundEndStart(self):
@@ -132,7 +131,7 @@ def testFoundEndStart(self):
         the forward sequence, start of the reverse complement), the
         correct value must be returned.
         """
-        seq = Seq('ACGT', IUPAC.unambiguous_dna)
+        seq = Seq('ACGT')
         self.assertEqual((4, 4), findPrimerBidiLimits('GT', seq))
 
     def testFoundMultiple(self):
@@ -140,7 +139,7 @@ def testFoundMultiple(self):
         If a primer is found multiple times, the correct value
         must be returned.
         """
-        seq = Seq('ACGTACGT', IUPAC.unambiguous_dna)
+        seq = Seq('ACGTACGT')
         self.assertEqual((7, 8), findPrimerBidiLimits('ACG', seq))
 
     def testOverlappingForwards(self):
@@ -148,7 +147,7 @@ def testOverlappingForwards(self):
         If a primer is present twice forwards but is overlapping, only
         the first instance should be returned.
         """
-        seq = Seq('GAAA', IUPAC.unambiguous_dna)
+        seq = Seq('GAAA')
         self.assertEqual((3, 4), findPrimerBidiLimits('AA', seq))
 
     def testOverlappingBackwards(self):
@@ -156,7 +155,7 @@ def testOverlappingBackwards(self):
         If a primer is present twice backwards but is overlapping, only
         the first instance should be returned.
         """
-        seq = Seq('GTTT', IUPAC.unambiguous_dna)
+        seq = Seq('GTTT')
         self.assertEqual((0, 1), findPrimerBidiLimits('AA', seq))
 
     def testLonger(self):
@@ -166,6 +165,5 @@ def testLonger(self):
         seq = Seq('AAAAAAAAAA'
                   'GGGGGGGGGG'
                   'AAAAAAAAAA'
-                  'AAAAAAAAAA',
-                  IUPAC.unambiguous_dna)
+                  'AAAAAAAAAA')
         self.assertEqual((20, 40), findPrimerBidiLimits('GGGGGGGGGG', seq))