Skip to content

Commit

Permalink
Merge pull request #162 from HadrienG/dev
Browse files Browse the repository at this point in the history
1.4.6
  • Loading branch information
HadrienG authored May 12, 2020
2 parents 6a9bca7 + 5c91fb8 commit e50ca2d
Show file tree
Hide file tree
Showing 7 changed files with 211 additions and 166 deletions.
2 changes: 1 addition & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ numpy = "*"
scipy = "*"
biopython = "*"
joblib = "*"
pysam = "==0.15.3"
pysam = "==0.15.4"
requests = "*"

[dev-packages]
Expand Down
296 changes: 157 additions & 139 deletions Pipfile.lock

Large diffs are not rendered by default.

31 changes: 25 additions & 6 deletions iss/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,23 @@
from Bio import Entrez

import io
import sys
import time
import zlib
import random
import logging


import requests


class BadRequestError(Exception):
"""Exception to raise when a http request does not return 200
"""

def __init__(self, url, status_code):
super().__init__('%s returned %d' % (url, status_code))


def ncbi(kingdom, n_genomes, output):
"""download random genomes sequences from ncbi genomes with entrez eutils
and requests.
Expand All @@ -32,7 +41,6 @@ def ncbi(kingdom, n_genomes, output):
'assembly',
term='%s[Organism] AND "latest refseq"[filter] AND "complete genome"[filter]'
% kingdom, retmax=100000))['IdList']
genomes = []
n = 0
logger.info('Searching for %s to download' % kingdom)
while n < n_genomes:
Expand All @@ -48,8 +56,16 @@ def ncbi(kingdom, n_genomes, output):
genome_info['AssemblyAccession'],
genome_info['AssemblyName'])
logger.info('Downloading %s' % genome_info['AssemblyAccession'])
assembly_to_fasta(url, output)
n += 1
try:
assembly_to_fasta(url, output)
except BadRequestError as e:
logger.debug('Could not download %s' %
genome_info['AssemblyAccession'])
logger.debug('Skipping and waiting two seconds')
time.sleep(2)
else:
n += 1
full_id_list.remove(ident)
return output


Expand All @@ -71,8 +87,11 @@ def assembly_to_fasta(url, output, chunk_size=1024):
url = url.replace("ftp://", "https://")
if url:
request = requests.get(url)
request = zlib.decompress(
request.content, zlib.MAX_WBITS | 32).decode()
if request.status_code == 200:
request = zlib.decompress(
request.content, zlib.MAX_WBITS | 32).decode()
else:
raise BadRequestError(url, request.status_code)

with io.StringIO(request) as fasta_io:
seq_handle = SeqIO.parse(fasta_io, 'fasta')
Expand Down
4 changes: 3 additions & 1 deletion iss/error_models/kde.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class KDErrorModel(ErrorModel):
- the substitution for each nucleotide at each position (for R1 and R2)
- the insertion and deletion rates for each position (for R1 and R2)
"""

def __init__(self, npz_path):
super().__init__()
self.npz_path = npz_path
Expand Down Expand Up @@ -74,7 +75,8 @@ def gen_phred_scores(self, cdfs, orientation):
mean = self.mean_reverse

norm_mean = mean / sum(mean)
quality_bin = np.searchsorted(norm_mean, np.random.rand())
# quality_bin = np.searchsorted(norm_mean, np.random.rand())
quality_bin = np.random.choice(range(len(norm_mean)), p=norm_mean)
# downgrades index out of bound (ex rand is 1, last bin in searchsorted
# is 0.95) to best quality bin
if quality_bin == 4:
Expand Down
40 changes: 23 additions & 17 deletions iss/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,21 +62,30 @@ def reads(record, ErrorModel, n_pairs, cpu_number, output, seed,
# except ValueError as e:
# logger.error('Skipping this record: %s' % record.id)
# return
forward, reverse = simulate_read(record, ErrorModel, i, cpu_number)
if gc_bias:
stiched_seq = forward.seq + reverse.seq
gc_content = GC(stiched_seq)
if 40 < gc_content < 60:
read_tuple_list.append((forward, reverse))
i += 1
elif np.random.rand() < 0.90:
try:
forward, reverse = simulate_read(record, ErrorModel, i, cpu_number)
except AssertionError as e:
logger.warning(
'%s shorter than read length for this ErrorModel' % record.id)
logger.warning(
'Skipping %s. You will have less reads than specified'
% record.id)
break
else:
if gc_bias:
stiched_seq = forward.seq + reverse.seq
gc_content = GC(stiched_seq)
if 40 < gc_content < 60:
read_tuple_list.append((forward, reverse))
i += 1
elif np.random.rand() < 0.90:
read_tuple_list.append((forward, reverse))
i += 1
else:
continue
else:
read_tuple_list.append((forward, reverse))
i += 1
else:
continue
else:
read_tuple_list.append((forward, reverse))
i += 1

temp_file_name = output + '.iss.tmp.%s.%s' % (record.id, cpu_number)
to_fastq(read_tuple_list, temp_file_name)
Expand Down Expand Up @@ -112,10 +121,7 @@ def simulate_read(record, ErrorModel, i, cpu_number):
forward_start = random.randrange(
0, len(record.seq) - (2 * read_length + insert_size))
except AssertionError as e:
logger.error(
'%s shorter than read length for this ErrorModel:%s'
% (e, record.id))
sys.exit(1)
raise
except ValueError as e:
logger.debug(
'%s shorter than template length for this ErrorModel:%s'
Expand Down
2 changes: 1 addition & 1 deletion iss/test/test_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_simulate_and_save_short():
generator.reads(ref_genome, err_mod, 1000, 0, 'data/.test', 0, True)


@raises(SystemExit)
@raises(AssertionError)
def test_small_input():
err_mod = kde.KDErrorModel('data/ecoli.npz')
ref_genome = SeqRecord(
Expand Down
2 changes: 1 addition & 1 deletion iss/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.4.5'
__version__ = '1.4.6'

0 comments on commit e50ca2d

Please sign in to comment.