Skip to content

Commit

Permalink
Add indexing program
Browse files Browse the repository at this point in the history
  • Loading branch information
cjw85 committed Dec 4, 2019
1 parent 47af968 commit 63f3a46
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 1 deletion.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
v1.2.20
-------
* Add `index_reads` program to build read_id->file index tsv file.

v1.2.19
-------
* Add program to produce read summary text file from Bulk .fast5.

v1.2.18
-------
* Allow `extract_reads` to extract only reads present in a given read_summary.txt
Expand Down
2 changes: 1 addition & 1 deletion fast5_research/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '1.2.19'
__version__ = '1.2.20'

from fast5_research.fast5 import Fast5, iterate_fast5
from fast5_research.fast5_bulk import BulkFast5
30 changes: 30 additions & 0 deletions fast5_research/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,35 @@ def extract_channel_reads(source, output, prefix, flat, by_id, max_files, multi,
return counter, channel


def build_read_index():
logging.basicConfig(
format='[%(asctime)s - %(name)s] %(message)s',
datefmt='%H:%M:%S', level=logging.INFO
)
logger = logging.getLogger('Index Reads')

parser = argparse.ArgumentParser(description='Build index of reads within .fast5s. Output to stdout.')
parser.add_argument('input', help='.fast5 directory')
parser.add_argument('--recursive', action='store_true',
help='Search recursively under `input` for source files.')
parser.add_argument('--workers', type=int, default=8,
help='Number of worker processes.')
args = parser.parse_args()

src_files = list(iterate_fast5(args.input, paths=True, recursive=args.recursive))
logger.info("Found {} files.".format(len(src_files)))

with ProcessPoolExecutor(args.workers) as executor:
n_reads = 0
for i, (src, read_ids) in enumerate(
zip(src_files, executor.map(reads_in_multi, src_files, chunksize=10))):
n_reads += len(read_ids)
for read in read_ids:
print('\t'.join((read, os.path.abspath(src))))
if i % 10 == 0:
logger.info("Indexed {}/{} files. {} reads".format(i, len(src_files), n_reads))


def filter_multi_reads():
logging.basicConfig(
format='[%(asctime)s - %(name)s] %(message)s',
Expand Down Expand Up @@ -402,6 +431,7 @@ def _subset_reads_to_file(read_index, output, prefix, worker_id=0):
writer.write_read(read_grp)
return reads_written, prefix


def reads_in_multi(src, filt=None):
"""Get list of read IDs contained within a multi-read file.
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
long_description_content_type=__long_description_content_type__,
entry_points={
'console_scripts': [
'index_reads = {}.extract:build_read_index'.format(__pkg_name__),
'extract_reads = {}.extract:extract_reads'.format(__pkg_name__),
'read_summary = {}.extract:extract_read_summary'.format(__pkg_name__),
'filter_reads = {}.extract:filter_multi_reads'.format(__pkg_name__),
Expand Down

0 comments on commit 63f3a46

Please sign in to comment.