Add indexing program

nanoporetech · Dec 4, 2019 · 63f3a46 · 63f3a46
1 parent 47af968
commit 63f3a46
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+v1.2.20
+-------
+* Add `index_reads` program to build read_id->file index tsv file.
+
+v1.2.19
+-------
+* Add program to produce read summary text file from Bulk .fast5.
+
 v1.2.18
 -------
 * Allow `extract_reads` to extract only reads present in a given read_summary.txt

diff --git a/fast5_research/__init__.py b/fast5_research/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '1.2.19'
+__version__ = '1.2.20'
 
 from fast5_research.fast5 import Fast5, iterate_fast5
 from fast5_research.fast5_bulk import BulkFast5
diff --git a/fast5_research/extract.py b/fast5_research/extract.py
@@ -253,6 +253,35 @@ def extract_channel_reads(source, output, prefix, flat, by_id, max_files, multi,
     return counter, channel
 
 
+def build_read_index():
+    logging.basicConfig(
+        format='[%(asctime)s - %(name)s] %(message)s',
+        datefmt='%H:%M:%S', level=logging.INFO
+    )
+    logger = logging.getLogger('Index Reads')
+
+    parser = argparse.ArgumentParser(description='Build index of reads within .fast5s. Output to stdout.')
+    parser.add_argument('input', help='.fast5 directory')
+    parser.add_argument('--recursive', action='store_true',
+        help='Search recursively under `input` for source files.')
+    parser.add_argument('--workers', type=int, default=8,
+        help='Number of worker processes.')
+    args = parser.parse_args()
+
+    src_files = list(iterate_fast5(args.input, paths=True, recursive=args.recursive))
+    logger.info("Found {} files.".format(len(src_files)))
+
+    with ProcessPoolExecutor(args.workers) as executor:
+         n_reads = 0
+         for i, (src, read_ids) in enumerate(
+                 zip(src_files, executor.map(reads_in_multi, src_files, chunksize=10))):
+             n_reads += len(read_ids)
+             for read in read_ids:
+                 print('\t'.join((read, os.path.abspath(src))))
+             if i % 10 == 0:
+                 logger.info("Indexed {}/{} files. {} reads".format(i, len(src_files), n_reads))
+
+
 def filter_multi_reads():
     logging.basicConfig(
         format='[%(asctime)s - %(name)s] %(message)s',
@@ -402,6 +431,7 @@ def _subset_reads_to_file(read_index, output, prefix, worker_id=0):
                         writer.write_read(read_grp)
     return reads_written, prefix
 
+
 def reads_in_multi(src, filt=None):
     """Get list of read IDs contained within a multi-read file.
 

diff --git a/setup.py b/setup.py
@@ -55,6 +55,7 @@
     long_description_content_type=__long_description_content_type__,
     entry_points={
         'console_scripts': [
+            'index_reads = {}.extract:build_read_index'.format(__pkg_name__),
             'extract_reads = {}.extract:extract_reads'.format(__pkg_name__),
             'read_summary = {}.extract:extract_read_summary'.format(__pkg_name__),
             'filter_reads = {}.extract:filter_multi_reads'.format(__pkg_name__),