Skip to content

Commit

Permalink
full information to filter_reads
Browse files Browse the repository at this point in the history
  • Loading branch information
cjw85 committed May 8, 2019
1 parent 54cfc6d commit 63bf289
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 14 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
v1.2.17
-------
* Allow `filter_reads` to be given full filename/read_id information

v1.2.16
-------
* Fix bug in `filter_reads` resulting in the last worker's reads not being written.
Expand Down
2 changes: 1 addition & 1 deletion fast5_research/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '1.2.16'
__version__ = '1.2.17'

from fast5_research.fast5 import Fast5, iterate_fast5
from fast5_research.fast5_bulk import BulkFast5
46 changes: 33 additions & 13 deletions fast5_research/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,17 +137,31 @@ def filter_multi_reads():
datefmt='%H:%M:%S', level=logging.INFO
)
logger = logging.getLogger('Filter')
parser = argparse.ArgumentParser(description='Extract reads from multi-read .fast5 files.')
parser.add_argument('input', help='Path to input multi-read .fast5 files.')
parser.add_argument('output', help='Output folder.')
parser.add_argument('filter', help='A .tsv file with column `read_id` defining required reads.')
parser.add_argument('--tsv_field', default='read_id', help='Field name from `filter` file to obtain read IDs.')
parser = argparse.ArgumentParser(
description='Extract reads from multi-read .fast5 files.')
parser.add_argument('input',
help='Path to input multi-read .fast5 files (or list of files).')
parser.add_argument('output',
help='Output folder.')
parser.add_argument('filter',
help='A .tsv file with column `read_id` defining required reads. '
'If a `filename` column is present, this will be used as the '
'location of the read.')
parser.add_argument('--tsv_field', default='read_id',
help='Field name from `filter` file to obtain read IDs.')
parser.add_argument('--prefix', default="",
help='Read file prefix.')
parser.add_argument('--recursive', action='store_true',
help='Search recursively under `input` for source files.')
parser.add_argument('--workers', type=int, default=4,
help='Number of worker processes.')

out_format = parser.add_mutually_exclusive_group()
out_format.add_argument('--multi', action='store_true', default=True, help='Output multi-read files.')
out_format.add_argument('--single', action='store_false', dest='multi', help='Output single-read files.')
parser.add_argument('--prefix', default="", help='Read file prefix.')
parser.add_argument('--recursive', action='store_true', help='Search recursively under `input` for source files.')
parser.add_argument('--workers', type=int, default=4, help='Number of worker processes.')
out_format.add_argument('--multi', action='store_true', default=True,
help='Output multi-read files.')
out_format.add_argument('--single', action='store_false', dest='multi',
help='Output single-read files.')

#parser.add_argument('--limit', type=int, default=None, help='Limit reads per channel.')
args = parser.parse_args()

Expand All @@ -161,14 +175,20 @@ def filter_multi_reads():

# grab list of source files
logger.info("Searching for input files.")
src_files = list(iterate_fast5(args.input, paths=True, recursive=args.recursive))
try:
src_files = list(set(readtsv(args.input)['filename']))
except Exception as e:
logger.info('Failed to read `input` as filelist, assuming path to search. {}'.format(e))
src_files = list(iterate_fast5(args.input, paths=True, recursive=args.recursive))
n_files = len(src_files)
logger.info("Found {} source files.".format(n_files))

logger.info("Reading filter file.")
read_table = readtsv(args.filter, fields=[args.tsv_field])
logger.info("Found {} reads in filter.".format(len(read_table)))

try:
# try to build index from a file with 'filename' column
# try to build index from the filter file with 'filename' column
if 'filename' not in read_table.dtype.names:
raise ValueError("'filename' column not present in filter.")
logger.info("Attempting to build read index from input filter.")
Expand All @@ -179,7 +199,7 @@ def filter_multi_reads():
raise ValueError('Found non-uniquely named source files')
read_index = dict()
for fname, indices in group_vector(read_table['filename']).items():
fpath = src_path_files[fname]
fpath = src_path_files[os.path.basename(fname)]
read_index[fpath] = read_table[args.tsv_field][indices]
logger.info("Successfully build read index from input filter.")
except Exception as e:
Expand Down

0 comments on commit 63bf289

Please sign in to comment.