From 2ed7792719a12d69961f433ed2fb864cc5690674 Mon Sep 17 00:00:00 2001 From: Yash Patel <86321070+yashpatel6@users.noreply.github.com> Date: Tue, 21 Nov 2023 13:04:53 -0800 Subject: [PATCH] Parallelize validation (#86) * Add CPU parallelization option to arguments * Parallelize validation function over CPUs * Add CPUs to ValidateArgs * Update tests for validation * Fix encoding test * Bump version for official v4.0.0 release * Remove unused import * Update tests with Pool as context manager * Update to use with * Add CPUs option in the README and update installation instructions * Fix link text * Fix positive integer check * Add tests for positive integer type * Add basic libmagic installation steps * Prepend PID to success and error printers * Update README with updated parameter name * Update argument in parser * Update argument in validation * Update tests --- README.md | 35 +++++++++++-- test/unit/test_generate_checksum.py | 2 +- test/unit/test_validate.py | 76 ++++++++++++++++++++++++----- validate/__init__.py | 2 +- validate/__main__.py | 14 ++++++ validate/validate.py | 40 +++++++++------ validate/validate_types.py | 2 +- 7 files changed, 137 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 58963ac..198944c 100644 --- a/README.md +++ b/README.md @@ -31,13 +31,40 @@ The tool can be installed as a standalone command line tool. The following depen |Python|3.10| |VCFtools|0.1.16| -### Install directly from GitHub +Additionally, the `libmagic` C library must also be installed on the system. + +### Installing `libmagic` + +On Debian/Ubuntu, install through: +```Bash +sudo apt-get install libmagic-dev +``` + +On Mac, install through homebrew (https://brew.sh/): +```Bash +brew install libmagic +``` + +`libmagic` can also be installed through the `conda` package manager: +```Bash +conda install -c conda-forge libmagic +``` + +With the dependencies (and the proper versions) installed, install `pipeval` through one of the options below: + +### Install directly from GitHub through SSH ```Bash pip install git+ssh://git@github.com/uclahs-cds/package-PipeVal.git ``` +### Install directly from GitHub through HTTPS +```Bash +pip install git+https://git@github.com/uclahs-cds/package-PipeVal.git +``` + ### Install from cloned repository ```Bash + cd pip install . ``` @@ -55,6 +82,8 @@ options: -v, --version show program's version number and exit -r CRAM_REFERENCE, --cram-reference CRAM_REFERENCE Path to reference file for CRAM + -p PROCESSES, --processes PROCESSES + Number of processes to run in parallel when validating multiple files ``` The tool will attempt to automatically detect the file type based on extension and perform the approriate validations. The tool will also perform an existence check along with a checksum check if an MD5 or SHA512 checksum exists regardless of file type. @@ -130,8 +159,8 @@ pytest ## Discussions - [Issue tracker](https://github.com/uclahs-cds/package-PipeVal/issues) to report errors and enhancement ideas. -- Discussions can take place in [tool-NF-test Discussions](https://github.com/uclahs-cds/package-PipeVal/discussions) -- [tool-NF-test pull requests](https://github.com/uclahs-cds/package-PipeVal/pulls) are also open for discussion +- Discussions can take place in [package-PipeVal Discussions](https://github.com/uclahs-cds/package-PipeVal/discussions) +- [package-PipeVal pull requests](https://github.com/uclahs-cds/package-PipeVal/pulls) are also open for discussion ## Contributors diff --git a/test/unit/test_generate_checksum.py b/test/unit/test_generate_checksum.py index 0c36a54..e360602 100644 --- a/test/unit/test_generate_checksum.py +++ b/test/unit/test_generate_checksum.py @@ -110,7 +110,7 @@ def test__write_checksum_file__writes_proper_checksum(mock_path, mock_write_open _write_checksum_file(mock_path, hash_type, computed_hash) - mock_write_open.assert_called_once_with(f'{file_path}.{hash_type}', 'w') + mock_write_open.assert_called_once_with(f'{file_path}.{hash_type}', 'w', encoding='utf-8') handle = mock_write_open() handle.write.assert_called_once_with(f'{computed_hash} {file_path}\n') diff --git a/test/unit/test_validate.py b/test/unit/test_validate.py index 5ddcbb4..88449de 100644 --- a/test/unit/test_validate.py +++ b/test/unit/test_validate.py @@ -1,6 +1,7 @@ # pylint: disable=C0116 # pylint: disable=C0114 from pathlib import Path +from argparse import Namespace, ArgumentTypeError from unittest.mock import Mock import warnings import mock @@ -28,10 +29,31 @@ _detect_file_type_and_extension, _check_extension, run_validate, - _validate_file + _validate_file, + _validation_worker ) +from validate.__main__ import positive_integer from validate.validate_types import ValidateArgs +def test__positive_integer__returns_correct_integer(): + expected_number = 2 + number_str = '2' + assert expected_number == positive_integer(number_str) + +@pytest.mark.parametrize( + 'number_str', + [ + ('-2'), + ('0'), + ('1.2'), + ('number'), + ('') + ] +) +def test__positive_integer__fails_non_positive_integers(number_str): + with pytest.raises(ArgumentTypeError): + positive_integer(number_str) + @pytest.mark.parametrize( 'expected_extension, expected_file_type', [ @@ -206,10 +228,8 @@ def test__validate_vcf_file__passes_vcf_validation(mock_call): _validate_vcf_file('some/file') -@mock.patch('validate.validate._print_success') -def test__run_validate__passes_validation_no_files(mock_print_success): - test_args = ValidateArgs(path=[], cram_reference=None) - mock_print_success.return_value = '' +def test__run_validate__passes_validation_no_files(): + test_args = ValidateArgs(path=[], cram_reference=None, processes=1) run_validate(test_args) @pytest.mark.parametrize( @@ -225,19 +245,47 @@ def test__run_validate__passes_validation_no_files(mock_print_success): @mock.patch('validate.validate._validate_file') @mock.patch('validate.validate._print_error') @mock.patch('validate.validate.Path.resolve') -def test__run_validate__fails_with_failing_checks( +def test___validation_worker__fails_with_failing_checks( mock_path_resolve, mock_print_error, mock_validate_file, mock_detect_file_type_and_extension, test_exception): - test_args = ValidateArgs(path=['some/path'], cram_reference=None) - mock_path_resolve.return_value = 'some/path' + test_path = 'some/path' + test_args = ValidateArgs(path=[test_path], cram_reference=None, processes=1) + mock_path_resolve.return_value = test_path mock_validate_file.side_effect = test_exception mock_detect_file_type_and_extension.return_value = ('', '') mock_print_error.return_value = '' + + assert not _validation_worker(test_path, test_args) + +@mock.patch('validate.validate.Path.resolve', autospec=True) +@mock.patch('validate.validate.multiprocessing.Pool') +def test__run_validate__passes_on_all_valid_files( + mock_pool, + mock_path_resolve + ): + test_path = 'some/path' + test_args = ValidateArgs(path=[test_path], cram_reference=None, processes=1) + + mock_path_resolve.return_value = None + mock_pool.return_value.__enter__.return_value = Namespace(starmap=lambda y, z: [True]) + + run_validate(test_args) + +@mock.patch('validate.validate.Path.resolve', autospec=True) +@mock.patch('validate.validate.multiprocessing.Pool') +def test__run_validate__fails_with_failing_file( + mock_pool, + mock_path_resolve): + test_path = 'some/path' + test_args = ValidateArgs(path=[test_path], cram_reference=None, processes=1) expected_code = 1 + mock_path_resolve.return_value = None + mock_pool.return_value.__enter__.return_value = Namespace(starmap=lambda y, z: [False]) + with pytest.raises(SystemExit) as pytest_exit: run_validate(test_args) assert pytest_exit.value.code == expected_code @@ -280,7 +328,9 @@ def test__run_validate__fails_on_unresolvable_symlink(mock_path_resolve): expected_error = FileNotFoundError mock_path_resolve.side_effect = expected_error - test_args = ValidateArgs(path=['some/path'], cram_reference=None) + test_path = 'some/path' + + test_args = ValidateArgs(path=[test_path], cram_reference=None, processes=1) with pytest.raises(expected_error): run_validate(test_args) @@ -289,7 +339,7 @@ def test__run_validate__fails_on_unresolvable_symlink(mock_path_resolve): @mock.patch('validate.validate._detect_file_type_and_extension') @mock.patch('validate.validate._validate_file') @mock.patch('validate.validate._print_success') -def test__run_validate__passes_proper_validation( +def test___validation_worker__passes_proper_validation( mock_print_success, mock_validate_file, mock_detect_file_type_and_extension, @@ -299,6 +349,8 @@ def test__run_validate__passes_proper_validation( mock_validate_file.return_value = None mock_path_resolve.return_value = None - test_args = ValidateArgs(path=['some/path'], cram_reference=None) + test_path = 'some/path' - run_validate(test_args) + test_args = ValidateArgs(path=[test_path], cram_reference=None, processes=1) + + _validation_worker(test_path, test_args) diff --git a/validate/__init__.py b/validate/__init__.py index c8c5da9..a7f1cbc 100644 --- a/validate/__init__.py +++ b/validate/__init__.py @@ -1,3 +1,3 @@ '''Inits validate module''' -__version__ = '4.0.0-rc.2' +__version__ = '4.0.0' diff --git a/validate/__main__.py b/validate/__main__.py index 735fded..37a1013 100644 --- a/validate/__main__.py +++ b/validate/__main__.py @@ -3,6 +3,18 @@ from validate import __version__ from validate.validate import run_validate +def positive_integer(arg): + """ Type and value check for positive integers """ + try: + i = int(arg) + except ValueError as value_exception: + raise argparse.ArgumentTypeError("Must be an integer.") from value_exception + + if i < 1: + raise argparse.ArgumentTypeError("Must be an integer greater than 0.") + + return i + def _parse_args(): """ Parse arguments """ parser = argparse.ArgumentParser() @@ -10,6 +22,8 @@ def _parse_args(): parser.add_argument('-v', '--version', action='version', version=f'%(prog)s {__version__}') parser.add_argument('-r', '--cram-reference', default=None, \ help='Path to reference file for CRAM') + parser.add_argument('-p', '--processes', type=positive_integer, default=1, \ + help='Number of processes to run in parallel when validating multiple files') parser.set_defaults(func=run_validate) diff --git a/validate/validate.py b/validate/validate.py index f5b528a..b2af3e9 100644 --- a/validate/validate.py +++ b/validate/validate.py @@ -1,7 +1,10 @@ ''' File validation functions ''' from pathlib import Path import sys +import os from typing import Dict, Union +import multiprocessing +from itertools import repeat from validate.validators.bam import _check_bam from validate.validators.sam import _check_sam @@ -57,11 +60,11 @@ def _validate_file( def _print_error(path:Path, err:BaseException): ''' Prints error message ''' - print(f'Error: {str(path)} {str(err)}') + print(f'PID:{os.getpid()} - Error: {str(path)} {str(err)}') def _print_success(path:Path, file_type:str): ''' Prints success message ''' - print(f'Input: {path} is valid {file_type}') + print(f'PID:{os.getpid()} - Input: {path} is valid {file_type}') def _detect_file_type_and_extension(path:Path): ''' File type and extension detection ''' @@ -86,6 +89,20 @@ def _check_extension(extension:str): return UNKNOWN_FILE_TYPE +def _validation_worker(path: Path, args:Union[ValidateArgs,Dict[str, Union[str,list]]]): + ''' Worker function to validate a single file ''' + try: + file_type, file_extension = _detect_file_type_and_extension(path) + _validate_file(path, file_type, file_extension, args) + except FileNotFoundError as file_not_found_err: + print(f"Warning: {str(path)} {str(file_not_found_err)}") + except (TypeError, ValueError, IOError, OSError) as err: + _print_error(path, err) + return False + + _print_success(path, file_type) + return True + def run_validate(args:Union[ValidateArgs,Dict[str, Union[str,list]]]): ''' Function to validate file(s) `args` must contain the following: @@ -93,20 +110,11 @@ def run_validate(args:Union[ValidateArgs,Dict[str, Union[str,list]]]): `cram_reference` is a required argument with either a string value or None ''' - all_files_pass = True - - for path in [Path(pathname).resolve(strict=True) for pathname in args.path]: - try: - file_type, file_extension = _detect_file_type_and_extension(path) - _validate_file(path, file_type, file_extension, args) - except FileNotFoundError as file_not_found_err: - print(f"Warning: {str(path)} {str(file_not_found_err)}") - except (TypeError, ValueError, IOError, OSError) as err: - all_files_pass = False - _print_error(path, err) - continue + num_parallel = min(args.processes, multiprocessing.cpu_count()) - _print_success(path, file_type) + with multiprocessing.Pool(num_parallel) as parallel_pool: + validation_results = parallel_pool.starmap(_validation_worker, \ + zip([Path(pathname).resolve(strict=True) for pathname in args.path], repeat(args))) - if not all_files_pass: + if not all(validation_results): sys.exit(1) diff --git a/validate/validate_types.py b/validate/validate_types.py index 5822689..ea52cbf 100644 --- a/validate/validate_types.py +++ b/validate/validate_types.py @@ -3,5 +3,5 @@ ValidateArgs = namedtuple( 'args', - 'path, cram_reference' + 'path, cram_reference, processes' )