diff --git a/oktoberfest/__init__.py b/oktoberfest/__init__.py index d5682e9a..5ae5b24b 100644 --- a/oktoberfest/__init__.py +++ b/oktoberfest/__init__.py @@ -9,22 +9,18 @@ import logging.handlers import sys -import time from oktoberfest import plotting as pl from oktoberfest import predict as pr from oktoberfest import preprocessing as pp from oktoberfest import rescore as re -from . import runner - CONSOLE_LOG_LEVEL = logging.INFO logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) if len(logger.handlers) == 0: formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(name)s::%(funcName)s %(message)s") - converter = time.gmtime # add console handler console_handler = logging.StreamHandler(sys.stdout) console_handler.setLevel(CONSOLE_LOG_LEVEL) @@ -37,6 +33,6 @@ error_handler.setFormatter(formatter) logger.addHandler(error_handler) else: - logger.info("Logger already initizalized. Resuming normal operation.") + logger.info("Logger already initialized. Resuming normal operation.") sys.modules.update({f"{__name__}.{m}": globals()[m] for m in ["pl", "pp", "pr", "re"]}) diff --git a/oktoberfest/__main__.py b/oktoberfest/__main__.py index c194c50b..1a3cbb4f 100644 --- a/oktoberfest/__main__.py +++ b/oktoberfest/__main__.py @@ -2,7 +2,7 @@ from rich import traceback -from oktoberfest import __copyright__, __version__, logger, runner +from oktoberfest import runner """triqler.__main__: executed when bootstrap directory is called as script.""" @@ -28,8 +28,6 @@ def _parse_args(): def main(): """Execution of oktoberfest from terminal.""" - logger.info(f"Oktoberfest version {__version__}\n{__copyright__}") - args = _parse_args() runner.run_job(args.config_path) diff --git a/oktoberfest/preprocessing/preprocessing.py b/oktoberfest/preprocessing/preprocessing.py index b5289d6d..1a9f62cc 100644 --- a/oktoberfest/preprocessing/preprocessing.py +++ b/oktoberfest/preprocessing/preprocessing.py @@ -281,23 +281,28 @@ def split_search( search_results: pd.DataFrame, output_dir: Union[str, Path], filenames: Optional[List[str]] = None, -): +) -> List[str]: """ Split search results by spectrum file. - Given a list of spectrum filenames from which search results originate the provided search results are split - and filename specific csv files are written to the provided output directory. The provided filenames need to + Given a list of spectrum file names from which search results originate the provided search results are split + and filename specific csv files are written to the provided output directory. The provided file names need to correspond to the spectrum file identifier in the "RAW_FILE" column of the provided search results. The search results need to be provided in internal format (see :doc:`../../internal_format`). - If the list of filenames is not provided, all spectrum file identifiers are considered, otherwise only the + If the list of file names is not provided, all spectrum file identifiers are considered, otherwise only the identifiers found in the list are taken into account for writing the individual csv files. - The output filenames follow the convention .rescore. + The output file names follow the convention .rescore. + If a file name is not found in the search results, it is ignored and a warning is printed. + The function returns a list of file names for which search results are available, removing the ones that were + ignored if a list of file names was provided. :param search_results: search results in internal format :param output_dir: directory in which to store individual csv files containing the search results for individual filenames :param filenames: optional list of spectrum filenames that should be considered. If not provided, all spectrum file identifiers in the search results are considered. + + :return: list of file names for which search results could be found """ if isinstance(output_dir, str): output_dir = Path(output_dir) @@ -308,10 +313,20 @@ def split_search( grouped_search_results = search_results.groupby("RAW_FILE") + filenames_found = [] for filename in filenames: output_file = (output_dir / filename).with_suffix(".rescore") logger.info(f"Creating split msms.txt file {output_file}") - grouped_search_results.get_group(filename).to_csv(output_file) + try: + grouped_search_results.get_group(filename).to_csv(output_file) + filenames_found.append(filename) + except KeyError: + logger.warning( + f"The search results do not contain search results for the provided file name {filename}. " + "If this is not intended, please verify that the file names are written correctly in the " + f"search results. {filename} is ignored." + ) + return filenames_found def merge_spectra_and_peptides(spectra: pd.DataFrame, search: pd.DataFrame) -> Spectra: diff --git a/oktoberfest/runner.py b/oktoberfest/runner.py index 6ed5be0e..bc710eff 100644 --- a/oktoberfest/runner.py +++ b/oktoberfest/runner.py @@ -1,9 +1,12 @@ +import datetime +import json import logging from pathlib import Path from typing import List, Type, Union from spectrum_io.spectral_library import MSP, DLib, SpectralLibrary, Spectronaut +from oktoberfest import __copyright__, __version__ from oktoberfest import plotting as pl from oktoberfest import predict as pr from oktoberfest import preprocessing as pp @@ -15,7 +18,7 @@ logger = logging.getLogger(__name__) -def _preprocess(spectra_files: List[Path], config: Config): +def _preprocess(spectra_files: List[Path], config: Config) -> List[Path]: preprocess_search_step = ProcessStep(config.output, "preprocessing_search") if not preprocess_search_step.is_done(): # load search results @@ -42,12 +45,21 @@ def _preprocess(spectra_files: List[Path], config: Config): search_results = pp.filter_peptides_for_model(peptides=search_results, model=config.models["intensity"]) # split search results - pp.split_search( + filenames_found = pp.split_search( search_results=search_results, output_dir=config.output / "msms", filenames=[spectra_file.stem for spectra_file in spectra_files], ) preprocess_search_step.mark_done() + else: + filenames_found = [msms_file.stem for msms_file in (config.output / "msms").glob("*rescore")] + + spectra_files_to_return = [] + for spectra_file in spectra_files: + if spectra_file.stem in filenames_found: + spectra_files_to_return.append(spectra_file) + + return spectra_files_to_return def _annotate_and_get_library(spectra_file: Path, config: Config) -> Spectra: @@ -226,7 +238,7 @@ def run_ce_calibration( proc_dir = config.output / "proc" proc_dir.mkdir(parents=True, exist_ok=True) - _preprocess(spectra_files, config) + spectra_files = _preprocess(spectra_files, config) processing_pool = JobPool(processes=config.num_threads) @@ -292,7 +304,7 @@ def run_rescoring(config_path: Union[str, Path]): proc_dir = config.output / "proc" proc_dir.mkdir(parents=True, exist_ok=True) - _preprocess(spectra_files, config) + spectra_files = _preprocess(spectra_files, config) processing_pool = JobPool(processes=config.num_threads) @@ -360,11 +372,29 @@ def run_job(config_path: Union[str, Path]): conf.check() job_type = conf.job_type - if job_type == "SpectralLibraryGeneration": - generate_spectral_lib(config_path) - elif job_type == "CollisionEnergyCalibration": - run_ce_calibration(config_path) - elif job_type == "Rescoring": - run_rescoring(config_path) - else: - raise ValueError(f"Unknown job_type in config: {job_type}") + # add file handler to root logger + base_logger = logging.getLogger() + formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(name)s::%(funcName)s %(message)s") + suffix = datetime.datetime.now().strftime("%Y_%m_%d-%H_%M_%S") + logging_output = conf.output / f"{job_type}_{suffix}.log" + file_handler = logging.FileHandler(filename=logging_output) + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(formatter) + base_logger.addHandler(file_handler) + + logger.info(f"Oktoberfest version {__version__}\n{__copyright__}") + logger.info("Job executed with the following config:") + logger.info(json.dumps(conf.data, indent=4)) + + try: + if job_type == "SpectralLibraryGeneration": + generate_spectral_lib(config_path) + elif job_type == "CollisionEnergyCalibration": + run_ce_calibration(config_path) + elif job_type == "Rescoring": + run_rescoring(config_path) + else: + raise ValueError(f"Unknown job_type in config: {job_type}") + finally: + file_handler.close() + base_logger.removeHandler(file_handler)