From b59bc2ef70bc971937f23b683c838a6a4773cc1f Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 16 Oct 2024 00:16:27 -0700 Subject: [PATCH 01/20] Basic WDL implementation --- src/toil/options/runner.py | 10 +- src/toil/wdl/wdltoil.py | 298 +++++++++++++++++++++++++++++++++++-- 2 files changed, 290 insertions(+), 18 deletions(-) diff --git a/src/toil/options/runner.py b/src/toil/options/runner.py index bb82cdab02..5fe3f26008 100644 --- a/src/toil/options/runner.py +++ b/src/toil/options/runner.py @@ -17,9 +17,9 @@ def add_runner_options(parser: ArgumentParser, cwl: bool = False, wdl: bool = Fa parser.add_argument(*run_imports_on_workers_arguments, action="store_true", default=False, dest="run_imports_on_workers", help="Run the file imports on a worker instead of the leader. This is useful if the leader is not optimized for high network performance. " "If set to true, the argument --importWorkersDisk must also be set.") - import_workers_disk_arguments = ["--importWorkersDisk"] + import_workers_threshold_argument = ["--importWorkersThreshold"] if cwl: - import_workers_disk_arguments.append("--import-workers-disk") - parser.add_argument(*import_workers_disk_arguments, dest="import_workers_disk", type=lambda x: human2bytes(str(x)), default=None, - help="Specify the amount of disk space an import worker will use. If file streaming for input files is not available, " - "this should be set to the size of the largest input file. This must be set in conjunction with the argument runImportsOnWorkers.") + import_workers_threshold_argument.append("--import-workers-threshold") + parser.add_argument(*import_workers_threshold_argument, dest="import_workers_threshold", type=lambda x: human2bytes(str(x)), default="1 GiB", + help="Specify the file size threshold that determines if the file import will happen in its own job. All files below the threshold " + "will go into a batch import job. This should be set in conjunction with the argument --runImportsOnWorkers.") diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 4e1367602b..afd8620e38 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -803,6 +803,173 @@ def is_url(filename: str, schemes: List[str]=['http:', 'https:', 's3:', 'gs:', ' return True return False + +def get_file_sizes(environment: WDLBindings, file_source: AbstractJobStore, task_path: str, search_paths: Optional[List[str]] = None, import_remote_files: bool = True, + execution_dir: Optional[str] = None) -> Dict[str, Tuple[str, uuid.UUID, Optional[int]]]: + """ + Resolve relative-URI files in the given environment and them then into absolute normalized URIs. Returns a dictionary of WDL file values to a tuple of the normalized URI, + parent directory ID, and size of the file. The size of the file may be None, which means unknown size. + + Will set the value of the File to the relative-URI. + + :param environment: Bindings to evaluate on + :param file_source: Context to search for files with + :param task_path: Dotted WDL name of the user-level code doing the + importing (probably the workflow name). + :param search_paths: If set, try resolving input location relative to the URLs or + directories in this list. + :param import_remote_files: If set, import files from remote locations. Else leave them as URI references. + """ + # path_to_id: Dict[str, uuid.UUID] = {} + path_to_id: Dict[str, uuid.UUID] = {} + file_to_data = {} + + @memoize + def get_filename_size(filename: str) -> None: + tried = [] + for candidate_uri in potential_absolute_uris(filename, search_paths if search_paths is not None else [], execution_dir=execution_dir): + tried.append(candidate_uri) + try: + if not import_remote_files and is_url(candidate_uri): + # Use remote URIs in place. But we need to find the one that exists. + if not file_source.url_exists(candidate_uri): + # Wasn't found there + continue + + # Now we know this exists, so pass it through + # Get filesizes + filesize = file_source.get_size(candidate_uri) + + except UnimplementedURLException as e: + # We can't find anything that can even support this URL scheme. + # Report to the user, they are probably missing an extra. + logger.critical('Error: ' + str(e)) + raise + except HTTPError as e: + # Something went wrong looking for it there. + logger.warning("Checked URL %s but got HTTP status %s", candidate_uri, e.code) + # Try the next location. + continue + except FileNotFoundError: + # Wasn't found there + continue + except Exception: + # Something went wrong besides the file not being found. Maybe + # we have no auth. + logger.error("Something went wrong when testing for existence of %s", candidate_uri) + raise + + # Work out what the basename for the file was + file_basename = os.path.basename(urlsplit(candidate_uri).path) + + if file_basename == "": + # We can't have files with no basename because we need to + # download them at that basename later. + raise RuntimeError(f"File {candidate_uri} has no basename and so cannot be a WDL File") + + # Was actually found + if is_url(candidate_uri): + # Might be a file URI or other URI. + # We need to make sure file URIs and local paths that point to + # the same place are treated the same. + parsed = urlsplit(candidate_uri) + if parsed.scheme == "file:": + # This is a local file URI. Convert to a path for source directory tracking. + parent_dir = os.path.dirname(unquote(parsed.path)) + else: + # This is some other URL. Get the URL to the parent directory and use that. + parent_dir = urljoin(candidate_uri, ".") + else: + # Must be a local path + parent_dir = os.path.dirname(candidate_uri) + + # Pack a UUID of the parent directory + dir_id = path_to_id.setdefault(parent_dir, uuid.uuid4()) + + file_to_data.setdefault(filename, (candidate_uri, dir_id, filesize)) + + return + # Not found + raise RuntimeError(f"Could not find {filename} at any of: {list(potential_absolute_uris(filename, search_paths if search_paths is not None else []))}") + + def get_file_size(file: WDL.Value.File) -> WDL.Value.File: + """ + Detect if any potential URI exists. Will convert a file's value to a URI and import it. + + Separated out from convert_file_to_url in order to properly memoize and avoid importing the same file twice + :param filename: Filename to import + :return: Tuple of the uri the file was found at and the virtualized import + """ + # Search through any input search paths passed in and download it if found + get_filename_size(file.value) + return file + map_over_files_in_bindings(environment, get_file_size) + return file_to_data + +def import_files(files: List[str], file_source: AbstractJobStore) -> Dict[str, FileID]: + """ + Import all files from the list into the jobstore. Returns a dictionary mapping the filename to it's FileID + :param files: list of filenames + :param file_source: jobstore object + :return: + """ + + @memoize + def import_filename(filename: str) -> Optional[FileID]: + """ + Detect if any potential URI exists. Will convert a file's value to a URI and import it. + + Separated out from convert_file_to_url in order to properly memoize and avoid importing the same file twice + :param filename: Filename to import + :return: Tuple of the uri the file was found at and the virtualized import + """ + # Search through any input search paths passed in and download it if found + # Actually import + # Try to import the file. If we can't find it, continue + return file_source.import_file(filename) + + path_to_fileid = {} + for file in files: + imported = import_filename(file) + if imported is not None: + path_to_fileid[file] = imported + return path_to_fileid + + +def convert_files(environment: WDLBindings, file_to_id: Dict[str, FileID], file_to_data: Dict[str, Tuple[str, uuid.UUID, int]], task_path: str) -> None: + """ + Resolve relative-URI files in the given environment and import all files. + + Will set the value of the File to the relative-URI. + + :param environment: Bindings to evaluate on + """ + + def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File: + """ + Calls import_filename to detect if a potential URI exists and imports it. Will modify the File object value to the new URI and tack on the virtualized file. + """ + candidate_uri = file_to_data[file.value][0] + file_id = file_to_id.get(candidate_uri) + dir_id = file_to_data[file.value][1] + + # Work out what the basename for the file was + file_basename = os.path.basename(urlsplit(candidate_uri).path) + + if file_basename == "": + # We can't have files with no basename because we need to + # download them at that basename later. + raise RuntimeError(f"File {candidate_uri} has no basename and so cannot be a WDL File") + + toil_uri = pack_toil_uri(file_id, task_path, dir_id, file_basename) + + setattr(file, "virtualized_value", toil_uri) + file.value = candidate_uri + return file + + map_over_files_in_bindings(environment, convert_file_to_uri) + + def convert_remote_files(environment: WDLBindings, file_source: AbstractJobStore, task_path: str, search_paths: Optional[List[str]] = None, import_remote_files: bool = True, execution_dir: Optional[str] = None) -> None: """ @@ -3669,35 +3836,146 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: return job.rv() class WDLImportJob(WDLSectionJob): - def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, wdl_options: WDLContext, path: Optional[List[str]] = None, skip_remote: bool = False, - disk_size: Optional[ParseableIndivisibleResource] = None, **kwargs: Any): + def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: List[str], wdl_options: WDLContext, path: Optional[List[str]] = None, skip_remote: bool = False, + disk_size: Optional[ParseableIndivisibleResource] = None, stream: bool = True, **kwargs: Any): """ Job to take the inputs from the WDL workflow and import them on a worker instead of a leader. Assumes all local and cloud files are accessible. This class is only used when runImportsOnWorkers is enabled. """ - super().__init__(wdl_options=wdl_options, local=False, - disk=disk_size, **kwargs) + if stream: + super().__init__(wdl_options=wdl_options, local=False, **kwargs) + else: + super().__init__(wdl_options=wdl_options, local=False, disk=disk_size, **kwargs) + self._target = target self._inputs = inputs self._path = path self._skip_remote = skip_remote + self._disk_size = disk_size + + def run(self, file_store: AbstractFileStore) -> Promised[Dict[str, FileID]]: + """ + Import the workflow inputs and then create and run the workflow. + :return: Promise of workflow outputs + """ + try: + return import_files(self._inputs, file_store.jobStore) + except InsufficientSystemResources as e: + # If importing via streaming, then the job's disk size will not be set. Detect this to create a nonstreaming import job with sufficient disk space + # else, this is the nonstreaming import job and the exception is valid, so raise + if e.resource == "disk" and e.requested < self._disk_size: + non_streaming_import = WDLImportJob(self._target, self._inputs, wdl_options=self._wdl_options, disk=self._disk_size, stream=False, + path=self._path, skip_remote=self._skip_remote) + self.addChild(non_streaming_import) + return non_streaming_import.rv() + else: + raise + + +class WDLCombineImportsJob(WDLSectionJob): + def __init__(self, task_path: str, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, file_to_fileid: Sequence[Promised[Dict[str, FileID]]], + file_to_data: Dict[str, Tuple[str, uuid.UUID, int]], wdl_options: WDLContext, **kwargs: Any) -> None: + """ + Job to take the inputs from the WDL workflow and import them on a worker instead of a leader. Assumes all local and cloud files are accessible. + + This class is only used when runImportsOnWorkers is enabled. + """ + super().__init__(local=True, wdl_options=wdl_options, **kwargs) + self._file_to_fileid = file_to_fileid + self._inputs = inputs + self._target = target + self._file_to_data = file_to_data + self._task_path = task_path def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: """ Import the workflow inputs and then create and run the workflow. :return: Promise of workflow outputs """ - convert_remote_files(self._inputs, file_store.jobStore, self._target.name, self._path, self._skip_remote, self._wdl_options.get("execution_dir")) + + file_to_fileid = {k: v for d in unwrap_all(self._file_to_fileid) for k, v in d.items()} + convert_files(self._inputs, file_to_fileid, self._file_to_data, self._task_path) root_job = WDLStartJob(self._target, self._inputs, wdl_options=self._wdl_options) self.addChild(root_job) return root_job.rv() +class WDLImportWrapper(WDLSectionJob): + def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, wdl_options: WDLContext, + inputs_search_path, import_remote_files, import_workers_threshold, **kwargs: Any): + """ + Job to take the inputs from the WDL workflow and import them on a worker instead of a leader. Assumes all local and cloud files are accessible. + + This class is only used when runImportsOnWorkers is enabled. + """ + super().__init__(local=True, wdl_options=wdl_options, **kwargs) + self._inputs = inputs + self._target = target + self._inputs_search_path = inputs_search_path + self._import_remote_files = import_remote_files + self._import_workers_threshold = import_workers_threshold + + def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: + """ + Import the workflow inputs and then create and run the workflow. + :return: Promise of workflow outputs + """ + inputs = self._inputs + target = self._target + inputs_search_path = self._inputs_search_path + wdl_options = self._wdl_options + import_remote_files = self._import_remote_files + import_workers_threshold = self._import_workers_threshold + # Run WDL imports on a worker instead + file_to_data = get_file_sizes(inputs, file_store.jobStore, target.name, inputs_search_path, import_remote_files=import_remote_files) + + individual_import_files = [] + batch_import_files = [] + for filename, (candidate_uri, parent_id, size) in file_to_data.items(): + if size < import_workers_threshold: + batch_import_files.append(filename) + else: + individual_import_files.append(filename) + + import_jobs = [] + for file in individual_import_files: + filesize = file_to_data[file][2] + filename = file_to_data[file][0] + import_jobs.append(WDLImportJob(target, [filename], wdl_options=wdl_options, path=inputs_search_path, skip_remote=import_remote_files, disk_size=filesize)) + + # create multiple batch jobs if necessary to not exceed the threshold + # todo: improve this logic + per_batch_files = [] + per_batch_size = 0 + while len(batch_import_files) > 0: + peek_filename = batch_import_files[0] + # See if adding this to the queue will make the batch job too big + filesize = file_to_data[peek_filename][2] + if per_batch_size + filesize >= import_workers_threshold: + # If so, it's too big, schedule the queue + import_jobs.append(WDLImportJob(target, per_batch_files, wdl_options=wdl_options, path=inputs_search_path, skip_remote=import_remote_files, disk_size=per_batch_size)) + # reset + per_batch_size = 0 + per_batch_files = [] + else: + per_batch_size += filesize + candidate_uri = file_to_data[peek_filename][0] + per_batch_files.append(candidate_uri) + batch_import_files.pop(0) + if len(per_batch_files) > 0: + import_jobs.append( + WDLImportJob(target, per_batch_files, wdl_options=wdl_options, path=inputs_search_path, skip_remote=import_remote_files, disk_size=per_batch_size)) + combine_imports_job = WDLCombineImportsJob(target.name, target, inputs, [job.rv() for job in import_jobs], file_to_data, wdl_options=wdl_options) + + for job in import_jobs: + self.addChild(job) + self.addFollowOn(combine_imports_job) + + return combine_imports_job.rv() def make_root_job(target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, inputs_search_path: List[str], toil: Toil, wdl_options: WDLContext, options: Namespace) -> WDLSectionJob: if options.run_imports_on_workers: - # Run WDL imports on a worker instead - root_job: WDLSectionJob = WDLImportJob(target, inputs, wdl_options=wdl_options, path=inputs_search_path, skip_remote=options.reference_inputs, disk_size=options.import_workers_disk) + root_job = WDLImportWrapper(target, inputs, wdl_options=wdl_options, inputs_search_path=inputs_search_path, import_remote_files=options.reference_inputs, import_workers_threshold=options.import_workers_threshold) else: # Run WDL imports on leader # Import any files in the bindings @@ -3724,10 +4002,6 @@ def main() -> None: # TODO: Move cwltoil's generate_default_job_store where we can use it options.jobStore = os.path.join(mkdtemp(), 'tree') - # Take care of incompatible arguments related to file imports - if options.run_imports_on_workers is True and options.import_workers_disk is None: - raise RuntimeError("Commandline arguments --runImportsOnWorkers and --importWorkersDisk must both be set to run file imports on workers.") - # Make sure we have an output directory (or URL prefix) and we don't need # to ever worry about a None, and MyPy knows it. # If we don't have a directory assigned, make one in the current directory. @@ -3823,8 +4097,6 @@ def main() -> None: # Get the execution directory execution_dir = os.getcwd() - convert_remote_files(input_bindings, toil._jobStore, task_path=target.name, search_paths=inputs_search_path, import_remote_files=options.reference_inputs) - # Configure workflow interpreter options wdl_options: WDLContext = {"execution_dir": execution_dir, "container": options.container, "task_path": target.name, "namespace": target.name, "all_call_outputs": options.all_call_outputs} From ccea6b0570a2c6694eac7bcd21d01502d0a3fe7b Mon Sep 17 00:00:00 2001 From: stxue1 Date: Thu, 24 Oct 2024 10:45:43 -0700 Subject: [PATCH 02/20] Fix bug with importworker options --- src/toil/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/toil/common.py b/src/toil/common.py index 130f541e40..b0062466c8 100644 --- a/src/toil/common.py +++ b/src/toil/common.py @@ -702,7 +702,7 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool add_cwl_options(parser, suppress=not cwl) add_wdl_options(parser, suppress=not wdl) # Add shared runner options - add_runner_options(parser) + add_runner_options(parser, cwl=cwl, wdl=wdl) def check_arguments(typ: str) -> None: """ @@ -716,7 +716,7 @@ def check_arguments(typ: str) -> None: add_cwl_options(check_parser) if typ == "cwl": add_wdl_options(check_parser) - add_runner_options(check_parser) + for action in check_parser._actions: action.default = SUPPRESS other_options, _ = check_parser.parse_known_args(sys.argv[1:], ignore_help_args=True) From 3cd4c61853ee813820ff2695eb71ae40034723a3 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Thu, 24 Oct 2024 10:46:17 -0700 Subject: [PATCH 03/20] Asbtract WDL and CWL imports and enforce a better method --- src/toil/cwl/cwltoil.py | 330 +++++++++++++++++++++++++++++++--------- src/toil/job.py | 159 ++++++++++++++++++- src/toil/wdl/wdltoil.py | 200 +++++++----------------- 3 files changed, 471 insertions(+), 218 deletions(-) diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index 473fc374e7..40babe7d5a 100644 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -50,7 +50,8 @@ Type, TypeVar, Union, - cast) + cast, Sequence) +from urllib.error import HTTPError from urllib.parse import quote, unquote, urlparse, urlsplit import cwl_utils.errors @@ -102,6 +103,7 @@ from toil.cwl import check_cwltool_version from toil.lib.misc import call_command from toil.provisioners.clusterScaler import JobTooBigError +from toil.wdl.wdltoil import is_url check_cwltool_version() from toil.cwl.utils import (CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION, @@ -112,7 +114,7 @@ from toil.exceptions import FailedJobsException from toil.fileStores import FileID from toil.fileStores.abstractFileStore import AbstractFileStore -from toil.job import AcceleratorRequirement, Job, Promise, Promised, unwrap +from toil.job import AcceleratorRequirement, Job, Promise, Promised, unwrap, ParseableIndivisibleResource, unwrap_all, ImportsJob, FileMetadata from toil.jobStores.abstractJobStore import (AbstractJobStore, NoSuchFileException, LocatorException, InvalidImportExportUrlException, UnimplementedURLException) from toil.jobStores.fileJobStore import FileJobStore @@ -1783,17 +1785,64 @@ def path_to_loc(obj: CWLObjectType) -> None: del obj["path"] -def import_files( - import_function: Callable[[str], FileID], +def extract_files(fileindex: Dict[str, str], + existing: Dict[str, str], + file_metadata: CWLObjectType, + mark_broken: bool = False, + skip_remote: bool = False) -> Optional[str]: + """ + Extract the filename from a CWL file record + :param fileindex: Forward mapping of filename + :param existing: Reverse mapping of filename. This function does not use this + :param file_metadata: CWL file record + :param mark_broken: Whether files should be marked as missing + :param skip_remote: Whether to skip remote files + :return: + """ + location = cast(str, file_metadata["location"]) + + if location in fileindex: + file_metadata["location"] = fileindex[location] + return + if not location and file_metadata["path"]: + file_metadata["location"] = location = schema_salad.ref_resolver.file_uri( + cast(str, file_metadata["path"]) + ) + if location.startswith("file://") and not os.path.isfile( + schema_salad.ref_resolver.uri_file_path(location) + ): + if mark_broken: + logger.debug("File %s is missing", file_metadata) + file_metadata["location"] = location = MISSING_FILE + else: + raise cwl_utils.errors.WorkflowException("File is missing: %s" % file_metadata) + if location.startswith("file://") or not skip_remote: + # This is a local file, or we also need to download and re-upload remote files + if location not in fileindex: + # don't download twice + # These dictionaries are meant to keep track of what we're going to import + # In the actual import, this is used as a bidirectional mapping from unvirtualized to virtualized + # see write_file + if not urlparse(location).scheme: + rp = os.path.realpath(location) + else: + rp = location + fileindex[rp] = rp + existing[rp] = rp + return rp + return None + + +def visit_files( + func: Callable[..., Any], fs_access: StdFsAccess, fileindex: Dict[str, str], existing: Dict[str, str], cwl_object: Optional[CWLObjectType], mark_broken: bool = False, skip_remote: bool = False, - bypass_file_store: bool = False, - log_level: int = logging.DEBUG -) -> None: + bypass_file_store: bool = False +) -> List[Any]: """ Prepare all files and directories. @@ -1839,18 +1888,12 @@ def import_files( :param log_level: Log imported files at the given level. """ + func_return = list() tool_id = cwl_object.get("id", str(cwl_object)) if cwl_object else "" logger.debug("Importing files for %s", tool_id) logger.debug("Importing files in %s", cwl_object) - def import_and_log(url: str) -> FileID: - """ - Upload a file and log that we are doing so. - """ - logger.log(log_level, "Loading %s...", url) - return import_function(url) - # We need to upload all files to the Toil filestore, and encode structure # recursively into all Directories' locations. But we cannot safely alter # the listing fields of Directory objects, because the handling required by @@ -1868,7 +1911,7 @@ def import_and_log(url: str) -> FileID: if bypass_file_store: # Don't go on to actually import files or encode contents for # directories. - return + return func_return # Otherwise we actually want to put the things in the file store. @@ -1946,11 +1989,9 @@ def visit_file_or_directory_up( # This is a CWL File result: DirectoryContents = {} - - # Upload the file itself, which will adjust its location. - upload_file( - import_and_log, fileindex, existing, rec, mark_broken=mark_broken, skip_remote=skip_remote - ) + # Run a function on the file and store the return + func_return.append(func(fileindex, existing, rec, + mark_broken=mark_broken, skip_remote=skip_remote)) # Make a record for this file under its name result[cast(str, rec["basename"])] = cast(str, rec["location"]) @@ -1993,6 +2034,7 @@ def visit_file_or_directory_up( visit_file_or_directory_down, visit_file_or_directory_up, ) + return func_return def upload_directory( @@ -2754,12 +2796,15 @@ def run(self, file_store: AbstractFileStore) -> Any: fs_access = runtime_context.make_fs_access(runtime_context.basedir) # And a file importer that can go from a file:// URI to a Toil FileID - file_import_function = functools.partial(writeGlobalFileWrapper, file_store) + def file_import_function(url: str, log_level: int = logging.DEBUG) -> FileID: + logger.log(log_level, "Loading %s...", url) + return writeGlobalFileWrapper(file_store, url) + file_upload_function = functools.partial(upload_file, file_import_function) # Upload all the Files and set their and the Directories' locations, if # needed. - import_files( - file_import_function, + visit_files( + file_upload_function, fs_access, index, existing, @@ -2804,15 +2849,14 @@ def makeRootJob( :return: """ - if options.run_imports_on_workers: - import_job = CWLImportJob(initialized_job_order, tool, runtime_context, options) + import_job = CWLImportWrapper(initialized_job_order, tool, runtime_context, options) return import_job else: import_workflow_inputs(toil._jobStore, options, initialized_job_order=initialized_job_order, tool=tool) - rootJob, followOn = makeJob(tool, jobobj, runtime_context, None, None) # toplevel, no name needed - rootJob.cwljob = initialized_job_order - return rootJob + root_job, followOn = makeJob(tool, jobobj, runtime_context, None, None) # toplevel, no name needed + root_job.cwljob = initialized_job_order + return root_job def makeJob( @@ -3362,55 +3406,201 @@ def run( return UnresolvedDict(outobj) -class CWLSetupJob(CWLNamedJob): - """ - Job to take a CWL tool and job order with all files imported and makes a CWLWorkflow as a child to run it. - """ - def __init__(self, initialized_job_order: Promised[CWLObjectType], tool: Promised[Process], runtime_context: cwltool.context.RuntimeContext): - super().__init__() +class CWLInstallImportsJob(Job): + def __init__(self, initialized_job_order: Promised[CWLObjectType], tool: Promised[Process], basedir: str, + skip_remote: bool, bypass_file_store: bool, import_data: Promised[Tuple[Dict[str, FileID], Dict[str, FileMetadata]]], + **kwargs: Any) -> None: + """ + Job to take the entire CWL object and a mapping of filenames to the imported URIs + to convert all file locations to URIs. + + This class is only used when runImportsOnWorkers is enabled. + """ + super().__init__(local=True, **kwargs) self.initialized_job_order = initialized_job_order self.tool = tool - self.runtime_context = runtime_context + self.basedir = basedir + self.skip_remote = skip_remote + self.bypass_file_store = bypass_file_store + self.import_data = import_data - def run(self, file_store: AbstractFileStore) -> Any: + def run(self, file_store: AbstractFileStore) -> Tuple[CWLObjectType, Process]: """ - :return: Returns a CWL object that represents the output of the workflow. + Convert the filenames in the workflow inputs into the URIs + :return: Promise of transformed workflow inputs. A tuple of the job order and process """ + candidate_to_fileid = unwrap(self.import_data)[0] + initialized_job_order = unwrap(self.initialized_job_order) tool = unwrap(self.tool) - root_job, _ = makeJob(tool, initialized_job_order, self.runtime_context, None, None) - self.addChild(root_job) - root_job.cwljob = initialized_job_order + def convert_file(filename: str) -> FileID: + fileid = candidate_to_fileid[filename] + return fileid - return root_job.rv() + file_convert_function = functools.partial(upload_file, convert_file) + fs_access = ToilFsAccess(self.basedir) + fileindex: Dict[str, str] = {} + existing: Dict[str, str] = {} + visit_files( + file_convert_function, + fs_access, + fileindex, + existing, + initialized_job_order, + mark_broken=True, + skip_remote=self.skip_remote, + bypass_file_store=self.bypass_file_store + ) + # logger.info("Importing tool-associated files...") + visitSteps( + tool, + functools.partial( + visit_files, + file_convert_function, + fs_access, + fileindex, + existing, + mark_broken=True, + skip_remote=self.skip_remote, + bypass_file_store=self.bypass_file_store + ), + ) + # We always expect to have processed all files that exist + for param_name, param_value in initialized_job_order.items(): + # Loop through all the parameters for the workflow overall. + # Drop any files that aren't either imported (for when we use + # the file store) or available on disk (for when we don't). + # This will properly make them cause an error later if they + # were required. + rm_unprocessed_secondary_files(param_value) -class CWLImportJob(CWLNamedJob): + return initialized_job_order, tool + + +class CWLImportWrapper(CWLNamedJob): """ - Job to do file imports on a worker instead of a leader. Assumes all local and cloud files are accessible. + Job to organize importing files on workers instead of the leader. Responsible for extracting filenames and metadata, + calling ImportsJob, applying imports to the job objects, and scheduling the start workflow job This class is only used when runImportsOnWorkers is enabled. """ def __init__(self, initialized_job_order: CWLObjectType, tool: Process, runtime_context: cwltool.context.RuntimeContext, options: Namespace): - super().__init__(local=False, disk=options.import_workers_disk) + super().__init__(local=False, disk=options.import_workers_threshold) self.initialized_job_order = initialized_job_order self.tool = tool self.options = options self.runtime_context = runtime_context def run(self, file_store: AbstractFileStore) -> Any: - """ - Import the workflow inputs and then create and run the workflow. - :return: Promise of workflow outputs - """ - import_workflow_inputs(file_store.jobStore, self.options, self.initialized_job_order, self.tool) - setup_job = CWLSetupJob(self.initialized_job_order, self.tool, self.runtime_context) - self.addChild(setup_job) - return setup_job.rv() + filenames = extract_workflow_inputs(self.options, self.initialized_job_order, self.tool) + file_to_data = get_file_sizes(filenames, file_store.jobStore, self.options.reference_inputs) + imports_job = ImportsJob(file_to_data, self.options.import_workers_threshold) + self.addChild(imports_job) + install_imports_job = CWLInstallImportsJob(initialized_job_order=self.initialized_job_order, tool=self.tool, basedir=self.options.basedir, + skip_remote=self.options.reference_inputs, bypass_file_store=self.options.bypass_file_store, + import_data=imports_job.rv()) + self.addChild(install_imports_job) + imports_job.addFollowOn(install_imports_job) + + start_job = CWLStartJob(install_imports_job.rv(), runtime_context=self.runtime_context) + self.addChild(start_job) + install_imports_job.addFollowOn(start_job) + + return start_job.rv() -def import_workflow_inputs(jobstore: AbstractJobStore, options: Namespace, initialized_job_order: CWLObjectType, tool: Process) -> None: +class CWLStartJob(CWLNamedJob): + """ + Job responsible for starting the CWL workflow + """ + def __init__(self, inputs: Promised[Tuple[CWLObjectType, CWLObjectType]], runtime_context: cwltool.context.RuntimeContext, **kwargs: Any) -> None: + super().__init__(**kwargs) + self.inputs = inputs + self.runtime_context = runtime_context + + def run(self, file_store: AbstractFileStore) -> Any: + initialized_job_order = unwrap(self.inputs)[0] + tool = unwrap(self.inputs)[1] + root_job, _ = makeJob(tool, initialized_job_order, self.runtime_context, None, None) # toplevel, no name needed + root_job.cwljob = initialized_job_order + self.addChild(root_job) + return root_job.rv() + + +def get_file_sizes(filenames: List[str], file_source: AbstractJobStore, import_remote_files: bool) -> Dict[str, FileMetadata]: + file_to_data = {} + for filename in filenames: + try: + if not import_remote_files and is_url(filename): + if not file_source.url_exists(filename): + continue + filesize = file_source.get_size(filename) + parent_dir = os.path.dirname(filename) + except UnimplementedURLException as e: + # We can't find anything that can even support this URL scheme. + # Report to the user, they are probably missing an extra. + logger.critical('Error: ' + str(e)) + raise + except HTTPError as e: + # Something went wrong looking for it there. + logger.warning("Checked URL %s but got HTTP status %s", filename, e.code) + # Try the next location. + continue + except FileNotFoundError: + # Wasn't found there + continue + except Exception: + # Something went wrong besides the file not being found. Maybe + # we have no auth. + logger.error("Something went wrong when testing for existence of %s", filename) + raise + + # filename and parent dir in the tuple are not used in CWL + # but this allows generalization with the WDL runner's importOnWorkers functionality + file_to_data[filename] = cast(FileMetadata, (filename, parent_dir, filesize)) + return file_to_data + + +def extract_workflow_inputs(options: Namespace, initialized_job_order: CWLObjectType, tool: Process) -> List[str]: + fileindex: Dict[str, str] = {} + existing: Dict[str, str] = {} + + # Grab all the input files + logger.info("Grabbing input files...") + fs_access = ToilFsAccess(options.basedir) + filenames = visit_files( + extract_files, + fs_access, + fileindex, + existing, + initialized_job_order, + mark_broken=True, + skip_remote=options.reference_inputs, + bypass_file_store=options.bypass_file_store + ) + # Grab all the files associated with tools (binaries, etc.). + logger.info("Grabbing tool-associated files...") + tool_filenames = visitSteps( + tool, + functools.partial( + visit_files, + extract_files, + fs_access, + fileindex, + existing, + mark_broken=True, + skip_remote=options.reference_inputs, + bypass_file_store=options.bypass_file_store + ), + ) + filenames.extend(tool_filenames) + return filenames + + +def import_workflow_inputs(jobstore: AbstractJobStore, options: Namespace, initialized_job_order: CWLObjectType, + tool: Process, log_level: int = logging.DEBUG) -> None: fileindex: Dict[str, str] = {} existing: Dict[str, str] = {} # Define something we can call to import a file and get its file @@ -3418,16 +3608,15 @@ def import_workflow_inputs(jobstore: AbstractJobStore, options: Namespace, initi # We cast this because import_file is overloaded depending on if we # pass a shared file name or not, and we know the way we call it we # always get a FileID out. - file_import_function = cast( - Callable[[str], FileID], - functools.partial(jobstore.import_file, symlink=True), - ) + def file_import_function(url: str) -> FileID: + logger.log(log_level, "Loading %s...", url) + return cast(FileID, jobstore.import_file(url, symlink=True)) # Import all the input files, some of which may be missing optional # files. logger.info("Importing input files...") fs_access = ToilFsAccess(options.basedir) - import_files( + visit_files( file_import_function, fs_access, fileindex, @@ -3435,8 +3624,7 @@ def import_workflow_inputs(jobstore: AbstractJobStore, options: Namespace, initi initialized_job_order, mark_broken=True, skip_remote=options.reference_inputs, - bypass_file_store=options.bypass_file_store, - log_level=logging.INFO, + bypass_file_store=options.bypass_file_store ) # Import all the files associated with tools (binaries, etc.). # Not sure why you would have an optional secondary file here, but @@ -3445,15 +3633,14 @@ def import_workflow_inputs(jobstore: AbstractJobStore, options: Namespace, initi visitSteps( tool, functools.partial( - import_files, + visit_files, file_import_function, fs_access, fileindex, existing, mark_broken=True, skip_remote=options.reference_inputs, - bypass_file_store=options.bypass_file_store, - log_level=logging.INFO, + bypass_file_store=options.bypass_file_store ), ) @@ -3469,8 +3656,8 @@ def import_workflow_inputs(jobstore: AbstractJobStore, options: Namespace, initi def visitSteps( cmdline_tool: Process, - op: Callable[[CommentedMap], None], -) -> None: + op: Callable[[CommentedMap], List[Any]], +) -> List[Any]: """ Iterate over a CWL Process object, running the op on each tool description CWL object. @@ -3479,13 +3666,15 @@ def visitSteps( # For workflows we need to dispatch on steps for step in cmdline_tool.steps: # Handle the step's tool - op(step.tool) + ret = op(step.tool) # Recures on the embedded tool; maybe it's a workflow. - visitSteps(step.embedded_tool, op) + recurse_ret = visitSteps(step.embedded_tool, op) + ret.extend(recurse_ret) + return ret elif isinstance(cmdline_tool, cwltool.process.Process): # All CWL Process objects (including CommandLineTool) will have tools # if they bothered to run the Process __init__. - op(cmdline_tool.tool) + return op(cmdline_tool.tool) else: raise RuntimeError( f"Unsupported type encountered in workflow " @@ -3759,11 +3948,6 @@ def main(args: Optional[List[str]] = None, stdout: TextIO = sys.stdout) -> int: options = get_options(args) - # Take care of incompatible arguments related to file imports - if options.run_imports_on_workers is True and options.import_workers_disk is None: - logger.error("Commandline arguments --runImportsOnWorkers and --importWorkersDisk must both be set to run file imports on workers.") - return 1 - # Do cwltool setup cwltool.main.setup_schema(args=options, custom_schema_callback=None) tmpdir_prefix = options.tmpdir_prefix = options.tmpdir_prefix or DEFAULT_TMPDIR_PREFIX diff --git a/src/toil/job.py b/src/toil/job.py index 4517911864..70ed1ebd37 100644 --- a/src/toil/job.py +++ b/src/toil/job.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import collections import copy import importlib @@ -46,6 +48,7 @@ from configargparse import ArgParser +from toil import memoize from toil.bus import Names from toil.lib.compatibility import deprecated @@ -76,7 +79,7 @@ if TYPE_CHECKING: from optparse import OptionParser - from toil.batchSystems.abstractBatchSystem import BatchJobExitReason + from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, InsufficientSystemResources from toil.fileStores.abstractFileStore import AbstractFileStore from toil.jobStores.abstractJobStore import AbstractJobStore @@ -3484,6 +3487,160 @@ def getUserScript(self): return self.serviceModule +class FileMetadata(NamedTuple): + """ + Metadata for a file. + source is the URL to grab the file from + parent_dir is parent directory of the source + size is the size of the file. Is none if the filesize cannot be retrieved. + """ + source: str + parent_dir: str + size: Optional[int] + + +class CombineImportsJob(Job): + """ + Combine the outputs of multiple WorkerImportsJob into one promise + """ + def __init__(self, d: Sequence[Promised[Dict[str, FileID]]], **kwargs): + """ + Merge the dicts + :param d: Sequence of dictionaries to merge + """ + self._d = d + super().__init__(**kwargs) + + def run(self, file_store: AbstractFileStore) -> Promised[Dict[str, FileID]]: + d = unwrap_all(self._d) + return {k: v for item in d for k, v in item.items()} + + +class WorkerImportJob(Job): + """ + Job to do file imports on a worker instead of a leader. Assumes all local and cloud files are accessible. + + For the CWL/WDL runners, this class is only used when runImportsOnWorkers is enabled. + """ + def __init__(self, filenames: List[str], disk_size: Optional[ParseableIndivisibleResource] = None, stream: bool = True, **kwargs: Any): + if stream: + super().__init__(local=False, **kwargs) + else: + super().__init__(local=False, disk=disk_size, **kwargs) + self.filenames = filenames + self.disk_size = disk_size + + @staticmethod + def import_files(files: List[str], file_source: "AbstractJobStore") -> Dict[str, FileID]: + path_to_fileid = {} + + @memoize + def import_filename(filename: str) -> Optional[FileID]: + return file_source.import_file(filename, symlink=True) + + for file in files: + imported = import_filename(file) + if imported is not None: + path_to_fileid[file] = imported + return path_to_fileid + + def run(self, file_store: AbstractFileStore) -> Promised[Dict[str, FileID]]: + """ + Import the workflow inputs and then create and run the workflow. + :return: Promise of workflow outputs + """ + try: + return self.import_files(self.filenames, file_store.jobStore) + except InsufficientSystemResources as e: + if e.resource == "disk" and e.requested < self.disk_size: + non_streaming_import = WorkerImportJob(self.filenames, self.disk_size, streaming=False) + self.addChild(non_streaming_import) + return non_streaming_import.rv() + else: + raise + + +class ImportsJob(Job): + """ + Job to organize and delegate files to individual WorkerImportJobs. + + For the CWL/WDL runners, this is only used when runImportsOnWorkers is enabled + """ + def __init__(self, file_to_data: Dict[str, FileMetadata], import_workers_threshold, **kwargs: Any): + """ + Job to take the inputs from the WDL workflow and import them on a worker instead of a leader. Assumes all local and cloud files are accessible. + + This class is only used when runImportsOnWorkers is enabled. + """ + super().__init__(local=True, **kwargs) + self._file_to_data = file_to_data + self._import_workers_threshold = import_workers_threshold + + def run(self, file_store: AbstractFileStore) -> Tuple[Promised[Dict[str, FileID]], Dict[str, FileMetadata]]: + """ + Import the workflow inputs and then create and run the workflow. + :return: Tuple of a mapping from the candidate uri to the file id and a mapping of the source filenames to its metadata. The candidate uri is a field in the file metadata + """ + import_workers_threshold = self._import_workers_threshold + file_to_data = self._file_to_data + # Run WDL imports on a worker instead + + individual_files = [] + batch_files = [] + # Separate out files that will be imported individually and in a batch + for filename, (candidate_uri, parent_id, size) in file_to_data.items(): + if size < import_workers_threshold: + batch_files.append(filename) + else: + individual_files.append(filename) + + # Create individual import jobs + import_jobs = [] + for file in individual_files: + filesize = file_to_data[file][2] + candidate_uri = file_to_data[file][0] + import_jobs.append(WorkerImportJob([candidate_uri], disk_size=filesize)) + + # create multiple batch jobs if necessary to not exceed the threshold + per_batch_files = [] + per_batch_size = 0 + # This list will hold lists of batched filenames + to_batch = [] + while len(batch_files) > 0 or len(per_batch_files) > 0: + if len(batch_files) == 0 and len(per_batch_files) > 0: + to_batch.append(per_batch_files) + break + + filename = batch_files[0] + # See if adding this to the queue will make the batch job too big + filesize = file_to_data[filename][2] + if per_batch_size + filesize >= import_workers_threshold: + # batch is too big now, store to schedule the batch + to_batch.append(per_batch_files) + # reset batching calculation + per_batch_size = 0 + else: + per_batch_size += filesize + per_batch_files.append(filename) + batch_files.pop(0) + + # Create batch import jobs for each group of files + for batch in to_batch: + candidate_uris = [file_to_data[filename][0] for filename in batch] + batch_size = sum(file_to_data[filename][2] for filename in batch) + import_jobs.append(WorkerImportJob(candidate_uris, disk_size=batch_size)) + + for job in import_jobs: + self.addChild(job) + + combine_imports_job = CombineImportsJob([job.rv() for job in import_jobs]) + for job in import_jobs: + job.addFollowOn(combine_imports_job) + self.addChild(combine_imports_job) + + return combine_imports_job.rv(), file_to_data + + class Promise: """ References a return value from a method as a *promise* before the method itself is run. diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index afd8620e38..34a760ec98 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -48,7 +48,7 @@ TypeVar, Union, cast, - TypedDict, IO) + TypedDict, IO, NamedTuple) if sys.version_info < (3, 11): from typing_extensions import NotRequired @@ -84,7 +84,7 @@ parse_accelerator, unwrap, unwrap_all, - ParseableIndivisibleResource) + ParseableIndivisibleResource, WorkerImportJob, ImportsJob, FileMetadata) from toil.jobStores.abstractJobStore import (AbstractJobStore, UnimplementedURLException, InvalidImportExportUrlException, LocatorException) from toil.lib.accelerators import get_individual_local_accelerators @@ -803,16 +803,23 @@ def is_url(filename: str, schemes: List[str]=['http:', 'https:', 's3:', 'gs:', ' return True return False +def extract_workflow_inputs(environment: WDLBindings) -> List[str]: + filenames = list() + def add_filename(file: WDL.Value.File) -> WDL.Value.File: + filenames.append(file.value) + return file + map_over_files_in_bindings(environment, add_filename) + return filenames -def get_file_sizes(environment: WDLBindings, file_source: AbstractJobStore, task_path: str, search_paths: Optional[List[str]] = None, import_remote_files: bool = True, - execution_dir: Optional[str] = None) -> Dict[str, Tuple[str, uuid.UUID, Optional[int]]]: +def get_file_sizes(filenames: List[str], file_source: AbstractJobStore, search_paths: Optional[List[str]] = None, import_remote_files: bool = True, + execution_dir: Optional[str] = None) -> Dict[str, FileMetadata]: """ Resolve relative-URI files in the given environment and them then into absolute normalized URIs. Returns a dictionary of WDL file values to a tuple of the normalized URI, parent directory ID, and size of the file. The size of the file may be None, which means unknown size. Will set the value of the File to the relative-URI. - :param environment: Bindings to evaluate on + :param filenames: list of filenames to evaluate on :param file_source: Context to search for files with :param task_path: Dotted WDL name of the user-level code doing the importing (probably the workflow name). @@ -820,12 +827,8 @@ def get_file_sizes(environment: WDLBindings, file_source: AbstractJobStore, task directories in this list. :param import_remote_files: If set, import files from remote locations. Else leave them as URI references. """ - # path_to_id: Dict[str, uuid.UUID] = {} - path_to_id: Dict[str, uuid.UUID] = {} - file_to_data = {} - @memoize - def get_filename_size(filename: str) -> None: + def get_filename_size(filename: str) -> FileMetadata: tried = [] for candidate_uri in potential_absolute_uris(filename, search_paths if search_paths is not None else [], execution_dir=execution_dir): tried.append(candidate_uri) @@ -883,28 +886,11 @@ def get_filename_size(filename: str) -> None: # Must be a local path parent_dir = os.path.dirname(candidate_uri) - # Pack a UUID of the parent directory - dir_id = path_to_id.setdefault(parent_dir, uuid.uuid4()) - - file_to_data.setdefault(filename, (candidate_uri, dir_id, filesize)) - - return + return cast(FileMetadata, (candidate_uri, parent_dir, filesize)) # Not found raise RuntimeError(f"Could not find {filename} at any of: {list(potential_absolute_uris(filename, search_paths if search_paths is not None else []))}") - def get_file_size(file: WDL.Value.File) -> WDL.Value.File: - """ - Detect if any potential URI exists. Will convert a file's value to a URI and import it. - - Separated out from convert_file_to_url in order to properly memoize and avoid importing the same file twice - :param filename: Filename to import - :return: Tuple of the uri the file was found at and the virtualized import - """ - # Search through any input search paths passed in and download it if found - get_filename_size(file.value) - return file - map_over_files_in_bindings(environment, get_file_size) - return file_to_data + return {k: get_filename_size(k) for k in filenames} def import_files(files: List[str], file_source: AbstractJobStore) -> Dict[str, FileID]: """ @@ -936,7 +922,7 @@ def import_filename(filename: str) -> Optional[FileID]: return path_to_fileid -def convert_files(environment: WDLBindings, file_to_id: Dict[str, FileID], file_to_data: Dict[str, Tuple[str, uuid.UUID, int]], task_path: str) -> None: +def convert_files(environment: WDLBindings, file_to_id: Dict[str, FileID], file_to_data: Dict[str, FileMetadata], task_path: str) -> None: """ Resolve relative-URI files in the given environment and import all files. @@ -944,6 +930,8 @@ def convert_files(environment: WDLBindings, file_to_id: Dict[str, FileID], file_ :param environment: Bindings to evaluate on """ + dir_ids = {t[1] for t in file_to_data.values()} + dir_to_id = {k: uuid.uuid4() for k in dir_ids} def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File: """ @@ -951,7 +939,6 @@ def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File: """ candidate_uri = file_to_data[file.value][0] file_id = file_to_id.get(candidate_uri) - dir_id = file_to_data[file.value][1] # Work out what the basename for the file was file_basename = os.path.basename(urlsplit(candidate_uri).path) @@ -961,7 +948,7 @@ def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File: # download them at that basename later. raise RuntimeError(f"File {candidate_uri} has no basename and so cannot be a WDL File") - toil_uri = pack_toil_uri(file_id, task_path, dir_id, file_basename) + toil_uri = pack_toil_uri(file_id, task_path, dir_to_id[file_to_data[file.value][1]], file_basename) setattr(file, "virtualized_value", toil_uri) file.value = candidate_uri @@ -3806,7 +3793,7 @@ class WDLStartJob(WDLSectionJob): the workflow name; both forms are accepted. """ - def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, wdl_options: WDLContext, **kwargs: Any) -> None: + def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: Promised[WDLBindings], wdl_options: WDLContext, **kwargs: Any) -> None: """ Create a subtree to run the workflow and namespace the outputs. """ @@ -3821,86 +3808,53 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: """ Actually build the subgraph. """ + inputs = unwrap(self._inputs) super().run(file_store) if isinstance(self._target, WDL.Tree.Workflow): # Create a workflow job. We rely in this to handle entering the input # namespace if needed, or handling free-floating inputs. - job: WDLBaseJob = WDLWorkflowJob(self._target, [self._inputs], [self._target.name], wdl_options=self._wdl_options, local=True) + job: WDLBaseJob = WDLWorkflowJob(self._target, [inputs], [self._target.name], wdl_options=self._wdl_options, local=True) else: # There is no workflow. Create a task job. - job = WDLTaskWrapperJob(self._target, [self._inputs], [self._target.name], wdl_options=self._wdl_options, local=True) + job = WDLTaskWrapperJob(self._target, [inputs], [self._target.name], wdl_options=self._wdl_options, local=True) # Run the task or workflow job.then_namespace(self._wdl_options["namespace"]) self.addChild(job) self.defer_postprocessing(job) return job.rv() -class WDLImportJob(WDLSectionJob): - def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: List[str], wdl_options: WDLContext, path: Optional[List[str]] = None, skip_remote: bool = False, - disk_size: Optional[ParseableIndivisibleResource] = None, stream: bool = True, **kwargs: Any): - """ - Job to take the inputs from the WDL workflow and import them on a worker instead of a leader. Assumes all local and cloud files are accessible. - - This class is only used when runImportsOnWorkers is enabled. - """ - if stream: - super().__init__(wdl_options=wdl_options, local=False, **kwargs) - else: - super().__init__(wdl_options=wdl_options, local=False, disk=disk_size, **kwargs) - - self._target = target - self._inputs = inputs - self._path = path - self._skip_remote = skip_remote - self._disk_size = disk_size - - def run(self, file_store: AbstractFileStore) -> Promised[Dict[str, FileID]]: - """ - Import the workflow inputs and then create and run the workflow. - :return: Promise of workflow outputs - """ - try: - return import_files(self._inputs, file_store.jobStore) - except InsufficientSystemResources as e: - # If importing via streaming, then the job's disk size will not be set. Detect this to create a nonstreaming import job with sufficient disk space - # else, this is the nonstreaming import job and the exception is valid, so raise - if e.resource == "disk" and e.requested < self._disk_size: - non_streaming_import = WDLImportJob(self._target, self._inputs, wdl_options=self._wdl_options, disk=self._disk_size, stream=False, - path=self._path, skip_remote=self._skip_remote) - self.addChild(non_streaming_import) - return non_streaming_import.rv() - else: - raise - -class WDLCombineImportsJob(WDLSectionJob): - def __init__(self, task_path: str, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, file_to_fileid: Sequence[Promised[Dict[str, FileID]]], - file_to_data: Dict[str, Tuple[str, uuid.UUID, int]], wdl_options: WDLContext, **kwargs: Any) -> None: +class WDLInstallImportsJob(Job): + def __init__(self, task_path: str, inputs: WDLBindings, + import_data: Promised[Tuple[Dict[str, FileID]], Dict[str, FileMetadata]], **kwargs: Any) -> None: """ - Job to take the inputs from the WDL workflow and import them on a worker instead of a leader. Assumes all local and cloud files are accessible. + Job to take the inputs from the WDL workflow and a mapping of filenames to imported URIs + to convert all file locations to URIs in each binding. This class is only used when runImportsOnWorkers is enabled. """ - super().__init__(local=True, wdl_options=wdl_options, **kwargs) - self._file_to_fileid = file_to_fileid + super().__init__(local=True, **kwargs) + self._import_data = import_data self._inputs = inputs - self._target = target - self._file_to_data = file_to_data self._task_path = task_path def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: """ - Import the workflow inputs and then create and run the workflow. - :return: Promise of workflow outputs + Convert the filenames in the workflow inputs ito the URIs + :return: Promise of transformed workflow inputs """ - - file_to_fileid = {k: v for d in unwrap_all(self._file_to_fileid) for k, v in d.items()} - convert_files(self._inputs, file_to_fileid, self._file_to_data, self._task_path) - root_job = WDLStartJob(self._target, self._inputs, wdl_options=self._wdl_options) - self.addChild(root_job) - return root_job.rv() + candidate_to_fileid = unwrap(self._import_data)[0] + file_to_data = unwrap(self._import_data)[1] + convert_files(self._inputs, candidate_to_fileid, file_to_data, self._task_path) + return self._inputs class WDLImportWrapper(WDLSectionJob): + """ + Job to organize importing files on workers instead of the leader. Responsible for extracting filenames and metadata, + calling ImportsJob, applying imports to input bindings, and scheduling the start workflow job + + This class is only used when runImportsOnWorkers is enabled. + """ def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, wdl_options: WDLContext, inputs_search_path, import_remote_files, import_workers_threshold, **kwargs: Any): """ @@ -3916,62 +3870,20 @@ def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLB self._import_workers_threshold = import_workers_threshold def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: - """ - Import the workflow inputs and then create and run the workflow. - :return: Promise of workflow outputs - """ - inputs = self._inputs - target = self._target - inputs_search_path = self._inputs_search_path - wdl_options = self._wdl_options - import_remote_files = self._import_remote_files - import_workers_threshold = self._import_workers_threshold - # Run WDL imports on a worker instead - file_to_data = get_file_sizes(inputs, file_store.jobStore, target.name, inputs_search_path, import_remote_files=import_remote_files) - - individual_import_files = [] - batch_import_files = [] - for filename, (candidate_uri, parent_id, size) in file_to_data.items(): - if size < import_workers_threshold: - batch_import_files.append(filename) - else: - individual_import_files.append(filename) - - import_jobs = [] - for file in individual_import_files: - filesize = file_to_data[file][2] - filename = file_to_data[file][0] - import_jobs.append(WDLImportJob(target, [filename], wdl_options=wdl_options, path=inputs_search_path, skip_remote=import_remote_files, disk_size=filesize)) - - # create multiple batch jobs if necessary to not exceed the threshold - # todo: improve this logic - per_batch_files = [] - per_batch_size = 0 - while len(batch_import_files) > 0: - peek_filename = batch_import_files[0] - # See if adding this to the queue will make the batch job too big - filesize = file_to_data[peek_filename][2] - if per_batch_size + filesize >= import_workers_threshold: - # If so, it's too big, schedule the queue - import_jobs.append(WDLImportJob(target, per_batch_files, wdl_options=wdl_options, path=inputs_search_path, skip_remote=import_remote_files, disk_size=per_batch_size)) - # reset - per_batch_size = 0 - per_batch_files = [] - else: - per_batch_size += filesize - candidate_uri = file_to_data[peek_filename][0] - per_batch_files.append(candidate_uri) - batch_import_files.pop(0) - if len(per_batch_files) > 0: - import_jobs.append( - WDLImportJob(target, per_batch_files, wdl_options=wdl_options, path=inputs_search_path, skip_remote=import_remote_files, disk_size=per_batch_size)) - combine_imports_job = WDLCombineImportsJob(target.name, target, inputs, [job.rv() for job in import_jobs], file_to_data, wdl_options=wdl_options) - - for job in import_jobs: - self.addChild(job) - self.addFollowOn(combine_imports_job) - - return combine_imports_job.rv() + filenames = extract_workflow_inputs(self._inputs) + file_to_data = get_file_sizes(filenames, file_store.jobStore, self._inputs_search_path, import_remote_files=self._import_remote_files) + imports_job = ImportsJob(file_to_data, self._import_workers_threshold) + self.addChild(imports_job) + install_imports_job = WDLInstallImportsJob(self._target.name, self._inputs, imports_job.rv()) + self.addChild(install_imports_job) + imports_job.addFollowOn(install_imports_job) + + start_job = WDLStartJob(self._target, install_imports_job.rv(), wdl_options=self._wdl_options) + self.addChild(start_job) + install_imports_job.addFollowOn(start_job) + + return start_job.rv() + def make_root_job(target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, inputs_search_path: List[str], toil: Toil, wdl_options: WDLContext, options: Namespace) -> WDLSectionJob: if options.run_imports_on_workers: From 642efd02ad890c7ee1a0522183ece5a396fa3a7a Mon Sep 17 00:00:00 2001 From: stxue1 Date: Thu, 24 Oct 2024 10:59:35 -0700 Subject: [PATCH 04/20] Remove unused import --- src/toil/wdl/wdltoil.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index d5c2b624ff..5811bc209c 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -1125,36 +1125,6 @@ def get_filename_size(filename: str) -> FileMetadata: return {k: get_filename_size(k) for k in filenames} -def import_files(files: List[str], file_source: AbstractJobStore) -> Dict[str, FileID]: - """ - Import all files from the list into the jobstore. Returns a dictionary mapping the filename to it's FileID - :param files: list of filenames - :param file_source: jobstore object - :return: - """ - - @memoize - def import_filename(filename: str) -> Optional[FileID]: - """ - Detect if any potential URI exists. Will convert a file's value to a URI and import it. - - Separated out from convert_file_to_url in order to properly memoize and avoid importing the same file twice - :param filename: Filename to import - :return: Tuple of the uri the file was found at and the virtualized import - """ - # Search through any input search paths passed in and download it if found - # Actually import - # Try to import the file. If we can't find it, continue - return file_source.import_file(filename) - - path_to_fileid = {} - for file in files: - imported = import_filename(file) - if imported is not None: - path_to_fileid[file] = imported - return path_to_fileid - - def convert_files( environment: WDLBindings, file_to_id: Dict[str, FileID], From e712d887223d3b479e0712d6dfe91f59f4ab13e1 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Thu, 24 Oct 2024 11:55:13 -0700 Subject: [PATCH 05/20] satisfy mypy --- src/toil/cwl/cwltoil.py | 12 +++++++----- src/toil/job.py | 4 ++-- src/toil/wdl/wdltoil.py | 14 +++++++------- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index 51ad20d8b9..ef38a105f6 100644 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -54,7 +54,7 @@ TypeVar, Union, cast, - Sequence, + Sequence, Literal, ) from urllib.error import HTTPError from urllib.parse import quote, unquote, urlparse, urlsplit @@ -1857,7 +1857,7 @@ def extract_files( if location in fileindex: file_metadata["location"] = fileindex[location] - return + return None if not location and file_metadata["path"]: file_metadata["location"] = location = schema_salad.ref_resolver.file_uri( cast(str, file_metadata["path"]) @@ -1944,7 +1944,7 @@ def visit_files( :param log_level: Log imported files at the given level. """ - func_return = list() + func_return: List[Any] = list() tool_id = cwl_object.get("id", str(cwl_object)) if cwl_object else "" logger.debug("Importing files for %s", tool_id) @@ -3653,7 +3653,7 @@ class CWLStartJob(CWLNamedJob): def __init__( self, - inputs: Promised[Tuple[CWLObjectType, CWLObjectType]], + inputs: Promised[Tuple[CWLObjectType, Process]], runtime_context: cwltool.context.RuntimeContext, **kwargs: Any, ) -> None: @@ -3765,7 +3765,7 @@ def import_workflow_inputs( # always get a FileID out. def file_import_function(url: str) -> FileID: logger.log(log_level, "Loading %s...", url) - return cast(FileID, jobstore.import_file(url, symlink=True)) + return jobstore.import_file(url, symlink=True) # Import all the input files, some of which may be missing optional # files. @@ -3835,6 +3835,8 @@ def visitSteps( f"Unsupported type encountered in workflow " f"traversal: {type(cmdline_tool)}" ) + # Satisfy mypy, but this branch should never be reached in practice + return [] def rm_unprocessed_secondary_files(job_params: Any) -> None: diff --git a/src/toil/job.py b/src/toil/job.py index 2789488b46..4a1bbc15e7 100644 --- a/src/toil/job.py +++ b/src/toil/job.py @@ -3874,8 +3874,8 @@ class ImportsJob(Job): def __init__( self, file_to_data: Dict[str, FileMetadata], - import_workers_threshold, - **kwargs: Any, + import_workers_threshold: ParseableIndivisibleResource, + **kwargs: Any ): """ Job to take the inputs from the WDL workflow and import them on a worker instead of a leader. Assumes all local and cloud files are accessible. diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 5811bc209c..bbb1de2045 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -1146,7 +1146,7 @@ def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File: Calls import_filename to detect if a potential URI exists and imports it. Will modify the File object value to the new URI and tack on the virtualized file. """ candidate_uri = file_to_data[file.value][0] - file_id = file_to_id.get(candidate_uri) + file_id = file_to_id[candidate_uri] # Work out what the basename for the file was file_basename = os.path.basename(urlsplit(candidate_uri).path) @@ -4877,7 +4877,7 @@ def __init__( self, task_path: str, inputs: WDLBindings, - import_data: Promised[Tuple[Dict[str, FileID]], Dict[str, FileMetadata]], + import_data: Promised[Tuple[Dict[str, FileID], Dict[str, FileMetadata]]], **kwargs: Any, ) -> None: """ @@ -4915,9 +4915,9 @@ def __init__( target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, wdl_options: WDLContext, - inputs_search_path, - import_remote_files, - import_workers_threshold, + inputs_search_path: List[str], + import_remote_files: bool, + import_workers_threshold: ParseableIndivisibleResource, **kwargs: Any, ): """ @@ -4960,13 +4960,13 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: def make_root_job( target: WDL.Tree.Workflow | WDL.Tree.Task, inputs: WDLBindings, - inputs_search_path: list[str], + inputs_search_path: List[str], toil: Toil, wdl_options: WDLContext, options: Namespace, ) -> WDLSectionJob: if options.run_imports_on_workers: - root_job = WDLImportWrapper( + root_job: WDLSectionJob = WDLImportWrapper( target, inputs, wdl_options=wdl_options, From e6c13fc35e7e2425f82c52e1f9dca8ad52d1642d Mon Sep 17 00:00:00 2001 From: stxue1 Date: Fri, 25 Oct 2024 10:26:36 -0700 Subject: [PATCH 06/20] Fix WDL import in CWL --- src/toil/cwl/cwltoil.py | 5 +++-- src/toil/job.py | 33 ++++++++++++++++++++++++++++++++- src/toil/wdl/wdltoil.py | 36 ++++-------------------------------- 3 files changed, 39 insertions(+), 35 deletions(-) diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index ef38a105f6..bbc5a364aa 100644 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -54,7 +54,8 @@ TypeVar, Union, cast, - Sequence, Literal, + Sequence, + Literal, ) from urllib.error import HTTPError from urllib.parse import quote, unquote, urlparse, urlsplit @@ -113,7 +114,6 @@ from toil.cwl import check_cwltool_version from toil.lib.misc import call_command from toil.provisioners.clusterScaler import JobTooBigError -from toil.wdl.wdltoil import is_url check_cwltool_version() from toil.cwl.utils import ( @@ -136,6 +136,7 @@ unwrap_all, ImportsJob, FileMetadata, + is_url, ) from toil.jobStores.abstractJobStore import ( AbstractJobStore, diff --git a/src/toil/job.py b/src/toil/job.py index 4a1bbc15e7..6cbfc99fd5 100644 --- a/src/toil/job.py +++ b/src/toil/job.py @@ -91,6 +91,37 @@ logger = logging.getLogger(__name__) +# We define a URI scheme kind of like but not actually compatible with the one +# we use for CWL. CWL brings along the file basename in its file type, but +# WDL.Value.File doesn't. So we need to make sure we stash that somewhere in +# the URI. +# TODO: We need to also make sure files from the same source directory end up +# in the same destination directory, when dealing with basename conflicts. + +TOIL_URI_SCHEME = "toilfile:" + + +def is_toil_url(filename: str) -> bool: + return is_url(filename, schemes=[TOIL_URI_SCHEME]) + + +def is_standard_url(filename: str) -> bool: + return is_url(filename, ["http:", "https:", "s3:", "gs:", "ftp:"]) + + +def is_url( + filename: str, + schemes: list[str] = ["http:", "https:", "s3:", "gs:", "ftp:", TOIL_URI_SCHEME], +) -> bool: + """ + Decide if a filename is a known kind of URL + """ + for scheme in schemes: + if filename.startswith(scheme): + return True + return False + + class JobPromiseConstraintError(RuntimeError): """ Error for job being asked to promise its return value, but it not available. @@ -3875,7 +3906,7 @@ def __init__( self, file_to_data: Dict[str, FileMetadata], import_workers_threshold: ParseableIndivisibleResource, - **kwargs: Any + **kwargs: Any, ): """ Job to take the inputs from the WDL workflow and import them on a worker instead of a leader. Assumes all local and cloud files are accessible. diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index bbb1de2045..6509fe3475 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -91,9 +91,12 @@ unwrap, unwrap_all, ParseableIndivisibleResource, - WorkerImportJob, ImportsJob, FileMetadata, + TOIL_URI_SCHEME, + is_url, + is_standard_url, + is_toil_url, ) from toil.jobStores.abstractJobStore import ( AbstractJobStore, @@ -758,16 +761,6 @@ def parse_disks( return specified_mount_point, part_size, part_suffix -# We define a URI scheme kind of like but not actually compatible with the one -# we use for CWL. CWL brings along the file basename in its file type, but -# WDL.Value.File doesn't. So we need to make sure we stash that somewhere in -# the URI. -# TODO: We need to also make sure files from the same source directory end up -# in the same destination directory, when dealing with basename conflicts. - -TOIL_URI_SCHEME = "toilfile:" - - def pack_toil_uri( file_id: FileID, task_path: str, dir_id: uuid.UUID, file_basename: str ) -> str: @@ -992,27 +985,6 @@ def _call_eager( return WDL.Value.Float(total_size) -def is_toil_url(filename: str) -> bool: - return is_url(filename, schemes=[TOIL_URI_SCHEME]) - - -def is_standard_url(filename: str) -> bool: - return is_url(filename, ["http:", "https:", "s3:", "gs:", "ftp:"]) - - -def is_url( - filename: str, - schemes: list[str] = ["http:", "https:", "s3:", "gs:", "ftp:", TOIL_URI_SCHEME], -) -> bool: - """ - Decide if a filename is a known kind of URL - """ - for scheme in schemes: - if filename.startswith(scheme): - return True - return False - - def extract_workflow_inputs(environment: WDLBindings) -> List[str]: filenames = list() From ae1e279b13c18a01483222ffeb37fd597e1229d1 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Fri, 25 Oct 2024 10:33:53 -0700 Subject: [PATCH 07/20] Fix WDL comments --- src/toil/cwl/cwltoil.py | 1 - src/toil/wdl/wdltoil.py | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index bbc5a364aa..f0dc3c0f52 100644 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -3570,7 +3570,6 @@ def convert_file(filename: str) -> FileID: skip_remote=self.skip_remote, bypass_file_store=self.bypass_file_store, ) - # logger.info("Importing tool-associated files...") visitSteps( tool, functools.partial( diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 6509fe3475..76f0f7a816 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -1007,8 +1007,6 @@ def get_file_sizes( Resolve relative-URI files in the given environment and them then into absolute normalized URIs. Returns a dictionary of WDL file values to a tuple of the normalized URI, parent directory ID, and size of the file. The size of the file may be None, which means unknown size. - Will set the value of the File to the relative-URI. - :param filenames: list of filenames to evaluate on :param file_source: Context to search for files with :param task_path: Dotted WDL name of the user-level code doing the @@ -1104,7 +1102,7 @@ def convert_files( task_path: str, ) -> None: """ - Resolve relative-URI files in the given environment and import all files. + Resolve relative-URI files in the given environment convert the file values to a new value made from a given mapping. Will set the value of the File to the relative-URI. From 1d89969b9565d0dc2b729129bd150d622beae4ab Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 30 Oct 2024 08:31:31 -0700 Subject: [PATCH 08/20] move url functions to job, remove dead imports, format with black --- src/toil/cwl/cwltoil.py | 8 +- src/toil/job.py | 35 +++++-- src/toil/wdl/wdltoil.py | 208 +++++++++++++++++++++++++--------------- 3 files changed, 159 insertions(+), 92 deletions(-) diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index f0dc3c0f52..7d271013fc 100644 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -50,11 +50,9 @@ Optional, TextIO, Tuple, - Type, TypeVar, Union, cast, - Sequence, Literal, ) from urllib.error import HTTPError @@ -132,11 +130,9 @@ Promise, Promised, unwrap, - ParseableIndivisibleResource, - unwrap_all, ImportsJob, FileMetadata, - is_url, + is_remote_url, ) from toil.jobStores.abstractJobStore import ( AbstractJobStore, @@ -3678,7 +3674,7 @@ def get_file_sizes( file_to_data = {} for filename in filenames: try: - if not import_remote_files and is_url(filename): + if not import_remote_files and is_remote_url(filename): if not file_source.url_exists(filename): continue filesize = file_source.get_size(filename) diff --git a/src/toil/job.py b/src/toil/job.py index d5621006be..39dcfd3200 100644 --- a/src/toil/job.py +++ b/src/toil/job.py @@ -101,26 +101,43 @@ TOIL_URI_SCHEME = "toilfile:" -def is_toil_url(filename: str) -> bool: - return is_url(filename, schemes=[TOIL_URI_SCHEME]) - +STANDARD_SCHEMES = ["http:", "https:", "s3:", "gs:", "ftp:"] +REMOTE_SCHEMES = STANDARD_SCHEMES + [TOIL_URI_SCHEME] +ALL_SCHEMES = REMOTE_SCHEMES + ["file:"] def is_standard_url(filename: str) -> bool: - return is_url(filename, ["http:", "https:", "s3:", "gs:", "ftp:"]) + return is_url_with_scheme(filename, STANDARD_SCHEMES) +def is_remote_url(filename: str) -> bool: + """ + Decide if a filename is a known, non-file kind of URL + """ + return is_url_with_scheme(filename, REMOTE_SCHEMES) -def is_url( - filename: str, - schemes: list[str] = ["http:", "https:", "s3:", "gs:", "ftp:", TOIL_URI_SCHEME], -) -> bool: +def is_any_url(filename: str) -> bool: + """ + Decide if a string is a URI like http:// or file://. + + Otherwise it might be a bare path. + """ + return is_url_with_scheme(filename, ALL_SCHEMES) + +def is_url_with_scheme(filename: str, schemes: list[str]) -> bool: """ - Decide if a filename is a known kind of URL + Return True if filename is a URL with any of the given schemes and False otherwise. """ + # TODO: "http:myfile.dat" is a valid filename and *not* a valid URL for scheme in schemes: if filename.startswith(scheme): return True return False +def is_toil_url(filename: str) -> bool: + return is_url_with_scheme(filename, [TOIL_URI_SCHEME]) + +def is_file_url(filename: str) -> bool: + return is_url_with_scheme(filename, ["file:"]) + class JobPromiseConstraintError(RuntimeError): """ diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index fc1529b3de..ad34f3ed97 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -34,7 +34,7 @@ from collections.abc import Generator, Iterable, Iterator, Sequence from contextlib import ExitStack, contextmanager from graphlib import TopologicalSorter -from tempfile import mkstemp, gettempdir +from tempfile import mkstemp from typing import ( Any, Callable, @@ -45,16 +45,13 @@ List, Optional, Sequence, - Set, Tuple, - Type, TypeVar, Union, cast, TypedDict, IO, - NamedTuple, - Protocol + Protocol, ) if sys.version_info < (3, 11): @@ -96,9 +93,11 @@ ImportsJob, FileMetadata, TOIL_URI_SCHEME, - is_url, is_standard_url, is_toil_url, + is_any_url, + is_remote_url, + is_file_url, ) from toil.jobStores.abstractJobStore import ( AbstractJobStore, @@ -117,6 +116,7 @@ logger = logging.getLogger(__name__) + # We want to use hashlib.file_digest to avoid a 3-line hashing loop like # MiniWDL has. But it is only in 3.11+ # @@ -140,17 +140,21 @@ class ReadableFileObj(Protocol): Would extend the protocol from Typeshed for hashlib but those are only declared for 3.11+. """ + def readinto(self, buf: bytearray, /) -> int: ... def readable(self) -> bool: ... def read(self, number: int) -> bytes: ... + class FileDigester(Protocol): """ Protocol for the features we need from hashlib.file_digest. """ + # We need __ prefixes here or the name of the argument becomes part of the required interface. def __call__(self, __f: ReadableFileObj, __alg_name: str) -> hashlib._Hash: ... + try: # Don't do a direct conditional import to the final name here because then # the polyfill needs *exactly* the signature of file_digest, and not just @@ -162,7 +166,8 @@ def __call__(self, __f: ReadableFileObj, __alg_name: str) -> hashlib._Hash: ... # # TODO: Change to checking sys.version_info because MyPy understands that # better? - from hashlib import file_digest as file_digest_impl # type: ignore[attr-defined,unused-ignore] + from hashlib import file_digest as file_digest_impl # type: ignore[attr-defined,unused-ignore] + file_digest: FileDigester = file_digest_impl except ImportError: # Polyfill file_digest from 3.11+ @@ -174,6 +179,7 @@ def file_digest_fallback_impl(f: ReadableFileObj, alg_name: str) -> hashlib._Has hasher.update(buffer) buffer = f.read(BUFFER_SIZE) return hasher + file_digest = file_digest_fallback_impl # WDL options to pass into the WDL jobs and standard libraries @@ -878,6 +884,7 @@ def unpack_toil_uri(toil_uri: str) -> tuple[FileID, str, str, str]: return file_id, task_path, parent_id, file_basename + ### # Caching machinery and file accessors ### @@ -887,6 +894,7 @@ def unpack_toil_uri(toil_uri: str) -> tuple[FileID, str, str, str]: # We store the shared FS path in an attribute on the WDL File. SHARED_PATH_ATTR = "_shared_fs_path" + def clone_metadata(old_file: WDL.Value.File, new_file: WDL.Value.File) -> None: """ Copy all Toil metadata from one WDL File to another. @@ -895,6 +903,7 @@ def clone_metadata(old_file: WDL.Value.File, new_file: WDL.Value.File) -> None: if hasattr(old_file, attribute): setattr(new_file, attribute, getattr(old_file, attribute)) + def set_file_value(file: WDL.Value.File, new_value: str) -> WDL.Value.File: """ Return a copy of a WDL File with all metadata intact but the value changed. @@ -904,6 +913,7 @@ def set_file_value(file: WDL.Value.File, new_value: str) -> WDL.Value.File: clone_metadata(file, new_file) return new_file + def set_file_nonexistent(file: WDL.Value.File, nonexistent: bool) -> WDL.Value.File: """ Return a copy of a WDL File with all metadata intact but the nonexistent flag set to the given value. @@ -913,13 +923,17 @@ def set_file_nonexistent(file: WDL.Value.File, nonexistent: bool) -> WDL.Value.F setattr(new_file, "nonexistent", nonexistent) return new_file + def get_file_nonexistent(file: WDL.Value.File) -> bool: """ Return the nonexistent flag for a file. """ return cast(bool, getattr(file, "nonexistent", False)) -def set_file_virtualized_value(file: WDL.Value.File, virtualized_value: str) -> WDL.Value.File: + +def set_file_virtualized_value( + file: WDL.Value.File, virtualized_value: str +) -> WDL.Value.File: """ Return a copy of a WDL File with all metadata intact but the virtualized_value attribute set to the given value. """ @@ -928,12 +942,14 @@ def set_file_virtualized_value(file: WDL.Value.File, virtualized_value: str) -> setattr(new_file, "virtualized_value", virtualized_value) return new_file + def get_file_virtualized_value(file: WDL.Value.File) -> Optional[str]: """ Get the virtualized storage location for a file. """ return cast(Optional[str], getattr(file, "virtualized_value", None)) + def get_shared_fs_path(file: WDL.Value.File) -> Optional[str]: """ If a File has a shared filesystem path, get that path. @@ -942,10 +958,13 @@ def get_shared_fs_path(file: WDL.Value.File) -> Optional[str]: """ if hasattr(file, SHARED_PATH_ATTR): result = cast(str, getattr(file, SHARED_PATH_ATTR)) - assert not result.startswith("file://"), f"Found URI shared FS path of {result} on {file}" + assert not result.startswith( + "file://" + ), f"Found URI shared FS path of {result} on {file}" return result return None + def set_shared_fs_path(file: WDL.Value.File, path: str) -> WDL.Value.File: """ Return a copy of the given File associated with the given shared filesystem path. @@ -953,28 +972,39 @@ def set_shared_fs_path(file: WDL.Value.File, path: str) -> WDL.Value.File: This should be the path it was initially imported from, or the path that it has in the call cache. """ # We should not have URLs here, only real paths. - assert not path.startswith("file://"), f"Cannot assign URI shared FS path of {path} to {file}" + assert not path.startswith( + "file://" + ), f"Cannot assign URI shared FS path of {path} to {file}" new_file = WDL.Value.File(file.value, file.expr) clone_metadata(file, new_file) setattr(new_file, SHARED_PATH_ATTR, path) return new_file -def view_shared_fs_paths(bindings: WDL.Env.Bindings[WDL.Value.Base]) -> WDL.Env.Bindings[WDL.Value.Base]: + +def view_shared_fs_paths( + bindings: WDL.Env.Bindings[WDL.Value.Base], +) -> WDL.Env.Bindings[WDL.Value.Base]: """ Given WDL bindings, return a copy where all files have their shared filesystem paths as their values. """ + def file_path_to_use(file: WDL.Value.File) -> WDL.Value.File: """ Return a File at the shared FS path if we have one, or the original File otherwise. """ shared_path = get_shared_fs_path(file) result_path = shared_path or file.value - assert not result_path.startswith("file://"), f"Found file URI {result_path} instead of a path for file {file}" + assert not result_path.startswith( + "file://" + ), f"Found file URI {result_path} instead of a path for file {file}" return set_file_value(file, result_path) return map_over_files_in_bindings(bindings, file_path_to_use) -def poll_execution_cache(node: Union[WDL.Tree.Workflow, WDL.Tree.Task], bindings: WDLBindings) -> tuple[WDLBindings | None, str]: + +def poll_execution_cache( + node: Union[WDL.Tree.Workflow, WDL.Tree.Task], bindings: WDLBindings +) -> tuple[WDLBindings | None, str]: """ Return the cached result of calling this workflow or task, and its key. @@ -986,12 +1016,14 @@ def poll_execution_cache(node: Union[WDL.Tree.Workflow, WDL.Tree.Task], bindings transformed_bindings = view_shared_fs_paths(bindings) log_bindings(logger.debug, "Digesting input bindings:", [transformed_bindings]) input_digest = WDL.Value.digest_env(transformed_bindings) - cache_key=f"{node.name}/{node.digest}/{input_digest}" + cache_key = f"{node.name}/{node.digest}/{input_digest}" miniwdl_logger = logging.getLogger("MiniWDL") # TODO: Ship config from leader? It might not see the right environment. miniwdl_config = WDL.runtime.config.Loader(miniwdl_logger) miniwdl_cache = WDL.runtime.cache.new(miniwdl_config, miniwdl_logger) - cached_result: Optional[WDLBindings] = miniwdl_cache.get(cache_key, transformed_bindings, node.effective_outputs) + cached_result: Optional[WDLBindings] = miniwdl_cache.get( + cache_key, transformed_bindings, node.effective_outputs + ) if cached_result is not None: logger.info("Found call in cache") return cached_result, cache_key @@ -999,7 +1031,15 @@ def poll_execution_cache(node: Union[WDL.Tree.Workflow, WDL.Tree.Task], bindings logger.debug("No cache hit for %s", cache_key) return None, cache_key -def fill_execution_cache(cache_key: str, output_bindings: WDLBindings, file_store: AbstractFileStore, wdl_options: WDLContext, miniwdl_logger: Optional[logging.Logger] = None, miniwdl_config: Optional[WDL.runtime.config.Loader] = None) -> WDLBindings: + +def fill_execution_cache( + cache_key: str, + output_bindings: WDLBindings, + file_store: AbstractFileStore, + wdl_options: WDLContext, + miniwdl_logger: Optional[logging.Logger] = None, + miniwdl_config: Optional[WDL.runtime.config.Loader] = None, +) -> WDLBindings: """ Cache the result of calling a workflow or task. @@ -1034,7 +1074,9 @@ def fill_execution_cache(cache_key: str, output_bindings: WDLBindings, file_stor # # In that case we just pout up with useless/unreferenced files in the # cache. - output_directory = os.path.join(miniwdl_cache._call_cache_dir, cache_key, str(uuid.uuid4())) + output_directory = os.path.join( + miniwdl_cache._call_cache_dir, cache_key, str(uuid.uuid4()) + ) # Adjust all files in the output bindings to have shared FS paths outside the job store. def assign_shared_fs_path(file: WDL.Value.File) -> WDL.Value.File: @@ -1055,7 +1097,9 @@ def assign_shared_fs_path(file: WDL.Value.File) -> WDL.Value.File: if virtualized is None: # TODO: If we're passing things around by URL reference and # some of them are file: is this actually allowed? - raise RuntimeError(f"File {file} caught escaping from task unvirtualized") + raise RuntimeError( + f"File {file} caught escaping from task unvirtualized" + ) # We need to save this file somewhere. # This needs to exist before we can export to it. And now we know @@ -1071,13 +1115,14 @@ def assign_shared_fs_path(file: WDL.Value.File) -> WDL.Value.File: wdl_options, devirtualized_to_virtualized, virtualized_to_devirtualized, - export=True + export=True, ) # Remember where it went file = set_shared_fs_path(file, exported_path) return file + output_bindings = map_over_files_in_bindings(output_bindings, assign_shared_fs_path) # Save the bindings to the cache, representing all files with their shared filesystem paths. @@ -1264,9 +1309,6 @@ def _call_eager( # Return the result as a WDL float value return WDL.Value.Float(total_size) -STANDARD_SCHEMES = ["http:", "https:", "s3:", "gs:", "ftp:"] -REMOTE_SCHEMES = STANDARD_SCHEMES + [TOIL_URI_SCHEME] -ALL_SCHEMES = REMOTE_SCHEMES + ["file:"] def extract_workflow_inputs(environment: WDLBindings) -> List[str]: filenames = list() @@ -1277,11 +1319,7 @@ def add_filename(file: WDL.Value.File) -> WDL.Value.File: map_over_files_in_bindings(environment, add_filename) return filenames -def is_toil_url(filename: str) -> bool: - return is_url_with_scheme(filename, [TOIL_URI_SCHEME]) -def is_file_url(filename: str) -> bool: - return is_url_with_scheme(filename, ["file:"]) def get_file_sizes( filenames: List[str], @@ -1313,7 +1351,7 @@ def get_filename_size(filename: str) -> FileMetadata: ): tried.append(candidate_uri) try: - if not import_remote_files and is_url(candidate_uri): + if not import_remote_files and is_remote_url(candidate_uri): # Use remote URIs in place. But we need to find the one that exists. if not file_source.url_exists(candidate_uri): # Wasn't found there @@ -1358,7 +1396,7 @@ def get_filename_size(filename: str) -> FileMetadata: ) # Was actually found - if is_url(candidate_uri): + if is_remote_url(candidate_uri): # Might be a file URI or other URI. # We need to make sure file URIs and local paths that point to # the same place are treated the same. @@ -1425,33 +1463,6 @@ def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File: map_over_files_in_bindings(environment, convert_file_to_uri) -def is_standard_url(filename: str) -> bool: - return is_url_with_scheme(filename, STANDARD_SCHEMES) - - -def is_remote_url(filename: str) -> bool: - """ - Decide if a filename is a known, non-file kind of URL - """ - return is_url_with_scheme(filename, REMOTE_SCHEMES) - -def is_any_url(filename: str) -> bool: - """ - Decide if a string is a URI like http:// or file://. - - Otherwise it might be a bare path. - """ - return is_url_with_scheme(filename, ALL_SCHEMES) - -def is_url_with_scheme(filename: str, schemes: list[str]) -> bool: - """ - Return True if filename is a URL with any of the given schemes and False otherwise. - """ - # TODO: "http:myfile.dat" is a valid filename and *not* a valid URL - for scheme in schemes: - if filename.startswith(scheme): - return True - return False def convert_remote_files( environment: WDLBindings, @@ -1610,8 +1621,12 @@ def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File: # Was actually found and imported assert candidate_uri is not None assert toil_uri is not None - new_file = set_file_virtualized_value(set_file_value(file, candidate_uri), toil_uri) - if candidate_uri is not None and (is_file_url(candidate_uri) or not is_any_url(candidate_uri)): + new_file = set_file_virtualized_value( + set_file_value(file, candidate_uri), toil_uri + ) + if candidate_uri is not None and ( + is_file_url(candidate_uri) or not is_any_url(candidate_uri) + ): # We imported a file so we have a local path assert candidate_uri is not None if is_file_url(candidate_uri): @@ -1737,8 +1752,12 @@ def _read( # but I need to virtualize as well, so I can't remove one or the other. def _f(file: WDL.Value.File) -> WDL.Value.Base: if get_file_virtualized_value(file) is None: - file = set_file_virtualized_value(file, self._virtualize_filename(file.value)) - with open(self._devirtualize_filename(get_file_virtualized_value(file)), "r") as infile: + file = set_file_virtualized_value( + file, self._virtualize_filename(file.value) + ) + with open( + self._devirtualize_filename(get_file_virtualized_value(file)), "r" + ) as infile: return parse(infile.read()) return _f @@ -1772,7 +1791,11 @@ def _devirtualize_file(self, file: WDL.Value.File) -> WDL.Value.File: if virtualized_filename is not None: devirtualized_path = self._devirtualize_filename(virtualized_filename) file = set_file_value(file, devirtualized_path) - logger.debug("For virtualized filename %s got devirtualized file %s", virtualized_filename, file) + logger.debug( + "For virtualized filename %s got devirtualized file %s", + virtualized_filename, + file, + ) else: logger.debug("File has no virtualized value so not changing value") return file @@ -1803,7 +1826,9 @@ def _virtualize_file( logger.debug("File appears nonexistent so marking it nonexistent") return set_file_nonexistent(file, True) virtualized_filename = self._virtualize_filename(file.value) - logger.debug('For file %s got virtualized filename %s', file, virtualized_filename) + logger.debug( + "For file %s got virtualized filename %s", file, virtualized_filename + ) marked_file = set_file_virtualized_value(file, virtualized_filename) return marked_file @@ -1878,9 +1903,7 @@ def _devirtualize_uri( file_id, dest_path, mutable=False, symlink=True ) else: - raise RuntimeError( - f"Unsupported file source: {file_source}" - ) + raise RuntimeError(f"Unsupported file source: {file_source}") else: # Download to a local file with the right name and execute bit. # Open it exclusively @@ -2070,6 +2093,7 @@ def _virtualize_filename(self, filename: str) -> str: self._virtualized_to_devirtualized[result] = abs_filename return result + class ToilWDLStdLibWorkflow(ToilWDLStdLibBase): """ Standard library implementation for workflow scope. @@ -2118,21 +2142,29 @@ def wrapper(v: WDL.Value.Base) -> WDL.Value.File: miniwdl_logger = logging.getLogger("MiniWDL") # TODO: Ship config from leader? It might not see the right environment. miniwdl_config = WDL.runtime.config.Loader(miniwdl_logger) - self._miniwdl_cache = WDL.runtime.cache.new(miniwdl_config, miniwdl_logger) + self._miniwdl_cache = WDL.runtime.cache.new( + miniwdl_config, miniwdl_logger + ) # TODO: If we did this before the _virtualize_filename call in the # base _write we wouldn't need to immediately devirtualize. But we # have internal caches to lean on. devirtualized_filename = self._devirtualize_filename(virtualized_file.value) # Hash the file to hex - hex_digest = file_digest(open(devirtualized_filename, "rb"), "sha256").hexdigest() + hex_digest = file_digest( + open(devirtualized_filename, "rb"), "sha256" + ).hexdigest() file_input_bindings = WDL.Env.Bindings( - WDL.Env.Binding("file_sha256", cast(WDL.Value.Base, WDL.Value.String(hex_digest))) + WDL.Env.Binding( + "file_sha256", cast(WDL.Value.Base, WDL.Value.String(hex_digest)) + ) ) # Make an environment of "file_sha256" to that as a WDL string, and # digest that, and make a write_ cache key. No need to transform to # shared FS paths sonce no paths are in it. - log_bindings(logger.debug, "Digesting file bindings:", [file_input_bindings]) + log_bindings( + logger.debug, "Digesting file bindings:", [file_input_bindings] + ) input_digest = WDL.Value.digest_env(file_input_bindings) file_cache_key = "write_/" + input_digest # Construct a description of the types we expect to get from the @@ -2173,7 +2205,7 @@ def wrapper(v: WDL.Value.Base) -> WDL.Value.File: self._wdl_options, {}, {}, - export=True + export=True, ) # Save the cache entry pointing to it @@ -2181,7 +2213,7 @@ def wrapper(v: WDL.Value.Base) -> WDL.Value.File: file_cache_key, WDL.Env.Bindings( WDL.Env.Binding("file", WDL.Value.File(exported_path)) - ) + ), ) # Apply the shared filesystem path to the virtualized file @@ -2725,6 +2757,7 @@ def add_paths(task_container: TaskContainer, host_paths: Iterable[str]) -> None: task_container.input_path_map[host_path] = container_path task_container.input_path_map_rev[container_path] = host_path + def drop_if_missing( file: WDL.Value.File, standard_library: ToilWDLStdLibBase ) -> WDL.Value.File | None: @@ -2887,7 +2920,9 @@ def map_over_typed_files_in_value( # This is a file so we need to process it orig_file_value = value.value new_file = transform(value) - assert value.value == orig_file_value, "Transformation mutated the original File" + assert ( + value.value == orig_file_value + ), "Transformation mutated the original File" if new_file is None: # Assume the transform checked types if we actually care about the # result. @@ -3218,7 +3253,11 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # TODO: What if the same file is passed through several tasks, and # we get cache hits on those tasks? Won't we upload it several # times? - return self.postprocess(virtualize_files(cached_result, standard_library, enforce_existence=False)) + return self.postprocess( + virtualize_files( + cached_result, standard_library, enforce_existence=False + ) + ) if self._task.inputs: logger.debug("Evaluating task code") @@ -3729,7 +3768,9 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # A current limitation with the singularity/miniwdl cache is it cannot check for image updates if the # filename is the same singularity_cache = os.path.join(os.path.expanduser("~"), ".singularity") - miniwdl_singularity_cache = os.path.join(os.path.expanduser("~"), ".cache/miniwdl") + miniwdl_singularity_cache = os.path.join( + os.path.expanduser("~"), ".cache/miniwdl" + ) # Cache Singularity's layers somewhere known to have space os.environ["SINGULARITY_CACHEDIR"] = os.environ.get( @@ -4168,7 +4209,14 @@ def get_path_in_container(file: WDL.Value.File) -> WDL.Value.File | None: if self._cache_key is not None: # We might need to save to the execution cache - output_bindings = fill_execution_cache(self._cache_key, output_bindings, file_store, self._wdl_options, miniwdl_logger=miniwdl_logger, miniwdl_config=miniwdl_config) + output_bindings = fill_execution_cache( + self._cache_key, + output_bindings, + file_store, + self._wdl_options, + miniwdl_logger=miniwdl_logger, + miniwdl_config=miniwdl_config, + ) # Do postprocessing steps to e.g. apply namespaces. output_bindings = self.postprocess(output_bindings) @@ -5185,7 +5233,11 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # or calculated inputs filled in). cached_result, cache_key = poll_execution_cache(self._workflow, bindings) if cached_result is not None: - return self.postprocess(virtualize_files(cached_result, standard_library, enforce_existence=False)) + return self.postprocess( + virtualize_files( + cached_result, standard_library, enforce_existence=False + ) + ) if self._workflow.inputs: try: @@ -5212,7 +5264,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: sink.rv(), wdl_options=self._wdl_options, cache_key=cache_key, - local=True + local=True, ) sink.addFollowOn(outputs_job) # Caller is responsible for making sure namespaces are applied @@ -5333,7 +5385,9 @@ def run(self, file_store: AbstractFileStore) -> WDLBindings: ) if self._cache_key is not None: - output_bindings = fill_execution_cache(self._cache_key, output_bindings, file_store, self._wdl_options) + output_bindings = fill_execution_cache( + self._cache_key, output_bindings, file_store, self._wdl_options + ) return self.postprocess(output_bindings) From ff5152644f25fd8bb1f4ba797d46fbaae5519f2e Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 6 Nov 2024 14:50:40 -0800 Subject: [PATCH 09/20] Address comments --- src/toil/cwl/cwltoil.py | 228 ++++++++++++-------------- src/toil/job.py | 319 +++++++++++++++++++++++++++---------- src/toil/lib/io.py | 41 +++++ src/toil/options/runner.py | 4 +- src/toil/wdl/wdltoil.py | 214 +++---------------------- 5 files changed, 399 insertions(+), 407 deletions(-) diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index 9df8f2221b..7c79e35719 100644 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -53,7 +53,7 @@ TypeVar, Union, cast, - Literal, + Literal, Protocol, ) from urllib.error import HTTPError from urllib.parse import quote, unquote, urlparse, urlsplit @@ -132,13 +132,13 @@ unwrap, ImportsJob, FileMetadata, - is_remote_url, + is_remote_url, get_file_sizes, ) from toil.jobStores.abstractJobStore import ( AbstractJobStore, NoSuchFileException, - LocatorException, InvalidImportExportUrlException, + LocatorException, UnimplementedURLException, ) from toil.jobStores.fileJobStore import FileJobStore @@ -1786,14 +1786,16 @@ def write_to_pipe( return schema_salad.ref_resolver.file_uri(src_path) -def write_file( - writeFunc: Callable[[str], FileID], +def convert_file_uri_to_toil_uri( + applyFunc: Callable[[str], FileID], index: dict[str, str], existing: dict[str, str], file_uri: str, ) -> str: """ - Write a file into the Toil jobstore. + Given a file URI, convert it to a toil file URI. Uses applyFunc to handle the conversion. + + Runs once on every unique file URI. 'existing' is a set of files retrieved as inputs from toil_get_file. This ensures they are mapped back as the same name if passed through. @@ -1810,12 +1812,8 @@ def write_file( else: file_uri = existing.get(file_uri, file_uri) if file_uri not in index: - if not urlparse(file_uri).scheme: - rp = os.path.realpath(file_uri) - else: - rp = file_uri try: - index[file_uri] = "toilfile:" + writeFunc(rp).pack() + index[file_uri] = "toilfile:" + applyFunc(file_uri).pack() existing[index[file_uri]] = file_uri except Exception as e: logger.error("Got exception '%s' while copying '%s'", e, file_uri) @@ -1834,7 +1832,7 @@ def path_to_loc(obj: CWLObjectType) -> None: del obj["path"] -def extract_files( +def extract_file_uri_once( fileindex: Dict[str, str], existing: Dict[str, str], file_metadata: CWLObjectType, @@ -1842,7 +1840,13 @@ def extract_files( skip_remote: bool = False, ) -> Optional[str]: """ - Extract the filename from a CWL file record + Extract the filename from a CWL file record. + + This function matches the predefined function signature in visit_files, which ensures + that this function is called on all files inside a CWL object. + + Ensures no duplicate files are returned according to fileindex. If a file has not been resolved already (and had file:// prepended) + then resolve symlinks. :param fileindex: Forward mapping of filename :param existing: Reverse mapping of filename. This function does not use this :param file_metadata: CWL file record @@ -1851,7 +1855,12 @@ def extract_files( :return: """ location = cast(str, file_metadata["location"]) - + if ( + location.startswith("toilfile:") + or location.startswith("toildir:") + or location.startswith("_:") + ): + return None if location in fileindex: file_metadata["location"] = fileindex[location] return None @@ -1870,12 +1879,14 @@ def extract_files( "File is missing: %s" % file_metadata ) if location.startswith("file://") or not skip_remote: - # This is a local file, or we also need to download and re-upload remote files + # This is a local file or a remote file if location not in fileindex: - # don't download twice # These dictionaries are meant to keep track of what we're going to import # In the actual import, this is used as a bidirectional mapping from unvirtualized to virtualized + # For this case, keep track of the files to prevent returning duplicate files # see write_file + + # If there is not a scheme, this file has not been resolved yet or is a URL. if not urlparse(location).scheme: rp = os.path.realpath(location) else: @@ -1885,9 +1896,15 @@ def extract_files( return rp return None +V = TypeVar("V", covariant=True) + +class VisitFunc(Protocol[V]): + def __call__(self, fileindex: dict[str, str], existing: dict[str, str], + file_metadata: CWLObjectType, mark_broken: bool, + skip_remote: bool) -> V: ... def visit_files( - func: Callable[..., Any], + func: VisitFunc[V], fs_access: StdFsAccess, fileindex: dict[str, str], existing: dict[str, str], @@ -1895,7 +1912,7 @@ def visit_files( mark_broken: bool = False, skip_remote: bool = False, bypass_file_store: bool = False, -) -> List[Any]: +) -> List[V]: """ Prepare all files and directories. @@ -2153,8 +2170,8 @@ def upload_directory( directory_metadata["location"] = encode_directory(directory_contents) -def upload_file( - uploadfunc: Callable[[str], FileID], +def extract_and_convert_file_to_toil_uri( + convertfunc: Callable[[str], FileID], fileindex: dict[str, str], existing: dict[str, str], file_metadata: CWLObjectType, @@ -2162,46 +2179,22 @@ def upload_file( skip_remote: bool = False, ) -> None: """ - Update a file object so that the file will be accessible from another machine. + Extract the file URI out of a file object and convert it to a Toil URI. + + Runs convertfunc on the file URI to handle conversion. - Uploads local files to the Toil file store, and sets their location to a - reference to the toil file store. + Is used to handle importing files into the jobstore. If a file doesn't exist, fails with an error, unless mark_broken is set, in which case the missing file is given a special sentinel location. - Unless skip_remote is set, downloads remote files into the file store and - sets their locations to references into the file store as well. + Unless skip_remote is set, also run on remote files and sets their locations + to toil URIs as well. """ - location = cast(str, file_metadata["location"]) - if ( - location.startswith("toilfile:") - or location.startswith("toildir:") - or location.startswith("_:") - ): - return - if location in fileindex: - file_metadata["location"] = fileindex[location] - return - if not location and file_metadata["path"]: - file_metadata["location"] = location = schema_salad.ref_resolver.file_uri( - cast(str, file_metadata["path"]) - ) - if location.startswith("file://") and not os.path.isfile( - schema_salad.ref_resolver.uri_file_path(location) - ): - if mark_broken: - logger.debug("File %s is missing", file_metadata) - file_metadata["location"] = location = MISSING_FILE - else: - raise cwl_utils.errors.WorkflowException( - "File is missing: %s" % file_metadata - ) - - if location.startswith("file://") or not skip_remote: - # This is a local file, or we also need to download and re-upload remote files - file_metadata["location"] = write_file( - uploadfunc, fileindex, existing, location + location = extract_file_uri_once(fileindex, existing, file_metadata, mark_broken, skip_remote) + if location is not None: + file_metadata["location"] = convert_file_uri_to_toil_uri( + convertfunc, fileindex, existing, location ) logger.debug("Sending file at: %s", file_metadata["location"]) @@ -2906,7 +2899,7 @@ def file_import_function(url: str, log_level: int = logging.DEBUG) -> FileID: logger.log(log_level, "Loading %s...", url) return writeGlobalFileWrapper(file_store, url) - file_upload_function = functools.partial(upload_file, file_import_function) + file_upload_function = functools.partial(extract_and_convert_file_to_toil_uri, file_import_function) # Upload all the Files and set their and the Directories' locations, if # needed. @@ -3531,7 +3524,7 @@ def __init__( basedir: str, skip_remote: bool, bypass_file_store: bool, - import_data: Promised[Tuple[Dict[str, FileID], Dict[str, FileMetadata]]], + import_data: Promised[Dict[str, FileID]], **kwargs: Any, ) -> None: """ @@ -3548,12 +3541,12 @@ def __init__( self.bypass_file_store = bypass_file_store self.import_data = import_data - def run(self, file_store: AbstractFileStore) -> Tuple[CWLObjectType, Process]: + def run(self, file_store: AbstractFileStore) -> Tuple[Process, CWLObjectType]: """ Convert the filenames in the workflow inputs into the URIs :return: Promise of transformed workflow inputs. A tuple of the job order and process """ - candidate_to_fileid = unwrap(self.import_data)[0] + candidate_to_fileid = unwrap(self.import_data) initialized_job_order = unwrap(self.initialized_job_order) tool = unwrap(self.tool) @@ -3562,7 +3555,7 @@ def convert_file(filename: str) -> FileID: fileid = candidate_to_fileid[filename] return fileid - file_convert_function = functools.partial(upload_file, convert_file) + file_convert_function = functools.partial(extract_and_convert_file_to_toil_uri, convert_file) fs_access = ToilFsAccess(self.basedir) fileindex: Dict[str, str] = {} existing: Dict[str, str] = {} @@ -3599,7 +3592,7 @@ def convert_file(filename: str) -> FileID: # were required. rm_unprocessed_secondary_files(param_value) - return initialized_job_order, tool + return tool, initialized_job_order class CWLImportWrapper(CWLNamedJob): @@ -3628,7 +3621,7 @@ def run(self, file_store: AbstractFileStore) -> Any: self.options, self.initialized_job_order, self.tool ) file_to_data = get_file_sizes( - filenames, file_store.jobStore, self.options.reference_inputs + filenames, file_store.jobStore, include_remote_files=self.options.reference_inputs ) imports_job = ImportsJob(file_to_data, self.options.import_workers_threshold) self.addChild(imports_job) @@ -3638,13 +3631,13 @@ def run(self, file_store: AbstractFileStore) -> Any: basedir=self.options.basedir, skip_remote=self.options.reference_inputs, bypass_file_store=self.options.bypass_file_store, - import_data=imports_job.rv(), + import_data=imports_job.rv(0), ) self.addChild(install_imports_job) imports_job.addFollowOn(install_imports_job) start_job = CWLStartJob( - install_imports_job.rv(), runtime_context=self.runtime_context + install_imports_job.rv(0), install_imports_job.rv(1), runtime_context=self.runtime_context ) self.addChild(start_job) install_imports_job.addFollowOn(start_job) @@ -3654,79 +3647,53 @@ def run(self, file_store: AbstractFileStore) -> Any: class CWLStartJob(CWLNamedJob): """ - Job responsible for starting the CWL workflow + Job responsible for starting the CWL workflow. + + Takes in the workflow/tool and inputs after all files are imported + and creates jobs to run those workflows. """ def __init__( self, - inputs: Promised[Tuple[CWLObjectType, Process]], + tool: Promised[Process], + initialized_job_order: Promised[CWLObjectType], runtime_context: cwltool.context.RuntimeContext, **kwargs: Any, ) -> None: super().__init__(**kwargs) - self.inputs = inputs + self.tool = tool + self.initialized_job_order = initialized_job_order self.runtime_context = runtime_context def run(self, file_store: AbstractFileStore) -> Any: - initialized_job_order = unwrap(self.inputs)[0] - tool = unwrap(self.inputs)[1] - root_job, _ = makeJob( + initialized_job_order = unwrap(self.initialized_job_order) + tool = unwrap(self.tool) + cwljob, _ = makeJob( tool, initialized_job_order, self.runtime_context, None, None ) # toplevel, no name needed - root_job.cwljob = initialized_job_order - self.addChild(root_job) - return root_job.rv() - - -def get_file_sizes( - filenames: List[str], file_source: AbstractJobStore, import_remote_files: bool -) -> Dict[str, FileMetadata]: - file_to_data = {} - for filename in filenames: - try: - if not import_remote_files and is_remote_url(filename): - if not file_source.url_exists(filename): - continue - filesize = file_source.get_size(filename) - parent_dir = os.path.dirname(filename) - except UnimplementedURLException as e: - # We can't find anything that can even support this URL scheme. - # Report to the user, they are probably missing an extra. - logger.critical("Error: " + str(e)) - raise - except HTTPError as e: - # Something went wrong looking for it there. - logger.warning("Checked URL %s but got HTTP status %s", filename, e.code) - # Try the next location. - continue - except FileNotFoundError: - # Wasn't found there - continue - except Exception: - # Something went wrong besides the file not being found. Maybe - # we have no auth. - logger.error( - "Something went wrong when testing for existence of %s", filename - ) - raise - - # filename and parent dir in the tuple are not used in CWL - # but this allows generalization with the WDL runner's importOnWorkers functionality - file_to_data[filename] = cast(FileMetadata, (filename, parent_dir, filesize)) - return file_to_data + cwljob.cwljob = initialized_job_order + self.addChild(cwljob) + return cwljob.rv() def extract_workflow_inputs( options: Namespace, initialized_job_order: CWLObjectType, tool: Process -) -> List[str]: +) -> list[str]: + """ + Collect all the workflow input files to import later. + :param options: namespace + :param initialized_job_order: cwl object + :param tool: tool object + :return: + """ fileindex: Dict[str, str] = {} existing: Dict[str, str] = {} - # Grab all the input files - logger.info("Grabbing input files...") + # Extract out all the input files' filenames + logger.info("Collecting input files...") fs_access = ToilFsAccess(options.basedir) filenames = visit_files( - extract_files, + extract_file_uri_once, fs_access, fileindex, existing, @@ -3735,13 +3702,13 @@ def extract_workflow_inputs( skip_remote=options.reference_inputs, bypass_file_store=options.bypass_file_store, ) - # Grab all the files associated with tools (binaries, etc.). - logger.info("Grabbing tool-associated files...") + # Extract filenames of all the files associated with tools (binaries, etc.). + logger.info("Collecting tool-associated files...") tool_filenames = visitSteps( tool, functools.partial( visit_files, - extract_files, + extract_file_uri_once, fs_access, fileindex, existing, @@ -3751,7 +3718,7 @@ def extract_workflow_inputs( ), ) filenames.extend(tool_filenames) - return filenames + return [file for file in filenames if file is not None] def import_workflow_inputs( @@ -3761,24 +3728,32 @@ def import_workflow_inputs( tool: Process, log_level: int = logging.DEBUG, ) -> None: + """ + Import all workflow inputs on the leader. + + Ran when not importing on workers. + :param jobstore: Toil jobstore + :param options: Namespace + :param initialized_job_order: CWL object + :param tool: CWL tool + :param log_level: log level + :return: + """ fileindex: Dict[str, str] = {} existing: Dict[str, str] = {} # Define something we can call to import a file and get its file # ID. - # We cast this because import_file is overloaded depending on if we - # pass a shared file name or not, and we know the way we call it we - # always get a FileID out. def file_import_function(url: str) -> FileID: logger.log(log_level, "Loading %s...", url) return jobstore.import_file(url, symlink=True) - + import_function = functools.partial(extract_and_convert_file_to_toil_uri, file_import_function) # Import all the input files, some of which may be missing optional # files. logger.info("Importing input files...") fs_access = ToilFsAccess(options.basedir) visit_files( - file_import_function, + import_function, fs_access, fileindex, existing, @@ -3795,7 +3770,7 @@ def file_import_function(url: str) -> FileID: tool, functools.partial( visit_files, - file_import_function, + import_function, fs_access, fileindex, existing, @@ -3815,10 +3790,11 @@ def file_import_function(url: str) -> FileID: rm_unprocessed_secondary_files(param_value) +T = TypeVar("T") def visitSteps( cmdline_tool: Process, - op: Callable[[CommentedMap], List[Any]], -) -> List[Any]: + op: Callable[[CommentedMap], List[T]], +) -> List[T]: """ Iterate over a CWL Process object, running the op on each tool description CWL object. diff --git a/src/toil/job.py b/src/toil/job.py index 39dcfd3200..f5478bc080 100644 --- a/src/toil/job.py +++ b/src/toil/job.py @@ -47,6 +47,8 @@ cast, overload, ) +from urllib.error import HTTPError +from urllib.parse import urlsplit, unquote, urljoin from configargparse import ArgParser @@ -54,14 +56,11 @@ from toil.bus import Names from toil.lib.compatibility import deprecated -if sys.version_info >= (3, 8): - from typing import TypedDict -else: - from typing_extensions import TypedDict - import dill from configargparse import ArgParser +from toil.lib.io import is_remote_url + if sys.version_info < (3, 11): from typing_extensions import NotRequired else: @@ -86,59 +85,11 @@ InsufficientSystemResources, ) from toil.fileStores.abstractFileStore import AbstractFileStore - from toil.jobStores.abstractJobStore import AbstractJobStore + from toil.jobStores.abstractJobStore import AbstractJobStore, UnimplementedURLException logger = logging.getLogger(__name__) -# We define a URI scheme kind of like but not actually compatible with the one -# we use for CWL. CWL brings along the file basename in its file type, but -# WDL.Value.File doesn't. So we need to make sure we stash that somewhere in -# the URI. -# TODO: We need to also make sure files from the same source directory end up -# in the same destination directory, when dealing with basename conflicts. - -TOIL_URI_SCHEME = "toilfile:" - - -STANDARD_SCHEMES = ["http:", "https:", "s3:", "gs:", "ftp:"] -REMOTE_SCHEMES = STANDARD_SCHEMES + [TOIL_URI_SCHEME] -ALL_SCHEMES = REMOTE_SCHEMES + ["file:"] - -def is_standard_url(filename: str) -> bool: - return is_url_with_scheme(filename, STANDARD_SCHEMES) - -def is_remote_url(filename: str) -> bool: - """ - Decide if a filename is a known, non-file kind of URL - """ - return is_url_with_scheme(filename, REMOTE_SCHEMES) - -def is_any_url(filename: str) -> bool: - """ - Decide if a string is a URI like http:// or file://. - - Otherwise it might be a bare path. - """ - return is_url_with_scheme(filename, ALL_SCHEMES) - -def is_url_with_scheme(filename: str, schemes: list[str]) -> bool: - """ - Return True if filename is a URL with any of the given schemes and False otherwise. - """ - # TODO: "http:myfile.dat" is a valid filename and *not* a valid URL - for scheme in schemes: - if filename.startswith(scheme): - return True - return False - -def is_toil_url(filename: str) -> bool: - return is_url_with_scheme(filename, [TOIL_URI_SCHEME]) - -def is_file_url(filename: str) -> bool: - return is_url_with_scheme(filename, ["file:"]) - - class JobPromiseConstraintError(RuntimeError): """ Error for job being asked to promise its return value, but it not available. @@ -3839,6 +3790,180 @@ class FileMetadata(NamedTuple): size: Optional[int] +def potential_absolute_uris( + uri: str, + path: list[str], + importer: WDL.Tree.Document | None = None, + execution_dir: str | None = None, +) -> Iterator[str]: + """ + Get potential absolute URIs to check for an imported file. + + Given a URI or bare path, yield in turn all the URIs, with schemes, where we + should actually try to find it, given that we want to search under/against + the given paths or URIs, the current directory, and the given importing WDL + document if any. + """ + + if uri == "": + # Empty URIs can't come from anywhere. + return + + # We need to brute-force find this URI relative to: + # + # 1. Itself if a full URI. + # + # 2. Importer's URL, if importer is a URL and this is a + # host-root-relative URL starting with / or scheme-relative + # starting with //, or just plain relative. + # + # 3. Current directory, if a relative path. + # + # 4. All the prefixes in "path". + # + # If it can't be found anywhere, we ought to (probably) throw + # FileNotFoundError like the MiniWDL implementation does, with a + # correct errno. + # + # To do this, we have AbstractFileStore.read_from_url, which can read a + # URL into a binary-mode writable, or throw some kind of unspecified + # exception if the source doesn't exist or can't be fetched. + + # This holds scheme-applied full URIs for all the places to search. + full_path_list = [] + + if importer is not None: + # Add the place the imported file came form, to search first. + full_path_list.append(Toil.normalize_uri(importer.pos.abspath)) + + # Then the current directory. We need to make sure to include a filename component here or it will treat the current directory with no trailing / as a document and relative paths will look 1 level up. + # When importing on a worker, the cwd will be a tmpdir and will result in FileNotFoundError after os.path.abspath, so override with the execution dir + full_path_list.append(Toil.normalize_uri(execution_dir or ".") + "/.") + + # Then the specified paths. + # TODO: + # https://github.com/chanzuckerberg/miniwdl/blob/e3e8ef74e80fbe59f137b0ad40b354957915c345/WDL/Tree.py#L1479-L1482 + # seems backward actually and might do these first! + full_path_list += [Toil.normalize_uri(p) for p in path] + + # This holds all the URIs we tried and failed with. + failures: set[str] = set() + + for candidate_base in full_path_list: + # Try fetching based off each base URI + candidate_uri = urljoin(candidate_base, uri) + if candidate_uri in failures: + # Already tried this one, maybe we have an absolute uri input. + continue + logger.debug( + "Consider %s which is %s off of %s", candidate_uri, uri, candidate_base + ) + + # Try it + yield candidate_uri + # If we come back it didn't work + failures.add(candidate_uri) + + +def get_file_sizes( + filenames: List[str], + file_source: AbstractJobStore, + search_paths: Optional[List[str]] = None, + include_remote_files: bool = True, + execution_dir: Optional[str] = None, +) -> Dict[str, FileMetadata]: + """ + Resolve relative-URI files in the given environment and turn them into absolute normalized URIs. Returns a dictionary of the *string values* from the WDL file values + to a tuple of the normalized URI, parent directory ID, and size of the file. The size of the file may be None, which means unknown size. + + :param filenames: list of filenames to evaluate on + :param file_source: Context to search for files with + :param task_path: Dotted WDL name of the user-level code doing the + importing (probably the workflow name). + :param search_paths: If set, try resolving input location relative to the URLs or + directories in this list. + :param include_remote_files: If set, import files from remote locations. Else leave them as URI references. + """ + + @memoize + def get_filename_size(filename: str) -> FileMetadata: + tried = [] + for candidate_uri in potential_absolute_uris( + filename, + search_paths if search_paths is not None else [], + execution_dir=execution_dir, + ): + tried.append(candidate_uri) + try: + if not include_remote_files and is_remote_url(candidate_uri): + # Use remote URIs in place. But we need to find the one that exists. + if not file_source.url_exists(candidate_uri): + # Wasn't found there + continue + + # Now we know this exists, so pass it through + # Get filesizes + filesize = file_source.get_size(candidate_uri) + + except UnimplementedURLException as e: + # We can't find anything that can even support this URL scheme. + # Report to the user, they are probably missing an extra. + logger.critical("Error: " + str(e)) + raise + except HTTPError as e: + # Something went wrong looking for it there. + logger.warning( + "Checked URL %s but got HTTP status %s", candidate_uri, e.code + ) + # Try the next location. + continue + except FileNotFoundError: + # Wasn't found there + continue + except Exception: + # Something went wrong besides the file not being found. Maybe + # we have no auth. + logger.error( + "Something went wrong when testing for existence of %s", + candidate_uri, + ) + raise + + # Work out what the basename for the file was + file_basename = os.path.basename(urlsplit(candidate_uri).path) + + if file_basename == "": + # We can't have files with no basename because we need to + # download them at that basename later. + raise RuntimeError( + f"File {candidate_uri} has no basename and so cannot be a WDL File" + ) + + # Was actually found + if is_remote_url(candidate_uri): + # Might be a file URI or other URI. + # We need to make sure file URIs and local paths that point to + # the same place are treated the same. + parsed = urlsplit(candidate_uri) + if parsed.scheme == "file:": + # This is a local file URI. Convert to a path for source directory tracking. + parent_dir = os.path.dirname(unquote(parsed.path)) + else: + # This is some other URL. Get the URL to the parent directory and use that. + parent_dir = urljoin(candidate_uri, ".") + else: + # Must be a local path + parent_dir = os.path.dirname(candidate_uri) + + return cast(FileMetadata, (candidate_uri, parent_dir, filesize)) + # Not found + raise RuntimeError( + f"Could not find {filename} at any of: {list(potential_absolute_uris(filename, search_paths if search_paths is not None else []))}" + ) + + return {k: get_filename_size(k) for k in filenames} + + class CombineImportsJob(Job): """ Combine the outputs of multiple WorkerImportsJob into one promise @@ -3846,13 +3971,15 @@ class CombineImportsJob(Job): def __init__(self, d: Sequence[Promised[Dict[str, FileID]]], **kwargs): """ - Merge the dicts :param d: Sequence of dictionaries to merge """ self._d = d super().__init__(**kwargs) def run(self, file_store: AbstractFileStore) -> Promised[Dict[str, FileID]]: + """ + Merge the dicts + """ d = unwrap_all(self._d) return {k: v for item in d for k, v in item.items()} @@ -3871,17 +3998,37 @@ def __init__( stream: bool = True, **kwargs: Any, ): + """ + Setup importing files on a worker. + :param filenames: List of file URIs to import + :param disk_size: Designated disk space the worker can use when importing. Disregarded if stream is enabled. + :param stream: Whether to stream a file import or not. We don't have machinery to ensure + streaming, so if true, assume streaming works and don't give the worker a lot of disk space to work with. + If streaming fails, the worker will run out of resources and allocate a child job to handle the import with enough disk space. + :param kwargs: args for the superclass + """ if stream: super().__init__(local=False, **kwargs) else: super().__init__(local=False, disk=disk_size, **kwargs) self.filenames = filenames self.disk_size = disk_size + self.stream = stream @staticmethod def import_files( files: List[str], file_source: "AbstractJobStore" ) -> Dict[str, FileID]: + """ + Import a list of files into the jobstore. Returns a mapping of the filename to the associated FileIDs + + When stream is true but the import is not streamable, the worker will run out of + disk space and run a new import job with enough disk space instead. + :param files: list of files to import + :param file_source: AbstractJobStore + :return: Dictionary mapping filenames to associated jobstore FileID + """ + # todo: make the import ensure streaming is done instead of relying on running out of disk space path_to_fileid = {} @memoize @@ -3902,9 +4049,12 @@ def run(self, file_store: AbstractFileStore) -> Promised[Dict[str, FileID]]: try: return self.import_files(self.filenames, file_store.jobStore) except InsufficientSystemResources as e: - if e.resource == "disk" and e.requested < self.disk_size: + # If the worker crashes due to running out of disk space and was not trying to + # stream the file import, then try a new import job without streaming by actually giving + # the worker enough disk space + if e.resource == "disk" and e.requested < self.disk_size and self.stream is True: non_streaming_import = WorkerImportJob( - self.filenames, self.disk_size, streaming=False + self.filenames, self.disk_size, stream=False ) self.addChild(non_streaming_import) return non_streaming_import.rv() @@ -3922,17 +4072,20 @@ class ImportsJob(Job): def __init__( self, file_to_data: Dict[str, FileMetadata], - import_workers_threshold: ParseableIndivisibleResource, + max_batch_size: ParseableIndivisibleResource, **kwargs: Any, ): """ - Job to take the inputs from the WDL workflow and import them on a worker instead of a leader. Assumes all local and cloud files are accessible. + Job to take the inputs for a workflow and import them on a worker instead of a leader. Assumes all local and cloud files are accessible. This class is only used when runImportsOnWorkers is enabled. + + :param file_to_data: mapping of file source name to file metadata + :param max_batch_size: maximum cumulative file size of a batched import """ super().__init__(local=True, **kwargs) self._file_to_data = file_to_data - self._import_workers_threshold = import_workers_threshold + self._max_batch_size = max_batch_size def run( self, file_store: AbstractFileStore @@ -3941,51 +4094,41 @@ def run( Import the workflow inputs and then create and run the workflow. :return: Tuple of a mapping from the candidate uri to the file id and a mapping of the source filenames to its metadata. The candidate uri is a field in the file metadata """ - import_workers_threshold = self._import_workers_threshold + max_batch_size = self._max_batch_size file_to_data = self._file_to_data # Run WDL imports on a worker instead - individual_files = [] - batch_files = [] - # Separate out files that will be imported individually and in a batch - for filename, (candidate_uri, parent_id, size) in file_to_data.items(): - if size < import_workers_threshold: - batch_files.append(filename) - else: - individual_files.append(filename) + filenames = list(file_to_data.keys()) - # Create individual import jobs import_jobs = [] - for file in individual_files: - filesize = file_to_data[file][2] - candidate_uri = file_to_data[file][0] - import_jobs.append(WorkerImportJob([candidate_uri], disk_size=filesize)) - # create multiple batch jobs if necessary to not exceed the threshold - per_batch_files = [] - per_batch_size = 0 # This list will hold lists of batched filenames - to_batch = [] - while len(batch_files) > 0 or len(per_batch_files) > 0: - if len(batch_files) == 0 and len(per_batch_files) > 0: - to_batch.append(per_batch_files) - break + file_batches = [] - filename = batch_files[0] + # List of filenames for each batch + per_batch_files = [] + per_batch_size = 0 + while len(filenames) > 0 or len(per_batch_files) > 0: + filename = filenames.pop(0) # See if adding this to the queue will make the batch job too big filesize = file_to_data[filename][2] - if per_batch_size + filesize >= import_workers_threshold: + if per_batch_size + filesize >= max_batch_size: # batch is too big now, store to schedule the batch - to_batch.append(per_batch_files) + if len(per_batch_files) == 0: + # schedule the individual file + per_batch_files.append(filename) + file_batches.append(per_batch_files) # reset batching calculation per_batch_size = 0 else: per_batch_size += filesize - per_batch_files.append(filename) - batch_files.pop(0) + per_batch_files.append(filename) + + if per_batch_files: + file_batches.append(per_batch_files) # Create batch import jobs for each group of files - for batch in to_batch: + for batch in file_batches: candidate_uris = [file_to_data[filename][0] for filename in batch] batch_size = sum(file_to_data[filename][2] for filename in batch) import_jobs.append(WorkerImportJob(candidate_uris, disk_size=batch_size)) diff --git a/src/toil/lib/io.py b/src/toil/lib/io.py index fbf54668db..55729d20e4 100644 --- a/src/toil/lib/io.py +++ b/src/toil/lib/io.py @@ -12,6 +12,47 @@ logger = logging.getLogger(__name__) +TOIL_URI_SCHEME = "toilfile:" + + +STANDARD_SCHEMES = ["http:", "https:", "s3:", "gs:", "ftp:"] +REMOTE_SCHEMES = STANDARD_SCHEMES + [TOIL_URI_SCHEME] +ALL_SCHEMES = REMOTE_SCHEMES + ["file:"] + +def is_standard_url(filename: str) -> bool: + return is_url_with_scheme(filename, STANDARD_SCHEMES) + +def is_remote_url(filename: str) -> bool: + """ + Decide if a filename is a known, non-file kind of URL + """ + return is_url_with_scheme(filename, REMOTE_SCHEMES) + +def is_any_url(filename: str) -> bool: + """ + Decide if a string is a URI like http:// or file://. + + Otherwise it might be a bare path. + """ + return is_url_with_scheme(filename, ALL_SCHEMES) + +def is_url_with_scheme(filename: str, schemes: list[str]) -> bool: + """ + Return True if filename is a URL with any of the given schemes and False otherwise. + """ + # TODO: "http:myfile.dat" is a valid filename and *not* a valid URL + for scheme in schemes: + if filename.startswith(scheme): + return True + return False + +def is_toil_url(filename: str) -> bool: + return is_url_with_scheme(filename, [TOIL_URI_SCHEME]) + +def is_file_url(filename: str) -> bool: + return is_url_with_scheme(filename, ["file:"]) + + def mkdtemp( suffix: Optional[str] = None, prefix: Optional[str] = None, diff --git a/src/toil/options/runner.py b/src/toil/options/runner.py index 24e463c45a..90d1536d2a 100644 --- a/src/toil/options/runner.py +++ b/src/toil/options/runner.py @@ -33,6 +33,6 @@ def add_runner_options( dest="import_workers_threshold", type=lambda x: human2bytes(str(x)), default="1 GiB", - help="Specify the file size threshold that determines if the file import will happen in its own job. All files below the threshold " - "will go into a batch import job. This should be set in conjunction with the argument --runImportsOnWorkers." + help="Specify the file size threshold that determines how many files go into a batched import. As many files will go into a batch import job until this threshold" + "is reached. This should be set in conjunction with the argument --runImportsOnWorkers." ) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index ad34f3ed97..3ea204c4be 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -92,12 +92,9 @@ ParseableIndivisibleResource, ImportsJob, FileMetadata, - TOIL_URI_SCHEME, - is_standard_url, - is_toil_url, - is_any_url, is_remote_url, - is_file_url, + potential_absolute_uris, + get_file_sizes ) from toil.jobStores.abstractJobStore import ( AbstractJobStore, @@ -107,7 +104,7 @@ ) from toil.lib.accelerators import get_individual_local_accelerators from toil.lib.conversions import VALID_PREFIXES, convert_units, human2bytes -from toil.lib.io import mkdtemp +from toil.lib.io import mkdtemp, is_any_url, is_file_url, TOIL_URI_SCHEME, is_standard_url, is_toil_url from toil.lib.memoize import memoize from toil.lib.misc import get_user_name from toil.lib.resources import ResourceMonitor @@ -117,6 +114,15 @@ logger = logging.getLogger(__name__) +# In regards to "toilfile:" URIs: +# We define a URI scheme kind of like but not actually compatible with the one +# we use for CWL. CWL brings along the file basename in its file type, but +# WDL.Value.File doesn't. So we need to make sure we stash that somewhere in +# the URI. +# TODO: We need to also make sure files from the same source directory end up +# in the same destination directory, when dealing with basename conflicts. + + # We want to use hashlib.file_digest to avoid a 3-line hashing loop like # MiniWDL has. But it is only in 3.11+ # @@ -487,81 +493,6 @@ def first_mismatch(prefix: str, value: str) -> int: return modified -def potential_absolute_uris( - uri: str, - path: list[str], - importer: WDL.Tree.Document | None = None, - execution_dir: str | None = None, -) -> Iterator[str]: - """ - Get potential absolute URIs to check for an imported file. - - Given a URI or bare path, yield in turn all the URIs, with schemes, where we - should actually try to find it, given that we want to search under/against - the given paths or URIs, the current directory, and the given importing WDL - document if any. - """ - - if uri == "": - # Empty URIs can't come from anywhere. - return - - # We need to brute-force find this URI relative to: - # - # 1. Itself if a full URI. - # - # 2. Importer's URL, if importer is a URL and this is a - # host-root-relative URL starting with / or scheme-relative - # starting with //, or just plain relative. - # - # 3. Current directory, if a relative path. - # - # 4. All the prefixes in "path". - # - # If it can't be found anywhere, we ought to (probably) throw - # FileNotFoundError like the MiniWDL implementation does, with a - # correct errno. - # - # To do this, we have AbstractFileStore.read_from_url, which can read a - # URL into a binary-mode writable, or throw some kind of unspecified - # exception if the source doesn't exist or can't be fetched. - - # This holds scheme-applied full URIs for all the places to search. - full_path_list = [] - - if importer is not None: - # Add the place the imported file came form, to search first. - full_path_list.append(Toil.normalize_uri(importer.pos.abspath)) - - # Then the current directory. We need to make sure to include a filename component here or it will treat the current directory with no trailing / as a document and relative paths will look 1 level up. - # When importing on a worker, the cwd will be a tmpdir and will result in FileNotFoundError after os.path.abspath, so override with the execution dir - full_path_list.append(Toil.normalize_uri(execution_dir or ".") + "/.") - - # Then the specified paths. - # TODO: - # https://github.com/chanzuckerberg/miniwdl/blob/e3e8ef74e80fbe59f137b0ad40b354957915c345/WDL/Tree.py#L1479-L1482 - # seems backward actually and might do these first! - full_path_list += [Toil.normalize_uri(p) for p in path] - - # This holds all the URIs we tried and failed with. - failures: set[str] = set() - - for candidate_base in full_path_list: - # Try fetching based off each base URI - candidate_uri = urljoin(candidate_base, uri) - if candidate_uri in failures: - # Already tried this one, maybe we have an absolute uri input. - continue - logger.debug( - "Consider %s which is %s off of %s", candidate_uri, uri, candidate_base - ) - - # Try it - yield candidate_uri - # If we come back it didn't work - failures.add(candidate_uri) - - async def toil_read_source( uri: str, path: list[str], importer: WDL.Tree.Document | None ) -> ReadSourceResult: @@ -1310,7 +1241,7 @@ def _call_eager( return WDL.Value.Float(total_size) -def extract_workflow_inputs(environment: WDLBindings) -> List[str]: +def extract_workflow_inputs(environment: WDLBindings) -> list[str]: filenames = list() def add_filename(file: WDL.Value.File) -> WDL.Value.File: @@ -1321,117 +1252,19 @@ def add_filename(file: WDL.Value.File) -> WDL.Value.File: return filenames -def get_file_sizes( - filenames: List[str], - file_source: AbstractJobStore, - search_paths: Optional[List[str]] = None, - import_remote_files: bool = True, - execution_dir: Optional[str] = None, -) -> Dict[str, FileMetadata]: - """ - Resolve relative-URI files in the given environment and them then into absolute normalized URIs. Returns a dictionary of WDL file values to a tuple of the normalized URI, - parent directory ID, and size of the file. The size of the file may be None, which means unknown size. - - :param filenames: list of filenames to evaluate on - :param file_source: Context to search for files with - :param task_path: Dotted WDL name of the user-level code doing the - importing (probably the workflow name). - :param search_paths: If set, try resolving input location relative to the URLs or - directories in this list. - :param import_remote_files: If set, import files from remote locations. Else leave them as URI references. - """ - - @memoize - def get_filename_size(filename: str) -> FileMetadata: - tried = [] - for candidate_uri in potential_absolute_uris( - filename, - search_paths if search_paths is not None else [], - execution_dir=execution_dir, - ): - tried.append(candidate_uri) - try: - if not import_remote_files and is_remote_url(candidate_uri): - # Use remote URIs in place. But we need to find the one that exists. - if not file_source.url_exists(candidate_uri): - # Wasn't found there - continue - - # Now we know this exists, so pass it through - # Get filesizes - filesize = file_source.get_size(candidate_uri) - - except UnimplementedURLException as e: - # We can't find anything that can even support this URL scheme. - # Report to the user, they are probably missing an extra. - logger.critical("Error: " + str(e)) - raise - except HTTPError as e: - # Something went wrong looking for it there. - logger.warning( - "Checked URL %s but got HTTP status %s", candidate_uri, e.code - ) - # Try the next location. - continue - except FileNotFoundError: - # Wasn't found there - continue - except Exception: - # Something went wrong besides the file not being found. Maybe - # we have no auth. - logger.error( - "Something went wrong when testing for existence of %s", - candidate_uri, - ) - raise - - # Work out what the basename for the file was - file_basename = os.path.basename(urlsplit(candidate_uri).path) - - if file_basename == "": - # We can't have files with no basename because we need to - # download them at that basename later. - raise RuntimeError( - f"File {candidate_uri} has no basename and so cannot be a WDL File" - ) - - # Was actually found - if is_remote_url(candidate_uri): - # Might be a file URI or other URI. - # We need to make sure file URIs and local paths that point to - # the same place are treated the same. - parsed = urlsplit(candidate_uri) - if parsed.scheme == "file:": - # This is a local file URI. Convert to a path for source directory tracking. - parent_dir = os.path.dirname(unquote(parsed.path)) - else: - # This is some other URL. Get the URL to the parent directory and use that. - parent_dir = urljoin(candidate_uri, ".") - else: - # Must be a local path - parent_dir = os.path.dirname(candidate_uri) - - return cast(FileMetadata, (candidate_uri, parent_dir, filesize)) - # Not found - raise RuntimeError( - f"Could not find {filename} at any of: {list(potential_absolute_uris(filename, search_paths if search_paths is not None else []))}" - ) - - return {k: get_filename_size(k) for k in filenames} - - def convert_files( environment: WDLBindings, file_to_id: Dict[str, FileID], file_to_data: Dict[str, FileMetadata], task_path: str, -) -> None: +) -> WDLBindings: """ Resolve relative-URI files in the given environment convert the file values to a new value made from a given mapping. - Will set the value of the File to the relative-URI. + Will return bindings with file values set to their corresponding relative-URI. :param environment: Bindings to evaluate on + :return: new bindings object """ dir_ids = {t[1] for t in file_to_data.values()} dir_to_id = {k: uuid.uuid4() for k in dir_ids} @@ -1461,14 +1294,14 @@ def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File: file.value = candidate_uri return file - map_over_files_in_bindings(environment, convert_file_to_uri) + return map_over_files_in_bindings(environment, convert_file_to_uri) def convert_remote_files( environment: WDLBindings, file_source: AbstractJobStore, task_path: str, - search_paths: Optional[List[str]] = None, + search_paths: Optional[list[str]] = None, import_remote_files: bool = True, execution_dir: Optional[str] = None, ) -> WDLBindings: @@ -5401,7 +5234,7 @@ class WDLStartJob(WDLSectionJob): def __init__( self, - target: Union[WDL.Tree.Workflow, WDL.Tree.Task], + target: WDL.Tree.Workflow | WDL.Tree.Task, inputs: Promised[WDLBindings], wdl_options: WDLContext, **kwargs: Any, @@ -5474,8 +5307,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: """ candidate_to_fileid = unwrap(self._import_data)[0] file_to_data = unwrap(self._import_data)[1] - convert_files(self._inputs, candidate_to_fileid, file_to_data, self._task_path) - return self._inputs + return convert_files(self._inputs, candidate_to_fileid, file_to_data, self._task_path) class WDLImportWrapper(WDLSectionJob): @@ -5491,7 +5323,7 @@ def __init__( target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, wdl_options: WDLContext, - inputs_search_path: List[str], + inputs_search_path: list[str], import_remote_files: bool, import_workers_threshold: ParseableIndivisibleResource, **kwargs: Any, @@ -5514,7 +5346,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: filenames, file_store.jobStore, self._inputs_search_path, - import_remote_files=self._import_remote_files, + include_remote_files=self._import_remote_files, ) imports_job = ImportsJob(file_to_data, self._import_workers_threshold) self.addChild(imports_job) @@ -5536,7 +5368,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: def make_root_job( target: WDL.Tree.Workflow | WDL.Tree.Task, inputs: WDLBindings, - inputs_search_path: List[str], + inputs_search_path: list[str], toil: Toil, wdl_options: WDLContext, options: Namespace, From b7ca42cf0d10375175872b773c291ff676f40604 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 6 Nov 2024 14:57:28 -0800 Subject: [PATCH 10/20] mypy --- src/toil/cwl/cwltoil.py | 4 +--- src/toil/job.py | 2 +- src/toil/wdl/wdltoil.py | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index 7c79e35719..62d647355c 100644 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -55,7 +55,6 @@ cast, Literal, Protocol, ) -from urllib.error import HTTPError from urllib.parse import quote, unquote, urlparse, urlsplit import cwl_utils.errors @@ -131,8 +130,7 @@ Promised, unwrap, ImportsJob, - FileMetadata, - is_remote_url, get_file_sizes, + get_file_sizes, ) from toil.jobStores.abstractJobStore import ( AbstractJobStore, diff --git a/src/toil/job.py b/src/toil/job.py index f5478bc080..9ca8dd0f37 100644 --- a/src/toil/job.py +++ b/src/toil/job.py @@ -45,7 +45,7 @@ TypeVar, Union, cast, - overload, + overload, TypedDict, Literal, ) from urllib.error import HTTPError from urllib.parse import urlsplit, unquote, urljoin diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 3ea204c4be..87ea000728 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -92,7 +92,6 @@ ParseableIndivisibleResource, ImportsJob, FileMetadata, - is_remote_url, potential_absolute_uris, get_file_sizes ) @@ -104,7 +103,7 @@ ) from toil.lib.accelerators import get_individual_local_accelerators from toil.lib.conversions import VALID_PREFIXES, convert_units, human2bytes -from toil.lib.io import mkdtemp, is_any_url, is_file_url, TOIL_URI_SCHEME, is_standard_url, is_toil_url +from toil.lib.io import mkdtemp, is_any_url, is_file_url, TOIL_URI_SCHEME, is_standard_url, is_toil_url, is_remote_url from toil.lib.memoize import memoize from toil.lib.misc import get_user_name from toil.lib.resources import ResourceMonitor From 04ede19e28aa8a22914f103b8830945ed79dea7f Mon Sep 17 00:00:00 2001 From: stxue1 Date: Fri, 8 Nov 2024 11:11:56 -0800 Subject: [PATCH 11/20] Detect running out of disk space with OSError instead --- src/toil/job.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/toil/job.py b/src/toil/job.py index 9ca8dd0f37..adddd1774a 100644 --- a/src/toil/job.py +++ b/src/toil/job.py @@ -4048,11 +4048,12 @@ def run(self, file_store: AbstractFileStore) -> Promised[Dict[str, FileID]]: """ try: return self.import_files(self.filenames, file_store.jobStore) - except InsufficientSystemResources as e: + except OSError as e: # If the worker crashes due to running out of disk space and was not trying to # stream the file import, then try a new import job without streaming by actually giving # the worker enough disk space - if e.resource == "disk" and e.requested < self.disk_size and self.stream is True: + # OSError 28 is no space left on device + if e.errno == 28 and self.stream is True: non_streaming_import = WorkerImportJob( self.filenames, self.disk_size, stream=False ) From 05a0898f57b984bd33cd09b030733e2a9e2ad58c Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 13 Nov 2024 11:59:26 -0800 Subject: [PATCH 12/20] Fix logic in worker import loop --- src/toil/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/toil/job.py b/src/toil/job.py index adddd1774a..91ca5757d5 100644 --- a/src/toil/job.py +++ b/src/toil/job.py @@ -4109,7 +4109,7 @@ def run( # List of filenames for each batch per_batch_files = [] per_batch_size = 0 - while len(filenames) > 0 or len(per_batch_files) > 0: + while len(filenames) > 0: filename = filenames.pop(0) # See if adding this to the queue will make the batch job too big filesize = file_to_data[filename][2] From 9f4fe361e551087040d478acfb749b15d295716b Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 13 Nov 2024 15:26:10 -0800 Subject: [PATCH 13/20] Fix bad caching --- src/toil/cwl/cwltoil.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index 01d8ee9470..53305f0a44 100644 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -1892,8 +1892,6 @@ def extract_file_uri_once( rp = os.path.realpath(location) else: rp = location - fileindex[rp] = rp - existing[rp] = rp return rp return None From 645fd25f362d43884ede3e14161d12e806b9a7eb Mon Sep 17 00:00:00 2001 From: stxue1 Date: Thu, 14 Nov 2024 12:50:46 -0800 Subject: [PATCH 14/20] Add wdl-conformance-tests to gitignore and sphinx config + remove unsued imports in job + get rid of wdl dependency in job.py --- .gitignore | 1 + docs/conf.py | 3 ++- src/toil/job.py | 21 ++++++++++----------- src/toil/wdl/wdltoil.py | 2 +- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index b11afb7abb..5e6be3f819 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ venv* v3nv/ tmp/ /src/toil/test/cwl/spec* +/src/toil/test/wdl/wdl-conformance-tests /cwltool_deps/ /docs/generated_rst/ /docker/Dockerfile diff --git a/docs/conf.py b/docs/conf.py index a0b402ebf7..a26d25a07a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -160,7 +160,8 @@ def setup(app): autoapi_dirs = ["../src/toil"] autodoc_typehints = "description" autoapi_keep_files = True -autoapi_ignore = ["*.pyi", "*/test/cwl/spec*/*.py", "*/fake_mpi_run.py", "*/tutorial_*.py", "*/example_*.py", "*/mkFile.py", "*/debugWorkflow.py"] +autoapi_ignore = ["*.pyi", "*/test/cwl/spec*/*.py", "*/fake_mpi_run.py", "*/tutorial_*.py", "*/example_*.py", "*/mkFile.py", "*/debugWorkflow.py", + "*/test/wdl/wdl-conformance-tests/*"] autoapi_options = [ "members", "undoc-members", diff --git a/src/toil/job.py b/src/toil/job.py index 91ca5757d5..efa173db2a 100644 --- a/src/toil/job.py +++ b/src/toil/job.py @@ -40,21 +40,18 @@ NamedTuple, Optional, Sequence, - Set, Tuple, TypeVar, Union, cast, - overload, TypedDict, Literal, + overload, + TypedDict, + Literal, ) from urllib.error import HTTPError from urllib.parse import urlsplit, unquote, urljoin -from configargparse import ArgParser - from toil import memoize -from toil.bus import Names -from toil.lib.compatibility import deprecated import dill from configargparse import ArgParser @@ -81,11 +78,13 @@ from optparse import OptionParser from toil.batchSystems.abstractBatchSystem import ( - BatchJobExitReason, - InsufficientSystemResources, + BatchJobExitReason ) from toil.fileStores.abstractFileStore import AbstractFileStore - from toil.jobStores.abstractJobStore import AbstractJobStore, UnimplementedURLException + from toil.jobStores.abstractJobStore import ( + AbstractJobStore, + UnimplementedURLException, + ) logger = logging.getLogger(__name__) @@ -3793,7 +3792,7 @@ class FileMetadata(NamedTuple): def potential_absolute_uris( uri: str, path: list[str], - importer: WDL.Tree.Document | None = None, + importer: str | None = None, execution_dir: str | None = None, ) -> Iterator[str]: """ @@ -3834,7 +3833,7 @@ def potential_absolute_uris( if importer is not None: # Add the place the imported file came form, to search first. - full_path_list.append(Toil.normalize_uri(importer.pos.abspath)) + full_path_list.append(Toil.normalize_uri(importer)) # Then the current directory. We need to make sure to include a filename component here or it will treat the current directory with no trailing / as a document and relative paths will look 1 level up. # When importing on a worker, the cwd will be a tmpdir and will result in FileNotFoundError after os.path.abspath, so override with the execution dir diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index fcf2d4dacb..abd1c0ce45 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -506,7 +506,7 @@ async def toil_read_source( # We track our own failures for debugging tried = [] - for candidate_uri in potential_absolute_uris(uri, path, importer): + for candidate_uri in potential_absolute_uris(uri, path, importer=importer.pos.abspath if importer else None): # For each place to try in order destination_buffer = io.BytesIO() logger.debug("Fetching %s", candidate_uri) From 9dac9889212fe4e583fa92c295d7742ba9acf5df Mon Sep 17 00:00:00 2001 From: stxue1 <122345910+stxue1@users.noreply.github.com> Date: Thu, 14 Nov 2024 16:50:28 -0800 Subject: [PATCH 15/20] Update src/toil/cwl/cwltoil.py Co-authored-by: Michael R. Crusoe <1330696+mr-c@users.noreply.github.com> --- src/toil/cwl/cwltoil.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index 53305f0a44..85a0ccb95d 100644 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -3820,13 +3820,11 @@ def visitSteps( # All CWL Process objects (including CommandLineTool) will have tools # if they bothered to run the Process __init__. return op(cmdline_tool.tool) - else: - raise RuntimeError( - f"Unsupported type encountered in workflow " - f"traversal: {type(cmdline_tool)}" - ) - # Satisfy mypy, but this branch should never be reached in practice - return [] + raise RuntimeError( + f"Unsupported type encountered in workflow " + f"traversal: {type(cmdline_tool)}" + ) + def rm_unprocessed_secondary_files(job_params: Any) -> None: From 82fd09353bf4301a8a0939fc0e2e6dbd6082316d Mon Sep 17 00:00:00 2001 From: stxue1 Date: Mon, 18 Nov 2024 11:57:15 -0800 Subject: [PATCH 16/20] Move URLNotImplemented exception out to exceptions.py to prevent circular import and add new argument to make user deal with import worker disk size when streaming is not available. Fix a bug with file mutation as well. --- src/toil/cwl/cwltoil.py | 4 +-- src/toil/job.py | 41 ++++++-------------------- src/toil/jobStores/abstractJobStore.py | 18 +---------- src/toil/lib/exceptions.py | 18 +++++++++++ src/toil/options/runner.py | 14 ++++++++- src/toil/wdl/wdltoil.py | 15 ++++++---- 6 files changed, 53 insertions(+), 57 deletions(-) diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index 53305f0a44..5ff4d0af12 100644 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -140,8 +140,8 @@ NoSuchFileException, InvalidImportExportUrlException, LocatorException, - UnimplementedURLException, ) +from toil.lib.exceptions import UnimplementedURLException from toil.jobStores.fileJobStore import FileJobStore from toil.jobStores.utils import JobStoreUnavailableException, generate_locator from toil.lib.io import mkdtemp @@ -3622,7 +3622,7 @@ def run(self, file_store: AbstractFileStore) -> Any: file_to_data = get_file_sizes( filenames, file_store.jobStore, include_remote_files=self.options.reference_inputs ) - imports_job = ImportsJob(file_to_data, self.options.import_workers_threshold) + imports_job = ImportsJob(file_to_data, self.options.import_workers_threshold, self.options.import_workers_disk) self.addChild(imports_job) install_imports_job = CWLInstallImportsJob( initialized_job_order=self.initialized_job_order, diff --git a/src/toil/job.py b/src/toil/job.py index efa173db2a..d05543f2e9 100644 --- a/src/toil/job.py +++ b/src/toil/job.py @@ -74,6 +74,8 @@ from toil.resource import ModuleDescriptor from toil.statsAndLogging import set_logging_from_options +from toil.lib.exceptions import UnimplementedURLException + if TYPE_CHECKING: from optparse import OptionParser @@ -81,10 +83,7 @@ BatchJobExitReason ) from toil.fileStores.abstractFileStore import AbstractFileStore - from toil.jobStores.abstractJobStore import ( - AbstractJobStore, - UnimplementedURLException, - ) + from toil.jobStores.abstractJobStore import AbstractJobStore logger = logging.getLogger(__name__) @@ -3994,25 +3993,16 @@ def __init__( self, filenames: List[str], disk_size: Optional[ParseableIndivisibleResource] = None, - stream: bool = True, - **kwargs: Any, + **kwargs: Any ): """ Setup importing files on a worker. :param filenames: List of file URIs to import :param disk_size: Designated disk space the worker can use when importing. Disregarded if stream is enabled. - :param stream: Whether to stream a file import or not. We don't have machinery to ensure - streaming, so if true, assume streaming works and don't give the worker a lot of disk space to work with. - If streaming fails, the worker will run out of resources and allocate a child job to handle the import with enough disk space. :param kwargs: args for the superclass """ - if stream: - super().__init__(local=False, **kwargs) - else: - super().__init__(local=False, disk=disk_size, **kwargs) self.filenames = filenames - self.disk_size = disk_size - self.stream = stream + super().__init__(local=False, disk=disk_size, **kwargs) @staticmethod def import_files( @@ -4045,21 +4035,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[Dict[str, FileID]]: Import the workflow inputs and then create and run the workflow. :return: Promise of workflow outputs """ - try: - return self.import_files(self.filenames, file_store.jobStore) - except OSError as e: - # If the worker crashes due to running out of disk space and was not trying to - # stream the file import, then try a new import job without streaming by actually giving - # the worker enough disk space - # OSError 28 is no space left on device - if e.errno == 28 and self.stream is True: - non_streaming_import = WorkerImportJob( - self.filenames, self.disk_size, stream=False - ) - self.addChild(non_streaming_import) - return non_streaming_import.rv() - else: - raise + return self.import_files(self.filenames, file_store.jobStore) class ImportsJob(Job): @@ -4073,6 +4049,7 @@ def __init__( self, file_to_data: Dict[str, FileMetadata], max_batch_size: ParseableIndivisibleResource, + import_worker_disk: ParseableIndivisibleResource, **kwargs: Any, ): """ @@ -4086,6 +4063,7 @@ def __init__( super().__init__(local=True, **kwargs) self._file_to_data = file_to_data self._max_batch_size = max_batch_size + self._import_worker_disk = import_worker_disk def run( self, file_store: AbstractFileStore @@ -4130,8 +4108,7 @@ def run( # Create batch import jobs for each group of files for batch in file_batches: candidate_uris = [file_to_data[filename][0] for filename in batch] - batch_size = sum(file_to_data[filename][2] for filename in batch) - import_jobs.append(WorkerImportJob(candidate_uris, disk_size=batch_size)) + import_jobs.append(WorkerImportJob(candidate_uris, disk_size=self._import_worker_disk)) for job in import_jobs: self.addChild(job) diff --git a/src/toil/jobStores/abstractJobStore.py b/src/toil/jobStores/abstractJobStore.py index d0ecca100f..e66d00a81f 100644 --- a/src/toil/jobStores/abstractJobStore.py +++ b/src/toil/jobStores/abstractJobStore.py @@ -47,6 +47,7 @@ ServiceJobDescription, ) from toil.lib.compatibility import deprecated +from toil.lib.exceptions import UnimplementedURLException from toil.lib.io import WriteWatchingStream from toil.lib.memoize import memoize from toil.lib.retry import ErrorCondition, retry @@ -83,23 +84,6 @@ def __init__(self, url: ParseResult) -> None: super().__init__("The URL '%s' is invalid." % url.geturl()) -class UnimplementedURLException(RuntimeError): - def __init__(self, url: ParseResult, operation: str) -> None: - """ - Make a new exception to report that a URL scheme is not implemented, or - that the implementation can't be loaded because its dependencies are - not installed. - - :param url: The given URL - :param operation: Whether we are trying to 'import' or 'export' - """ - super().__init__( - f"No available job store implementation can {operation} the URL " - f"'{url.geturl()}'. Ensure Toil has been installed " - f"with the appropriate extras." - ) - - class NoSuchJobException(Exception): """Indicates that the specified job does not exist.""" diff --git a/src/toil/lib/exceptions.py b/src/toil/lib/exceptions.py index 8140838939..0083d3bf5a 100644 --- a/src/toil/lib/exceptions.py +++ b/src/toil/lib/exceptions.py @@ -15,6 +15,7 @@ # 5.14.2018: copied into Toil from https://github.com/BD2KGenomics/bd2k-python-lib import sys +from urllib.parse import ParseResult # TODO: isn't this built in to Python 3 now? @@ -61,3 +62,20 @@ def raise_(exc_type, exc_value, traceback) -> None: if exc.__traceback__ is not traceback: raise exc.with_traceback(traceback) raise exc + + +class UnimplementedURLException(RuntimeError): + def __init__(self, url: ParseResult, operation: str) -> None: + """ + Make a new exception to report that a URL scheme is not implemented, or + that the implementation can't be loaded because its dependencies are + not installed. + + :param url: The given URL + :param operation: Whether we are trying to 'import' or 'export' + """ + super().__init__( + f"No available job store implementation can {operation} the URL " + f"'{url.geturl()}'. Ensure Toil has been installed " + f"with the appropriate extras." + ) diff --git a/src/toil/options/runner.py b/src/toil/options/runner.py index 90d1536d2a..858daca1be 100644 --- a/src/toil/options/runner.py +++ b/src/toil/options/runner.py @@ -33,6 +33,18 @@ def add_runner_options( dest="import_workers_threshold", type=lambda x: human2bytes(str(x)), default="1 GiB", - help="Specify the file size threshold that determines how many files go into a batched import. As many files will go into a batch import job until this threshold" + help="Specify the file size threshold that determines how many files go into a batched import. As many files will go into a batch import job until this threshold " "is reached. This should be set in conjunction with the argument --runImportsOnWorkers." ) + import_workers_disk_argument = ["--importWorkersDisk"] + if cwl: + import_workers_disk_argument.append("--import-workers-disk") + parser.add_argument( + *import_workers_disk_argument, + dest="import_workers_disk", + type=lambda x: human2bytes(str(x)), + default="1 MiB", + help="Specify the disk size each import worker will get. This may be necessary when file streaming is not possible. For example, downloading from AWS to a GCE " + "job store. If specified, this should be set to the largest file size of all files to import. This should be set in conjunction with the arguments " + "--runImportsOnWorkers and --importWorkersThreshold." + ) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index abd1c0ce45..5f34031b7d 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -97,10 +97,10 @@ ) from toil.jobStores.abstractJobStore import ( AbstractJobStore, - UnimplementedURLException, InvalidImportExportUrlException, LocatorException, ) +from toil.lib.exceptions import UnimplementedURLException from toil.lib.accelerators import get_individual_local_accelerators from toil.lib.conversions import VALID_PREFIXES, convert_units, human2bytes from toil.lib.io import mkdtemp, is_any_url, is_file_url, TOIL_URI_SCHEME, is_standard_url, is_toil_url, is_remote_url @@ -1290,9 +1290,10 @@ def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File: file_id, task_path, dir_to_id[file_to_data[file.value][1]], file_basename ) - setattr(file, "virtualized_value", toil_uri) - file.value = candidate_uri - return file + # Don't mutate the original file object + new_file = WDL.Value.File(file.value) + setattr(new_file, "virtualized_value", toil_uri) + return new_file return map_over_files_in_bindings(environment, convert_file_to_uri) @@ -5326,6 +5327,7 @@ def __init__( inputs_search_path: list[str], import_remote_files: bool, import_workers_threshold: ParseableIndivisibleResource, + import_workers_disk: ParseableIndivisibleResource, **kwargs: Any, ): """ @@ -5339,6 +5341,7 @@ def __init__( self._inputs_search_path = inputs_search_path self._import_remote_files = import_remote_files self._import_workers_threshold = import_workers_threshold + self._import_workers_disk = import_workers_disk def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: filenames = extract_workflow_inputs(self._inputs) @@ -5347,8 +5350,9 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: file_store.jobStore, self._inputs_search_path, include_remote_files=self._import_remote_files, + execution_dir=self._wdl_options.get("execution_dir") ) - imports_job = ImportsJob(file_to_data, self._import_workers_threshold) + imports_job = ImportsJob(file_to_data, self._import_workers_threshold, self._import_workers_disk) self.addChild(imports_job) install_imports_job = WDLInstallImportsJob( self._target.name, self._inputs, imports_job.rv() @@ -5381,6 +5385,7 @@ def make_root_job( inputs_search_path=inputs_search_path, import_remote_files=options.reference_inputs, import_workers_threshold=options.import_workers_threshold, + import_workers_disk=options.import_workers_disk ) else: # Run WDL imports on leader From 714d8ada2d03b678d274841ffd1c08b682e44326 Mon Sep 17 00:00:00 2001 From: stxue1 <122345910+stxue1@users.noreply.github.com> Date: Mon, 18 Nov 2024 11:59:07 -0800 Subject: [PATCH 17/20] Update src/toil/job.py Co-authored-by: Adam Novak --- src/toil/job.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/toil/job.py b/src/toil/job.py index d05543f2e9..1cd3ae8817 100644 --- a/src/toil/job.py +++ b/src/toil/job.py @@ -3932,9 +3932,9 @@ def get_filename_size(filename: str) -> FileMetadata: if file_basename == "": # We can't have files with no basename because we need to - # download them at that basename later. + # download them at that basename later in WDL. raise RuntimeError( - f"File {candidate_uri} has no basename and so cannot be a WDL File" + f"File {candidate_uri} has no basename" ) # Was actually found From db7edaf47c7d0cce2435b7caaf9372996ea21a01 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Mon, 18 Nov 2024 12:00:18 -0800 Subject: [PATCH 18/20] Some type hinting improvements --- src/toil/cwl/cwltoil.py | 2 -- src/toil/job.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index 661e78c8e0..9f317d1790 100644 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -41,9 +41,7 @@ IO, Any, Callable, - Dict, Iterator, - List, Mapping, MutableMapping, MutableSequence, diff --git a/src/toil/job.py b/src/toil/job.py index 1cd3ae8817..ec895ebc21 100644 --- a/src/toil/job.py +++ b/src/toil/job.py @@ -3791,8 +3791,8 @@ class FileMetadata(NamedTuple): def potential_absolute_uris( uri: str, path: list[str], - importer: str | None = None, - execution_dir: str | None = None, + importer: Optional[str] = None, + execution_dir: Optional[str] = None, ) -> Iterator[str]: """ Get potential absolute URIs to check for an imported file. From d3aa7d5b295e93ac84b044ef2935fbc9fab02c23 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 20 Nov 2024 13:26:27 -0500 Subject: [PATCH 19/20] Use base disk instead of disk_size Without the stream switch there's no reason to use disk_size instead of disk. --- src/toil/job.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/toil/job.py b/src/toil/job.py index ec895ebc21..3f0226899b 100644 --- a/src/toil/job.py +++ b/src/toil/job.py @@ -3992,17 +3992,15 @@ class WorkerImportJob(Job): def __init__( self, filenames: List[str], - disk_size: Optional[ParseableIndivisibleResource] = None, **kwargs: Any ): """ Setup importing files on a worker. :param filenames: List of file URIs to import - :param disk_size: Designated disk space the worker can use when importing. Disregarded if stream is enabled. :param kwargs: args for the superclass """ self.filenames = filenames - super().__init__(local=False, disk=disk_size, **kwargs) + super().__init__(local=False, **kwargs) @staticmethod def import_files( @@ -4108,7 +4106,7 @@ def run( # Create batch import jobs for each group of files for batch in file_batches: candidate_uris = [file_to_data[filename][0] for filename in batch] - import_jobs.append(WorkerImportJob(candidate_uris, disk_size=self._import_worker_disk)) + import_jobs.append(WorkerImportJob(candidate_uris, disk=self._import_worker_disk)) for job in import_jobs: self.addChild(job) From 03365ddf2041c47eba006c9fdd4b56eceebcbfc1 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 20 Nov 2024 14:06:19 -0800 Subject: [PATCH 20/20] Fix mypy types --- src/toil/cwl/cwltoil.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index 9f317d1790..233d06e15b 100644 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -1832,8 +1832,8 @@ def path_to_loc(obj: CWLObjectType) -> None: def extract_file_uri_once( - fileindex: Dict[str, str], - existing: Dict[str, str], + fileindex: dict[str, str], + existing: dict[str, str], file_metadata: CWLObjectType, mark_broken: bool = False, skip_remote: bool = False, @@ -1909,7 +1909,7 @@ def visit_files( mark_broken: bool = False, skip_remote: bool = False, bypass_file_store: bool = False, -) -> List[V]: +) -> list[V]: """ Prepare all files and directories. @@ -1955,7 +1955,7 @@ def visit_files( :param log_level: Log imported files at the given level. """ - func_return: List[Any] = list() + func_return: list[Any] = list() tool_id = cwl_object.get("id", str(cwl_object)) if cwl_object else "" logger.debug("Importing files for %s", tool_id) @@ -3521,7 +3521,7 @@ def __init__( basedir: str, skip_remote: bool, bypass_file_store: bool, - import_data: Promised[Dict[str, FileID]], + import_data: Promised[dict[str, FileID]], **kwargs: Any, ) -> None: """ @@ -3543,7 +3543,7 @@ def run(self, file_store: AbstractFileStore) -> Tuple[Process, CWLObjectType]: Convert the filenames in the workflow inputs into the URIs :return: Promise of transformed workflow inputs. A tuple of the job order and process """ - candidate_to_fileid = unwrap(self.import_data) + candidate_to_fileid: dict[str, FileID] = unwrap(self.import_data) initialized_job_order = unwrap(self.initialized_job_order) tool = unwrap(self.tool) @@ -3554,8 +3554,8 @@ def convert_file(filename: str) -> FileID: file_convert_function = functools.partial(extract_and_convert_file_to_toil_uri, convert_file) fs_access = ToilFsAccess(self.basedir) - fileindex: Dict[str, str] = {} - existing: Dict[str, str] = {} + fileindex: dict[str, str] = {} + existing: dict[str, str] = {} visit_files( file_convert_function, fs_access, @@ -3683,8 +3683,8 @@ def extract_workflow_inputs( :param tool: tool object :return: """ - fileindex: Dict[str, str] = {} - existing: Dict[str, str] = {} + fileindex: dict[str, str] = {} + existing: dict[str, str] = {} # Extract out all the input files' filenames logger.info("Collecting input files...") @@ -3736,8 +3736,8 @@ def import_workflow_inputs( :param log_level: log level :return: """ - fileindex: Dict[str, str] = {} - existing: Dict[str, str] = {} + fileindex: dict[str, str] = {} + existing: dict[str, str] = {} # Define something we can call to import a file and get its file # ID. @@ -3799,8 +3799,8 @@ def file_import_function(url: str) -> FileID: T = TypeVar("T") def visitSteps( cmdline_tool: Process, - op: Callable[[CommentedMap], List[T]], -) -> List[T]: + op: Callable[[CommentedMap], list[T]], +) -> list[T]: """ Iterate over a CWL Process object, running the op on each tool description CWL object.