diff --git a/README.md b/README.md index 7978f8f6..c1e0c2ad 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the follo - **Logo**: Main logo used to represent the target software component. - **Ontologies**: URL and path to the ontology files present in the repository. - **Application domain**: The application domain of the repository. Current supported domains include: Astrophisics, Audio, Computer vision, Graphs, Natural language processing, Reinforcement learning, Semantc web, Sequential. Domains are not mutually exclusive. These domains have been extracted from [awesome lists](https://github.com/topics/awesome-list) and [Papers with code](https://paperswithcode.com/). Find more information in our [documentation](https://somef.readthedocs.io/en/latest/). +- **Workflows**: URL and path to the workflow files present in the repository. We use different supervised classifiers, header analysis, regular expressions and the GitHub/Gitlab API to retrieve all these fields (more than one technique may be used for each field). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/) diff --git a/docs/output.md b/docs/output.md index bf3767ef..581fff8e 100644 --- a/docs/output.md +++ b/docs/output.md @@ -111,6 +111,7 @@ SOMEF aims to recognize the following categories (in alphabetical order): - `support`: Guidelines and links of where to obtain support for a software component. - `support_channels`: Help channels one can use to get support about the target software component. - `usage`: Usage examples and considerations of a code repository. +- `workflows`: URL and path to the workflow files present in the repository. The following table summarized the properties used to describe a `category`: diff --git a/src/somef/extract_workflows.py b/src/somef/extract_workflows.py new file mode 100644 index 00000000..4ae5a6f6 --- /dev/null +++ b/src/somef/extract_workflows.py @@ -0,0 +1,22 @@ +import re +Galaxy_pattern = r"(?i)a[_\s-]?galaxy[_\s-]?workflow" +CWL_pattern = r"\bclass:\s*[Ww]orkflow\b" +Workflow_content_pattern = r"in:\s*[^}]*\s*out:\s*(?:\[.*?\]|.*?(?=\n\s*\S+:|$))" +workflow_pattern=r'\bworkflow\b' +Nextflow_pattern= r"(?i)nextflow[\s\S]*?(workflow\s*\{[\s\S]*?\})" + +def is_file_workflow(file_path): + with open(file_path, 'r') as file: + content = file.read() + try: + Galaxy_match=re.search(Galaxy_pattern,content) + CWL_match=re.search(CWL_pattern,content) + Workflow_match=re.search(Workflow_content_pattern,content) + Workflow_match_2=re.search(workflow_pattern,content,re.IGNORECASE) + Nextflow_match=re.search(Nextflow_pattern,content) + if Galaxy_match or CWL_match or Workflow_match or Workflow_match_2 or Nextflow_match: + return True + else: + return False + except Exception: + pass \ No newline at end of file diff --git a/src/somef/process_files.py b/src/somef/process_files.py index f67fc79f..08cf77ed 100644 --- a/src/somef/process_files.py +++ b/src/somef/process_files.py @@ -3,7 +3,7 @@ import re import urllib from .utils import constants, markdown_utils -from . import extract_ontologies +from . import extract_ontologies,extract_workflows from .process_results import Result from chardet import detect @@ -150,7 +150,15 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner constants.PROP_TYPE: constants.URL }, 1, constants.TECHNIQUE_FILE_EXPLORATION ) - + if filename.endswith(".ga") or filename.endswith(".cwl") or filename.endswith(".nf") or (filename.endswith(".snake") or filename.endswith(".smk") or "Snakefile"==filename_no_ext) or filename.endswith(".knwf") or filename.endswith(".t2flow") or filename.endswith(".dag") or filename.endswith(".kar") or filename.endswith(".wdl"): + analysis = extract_workflows.is_file_workflow(os.path.join(repo_dir, file_path)) + if analysis == True: + Workflow_url=get_file_link(repo_type,file_path,owner,repo_name,repo_default_branch,repo_dir,repo_relative_path,filename) + metadata_result.add_result(constants.CAT_WORKFLOWS, + { + constants.PROP_VALUE: Workflow_url, + constants.PROP_TYPE: constants.URL + }, 1, constants.TECHNIQUE_FILE_EXPLORATION) # TO DO: Improve this a bit, as just returning the docs folder is not that informative for dir_name in dir_names: if dir_name.lower() == "docs": diff --git a/src/somef/test/test_data/SimulatedReads2Map.wdl b/src/somef/test/test_data/SimulatedReads2Map.wdl new file mode 100644 index 00000000..71f55881 --- /dev/null +++ b/src/somef/test/test_data/SimulatedReads2Map.wdl @@ -0,0 +1,80 @@ +version 1.0 + + +import "../../structs/dna_seq_structs.wdl" +import "../../structs/read_simulation_structs.wdl" + +import "../../tasks/pedigree_simulator_utils.wdl" +import "../../tasks/JointReports.wdl" as reports + +import "../../subworkflows/SimulatedSingleFamily.wdl" as sub + +workflow SimulatedReads { + + input { + ReferenceFasta references + Family family + Sequencing sequencing + Int number_of_families + Int global_seed + Int max_cores + Int n_chrom + String? filters + + Int chunk_size = 5 + Boolean gatk_mchap = false + Boolean hardfilters = true + Boolean replaceAD = true + } + + # ProduceFamiliesSeeds just generates random seeds. It returns an + # array of integers + call pedigree_simulator_utils.ProduceFamiliesSeeds { + input: + global_seed= global_seed, + number_of_families=number_of_families + } + + # Here we generate Family objects on the fly, based on the values + # from the family and the random seed of the previous task. + scatter (seed in ProduceFamiliesSeeds.seeds) { + # Calling reads_simu for each seed + call sub.SimulatedSingleFamily { + input: + references=references, + family=family, + sequencing = sequencing, + max_cores = max_cores, + filters = filters, + ploidy = family.ploidy, + chunk_size = chunk_size, + gatk_mchap=gatk_mchap, + hardfilters = hardfilters, + replaceAD = replaceAD, + n_chrom = n_chrom + } + } + + call reports.JointTablesSimu { + input: + data1_depths_geno_prob = SimulatedSingleFamily.data1_depths_geno_prob, + data2_maps = SimulatedSingleFamily.data2_maps, + data3_filters = SimulatedSingleFamily.data3_filters, + data5_SNPCall_efficiency = SimulatedSingleFamily.data5_SNPCall_efficiency, + data4_times = SimulatedSingleFamily.data4_times, + data6_RDatas = SimulatedSingleFamily.data6_RDatas, + data7_gusmap = SimulatedSingleFamily.data7_gusmap, + data8_names = SimulatedSingleFamily.data8_names, + data9_simu_haplo = SimulatedSingleFamily.simu_haplo, + data10_counts = SimulatedSingleFamily.data10_counts, + depth = sequencing.depth, + plots = SimulatedSingleFamily.Plots, + positions = SimulatedSingleFamily.positions + } + + # Here you can reference outputs from the sub workflow. Remember that + # it will be an array of the same type of the original. + output { + File results = JointTablesSimu.results + } +} diff --git a/src/somef/test/test_extract_workflows.py b/src/somef/test/test_extract_workflows.py new file mode 100644 index 00000000..34114e59 --- /dev/null +++ b/src/somef/test/test_extract_workflows.py @@ -0,0 +1,17 @@ +import unittest +import os +from pathlib import Path + +from .. import extract_workflows + +test_data_repositories = str(Path(__file__).parent / "test_data" ) + os.path.sep + +class TestWorkflows(unittest.TestCase): + def test_is_workflow(self): + + workflow = extract_workflows.is_file_workflow(test_data_repositories + "SimulatedReads2Map.wdl") + assert workflow, "The file does contain a workflow." + + def test_is_workflow_fake(self): + workflow = extract_workflows.is_file_workflow(test_data_repositories + "repositories/wav2letter/scripts/arrayfire_parser.py") + assert(workflow is False) \ No newline at end of file diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index d9e33fbb..40a209b2 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -77,6 +77,7 @@ CAT_SUPPORT = "support" CAT_SUPPORT_CHANNELS = "support_channels" CAT_USAGE = "usage" +CAT_WORKFLOWS = "workflows" # Special category: missing categories CAT_MISSING = "somef_missing_categories"