From 7a396638a05cf54ed535a27c861b1965b0f5fabb Mon Sep 17 00:00:00 2001 From: rafaeel9 <115452767+rafaeel9@users.noreply.github.com> Date: Thu, 8 Jun 2023 13:26:27 +0000 Subject: [PATCH 1/3] Added files for testing and extracting workflows and made changes to process_files.py and constants to add the new category --- src/somef/extract_workflows.py | 20 +++++ src/somef/process_files.py | 12 ++- .../test/test_data/SimulatedReads2Map.wdl | 80 +++++++++++++++++++ src/somef/test/test_extract_workflows.py | 17 ++++ src/somef/utils/constants.py | 1 + 5 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 src/somef/extract_workflows.py create mode 100644 src/somef/test/test_data/SimulatedReads2Map.wdl create mode 100644 src/somef/test/test_extract_workflows.py diff --git a/src/somef/extract_workflows.py b/src/somef/extract_workflows.py new file mode 100644 index 00000000..ffd638ed --- /dev/null +++ b/src/somef/extract_workflows.py @@ -0,0 +1,20 @@ +import re +Galaxy_pattern = r"(?i)a[_\s-]?galaxy[_\s-]?workflow" +CWL_pattern = r"\bclass:\s*[Ww]orkflow\b" +Workflow_content_pattern = r"in:\s*[^}]*\s*out:\s*(?:\[.*?\]|.*?(?=\n\s*\S+:|$))" +Nextflow_pattern= r"(?i)nextflow[\s\S]*?(workflow\s*\{[\s\S]*?\})" + +def is_file_workflow(file_path): + with open(file_path, 'r') as file: + content = file.read() + try: + Galaxy_match=re.search(Galaxy_pattern,content) + CWL_match=re.search(CWL_pattern,content) + Workflow_match=re.search(Workflow_content_pattern,content) + Nextflow_match=re.search(Nextflow_pattern,content) + if Galaxy_match or CWL_match or Workflow_match or Nextflow_match: + return True + else: + return False + except Exception: + pass \ No newline at end of file diff --git a/src/somef/process_files.py b/src/somef/process_files.py index f67fc79f..08cf77ed 100644 --- a/src/somef/process_files.py +++ b/src/somef/process_files.py @@ -3,7 +3,7 @@ import re import urllib from .utils import constants, markdown_utils -from . import extract_ontologies +from . import extract_ontologies,extract_workflows from .process_results import Result from chardet import detect @@ -150,7 +150,15 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner constants.PROP_TYPE: constants.URL }, 1, constants.TECHNIQUE_FILE_EXPLORATION ) - + if filename.endswith(".ga") or filename.endswith(".cwl") or filename.endswith(".nf") or (filename.endswith(".snake") or filename.endswith(".smk") or "Snakefile"==filename_no_ext) or filename.endswith(".knwf") or filename.endswith(".t2flow") or filename.endswith(".dag") or filename.endswith(".kar") or filename.endswith(".wdl"): + analysis = extract_workflows.is_file_workflow(os.path.join(repo_dir, file_path)) + if analysis == True: + Workflow_url=get_file_link(repo_type,file_path,owner,repo_name,repo_default_branch,repo_dir,repo_relative_path,filename) + metadata_result.add_result(constants.CAT_WORKFLOWS, + { + constants.PROP_VALUE: Workflow_url, + constants.PROP_TYPE: constants.URL + }, 1, constants.TECHNIQUE_FILE_EXPLORATION) # TO DO: Improve this a bit, as just returning the docs folder is not that informative for dir_name in dir_names: if dir_name.lower() == "docs": diff --git a/src/somef/test/test_data/SimulatedReads2Map.wdl b/src/somef/test/test_data/SimulatedReads2Map.wdl new file mode 100644 index 00000000..71f55881 --- /dev/null +++ b/src/somef/test/test_data/SimulatedReads2Map.wdl @@ -0,0 +1,80 @@ +version 1.0 + + +import "../../structs/dna_seq_structs.wdl" +import "../../structs/read_simulation_structs.wdl" + +import "../../tasks/pedigree_simulator_utils.wdl" +import "../../tasks/JointReports.wdl" as reports + +import "../../subworkflows/SimulatedSingleFamily.wdl" as sub + +workflow SimulatedReads { + + input { + ReferenceFasta references + Family family + Sequencing sequencing + Int number_of_families + Int global_seed + Int max_cores + Int n_chrom + String? filters + + Int chunk_size = 5 + Boolean gatk_mchap = false + Boolean hardfilters = true + Boolean replaceAD = true + } + + # ProduceFamiliesSeeds just generates random seeds. It returns an + # array of integers + call pedigree_simulator_utils.ProduceFamiliesSeeds { + input: + global_seed= global_seed, + number_of_families=number_of_families + } + + # Here we generate Family objects on the fly, based on the values + # from the family and the random seed of the previous task. + scatter (seed in ProduceFamiliesSeeds.seeds) { + # Calling reads_simu for each seed + call sub.SimulatedSingleFamily { + input: + references=references, + family=family, + sequencing = sequencing, + max_cores = max_cores, + filters = filters, + ploidy = family.ploidy, + chunk_size = chunk_size, + gatk_mchap=gatk_mchap, + hardfilters = hardfilters, + replaceAD = replaceAD, + n_chrom = n_chrom + } + } + + call reports.JointTablesSimu { + input: + data1_depths_geno_prob = SimulatedSingleFamily.data1_depths_geno_prob, + data2_maps = SimulatedSingleFamily.data2_maps, + data3_filters = SimulatedSingleFamily.data3_filters, + data5_SNPCall_efficiency = SimulatedSingleFamily.data5_SNPCall_efficiency, + data4_times = SimulatedSingleFamily.data4_times, + data6_RDatas = SimulatedSingleFamily.data6_RDatas, + data7_gusmap = SimulatedSingleFamily.data7_gusmap, + data8_names = SimulatedSingleFamily.data8_names, + data9_simu_haplo = SimulatedSingleFamily.simu_haplo, + data10_counts = SimulatedSingleFamily.data10_counts, + depth = sequencing.depth, + plots = SimulatedSingleFamily.Plots, + positions = SimulatedSingleFamily.positions + } + + # Here you can reference outputs from the sub workflow. Remember that + # it will be an array of the same type of the original. + output { + File results = JointTablesSimu.results + } +} diff --git a/src/somef/test/test_extract_workflows.py b/src/somef/test/test_extract_workflows.py new file mode 100644 index 00000000..295f257f --- /dev/null +++ b/src/somef/test/test_extract_workflows.py @@ -0,0 +1,17 @@ +import unittest +import os +from pathlib import Path + +from .. import extract_workflows + +test_data_repositories = str(Path(__file__).parent / "test_data" ) + os.path.sep + +class TestWorkflows(unittest.TestCase): + def test_is_workflow(self): + + workflow = extract_workflows.is_file_workflow(test_data_repositories + "SimulatedReads2Map.wdl") + assert workflow, "The file does not contain a workflow." + + def test_is_workflow_fake(self): + workflow = extract_workflows.is_file_workflow(test_data_repositories + "repositories/wav2letter/scripts/arrayfire_parser.py") + assert(workflow is False) \ No newline at end of file diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index d9e33fbb..40a209b2 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -77,6 +77,7 @@ CAT_SUPPORT = "support" CAT_SUPPORT_CHANNELS = "support_channels" CAT_USAGE = "usage" +CAT_WORKFLOWS = "workflows" # Special category: missing categories CAT_MISSING = "somef_missing_categories" From d5fd353e6e5b74edefaed12ede6ef08cab9e9a67 Mon Sep 17 00:00:00 2001 From: rafaeel9 <115452767+rafaeel9@users.noreply.github.com> Date: Fri, 9 Jun 2023 08:11:01 +0000 Subject: [PATCH 2/3] Updated function to past tests. --- src/somef/extract_workflows.py | 4 +++- src/somef/test/test_extract_workflows.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/somef/extract_workflows.py b/src/somef/extract_workflows.py index ffd638ed..4ae5a6f6 100644 --- a/src/somef/extract_workflows.py +++ b/src/somef/extract_workflows.py @@ -2,6 +2,7 @@ Galaxy_pattern = r"(?i)a[_\s-]?galaxy[_\s-]?workflow" CWL_pattern = r"\bclass:\s*[Ww]orkflow\b" Workflow_content_pattern = r"in:\s*[^}]*\s*out:\s*(?:\[.*?\]|.*?(?=\n\s*\S+:|$))" +workflow_pattern=r'\bworkflow\b' Nextflow_pattern= r"(?i)nextflow[\s\S]*?(workflow\s*\{[\s\S]*?\})" def is_file_workflow(file_path): @@ -11,8 +12,9 @@ def is_file_workflow(file_path): Galaxy_match=re.search(Galaxy_pattern,content) CWL_match=re.search(CWL_pattern,content) Workflow_match=re.search(Workflow_content_pattern,content) + Workflow_match_2=re.search(workflow_pattern,content,re.IGNORECASE) Nextflow_match=re.search(Nextflow_pattern,content) - if Galaxy_match or CWL_match or Workflow_match or Nextflow_match: + if Galaxy_match or CWL_match or Workflow_match or Workflow_match_2 or Nextflow_match: return True else: return False diff --git a/src/somef/test/test_extract_workflows.py b/src/somef/test/test_extract_workflows.py index 295f257f..34114e59 100644 --- a/src/somef/test/test_extract_workflows.py +++ b/src/somef/test/test_extract_workflows.py @@ -10,7 +10,7 @@ class TestWorkflows(unittest.TestCase): def test_is_workflow(self): workflow = extract_workflows.is_file_workflow(test_data_repositories + "SimulatedReads2Map.wdl") - assert workflow, "The file does not contain a workflow." + assert workflow, "The file does contain a workflow." def test_is_workflow_fake(self): workflow = extract_workflows.is_file_workflow(test_data_repositories + "repositories/wav2letter/scripts/arrayfire_parser.py") From e4ca25209a4b703682298a00fa3fd9539fea1ae0 Mon Sep 17 00:00:00 2001 From: rafaeel9 <115452767+rafaeel9@users.noreply.github.com> Date: Fri, 9 Jun 2023 09:04:54 +0000 Subject: [PATCH 3/3] added documentation related to new addition --- README.md | 1 + docs/output.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 7978f8f6..c1e0c2ad 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the follo - **Logo**: Main logo used to represent the target software component. - **Ontologies**: URL and path to the ontology files present in the repository. - **Application domain**: The application domain of the repository. Current supported domains include: Astrophisics, Audio, Computer vision, Graphs, Natural language processing, Reinforcement learning, Semantc web, Sequential. Domains are not mutually exclusive. These domains have been extracted from [awesome lists](https://github.com/topics/awesome-list) and [Papers with code](https://paperswithcode.com/). Find more information in our [documentation](https://somef.readthedocs.io/en/latest/). +- **Workflows**: URL and path to the workflow files present in the repository. We use different supervised classifiers, header analysis, regular expressions and the GitHub/Gitlab API to retrieve all these fields (more than one technique may be used for each field). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/) diff --git a/docs/output.md b/docs/output.md index bf3767ef..581fff8e 100644 --- a/docs/output.md +++ b/docs/output.md @@ -111,6 +111,7 @@ SOMEF aims to recognize the following categories (in alphabetical order): - `support`: Guidelines and links of where to obtain support for a software component. - `support_channels`: Help channels one can use to get support about the target software component. - `usage`: Usage examples and considerations of a code repository. +- `workflows`: URL and path to the workflow files present in the repository. The following table summarized the properties used to describe a `category`: