Merge pull request #569 from KnowledgeCaptureAndDiscovery/dev

Dev
KnowledgeCaptureAndDiscovery · Jun 9, 2023 · 9a976e7 · 9a976e7
2 parents 77fdccf + 402e0e7
commit 9a976e7
Show file tree

Hide file tree

Showing 7 changed files with 132 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -50,6 +50,7 @@ Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the follo
 - **Logo**: Main logo used to represent the target software component.
 - **Ontologies**: URL and path to the ontology files present in the repository.
 - **Application domain**: The application domain of the repository. Current supported domains include: Astrophisics, Audio, Computer vision, Graphs, Natural language processing, Reinforcement learning, Semantc web, Sequential. Domains are not mutually exclusive. These domains have been extracted from [awesome lists](https://github.com/topics/awesome-list) and [Papers with code](https://paperswithcode.com/). Find more information in our [documentation](https://somef.readthedocs.io/en/latest/).
+- **Workflows**: URL and path to the workflow files present in the repository.
 
 
 We use different supervised classifiers, header analysis, regular expressions and the GitHub/Gitlab API to retrieve all these fields (more than one technique may be used for each field). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/)

diff --git a/docs/output.md b/docs/output.md
@@ -111,6 +111,7 @@ SOMEF aims to recognize the following categories (in alphabetical order):
 - `support`: Guidelines and links of where to obtain support for a software component.
 - `support_channels`: Help channels one can use to get support about the target software component.
 - `usage`: Usage examples and considerations of a code repository.
+- `workflows`: URL and path to the workflow files present in the repository.
 
 The following table summarized the properties used to describe a `category`:
 

diff --git a/src/somef/extract_workflows.py b/src/somef/extract_workflows.py
@@ -0,0 +1,22 @@
+import re
+Galaxy_pattern = r"(?i)a[_\s-]?galaxy[_\s-]?workflow"
+CWL_pattern = r"\bclass:\s*[Ww]orkflow\b"
+Workflow_content_pattern = r"in:\s*[^}]*\s*out:\s*(?:\[.*?\]|.*?(?=\n\s*\S+:|$))"
+workflow_pattern=r'\bworkflow\b'
+Nextflow_pattern= r"(?i)nextflow[\s\S]*?(workflow\s*\{[\s\S]*?\})"
+
+def is_file_workflow(file_path):
+    with open(file_path, 'r') as file:
+        content = file.read()
+        try:
+            Galaxy_match=re.search(Galaxy_pattern,content)
+            CWL_match=re.search(CWL_pattern,content)
+            Workflow_match=re.search(Workflow_content_pattern,content)
+            Workflow_match_2=re.search(workflow_pattern,content,re.IGNORECASE)
+            Nextflow_match=re.search(Nextflow_pattern,content)
+            if Galaxy_match or CWL_match or Workflow_match or Workflow_match_2 or Nextflow_match:
+                return True
+            else:
+                return False
+        except Exception:
+            pass
diff --git a/src/somef/process_files.py b/src/somef/process_files.py
@@ -3,7 +3,7 @@
 import re
 import urllib
 from .utils import constants, markdown_utils
-from . import extract_ontologies
+from . import extract_ontologies,extract_workflows
 from .process_results import Result
 from chardet import detect
 
@@ -150,7 +150,15 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
                                                        constants.PROP_TYPE: constants.URL
                                                    }, 1, constants.TECHNIQUE_FILE_EXPLORATION
                                                    )
-
+                if filename.endswith(".ga") or filename.endswith(".cwl") or filename.endswith(".nf") or (filename.endswith(".snake") or filename.endswith(".smk")  or "Snakefile"==filename_no_ext) or filename.endswith(".knwf") or filename.endswith(".t2flow") or filename.endswith(".dag") or filename.endswith(".kar") or filename.endswith(".wdl"):
+                    analysis = extract_workflows.is_file_workflow(os.path.join(repo_dir, file_path))
+                    if analysis == True:
+                        Workflow_url=get_file_link(repo_type,file_path,owner,repo_name,repo_default_branch,repo_dir,repo_relative_path,filename) 
+                        metadata_result.add_result(constants.CAT_WORKFLOWS,
+                                                    {
+                                                        constants.PROP_VALUE: Workflow_url,
+                                                        constants.PROP_TYPE: constants.URL
+                                                    }, 1, constants.TECHNIQUE_FILE_EXPLORATION)
             # TO DO: Improve this a bit, as just returning the docs folder is not that informative
             for dir_name in dir_names:
                 if dir_name.lower() == "docs":

diff --git a/src/somef/test/test_data/SimulatedReads2Map.wdl b/src/somef/test/test_data/SimulatedReads2Map.wdl
@@ -0,0 +1,80 @@
+version 1.0
+
+
+import "../../structs/dna_seq_structs.wdl"
+import "../../structs/read_simulation_structs.wdl"
+
+import "../../tasks/pedigree_simulator_utils.wdl"
+import "../../tasks/JointReports.wdl" as reports
+
+import "../../subworkflows/SimulatedSingleFamily.wdl" as sub
+
+workflow SimulatedReads {
+
+  input {
+    ReferenceFasta references
+    Family family
+    Sequencing sequencing
+    Int number_of_families
+    Int global_seed
+    Int max_cores
+    Int n_chrom
+    String? filters
+
+    Int chunk_size = 5
+    Boolean gatk_mchap = false
+    Boolean hardfilters = true
+    Boolean replaceAD = true
+  }
+
+  # ProduceFamiliesSeeds just generates random seeds. It returns an
+  # array of integers
+  call pedigree_simulator_utils.ProduceFamiliesSeeds {
+    input:
+      global_seed= global_seed,
+      number_of_families=number_of_families
+  }
+
+  # Here we generate Family objects on the fly, based on the values
+  # from the family and the random seed of the previous task.
+  scatter (seed in ProduceFamiliesSeeds.seeds) {
+    # Calling reads_simu for each seed
+    call sub.SimulatedSingleFamily {
+      input:
+        references=references,
+        family=family,
+        sequencing = sequencing,
+        max_cores = max_cores,
+        filters = filters,
+        ploidy =  family.ploidy,
+        chunk_size = chunk_size,
+        gatk_mchap=gatk_mchap,
+        hardfilters = hardfilters,
+        replaceAD = replaceAD,
+        n_chrom  = n_chrom
+    }
+  }
+
+  call reports.JointTablesSimu {
+    input:
+      data1_depths_geno_prob   = SimulatedSingleFamily.data1_depths_geno_prob,
+      data2_maps               = SimulatedSingleFamily.data2_maps,
+      data3_filters            = SimulatedSingleFamily.data3_filters,
+      data5_SNPCall_efficiency = SimulatedSingleFamily.data5_SNPCall_efficiency,
+      data4_times              = SimulatedSingleFamily.data4_times,
+      data6_RDatas             = SimulatedSingleFamily.data6_RDatas,
+      data7_gusmap             = SimulatedSingleFamily.data7_gusmap,
+      data8_names              = SimulatedSingleFamily.data8_names,
+      data9_simu_haplo         = SimulatedSingleFamily.simu_haplo,
+      data10_counts            = SimulatedSingleFamily.data10_counts,
+      depth                    = sequencing.depth,
+      plots                    = SimulatedSingleFamily.Plots,
+      positions                = SimulatedSingleFamily.positions
+  }
+
+  # Here you can reference outputs from the sub workflow. Remember that
+  # it will be an array of the same type of the original.
+  output {
+    File results = JointTablesSimu.results
+  }
+}
diff --git a/src/somef/test/test_extract_workflows.py b/src/somef/test/test_extract_workflows.py
@@ -0,0 +1,17 @@
+import unittest
+import os
+from pathlib import Path
+
+from .. import extract_workflows
+
+test_data_repositories = str(Path(__file__).parent / "test_data" ) + os.path.sep
+
+class TestWorkflows(unittest.TestCase):
+    def test_is_workflow(self):
+
+        workflow = extract_workflows.is_file_workflow(test_data_repositories + "SimulatedReads2Map.wdl")
+        assert workflow, "The file does contain a workflow."
+
+    def test_is_workflow_fake(self):
+        workflow = extract_workflows.is_file_workflow(test_data_repositories + "repositories/wav2letter/scripts/arrayfire_parser.py")
+        assert(workflow is False)
diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py
@@ -77,6 +77,7 @@
 CAT_SUPPORT = "support"
 CAT_SUPPORT_CHANNELS = "support_channels"
 CAT_USAGE = "usage"
+CAT_WORKFLOWS = "workflows"
 
 # Special category: missing categories
 CAT_MISSING = "somef_missing_categories"