Skip to content

Commit

Permalink
Merge pull request #569 from KnowledgeCaptureAndDiscovery/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
dgarijo authored Jun 9, 2023
2 parents 77fdccf + 402e0e7 commit 9a976e7
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 2 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the follo
- **Logo**: Main logo used to represent the target software component.
- **Ontologies**: URL and path to the ontology files present in the repository.
- **Application domain**: The application domain of the repository. Current supported domains include: Astrophisics, Audio, Computer vision, Graphs, Natural language processing, Reinforcement learning, Semantc web, Sequential. Domains are not mutually exclusive. These domains have been extracted from [awesome lists](https://github.com/topics/awesome-list) and [Papers with code](https://paperswithcode.com/). Find more information in our [documentation](https://somef.readthedocs.io/en/latest/).
- **Workflows**: URL and path to the workflow files present in the repository.


We use different supervised classifiers, header analysis, regular expressions and the GitHub/Gitlab API to retrieve all these fields (more than one technique may be used for each field). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/)
Expand Down
1 change: 1 addition & 0 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ SOMEF aims to recognize the following categories (in alphabetical order):
- `support`: Guidelines and links of where to obtain support for a software component.
- `support_channels`: Help channels one can use to get support about the target software component.
- `usage`: Usage examples and considerations of a code repository.
- `workflows`: URL and path to the workflow files present in the repository.

The following table summarized the properties used to describe a `category`:

Expand Down
22 changes: 22 additions & 0 deletions src/somef/extract_workflows.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import re
Galaxy_pattern = r"(?i)a[_\s-]?galaxy[_\s-]?workflow"
CWL_pattern = r"\bclass:\s*[Ww]orkflow\b"
Workflow_content_pattern = r"in:\s*[^}]*\s*out:\s*(?:\[.*?\]|.*?(?=\n\s*\S+:|$))"
workflow_pattern=r'\bworkflow\b'
Nextflow_pattern= r"(?i)nextflow[\s\S]*?(workflow\s*\{[\s\S]*?\})"

def is_file_workflow(file_path):
with open(file_path, 'r') as file:
content = file.read()
try:
Galaxy_match=re.search(Galaxy_pattern,content)
CWL_match=re.search(CWL_pattern,content)
Workflow_match=re.search(Workflow_content_pattern,content)
Workflow_match_2=re.search(workflow_pattern,content,re.IGNORECASE)
Nextflow_match=re.search(Nextflow_pattern,content)
if Galaxy_match or CWL_match or Workflow_match or Workflow_match_2 or Nextflow_match:
return True
else:
return False
except Exception:
pass
12 changes: 10 additions & 2 deletions src/somef/process_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
import urllib
from .utils import constants, markdown_utils
from . import extract_ontologies
from . import extract_ontologies,extract_workflows
from .process_results import Result
from chardet import detect

Expand Down Expand Up @@ -150,7 +150,15 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
constants.PROP_TYPE: constants.URL
}, 1, constants.TECHNIQUE_FILE_EXPLORATION
)

if filename.endswith(".ga") or filename.endswith(".cwl") or filename.endswith(".nf") or (filename.endswith(".snake") or filename.endswith(".smk") or "Snakefile"==filename_no_ext) or filename.endswith(".knwf") or filename.endswith(".t2flow") or filename.endswith(".dag") or filename.endswith(".kar") or filename.endswith(".wdl"):
analysis = extract_workflows.is_file_workflow(os.path.join(repo_dir, file_path))
if analysis == True:
Workflow_url=get_file_link(repo_type,file_path,owner,repo_name,repo_default_branch,repo_dir,repo_relative_path,filename)
metadata_result.add_result(constants.CAT_WORKFLOWS,
{
constants.PROP_VALUE: Workflow_url,
constants.PROP_TYPE: constants.URL
}, 1, constants.TECHNIQUE_FILE_EXPLORATION)
# TO DO: Improve this a bit, as just returning the docs folder is not that informative
for dir_name in dir_names:
if dir_name.lower() == "docs":
Expand Down
80 changes: 80 additions & 0 deletions src/somef/test/test_data/SimulatedReads2Map.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
version 1.0


import "../../structs/dna_seq_structs.wdl"
import "../../structs/read_simulation_structs.wdl"

import "../../tasks/pedigree_simulator_utils.wdl"
import "../../tasks/JointReports.wdl" as reports

import "../../subworkflows/SimulatedSingleFamily.wdl" as sub

workflow SimulatedReads {

input {
ReferenceFasta references
Family family
Sequencing sequencing
Int number_of_families
Int global_seed
Int max_cores
Int n_chrom
String? filters

Int chunk_size = 5
Boolean gatk_mchap = false
Boolean hardfilters = true
Boolean replaceAD = true
}

# ProduceFamiliesSeeds just generates random seeds. It returns an
# array of integers
call pedigree_simulator_utils.ProduceFamiliesSeeds {
input:
global_seed= global_seed,
number_of_families=number_of_families
}

# Here we generate Family objects on the fly, based on the values
# from the family and the random seed of the previous task.
scatter (seed in ProduceFamiliesSeeds.seeds) {
# Calling reads_simu for each seed
call sub.SimulatedSingleFamily {
input:
references=references,
family=family,
sequencing = sequencing,
max_cores = max_cores,
filters = filters,
ploidy = family.ploidy,
chunk_size = chunk_size,
gatk_mchap=gatk_mchap,
hardfilters = hardfilters,
replaceAD = replaceAD,
n_chrom = n_chrom
}
}

call reports.JointTablesSimu {
input:
data1_depths_geno_prob = SimulatedSingleFamily.data1_depths_geno_prob,
data2_maps = SimulatedSingleFamily.data2_maps,
data3_filters = SimulatedSingleFamily.data3_filters,
data5_SNPCall_efficiency = SimulatedSingleFamily.data5_SNPCall_efficiency,
data4_times = SimulatedSingleFamily.data4_times,
data6_RDatas = SimulatedSingleFamily.data6_RDatas,
data7_gusmap = SimulatedSingleFamily.data7_gusmap,
data8_names = SimulatedSingleFamily.data8_names,
data9_simu_haplo = SimulatedSingleFamily.simu_haplo,
data10_counts = SimulatedSingleFamily.data10_counts,
depth = sequencing.depth,
plots = SimulatedSingleFamily.Plots,
positions = SimulatedSingleFamily.positions
}

# Here you can reference outputs from the sub workflow. Remember that
# it will be an array of the same type of the original.
output {
File results = JointTablesSimu.results
}
}
17 changes: 17 additions & 0 deletions src/somef/test/test_extract_workflows.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import unittest
import os
from pathlib import Path

from .. import extract_workflows

test_data_repositories = str(Path(__file__).parent / "test_data" ) + os.path.sep

class TestWorkflows(unittest.TestCase):
def test_is_workflow(self):

workflow = extract_workflows.is_file_workflow(test_data_repositories + "SimulatedReads2Map.wdl")
assert workflow, "The file does contain a workflow."

def test_is_workflow_fake(self):
workflow = extract_workflows.is_file_workflow(test_data_repositories + "repositories/wav2letter/scripts/arrayfire_parser.py")
assert(workflow is False)
1 change: 1 addition & 0 deletions src/somef/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
CAT_SUPPORT = "support"
CAT_SUPPORT_CHANNELS = "support_channels"
CAT_USAGE = "usage"
CAT_WORKFLOWS = "workflows"

# Special category: missing categories
CAT_MISSING = "somef_missing_categories"
Expand Down

0 comments on commit 9a976e7

Please sign in to comment.