Skip to content

Commit

Permalink
Merge pull request #173 from KnowledgeCaptureAndDiscovery/dev
Browse files Browse the repository at this point in the history
Fix #153 Fix #163
  • Loading branch information
dgarijo authored Mar 10, 2021
2 parents 70ddbe9 + 984b767 commit 5160cf3
Showing 1 changed file with 26 additions and 20 deletions.
46 changes: 26 additions & 20 deletions src/somef/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

import urllib


## Markdown to plain text conversion: begin ##
# code snippet from https://stackoverflow.com/a/54923798
def unmark_element(element, stream=None):
Expand Down Expand Up @@ -226,7 +227,7 @@ def get_path(obj, path):
license_info = {}
if 'license' in filtered_resp:
for k in ('name', 'url'):
if k in filtered_resp['license']:
if k in filtered_resp['license']:
license_info[k] = filtered_resp['license'][k]

## If we didn't find it, look for the license
Expand All @@ -240,7 +241,8 @@ def get_path(obj, path):
# license_text = license_text_resp.text
license_info['url'] = possible_license_url

filtered_resp['license'] = license_info
if license_info != '':
filtered_resp['license'] = license_info

# get keywords / topics
topics_headers = header
Expand Down Expand Up @@ -310,7 +312,7 @@ def get_path(obj, path):
zip_ref.extractall(repo_extract_dir)

repo_folders = os.listdir(repo_extract_dir)
assert(len(repo_folders) == 1)
assert (len(repo_folders) == 1)

repo_dir = os.path.join(repo_extract_dir, repo_folders[0])

Expand All @@ -334,7 +336,8 @@ def get_path(obj, path):
else:
docs_path = repo_relative_path + "/" + dirname

docs.append(f"https://github.com/{owner}/{repo_name}/tree/{urllib.parse.quote(repo_ref)}/{docs_path}")
docs.append(
f"https://github.com/{owner}/{repo_name}/tree/{urllib.parse.quote(repo_ref)}/{docs_path}")
print(docs)

print("NOTEBOOKS:")
Expand All @@ -343,14 +346,12 @@ def get_path(obj, path):
print("DOCKERFILES:")
print(dockerfiles)

def convert_to_raw_usercontent(partial):

return f"https://raw.githubusercontent.com/{owner}/{repo_name}/{repo_ref}/{urllib.parse.quote(partial)}"


filtered_resp["hasExecutableNotebook"] = [convert_to_raw_usercontent(x) for x in notebooks]
filtered_resp["hasBuildFile"] = [convert_to_raw_usercontent(x) for x in dockerfiles]
filtered_resp["hasDocumentation"] = docs
if len(notebooks) > 0:
filtered_resp["hasExecutableNotebook"] = [convert_to_raw_usercontent(x, owner, repo_name, repo_ref) for x in notebooks]
if len(dockerfiles) > 0:
filtered_resp["hasBuildFile"] = [convert_to_raw_usercontent(x, owner, repo_name, repo_ref) for x in dockerfiles]
if len(docs) > 0:
filtered_resp["hasDocumentation"] = docs

## get releases
releases_list, date = rate_limit_get(repo_api_base_url + "/releases",
Expand All @@ -365,6 +366,9 @@ def convert_to_raw_usercontent(partial):
return text, filtered_resp


def convert_to_raw_usercontent(partial, owner, repo_name, repo_ref):
return f"https://raw.githubusercontent.com/{owner}/{repo_name}/{repo_ref}/{urllib.parse.quote(partial)}"

## Function takes readme text as input and divides it into excerpts
## Returns the extracted excerpts
def create_excerpts(string_list):
Expand Down Expand Up @@ -558,7 +562,7 @@ def merge(header_predictions, predictions, citations, dois, binder_links, long_t
print("Merge prediction using header information, classifier and bibtex and doi parsers")
if long_title:
predictions['long_title'] = {'excerpt': long_title, 'confidence': [1.0],
'technique': 'Regular expression'}
'technique': 'Regular expression'}
for i in range(len(citations)):
if 'citation' not in predictions.keys():
predictions['citation'] = []
Expand All @@ -575,7 +579,7 @@ def merge(header_predictions, predictions, citations, dois, binder_links, long_t
for notebook in binder_links:
# The identifier is in position 1. Position 0 is the badge id, which we don't want to export
predictions['executable_example'].insert(0, {'excerpt': notebook[1], 'confidence': [1.0],
'technique': 'Regular expression'})
'technique': 'Regular expression'})
for headers in header_predictions:
if headers not in predictions.keys():
predictions[headers] = header_predictions[headers]
Expand All @@ -596,7 +600,10 @@ def format_output(git_data, repo_data):
repo_data['description'] = []
repo_data['description'].append({'excerpt': git_data[i], 'confidence': [1.0], 'technique': 'GitHub API'})
else:
repo_data[i] = {'excerpt': git_data[i], 'confidence': [1.0], 'technique': 'GitHub API'}
if i == 'hasExecutableNotebook' or i == 'hasBuildFile' or i == 'hasDocumentation':
repo_data[i] = {'excerpt': git_data[i], 'confidence': [1.0], 'technique': 'File Exploration'}
else:
repo_data[i] = {'excerpt': git_data[i], 'confidence': [1.0], 'technique': 'GitHub API'}

return repo_data

Expand All @@ -618,16 +625,15 @@ def save_json(git_data, repo_data, outfile):
repo_data = format_output(git_data, repo_data)
save_json_output(repo_data, outfile)

def save_codemeta_output(repo_data, outfile, pretty=False):

def save_codemeta_output(repo_data, outfile, pretty=False):
def data_path(path):
return DataGraph.resolve_path(repo_data, path)

def format_date(date_string):
date_object = date_parser.parse(date_string)
return date_object.strftime("%Y-%m-%d")


latest_release = None
releases = data_path(["releases", "excerpt"])

Expand Down Expand Up @@ -659,9 +665,9 @@ def average_confidence(x):
else:
return 0


descriptions = data_path(["description"])
descriptions.sort(key=lambda x: (average_confidence(x) + (1 if x["technique"] == "GitHub API" else 0)), reverse=True)
descriptions.sort(key=lambda x: (average_confidence(x) + (1 if x["technique"] == "GitHub API" else 0)),
reverse=True)
descriptions_text = [x["excerpt"] for x in descriptions]

codemeta_output = {
Expand Down Expand Up @@ -792,4 +798,4 @@ def run_cli(*,
out_file.write(data_graph.g.serialize(format=graph_format))

if codemeta_out is not None:
save_codemeta_output(repo_data, codemeta_out, pretty=pretty)
save_codemeta_output(repo_data, codemeta_out, pretty=pretty)

0 comments on commit 5160cf3

Please sign in to comment.