Skip to content

Commit

Permalink
Merge pull request #435 from KnowledgeCaptureAndDiscovery/dev Fix #431
Browse files Browse the repository at this point in the history
…Fix #427

Dev
  • Loading branch information
dgarijo authored Apr 20, 2022
2 parents b02f6c5 + 2083a88 commit 450d427
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 38 deletions.
2 changes: 1 addition & 1 deletion src/somef/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
arxiv_links = regular_expressions.extract_arxiv_links(unfiltered_text)
wiki_links = regular_expressions.extract_wiki_links(unfiltered_text, repo_url)
# logo = extract_logo(unfiltered_text, repo_url)
logo, images = regular_expressions.extract_images(unfiltered_text, repo_url)
logo, images = regular_expressions.extract_images(unfiltered_text, repo_url, local_repo)
support_channels = regular_expressions.extract_support_channels(unfiltered_text)
package_distribution = regular_expressions.extract_package_distributions(unfiltered_text)
else:
Expand Down
15 changes: 7 additions & 8 deletions src/somef/parser_somef.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,20 +438,19 @@ def minor_than(second, first):


def is_header(header):
if header.startswith('<h1') or header.startswith('<h2') or header.startswith('<h3') \
or header.startswith('<h4') or header.startswith('<h5') or header.startswith('<h6'):
if (header.startswith('<h1') or header.startswith('<h2') or header.startswith('<h3') \
or header.startswith('<h4') or header.startswith('<h5') or header.startswith('<h6')) \
and header.find('</h') > 0:
return True
else:
return False


def get_tag_content(header):
try:
init = header.index(">")
end = header.index("</h")
return replace_html_tags(header[init+1:end])
except:
return ""
init = header.index(">")
end = header.index("</h")
return replace_html_tags(header[init+1:end])



def replace_html_tags(text):
Expand Down
44 changes: 24 additions & 20 deletions src/somef/regular_expressions.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import re
import markdown
import requests
Expand Down Expand Up @@ -203,7 +204,7 @@ def extract_logo(unfiltered_text, repo_url):
return logo


def extract_images(unfiltered_text, repo_url):
def extract_images(unfiltered_text, repo_url, local_repo):
"""Extracts logos from a given text"""
logo = ""
has_logo = False
Expand All @@ -223,20 +224,20 @@ def extract_images(unfiltered_text, repo_url):
if not has_logo and repo:
start = img.rindex("/")
if img.find(repo_name, start) > 0:
logo = rename_github_image(img, repo_url)
logo = rename_github_image(img, repo_url, local_repo)
has_logo = True
elif get_alt_text_md(html_text, img) == repo_name or get_alt_text_md(html_text, img).upper() == "LOGO":
logo = rename_github_image(img, repo_url)
logo = rename_github_image(img, repo_url, local_repo)
has_logo = True
else:
start = img.rindex("/")
if img.upper().find("LOGO", start) > 0:
logo = rename_github_image(img, repo_url)
logo = rename_github_image(img, repo_url, local_repo)
has_logo = True
else:
images.append(rename_github_image(img, repo_url))
images.append(rename_github_image(img, repo_url, local_repo))
else:
images.append(rename_github_image(img, repo_url))
images.append(rename_github_image(img, repo_url, local_repo))
for index_img in result:
init = html_text.find("src=\"", index_img)
end = html_text.find("\"", init + 5)
Expand All @@ -245,21 +246,21 @@ def extract_images(unfiltered_text, repo_url):
start = img.rindex("/")
image_name = img[start:]
if image_name.find(repo_name) > 0 or image_name.upper().find("LOGO") > 0:
logo = rename_github_image(img, repo_url)
logo = rename_github_image(img, repo_url, local_repo)
has_logo = True
elif get_alt_text_img(html_text, index_img) == repo_name or get_alt_text_img(html_text,
index_img).upper() == "LOGO":
logo = rename_github_image(img, repo_url)
logo = rename_github_image(img, repo_url, local_repo)
has_logo = True
else:
images.append(rename_github_image(img, repo_url))
images.append(rename_github_image(img, repo_url, local_repo))
else:
start = img.rindex("/")
if img.upper().find("LOGO", start) > 0:
logo = rename_github_image(img, repo_url)
logo = rename_github_image(img, repo_url, local_repo)
has_logo = True
else:
images.append(rename_github_image(img, repo_url))
images.append(rename_github_image(img, repo_url, local_repo))

return logo, images

Expand Down Expand Up @@ -445,17 +446,20 @@ def extract_binder_links(readme_text) -> object:
return list(dict.fromkeys(binder_links))


def rename_github_image(img, repo_url):
def rename_github_image(img, repo_url, local_repo):
"""Renames GitHub image links so they can be accessed raw"""
if not img.startswith("http") and repo_url is not None and repo_url != "":
if repo_url.find("/tree/") > 0:
repo_url = repo_url.replace("/tree/", "/")
if not img.startswith("http") and ((repo_url is not None and repo_url != "") or (local_repo is not None and local_repo != "")):
if repo_url is not None and repo_url != "":
if repo_url.find("/tree/") > 0:
repo_url = repo_url.replace("/tree/", "/")
else:
repo_url = repo_url + "/master/"
repo_url = repo_url.replace("github.com", "raw.githubusercontent.com")
if not repo_url.endswith("/"):
repo_url = repo_url + "/"
img = repo_url + img
else:
repo_url = repo_url + "/master/"
repo_url = repo_url.replace("github.com", "raw.githubusercontent.com")
if not repo_url.endswith("/"):
repo_url = repo_url + "/"
img = repo_url + img
img = local_repo + os.path.sep + img
return img


Expand Down
12 changes: 11 additions & 1 deletion src/somef/test/test_parser_somef.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path

from ..parser_somef import extract_headers, extract_headers_with_tags, extract_content_per_header, \
extract_bash, extract_blocks_excerpts, extract_text_excerpts_header, extract_headers_parents
extract_bash, extract_blocks_excerpts, extract_text_excerpts_header, extract_headers_parents, is_header

# Test data for tests
test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep
Expand Down Expand Up @@ -62,3 +62,13 @@ def test_extract_headers_parents(self):
text = data_file.read()
parents = extract_headers_parents(text)
assert len(parents) == 15

def test_issue_431(self):
# Changed method is_header to avoid false positive
# It will return true only when there is an opening and closing header tag in string input
first_header = '''<h1 align="center">\n
'''
second_header = '''<h1>WIzard for DOCumenting Ontologies (WIDOCO)</h1>'''
print(is_header(first_header))
print(is_header(second_header))
assert (not is_header(first_header) and is_header(second_header))
24 changes: 16 additions & 8 deletions src/somef/test/test_regular_expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .. import regular_expressions

test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep
test_data_repositories = str(Path(__file__).parent / "test_data" / "repositories") + os.path.sep


class TestCli(unittest.TestCase):
Expand Down Expand Up @@ -104,31 +105,31 @@ def test_issue_291(self):
repo_url = "https://github.com/dgarijo/Widoco"
with open(test_data_path + "README-widoco.md", "r") as data_file:
test_text = data_file.read()
logo, images = regular_expressions.extract_images(test_text, repo_url)
logo, images = regular_expressions.extract_images(test_text, repo_url, None)
assert (not logo == "")

def test_issue_291_2(self):
"""Test designed to check if logos are detected"""
repo_url = "https://github.com/usc-isi-i2/kgtk/"
with open(test_data_path + "test_logo_uscisii2.txt", "r") as data_file:
test_text = data_file.read()
logo, images = regular_expressions.extract_images(test_text, repo_url)
logo, images = regular_expressions.extract_images(test_text, repo_url, None)
assert (not logo == "")

def test_issue_291_3(self):
"""Test designed to check if logos are detected"""
repo_url = "https://github.com/tensorflow/tensorflow/"
with open(test_data_path + "test_logo_tensorflow.txt", "r") as data_file:
test_text = data_file.read()
logo, images = regular_expressions.extract_images(test_text, repo_url)
logo, images = regular_expressions.extract_images(test_text, repo_url, None)
assert (not logo == "")

def test_issue_images(self):
"""Test designed to check if images are detected"""
repo_url = "https://github.com/usc-isi-i2/kgtk/"
with open(test_data_path + "test_issue_images.txt", "r") as data_file:
test_text = data_file.read()
logo, images = regular_expressions.extract_images(test_text, repo_url)
logo, images = regular_expressions.extract_images(test_text, repo_url, None)
assert len(images) > 0

def test_issue_181(self):
Expand All @@ -149,28 +150,28 @@ def test_logo(self):
"""Test designed to check if logos are detected"""
with open(test_data_path + "test_logo.txt", "r") as data_file:
test_text = data_file.read()
logo, images = regular_expressions.extract_images(test_text, "https://github.com/oeg-upm/Chowlk")
logo, images = regular_expressions.extract_images(test_text, "https://github.com/oeg-upm/Chowlk", None)
assert (not logo == "")

def test_logo2(self):
"""Test designed to check if logos are detected"""
with open(test_data_path + "test_logo2.txt", "r") as data_file:
test_text = data_file.read()
logo, images = regular_expressions.extract_images(test_text, "https://github.com/pytorch/pytorch")
logo, images = regular_expressions.extract_images(test_text, "https://github.com/pytorch/pytorch", None)
assert (not logo == "")

def test_images(self):
"""Test designed to check if images are detected"""
with open(test_data_path + "test_images.txt", "r") as data_file:
test_text = data_file.read()
logo, images = regular_expressions.extract_images(test_text, "https://github.com/pytorch/pytorch")
logo, images = regular_expressions.extract_images(test_text, "https://github.com/pytorch/pytorch", None)
assert (len(images) > 0 and not logo == "")

def test_issue_320(self):
"""Test designed to check if logos are detected"""
with open(test_data_path + "README-urllib3.md", "r") as data_file:
test_text = data_file.read()
logo, images = regular_expressions.extract_images(test_text, "https://github.com/urllib3/urllib3")
logo, images = regular_expressions.extract_images(test_text, "https://github.com/urllib3/urllib3", None)
assert (not logo == "")

def test_issue_337(self):
Expand All @@ -186,3 +187,10 @@ def test_issue_337(self):
"""
text = regular_expressions.remove_links_images(text)
assert text.find("[www.mapshaper.org](http://www.mapshaper.org)") == -1

def test_issue_427(self):
with open(test_data_repositories + "Widoco" + os.path.sep + "README.md", "r") as data_file:
test_text = data_file.read()
logo, images = regular_expressions.extract_images(test_text, None, test_data_repositories + "Widoco")
assert (logo.find('test_data') > 0)

0 comments on commit 450d427

Please sign in to comment.