Merge pull request #435 from KnowledgeCaptureAndDiscovery/dev Fix #431 …

…Fix #427 Dev
KnowledgeCaptureAndDiscovery · Apr 20, 2022 · 450d427 · 450d427
2 parents b02f6c5 + 2083a88
commit 450d427
Show file tree

Hide file tree

Showing 5 changed files with 59 additions and 38 deletions.
diff --git a/src/somef/cli.py b/src/somef/cli.py
@@ -698,7 +698,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
         arxiv_links = regular_expressions.extract_arxiv_links(unfiltered_text)
         wiki_links = regular_expressions.extract_wiki_links(unfiltered_text, repo_url)
         # logo = extract_logo(unfiltered_text, repo_url)
-        logo, images = regular_expressions.extract_images(unfiltered_text, repo_url)
+        logo, images = regular_expressions.extract_images(unfiltered_text, repo_url, local_repo)
         support_channels = regular_expressions.extract_support_channels(unfiltered_text)
         package_distribution = regular_expressions.extract_package_distributions(unfiltered_text)
     else:

diff --git a/src/somef/parser_somef.py b/src/somef/parser_somef.py
@@ -438,20 +438,19 @@ def minor_than(second, first):
 
 
 def is_header(header):
-    if header.startswith('<h1') or header.startswith('<h2') or header.startswith('<h3') \
-            or header.startswith('<h4') or header.startswith('<h5') or header.startswith('<h6'):
+    if (header.startswith('<h1') or header.startswith('<h2') or header.startswith('<h3') \
+            or header.startswith('<h4') or header.startswith('<h5') or header.startswith('<h6')) \
+            and header.find('</h') > 0:
         return True
     else:
         return False
 
 
 def get_tag_content(header):
-    try:
-        init = header.index(">")
-        end = header.index("</h")
-        return replace_html_tags(header[init+1:end])
-    except:
-        return ""
+    init = header.index(">")
+    end = header.index("</h")
+    return replace_html_tags(header[init+1:end])
+
 
 
 def replace_html_tags(text):

diff --git a/src/somef/regular_expressions.py b/src/somef/regular_expressions.py
@@ -1,3 +1,4 @@
+import os
 import re
 import markdown
 import requests
@@ -203,7 +204,7 @@ def extract_logo(unfiltered_text, repo_url):
     return logo
 
 
-def extract_images(unfiltered_text, repo_url):
+def extract_images(unfiltered_text, repo_url, local_repo):
     """Extracts logos from a given text"""
     logo = ""
     has_logo = False
@@ -223,20 +224,20 @@ def extract_images(unfiltered_text, repo_url):
         if not has_logo and repo:
             start = img.rindex("/")
             if img.find(repo_name, start) > 0:
-                logo = rename_github_image(img, repo_url)
+                logo = rename_github_image(img, repo_url, local_repo)
                 has_logo = True
             elif get_alt_text_md(html_text, img) == repo_name or get_alt_text_md(html_text, img).upper() == "LOGO":
-                logo = rename_github_image(img, repo_url)
+                logo = rename_github_image(img, repo_url, local_repo)
                 has_logo = True
             else:
                 start = img.rindex("/")
                 if img.upper().find("LOGO", start) > 0:
-                    logo = rename_github_image(img, repo_url)
+                    logo = rename_github_image(img, repo_url, local_repo)
                     has_logo = True
                 else:
-                    images.append(rename_github_image(img, repo_url))
+                    images.append(rename_github_image(img, repo_url, local_repo))
         else:
-            images.append(rename_github_image(img, repo_url))
+            images.append(rename_github_image(img, repo_url, local_repo))
     for index_img in result:
         init = html_text.find("src=\"", index_img)
         end = html_text.find("\"", init + 5)
@@ -245,21 +246,21 @@ def extract_images(unfiltered_text, repo_url):
             start = img.rindex("/")
             image_name = img[start:]
             if image_name.find(repo_name) > 0 or image_name.upper().find("LOGO") > 0:
-                logo = rename_github_image(img, repo_url)
+                logo = rename_github_image(img, repo_url, local_repo)
                 has_logo = True
             elif get_alt_text_img(html_text, index_img) == repo_name or get_alt_text_img(html_text,
                                                                                          index_img).upper() == "LOGO":
-                logo = rename_github_image(img, repo_url)
+                logo = rename_github_image(img, repo_url, local_repo)
                 has_logo = True
             else:
-                images.append(rename_github_image(img, repo_url))
+                images.append(rename_github_image(img, repo_url, local_repo))
         else:
             start = img.rindex("/")
             if img.upper().find("LOGO", start) > 0:
-                logo = rename_github_image(img, repo_url)
+                logo = rename_github_image(img, repo_url, local_repo)
                 has_logo = True
             else:
-                images.append(rename_github_image(img, repo_url))
+                images.append(rename_github_image(img, repo_url, local_repo))
 
     return logo, images
 
@@ -445,17 +446,20 @@ def extract_binder_links(readme_text) -> object:
     return list(dict.fromkeys(binder_links))
 
 
-def rename_github_image(img, repo_url):
+def rename_github_image(img, repo_url, local_repo):
     """Renames GitHub image links so they can be accessed raw"""
-    if not img.startswith("http") and repo_url is not None and repo_url != "":
-        if repo_url.find("/tree/") > 0:
-            repo_url = repo_url.replace("/tree/", "/")
+    if not img.startswith("http") and ((repo_url is not None and repo_url != "") or (local_repo is not None and local_repo != "")):
+        if repo_url is not None and repo_url != "":
+            if repo_url.find("/tree/") > 0:
+                repo_url = repo_url.replace("/tree/", "/")
+            else:
+                repo_url = repo_url + "/master/"
+            repo_url = repo_url.replace("github.com", "raw.githubusercontent.com")
+            if not repo_url.endswith("/"):
+                repo_url = repo_url + "/"
+            img = repo_url + img
         else:
-            repo_url = repo_url + "/master/"
-        repo_url = repo_url.replace("github.com", "raw.githubusercontent.com")
-        if not repo_url.endswith("/"):
-            repo_url = repo_url + "/"
-        img = repo_url + img
+            img = local_repo + os.path.sep + img
     return img
 
 

diff --git a/src/somef/test/test_parser_somef.py b/src/somef/test/test_parser_somef.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 
 from ..parser_somef import extract_headers, extract_headers_with_tags, extract_content_per_header, \
-    extract_bash, extract_blocks_excerpts, extract_text_excerpts_header, extract_headers_parents
+    extract_bash, extract_blocks_excerpts, extract_text_excerpts_header, extract_headers_parents, is_header
 
 # Test data for tests
 test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep
@@ -62,3 +62,13 @@ def test_extract_headers_parents(self):
             text = data_file.read()
             parents = extract_headers_parents(text)
             assert len(parents) == 15
+
+    def test_issue_431(self):
+        # Changed method is_header to avoid false positive
+        # It will return true only when there is an opening and closing header tag in string input
+        first_header = '''<h1 align="center">\n
+        '''
+        second_header = '''<h1>WIzard for DOCumenting Ontologies (WIDOCO)</h1>'''
+        print(is_header(first_header))
+        print(is_header(second_header))
+        assert (not is_header(first_header) and is_header(second_header))
diff --git a/src/somef/test/test_regular_expressions.py b/src/somef/test/test_regular_expressions.py
@@ -5,6 +5,7 @@
 from .. import regular_expressions
 
 test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep
+test_data_repositories = str(Path(__file__).parent / "test_data" / "repositories") + os.path.sep
 
 
 class TestCli(unittest.TestCase):
@@ -104,31 +105,31 @@ def test_issue_291(self):
         repo_url = "https://github.com/dgarijo/Widoco"
         with open(test_data_path + "README-widoco.md", "r") as data_file:
             test_text = data_file.read()
-            logo, images = regular_expressions.extract_images(test_text, repo_url)
+            logo, images = regular_expressions.extract_images(test_text, repo_url, None)
             assert (not logo == "")
 
     def test_issue_291_2(self):
         """Test designed to check if logos are detected"""
         repo_url = "https://github.com/usc-isi-i2/kgtk/"
         with open(test_data_path + "test_logo_uscisii2.txt", "r") as data_file:
             test_text = data_file.read()
-            logo, images = regular_expressions.extract_images(test_text, repo_url)
+            logo, images = regular_expressions.extract_images(test_text, repo_url, None)
             assert (not logo == "")
 
     def test_issue_291_3(self):
         """Test designed to check if logos are detected"""
         repo_url = "https://github.com/tensorflow/tensorflow/"
         with open(test_data_path + "test_logo_tensorflow.txt", "r") as data_file:
             test_text = data_file.read()
-            logo, images = regular_expressions.extract_images(test_text, repo_url)
+            logo, images = regular_expressions.extract_images(test_text, repo_url, None)
             assert (not logo == "")
 
     def test_issue_images(self):
         """Test designed to check if images are detected"""
         repo_url = "https://github.com/usc-isi-i2/kgtk/"
         with open(test_data_path + "test_issue_images.txt", "r") as data_file:
             test_text = data_file.read()
-            logo, images = regular_expressions.extract_images(test_text, repo_url)
+            logo, images = regular_expressions.extract_images(test_text, repo_url, None)
             assert len(images) > 0
 
     def test_issue_181(self):
@@ -149,28 +150,28 @@ def test_logo(self):
         """Test designed to check if logos are detected"""
         with open(test_data_path + "test_logo.txt", "r") as data_file:
             test_text = data_file.read()
-            logo, images = regular_expressions.extract_images(test_text, "https://github.com/oeg-upm/Chowlk")
+            logo, images = regular_expressions.extract_images(test_text, "https://github.com/oeg-upm/Chowlk", None)
             assert (not logo == "")
 
     def test_logo2(self):
         """Test designed to check if logos are detected"""
         with open(test_data_path + "test_logo2.txt", "r") as data_file:
             test_text = data_file.read()
-            logo, images = regular_expressions.extract_images(test_text, "https://github.com/pytorch/pytorch")
+            logo, images = regular_expressions.extract_images(test_text, "https://github.com/pytorch/pytorch", None)
             assert (not logo == "")
 
     def test_images(self):
         """Test designed to check if images are detected"""
         with open(test_data_path + "test_images.txt", "r") as data_file:
             test_text = data_file.read()
-            logo, images = regular_expressions.extract_images(test_text, "https://github.com/pytorch/pytorch")
+            logo, images = regular_expressions.extract_images(test_text, "https://github.com/pytorch/pytorch", None)
             assert (len(images) > 0 and not logo == "")
 
     def test_issue_320(self):
         """Test designed to check if logos are detected"""
         with open(test_data_path + "README-urllib3.md", "r") as data_file:
             test_text = data_file.read()
-            logo, images = regular_expressions.extract_images(test_text, "https://github.com/urllib3/urllib3")
+            logo, images = regular_expressions.extract_images(test_text, "https://github.com/urllib3/urllib3", None)
             assert (not logo == "")
 
     def test_issue_337(self):
@@ -186,3 +187,10 @@ def test_issue_337(self):
         """
         text = regular_expressions.remove_links_images(text)
         assert text.find("[www.mapshaper.org](http://www.mapshaper.org)") == -1
+
+    def test_issue_427(self):
+        with open(test_data_repositories + "Widoco" + os.path.sep + "README.md", "r") as data_file:
+            test_text = data_file.read()
+            logo, images = regular_expressions.extract_images(test_text, None, test_data_repositories + "Widoco")
+            assert (logo.find('test_data') > 0)
+