Fix #462

KnowledgeCaptureAndDiscovery · May 26, 2022 · 33b2425 · 33b2425
1 parent cb5b678
commit 33b2425
Show file tree

Hide file tree

Showing 4 changed files with 161 additions and 108 deletions.
diff --git a/src/somef/__main__.py b/src/somef/__main__.py
@@ -143,11 +143,14 @@ class URLParamType(click.types.StringParamType):
     help="""JSON report with the missing metadata fields SOMEF was not able to find. The report will be placed in 
     $PATH_missing.json, where $PATH is -o, -c or -g."""
 )
+@click.option(
+    "--keep_tmp",
+    "-kt",
+    type=click.Path(),
+    help="""SOMEF will NOT delete the temporary folder where files are stored for analysis. Files will be stored at the
+    desired path"""
+)
 def describe(**kwargs):
     from somef import cli
     cli.run_cli(**kwargs)
     click.secho(f"Success", fg="green")
-
-
-#if __name__ == '__main__':
-#    version()
diff --git a/src/somef/cli.py b/src/somef/cli.py
@@ -620,7 +620,7 @@ def create_missing_fields_report(repo_data, out_path):
 
 
 def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, local_repo=None,
-                 ignore_github_metadata=False, readme_only=False):
+                 ignore_github_metadata=False, readme_only=False, keep_tmp=None):
     """
     Main function to get the data through the command line
     Parameters
@@ -632,6 +632,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
     local_repo: flag to indicate that the repo is local
     ignore_github_metadata: flag used to avoid doing extra requests to the GitHub API
     readme_only: flag to indicate that only the readme should be analyzed
+    keep_tmp: path where to store TMP files in case SOMEF is instructed to keep them
 
     Returns
     -------
@@ -650,9 +651,9 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
         header['Authorization'] = file_paths['Authorization']
     header['accept'] = 'application/vnd.github.v3+json'
     if repo_url is not None:
-        assert (doc_src is None)
+        # assert (doc_src is None)
         try:
-            text, github_data = process_repository.load_github_repository_metadata(repo_url, header, ignore_github_metadata, readme_only)
+            text, github_data = process_repository.load_online_repository_metadata(repo_url, header, ignore_github_metadata, readme_only, keep_tmp)
             if text == "":
                 print("Warning: README document does not exist in the repository")
         except process_repository.GithubUrlError:
@@ -745,7 +746,8 @@ def run_cli(*,
             graph_format="turtle",
             codemeta_out=None,
             pretty=False,
-            missing=False
+            missing=False,
+            keep_tmp=None
             ):
     """Function to run all the required components of the cli for a repository"""
     # check if it is a valid url
@@ -772,18 +774,18 @@ def run_cli(*,
         for remove_url in remove_urls:
             repo_set.remove(remove_url)
         if len(repo_set) > 0:
-            repo_data = [cli_get_data(threshold, ignore_classifiers, repo_url=repo_url) for repo_url in repo_set]
+            repo_data = [cli_get_data(threshold, ignore_classifiers, repo_url=repo_url, keep_tmp=keep_tmp) for repo_url in repo_set]
         else:
             return None
 
     else:
         if repo_url:
             repo_data = cli_get_data(threshold, ignore_classifiers, repo_url=repo_url,
-                                     ignore_github_metadata=ignore_github_metadata, readme_only=readme_only)
+                                     ignore_github_metadata=ignore_github_metadata, readme_only=readme_only, keep_tmp=keep_tmp)
         elif local_repo:
-            repo_data = cli_get_data(threshold, ignore_classifiers, local_repo=local_repo)
+            repo_data = cli_get_data(threshold, ignore_classifiers, local_repo=local_repo, keep_tmp=keep_tmp)
         else:
-            repo_data = cli_get_data(threshold, ignore_classifiers, doc_src=doc_src)
+            repo_data = cli_get_data(threshold, ignore_classifiers, doc_src=doc_src, keep_tmp=keep_tmp)
 
     if output is not None:
         save_json_output(repo_data, output, missing, pretty=pretty)

diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py
@@ -41,7 +41,7 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, **kwargs):
     return response, date
 
 
-def load_gitlab_repository_metadata(repository_url, header, readme_only=False):
+def load_gitlab_repository_metadata(repository_url, header, readme_only=False, keep_tmp=None):
     """
     Function uses the repository_url provided to load required information from gitlab.
     Information kept from the repository is written in keep_keys.
@@ -50,6 +50,7 @@ def load_gitlab_repository_metadata(repository_url, header, readme_only=False):
     repository_url: URL of the Gitlab repository to analyze
     header: headers of the repository
     readme_only: flag to indicate whether to process the full repo or just the readme
+    keep_tmp
 
     Returns
     -------
@@ -128,7 +129,6 @@ def load_gitlab_repository_metadata(repository_url, header, readme_only=False):
         if repo_download.status_code != 200:
             print(f"Error: Archive request failed with HTTP {repo_download.status_code}")
         repo_zip = repo_download.content
-        print(repo_zip)
         text = repo_zip.decode('utf-8')
         return text, {}
 
@@ -225,11 +225,6 @@ def get_path(obj, path):
 
     del filtered_resp['languages_url']
 
-    # get default README
-    # repo_api_base_url https://api.github.com/dgarijo/Widoco/readme
-    # readme_info, date = rate_limit_get(repo_api_base_url + "/readme",
-    #                                   headers=topics_headers,
-    #                                   params=ref_param)
     readme_info = {}
     if 'message' in readme_info.keys():
         print("README Error: " + readme_info['message'])
@@ -243,51 +238,77 @@ def get_path(obj, path):
         text = get_readme_content(project_details['readme_url'])
         filtered_resp['readmeUrl'] = project_details['readme_url']
 
-    # create a temporary directory
-    with tempfile.TemporaryDirectory() as temp_dir:
-
-        # download the repo at the selected branch with the link
-        # https://gitlab.com/unboundedsystems/adapt/-/archive/master/adapt-master.zip
-        repo_archive_url = f"https://gitlab.com/{owner}/{repo_name}/-/archive/{repo_ref}/{repo_name}-{repo_ref}.zip"
-        if len(path_components) == 4:
-            repo_archive_url = f"https://gitlab.com/{owner}/{repo_name}/-/archive/{repo_ref}/{path_components[3]}.zip"
-        print(f"Downloading {repo_archive_url}")
-        repo_download = requests.get(repo_archive_url)
-        repo_zip = repo_download.content
-
-        repo_zip_file = os.path.join(temp_dir, "repo.zip")
-        repo_extract_dir = os.path.join(temp_dir, "repo")
-
-        with open(repo_zip_file, "wb") as f:
-            f.write(repo_zip)
-
-        with zipfile.ZipFile(repo_zip_file, "r") as zip_ref:
-            zip_ref.extractall(repo_extract_dir)
-
-        repo_folders = os.listdir(repo_extract_dir)
-        assert (len(repo_folders) == 1)
-
-        repo_dir = os.path.join(repo_extract_dir, repo_folders[0])
+    if keep_tmp is not None:
+        os.makedirs(keep_tmp, exist_ok=True)
+        text, filtered_resp = download_gitlab_files(keep_tmp, owner, repo_name, repo_ref, filtered_resp,
+                                                    path_components)
+    else:
+        # create a temporary directory (default behavior)
+        with tempfile.TemporaryDirectory() as temp_dir:
+            text, filtered_resp = download_gitlab_files(temp_dir, owner, repo_name, repo_ref, filtered_resp,
+                                                        path_components)
 
-        text, filtered_resp = process_repository_files(repo_dir, filtered_resp, constants.RepositoryType.GITLAB,
-                                                       owner, repo_name, repo_ref)
     releases_list = {}
     if isinstance(releases_list, dict) and 'message' in releases_list.keys():
         print("Releases Error: " + releases_list['message'])
     else:
         filtered_resp['releases'] = [do_crosswalk(release, constants.release_crosswalk_table) for release in
                                      releases_list]
 
-    print("Repository Information Successfully Loaded. \n")
+    print("Repository information successfully loaded. \n")
     return text, filtered_resp
 
 
-def load_github_repository_metadata(repository_url, header, ignore_github_metadata=False, readme_only=False):
+def download_gitlab_files(directory, owner, repo_name, repo_ref, filtered_resp, path_components):
+    """
+    Download all repository files from a GitHub repository
+    Parameters
+    ----------
+    filtered_resp: the main response object we are building in somef
+    repo_ref: link to the repo
+    repo_name: name of the repo
+    owner: GitHub owner
+    directory: directory where to extract all downloaded files
+    path_components: components in the path of the gitlab repository
+
+    Returns
+    -------
+    text and filtered response obtained from the repository
     """
-    Function uses the repository_url provided to load required information from Github.
+    repo_archive_url = f"https://gitlab.com/{owner}/{repo_name}/-/archive/{repo_ref}/{repo_name}-{repo_ref}.zip"
+    if len(path_components) == 4:
+        repo_archive_url = f"https://gitlab.com/{owner}/{repo_name}/-/archive/{repo_ref}/{path_components[3]}.zip"
+    print(f"Downloading {repo_archive_url}")
+    repo_download = requests.get(repo_archive_url)
+    repo_zip = repo_download.content
+
+    repo_zip_file = os.path.join(directory, "repo.zip")
+    repo_extract_dir = os.path.join(directory, "repo")
+
+    with open(repo_zip_file, "wb") as f:
+        f.write(repo_zip)
+
+    with zipfile.ZipFile(repo_zip_file, "r") as zip_ref:
+        zip_ref.extractall(repo_extract_dir)
+
+    repo_folders = os.listdir(repo_extract_dir)
+    assert (len(repo_folders) == 1)
+
+    repo_dir = os.path.join(repo_extract_dir, repo_folders[0])
+
+    return process_repository_files(repo_dir, filtered_resp, constants.RepositoryType.GITLAB,
+                                                   owner, repo_name, repo_ref)
+
+
+def load_online_repository_metadata(repository_url, header, ignore_github_metadata=False, readme_only=False, keep_tmp=None):
+    """
+    Function uses the repository_url provided to load required information from GitHub or Gitlab.
     Information kept from the repository is written in keep_keys.
     Parameters
     ----------
+    keep_tmp
+    readme_only
+    ignore_github_metadata
     repository_url
     header
 
@@ -296,10 +317,10 @@ def load_github_repository_metadata(repository_url, header, ignore_github_metada
     Returns the readme text and required metadata
     """
     if repository_url.rfind("gitlab.com") > 0:
-        return load_gitlab_repository_metadata(repository_url, header, readme_only)
+        return load_gitlab_repository_metadata(repository_url, header, readme_only, keep_tmp=keep_tmp)
 
     print(f"Loading Repository {repository_url} Information....")
-    ## load general response of the repository
+    # load general response of the repository
     if repository_url[-1] == '/':
         repository_url = repository_url[:-1]
     url = urlparse(repository_url)
@@ -319,7 +340,6 @@ def load_github_repository_metadata(repository_url, header, ignore_github_metada
     repo_api_base_url = f"https://api.github.com/repos/{owner}/{repo_name}"
 
     repo_ref = None
-    ref_param = None
 
     if len(path_components) >= 5:
         if not path_components[3] == "tree":
@@ -370,7 +390,7 @@ def load_github_repository_metadata(repository_url, header, ignore_github_metada
         text = repo_zip.decode('utf-8')
         return text, {}
 
-    ## get only the fields that we want
+    # get only the fields that we want
     def do_crosswalk(data, crosswalk_table):
         def get_path(obj, path):
             if isinstance(path, list) or isinstance(path, tuple):
@@ -458,7 +478,7 @@ def get_path(obj, path):
     if len(forks_info.keys()) > 0:
         filtered_resp['forksCount'] = forks_info
 
-    ## get languages
+    # get languages
     if not ignore_github_metadata:
         languages, date = rate_limit_get(filtered_resp['languages_url'], headers=header)
         if "message" in languages:
@@ -468,56 +488,14 @@ def get_path(obj, path):
 
         del filtered_resp['languages_url']
 
-    # get default README
-    #                                   headers=topics_headers,
-    # readme_info, date = rate_limit_get(repo_api_base_url + "/readme",
-    #                                   headers=topics_headers,
-    #                                   params=ref_param)
-    # if 'message' in readme_info.keys():
-    #    print("README Error: " + readme_info['message'])
-    #    text = ""
-    # else:
-    #    readme = base64.b64decode(readme_info['content']).decode("utf-8")
-    #    text = readme
-    #    filtered_resp['readmeUrl'] = readme_info['html_url']
-
-    # get full git repository
-    # todo: maybe it should be optional, as this could take some time?
-
     text = ""
-    # create a temporary directory
-    with tempfile.TemporaryDirectory() as temp_dir:
-
-        # download the repo at the selected branch with the link
-        repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip"
-        print(f"Downloading {repo_archive_url}")
-        repo_download = requests.get(repo_archive_url)
-        if repo_download.status_code == 404:
-            print(f"Error: Archive request failed with HTTP {repo_download.status_code}")
-            repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip"
-            print(f"Trying to download {repo_archive_url}")
-            repo_download = requests.get(repo_archive_url)
-
-        if repo_download.status_code != 200:
-            sys.exit(f"Error: Archive request failed with HTTP {repo_download.status_code}")
-        repo_zip = repo_download.content
-
-        repo_zip_file = os.path.join(temp_dir, "repo.zip")
-        repo_extract_dir = os.path.join(temp_dir, "repo")
-
-        with open(repo_zip_file, "wb") as f:
-            f.write(repo_zip)
-
-        with zipfile.ZipFile(repo_zip_file, "r") as zip_ref:
-            zip_ref.extractall(repo_extract_dir)
-
-        repo_folders = os.listdir(repo_extract_dir)
-        assert (len(repo_folders) == 1)
-
-        repo_dir = os.path.join(repo_extract_dir, repo_folders[0])
-
-        text, filtered_resp = process_repository_files(repo_dir, filtered_resp, constants.RepositoryType.GITHUB,
-                                                 owner, repo_name, repo_ref)
+    if keep_tmp is not None:
+        os.makedirs(keep_tmp, exist_ok=True)
+        text, filtered_resp = download_github_files(keep_tmp, owner, repo_name, repo_ref, filtered_resp)
+    else:
+        # create a temporary directory (default behavior)
+        with tempfile.TemporaryDirectory() as temp_dir:
+            text, filtered_resp = download_github_files(temp_dir, owner, repo_name, repo_ref, filtered_resp)
 
     # get releases
     if not ignore_github_metadata:
@@ -534,12 +512,60 @@ def get_path(obj, path):
     return text, filtered_resp
 
 
+def download_github_files(directory, owner, repo_name, repo_ref, filtered_resp):
+    """
+    Download all repository files from a GitHub repository
+    Parameters
+    ----------
+    filtered_resp: the main response object we are building in somef
+    repo_ref: link to the repo
+    repo_name: name of the repo
+    owner: GitHub owner
+    directory: directory where to extract all downloaded files
+
+    Returns
+    -------
+    text and filtered response obtained from the repository
+    """
+    # download the repo at the selected branch with the link
+    repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip"
+    print(f"Downloading {repo_archive_url}")
+    repo_download = requests.get(repo_archive_url)
+    if repo_download.status_code == 404:
+        print(f"Error: Archive request failed with HTTP {repo_download.status_code}")
+        repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip"
+        print(f"Trying to download {repo_archive_url}")
+        repo_download = requests.get(repo_archive_url)
+
+    if repo_download.status_code != 200:
+        sys.exit(f"Error: Archive request failed with HTTP {repo_download.status_code}")
+    repo_zip = repo_download.content
+
+    repo_name_full = owner+"_"+repo_name
+    repo_zip_file = os.path.join(directory, repo_name_full+".zip")
+    repo_extract_dir = os.path.join(directory, repo_name_full)
+
+    with open(repo_zip_file, "wb") as f:
+        f.write(repo_zip)
+
+    with zipfile.ZipFile(repo_zip_file, "r") as zip_ref:
+        zip_ref.extractall(repo_extract_dir)
+
+    repo_folders = os.listdir(repo_extract_dir)
+    # assert (len(repo_folders) == 1)
+
+    repo_dir = os.path.join(repo_extract_dir, repo_folders[0])
+
+    return process_repository_files(repo_dir, filtered_resp, constants.RepositoryType.GITHUB,
+                                                   owner, repo_name, repo_ref)
+
+
 def load_local_repository_metadata(local_repo):
     """Function to apply somef to a local repository (already downloaded)"""
     filtered_resp = {}
     repo_dir = os.path.abspath(local_repo)
     text, filtered_resp = process_repository_files(repo_dir, filtered_resp, constants.RepositoryType.LOCAL)
-    print("Local Repository Information Successfully Loaded. \n")
+    print("Local repository information successfully loaded. \n")
     return text, filtered_resp