Skip to content

Commit

Permalink
Fix #462
Browse files Browse the repository at this point in the history
  • Loading branch information
dgarijo committed May 26, 2022
1 parent cb5b678 commit 33b2425
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 108 deletions.
11 changes: 7 additions & 4 deletions src/somef/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,11 +143,14 @@ class URLParamType(click.types.StringParamType):
help="""JSON report with the missing metadata fields SOMEF was not able to find. The report will be placed in
$PATH_missing.json, where $PATH is -o, -c or -g."""
)
@click.option(
"--keep_tmp",
"-kt",
type=click.Path(),
help="""SOMEF will NOT delete the temporary folder where files are stored for analysis. Files will be stored at the
desired path"""
)
def describe(**kwargs):
from somef import cli
cli.run_cli(**kwargs)
click.secho(f"Success", fg="green")


#if __name__ == '__main__':
# version()
18 changes: 10 additions & 8 deletions src/somef/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,7 @@ def create_missing_fields_report(repo_data, out_path):


def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, local_repo=None,
ignore_github_metadata=False, readme_only=False):
ignore_github_metadata=False, readme_only=False, keep_tmp=None):
"""
Main function to get the data through the command line
Parameters
Expand All @@ -632,6 +632,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
local_repo: flag to indicate that the repo is local
ignore_github_metadata: flag used to avoid doing extra requests to the GitHub API
readme_only: flag to indicate that only the readme should be analyzed
keep_tmp: path where to store TMP files in case SOMEF is instructed to keep them
Returns
-------
Expand All @@ -650,9 +651,9 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
header['Authorization'] = file_paths['Authorization']
header['accept'] = 'application/vnd.github.v3+json'
if repo_url is not None:
assert (doc_src is None)
# assert (doc_src is None)
try:
text, github_data = process_repository.load_github_repository_metadata(repo_url, header, ignore_github_metadata, readme_only)
text, github_data = process_repository.load_online_repository_metadata(repo_url, header, ignore_github_metadata, readme_only, keep_tmp)
if text == "":
print("Warning: README document does not exist in the repository")
except process_repository.GithubUrlError:
Expand Down Expand Up @@ -745,7 +746,8 @@ def run_cli(*,
graph_format="turtle",
codemeta_out=None,
pretty=False,
missing=False
missing=False,
keep_tmp=None
):
"""Function to run all the required components of the cli for a repository"""
# check if it is a valid url
Expand All @@ -772,18 +774,18 @@ def run_cli(*,
for remove_url in remove_urls:
repo_set.remove(remove_url)
if len(repo_set) > 0:
repo_data = [cli_get_data(threshold, ignore_classifiers, repo_url=repo_url) for repo_url in repo_set]
repo_data = [cli_get_data(threshold, ignore_classifiers, repo_url=repo_url, keep_tmp=keep_tmp) for repo_url in repo_set]
else:
return None

else:
if repo_url:
repo_data = cli_get_data(threshold, ignore_classifiers, repo_url=repo_url,
ignore_github_metadata=ignore_github_metadata, readme_only=readme_only)
ignore_github_metadata=ignore_github_metadata, readme_only=readme_only, keep_tmp=keep_tmp)
elif local_repo:
repo_data = cli_get_data(threshold, ignore_classifiers, local_repo=local_repo)
repo_data = cli_get_data(threshold, ignore_classifiers, local_repo=local_repo, keep_tmp=keep_tmp)
else:
repo_data = cli_get_data(threshold, ignore_classifiers, doc_src=doc_src)
repo_data = cli_get_data(threshold, ignore_classifiers, doc_src=doc_src, keep_tmp=keep_tmp)

if output is not None:
save_json_output(repo_data, output, missing, pretty=pretty)
Expand Down
210 changes: 118 additions & 92 deletions src/somef/process_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, **kwargs):
return response, date


def load_gitlab_repository_metadata(repository_url, header, readme_only=False):
def load_gitlab_repository_metadata(repository_url, header, readme_only=False, keep_tmp=None):
"""
Function uses the repository_url provided to load required information from gitlab.
Information kept from the repository is written in keep_keys.
Expand All @@ -50,6 +50,7 @@ def load_gitlab_repository_metadata(repository_url, header, readme_only=False):
repository_url: URL of the Gitlab repository to analyze
header: headers of the repository
readme_only: flag to indicate whether to process the full repo or just the readme
keep_tmp
Returns
-------
Expand Down Expand Up @@ -128,7 +129,6 @@ def load_gitlab_repository_metadata(repository_url, header, readme_only=False):
if repo_download.status_code != 200:
print(f"Error: Archive request failed with HTTP {repo_download.status_code}")
repo_zip = repo_download.content
print(repo_zip)
text = repo_zip.decode('utf-8')
return text, {}

Expand Down Expand Up @@ -225,11 +225,6 @@ def get_path(obj, path):

del filtered_resp['languages_url']

# get default README
# repo_api_base_url https://api.github.com/dgarijo/Widoco/readme
# readme_info, date = rate_limit_get(repo_api_base_url + "/readme",
# headers=topics_headers,
# params=ref_param)
readme_info = {}
if 'message' in readme_info.keys():
print("README Error: " + readme_info['message'])
Expand All @@ -243,51 +238,77 @@ def get_path(obj, path):
text = get_readme_content(project_details['readme_url'])
filtered_resp['readmeUrl'] = project_details['readme_url']

# create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:

# download the repo at the selected branch with the link
# https://gitlab.com/unboundedsystems/adapt/-/archive/master/adapt-master.zip
repo_archive_url = f"https://gitlab.com/{owner}/{repo_name}/-/archive/{repo_ref}/{repo_name}-{repo_ref}.zip"
if len(path_components) == 4:
repo_archive_url = f"https://gitlab.com/{owner}/{repo_name}/-/archive/{repo_ref}/{path_components[3]}.zip"
print(f"Downloading {repo_archive_url}")
repo_download = requests.get(repo_archive_url)
repo_zip = repo_download.content

repo_zip_file = os.path.join(temp_dir, "repo.zip")
repo_extract_dir = os.path.join(temp_dir, "repo")

with open(repo_zip_file, "wb") as f:
f.write(repo_zip)

with zipfile.ZipFile(repo_zip_file, "r") as zip_ref:
zip_ref.extractall(repo_extract_dir)

repo_folders = os.listdir(repo_extract_dir)
assert (len(repo_folders) == 1)

repo_dir = os.path.join(repo_extract_dir, repo_folders[0])
if keep_tmp is not None:
os.makedirs(keep_tmp, exist_ok=True)
text, filtered_resp = download_gitlab_files(keep_tmp, owner, repo_name, repo_ref, filtered_resp,
path_components)
else:
# create a temporary directory (default behavior)
with tempfile.TemporaryDirectory() as temp_dir:
text, filtered_resp = download_gitlab_files(temp_dir, owner, repo_name, repo_ref, filtered_resp,
path_components)

text, filtered_resp = process_repository_files(repo_dir, filtered_resp, constants.RepositoryType.GITLAB,
owner, repo_name, repo_ref)
releases_list = {}
if isinstance(releases_list, dict) and 'message' in releases_list.keys():
print("Releases Error: " + releases_list['message'])
else:
filtered_resp['releases'] = [do_crosswalk(release, constants.release_crosswalk_table) for release in
releases_list]

print("Repository Information Successfully Loaded. \n")
print("Repository information successfully loaded. \n")
return text, filtered_resp


def load_github_repository_metadata(repository_url, header, ignore_github_metadata=False, readme_only=False):
def download_gitlab_files(directory, owner, repo_name, repo_ref, filtered_resp, path_components):
"""
Download all repository files from a GitHub repository
Parameters
----------
filtered_resp: the main response object we are building in somef
repo_ref: link to the repo
repo_name: name of the repo
owner: GitHub owner
directory: directory where to extract all downloaded files
path_components: components in the path of the gitlab repository
Returns
-------
text and filtered response obtained from the repository
"""
Function uses the repository_url provided to load required information from Github.
repo_archive_url = f"https://gitlab.com/{owner}/{repo_name}/-/archive/{repo_ref}/{repo_name}-{repo_ref}.zip"
if len(path_components) == 4:
repo_archive_url = f"https://gitlab.com/{owner}/{repo_name}/-/archive/{repo_ref}/{path_components[3]}.zip"
print(f"Downloading {repo_archive_url}")
repo_download = requests.get(repo_archive_url)
repo_zip = repo_download.content

repo_zip_file = os.path.join(directory, "repo.zip")
repo_extract_dir = os.path.join(directory, "repo")

with open(repo_zip_file, "wb") as f:
f.write(repo_zip)

with zipfile.ZipFile(repo_zip_file, "r") as zip_ref:
zip_ref.extractall(repo_extract_dir)

repo_folders = os.listdir(repo_extract_dir)
assert (len(repo_folders) == 1)

repo_dir = os.path.join(repo_extract_dir, repo_folders[0])

return process_repository_files(repo_dir, filtered_resp, constants.RepositoryType.GITLAB,
owner, repo_name, repo_ref)


def load_online_repository_metadata(repository_url, header, ignore_github_metadata=False, readme_only=False, keep_tmp=None):
"""
Function uses the repository_url provided to load required information from GitHub or Gitlab.
Information kept from the repository is written in keep_keys.
Parameters
----------
keep_tmp
readme_only
ignore_github_metadata
repository_url
header
Expand All @@ -296,10 +317,10 @@ def load_github_repository_metadata(repository_url, header, ignore_github_metada
Returns the readme text and required metadata
"""
if repository_url.rfind("gitlab.com") > 0:
return load_gitlab_repository_metadata(repository_url, header, readme_only)
return load_gitlab_repository_metadata(repository_url, header, readme_only, keep_tmp=keep_tmp)

print(f"Loading Repository {repository_url} Information....")
## load general response of the repository
# load general response of the repository
if repository_url[-1] == '/':
repository_url = repository_url[:-1]
url = urlparse(repository_url)
Expand All @@ -319,7 +340,6 @@ def load_github_repository_metadata(repository_url, header, ignore_github_metada
repo_api_base_url = f"https://api.github.com/repos/{owner}/{repo_name}"

repo_ref = None
ref_param = None

if len(path_components) >= 5:
if not path_components[3] == "tree":
Expand Down Expand Up @@ -370,7 +390,7 @@ def load_github_repository_metadata(repository_url, header, ignore_github_metada
text = repo_zip.decode('utf-8')
return text, {}

## get only the fields that we want
# get only the fields that we want
def do_crosswalk(data, crosswalk_table):
def get_path(obj, path):
if isinstance(path, list) or isinstance(path, tuple):
Expand Down Expand Up @@ -458,7 +478,7 @@ def get_path(obj, path):
if len(forks_info.keys()) > 0:
filtered_resp['forksCount'] = forks_info

## get languages
# get languages
if not ignore_github_metadata:
languages, date = rate_limit_get(filtered_resp['languages_url'], headers=header)
if "message" in languages:
Expand All @@ -468,56 +488,14 @@ def get_path(obj, path):

del filtered_resp['languages_url']

# get default README
# headers=topics_headers,
# readme_info, date = rate_limit_get(repo_api_base_url + "/readme",
# headers=topics_headers,
# params=ref_param)
# if 'message' in readme_info.keys():
# print("README Error: " + readme_info['message'])
# text = ""
# else:
# readme = base64.b64decode(readme_info['content']).decode("utf-8")
# text = readme
# filtered_resp['readmeUrl'] = readme_info['html_url']

# get full git repository
# todo: maybe it should be optional, as this could take some time?

text = ""
# create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:

# download the repo at the selected branch with the link
repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip"
print(f"Downloading {repo_archive_url}")
repo_download = requests.get(repo_archive_url)
if repo_download.status_code == 404:
print(f"Error: Archive request failed with HTTP {repo_download.status_code}")
repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip"
print(f"Trying to download {repo_archive_url}")
repo_download = requests.get(repo_archive_url)

if repo_download.status_code != 200:
sys.exit(f"Error: Archive request failed with HTTP {repo_download.status_code}")
repo_zip = repo_download.content

repo_zip_file = os.path.join(temp_dir, "repo.zip")
repo_extract_dir = os.path.join(temp_dir, "repo")

with open(repo_zip_file, "wb") as f:
f.write(repo_zip)

with zipfile.ZipFile(repo_zip_file, "r") as zip_ref:
zip_ref.extractall(repo_extract_dir)

repo_folders = os.listdir(repo_extract_dir)
assert (len(repo_folders) == 1)

repo_dir = os.path.join(repo_extract_dir, repo_folders[0])

text, filtered_resp = process_repository_files(repo_dir, filtered_resp, constants.RepositoryType.GITHUB,
owner, repo_name, repo_ref)
if keep_tmp is not None:
os.makedirs(keep_tmp, exist_ok=True)
text, filtered_resp = download_github_files(keep_tmp, owner, repo_name, repo_ref, filtered_resp)
else:
# create a temporary directory (default behavior)
with tempfile.TemporaryDirectory() as temp_dir:
text, filtered_resp = download_github_files(temp_dir, owner, repo_name, repo_ref, filtered_resp)

# get releases
if not ignore_github_metadata:
Expand All @@ -534,12 +512,60 @@ def get_path(obj, path):
return text, filtered_resp


def download_github_files(directory, owner, repo_name, repo_ref, filtered_resp):
"""
Download all repository files from a GitHub repository
Parameters
----------
filtered_resp: the main response object we are building in somef
repo_ref: link to the repo
repo_name: name of the repo
owner: GitHub owner
directory: directory where to extract all downloaded files
Returns
-------
text and filtered response obtained from the repository
"""
# download the repo at the selected branch with the link
repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip"
print(f"Downloading {repo_archive_url}")
repo_download = requests.get(repo_archive_url)
if repo_download.status_code == 404:
print(f"Error: Archive request failed with HTTP {repo_download.status_code}")
repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip"
print(f"Trying to download {repo_archive_url}")
repo_download = requests.get(repo_archive_url)

if repo_download.status_code != 200:
sys.exit(f"Error: Archive request failed with HTTP {repo_download.status_code}")
repo_zip = repo_download.content

repo_name_full = owner+"_"+repo_name
repo_zip_file = os.path.join(directory, repo_name_full+".zip")
repo_extract_dir = os.path.join(directory, repo_name_full)

with open(repo_zip_file, "wb") as f:
f.write(repo_zip)

with zipfile.ZipFile(repo_zip_file, "r") as zip_ref:
zip_ref.extractall(repo_extract_dir)

repo_folders = os.listdir(repo_extract_dir)
# assert (len(repo_folders) == 1)

repo_dir = os.path.join(repo_extract_dir, repo_folders[0])

return process_repository_files(repo_dir, filtered_resp, constants.RepositoryType.GITHUB,
owner, repo_name, repo_ref)


def load_local_repository_metadata(local_repo):
"""Function to apply somef to a local repository (already downloaded)"""
filtered_resp = {}
repo_dir = os.path.abspath(local_repo)
text, filtered_resp = process_repository_files(repo_dir, filtered_resp, constants.RepositoryType.LOCAL)
print("Local Repository Information Successfully Loaded. \n")
print("Local repository information successfully loaded. \n")
return text, filtered_resp


Expand Down
Loading

0 comments on commit 33b2425

Please sign in to comment.