Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sarah afiya/issue 21/improve api extraction #24

Merged
merged 8 commits into from
Sep 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 14 additions & 10 deletions documentation_quality_analysis/analyze_library/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@
python_match_examples


def debug_metrics(language, library_name, doc_url, gh_url):
def debug_metrics(language, library_name, doc_url, gh_url, depth):
# os.chdir(ROOT_DIR)

# repo_path = clone_repo(gh_url, True)
# print("Done cloning")

# get_functions_and_classes_from_src()

doc_pages: List[DocPage] = get_all_webpages(doc_url, 3)
doc_pages: List[DocPage] = get_all_webpages(doc_url, depth)

doc_api: List[Union[Signature, None]] = get_functions_and_classes_from_doc_api_ref(doc_pages)

Expand Down Expand Up @@ -111,24 +111,28 @@ def get_stats_api_per_example(matched_methods: List[MatchedCall], doc_code_examp


if __name__ == '__main__':
debug_metrics("python", "requests",
"https://requests.readthedocs.io/en/latest/api/",
"https://github.com/psf/requests.git")
# debug_metrics("python", "requests",
# "https://requests.readthedocs.io/en/latest/api/",
# "https://github.com/psf/requests.git", 0)

debug_metrics("python", "pandas",
"https://pandas.pydata.org/docs/reference/api/pandas.Series.__array__.html",
"https://github.com/pandas-dev/pandas", 0)

# debug_metrics("python", "pandas",
# "https://pandas.pydata.org/docs/index.html",
# "https://github.com/pandas-dev/pandas")
# "https://pandas.pydata.org/docs/reference/api/pandas.Series.plot.html",
# "https://github.com/pandas-dev/pandas", 0)

# debug_metrics("python", "GraphQL compiler",
# "https://graphql-compiler.readthedocs.io/",
# "https://github.com/kensho-technologies/graphql-compiler")
# "https://github.com/kensho-technologies/graphql-compiler", 3)

# debug_metrics("python", "collections",
# "https://docs.python.org/3/library/collections.html",
# "https://github.com/python/cpython/tree/3.11/Lib/collections")
# "https://github.com/python/cpython/tree/3.11/Lib/collections", 0)

# debug_metrics("python", "TensorFlow",
# "https://www.tensorflow.org/api_docs/python/tf/all_symbols",
# "https://github.com/tensorflow/docs.git")
# "https://github.com/tensorflow/docs.git", 1)


Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,10 @@ def get_all_webpages(doc_home: str, max_depth: int) -> List[DocPage]:


def get_functions_and_classes_from_doc_api_ref(doc_pages: List[DocPage]) -> List[Signature]:
api_ref = ['api', 'reference']
api_ref_keywords = ['api', 'reference']
signatures: List[Signature] = []
for page in doc_pages:
if any(word in page.url for word in api_ref):
if any(word in page.url for word in api_ref_keywords):
signatures.extend(get_signatures_from_doc(page))

return signatures
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,31 +8,23 @@
from documentation_quality_analysis.analyze_library.models.method_signature import MethodSignature
from documentation_quality_analysis.analyze_library.models.parameter import Parameter

REQ_STATEMENT_TYPES = ['method', 'class', 'function', 'exception']
PARSE_GENERATED_HTML = "PARSE_GENERATED_HTML"
PARSE_ANY_DOC = "PARSE_ANY_DOC"


def get_signatures_from_doc(doc_page: DocPage) -> List[Signature]:
page_url = doc_page.url
soup = doc_page.content
signatures: List[Signature] = []

sections = soup.find_all("section")

for section in sections:
if len(section.find_all("section")) == 0 and len(sections) != 1:
signatures.extend(get_signatures_from_section(section))

if len(sections) <= 1:
signatures.extend(get_signatures_from_section(soup))
signatures.extend(get_signatures_from_section(soup, PARSE_GENERATED_HTML))

return signatures


def get_signatures_from_section(section) -> List[Signature]:
def get_signatures_from_section(section, parsing_method) -> List[Signature]:
dts = section.find_all("dt")
# method_signatures: List[MethodSignature] = []
# class_signatures: List[ClassConstructorSignature] = []

signatures: List[Signature] = []
class_name = None

for tag in dts:
desc = []
Expand All @@ -43,33 +35,65 @@ def get_signatures_from_section(section) -> List[Signature]:
description = "".join(desc).strip()

try:
if "class" in description:
class_element = re.findall(re.compile(r'class\s((?:.*)(?:\(.*\))?)'), description)
if len(class_element) > 0:
class_name = re.findall('(.*)\(', class_element[0])[0] if "(" in class_element[0] else class_element[0].strip()
class_signature: ClassConstructorSignature = _get_parsed_class_details(class_element[0])
signatures.append(class_signature)
elif "(" in description:
code = re.findall('[A-Za-z]*\s(.+\(.*\))', description)
if len(code) > 0:
method = _get_parsed_method_details(method=code[0], class_name=class_name)
signatures.append(method)
else:
method = _get_parsed_method_details(method=description, class_name=class_name)
signatures.append(method)
if parsing_method == PARSE_GENERATED_HTML:
_append_signature_from_generated_html_tag(description, signatures, tag)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does it makes sense to somehow keep this "more generic" parsing mechanism still available in the code? For example, have different parsing strategies that can be called/used, and then just use the specific one for the format?

elif parsing_method == PARSE_ANY_DOC:
_append_signature_from_statement(description, signatures)

except AttributeError as e:
print(e)
print(description)

# signatures.append([description, page])
return signatures


def _get_parsed_method_details(method: str, class_name: Union[str, None]) -> MethodSignature:
def _append_signature_from_statement(description, signatures):
if "class" in description:
class_element = re.findall(re.compile(r'class\s(.*)'), description)
if len(class_element) > 0:
class_signature: ClassConstructorSignature = _get_parsed_class_details(class_element[0])
signatures.append(class_signature)
elif "(" in description:
code = re.findall('([^\s]+\(.*\))', description)
if len(code) > 0:
method = _get_parsed_method_details(method=code[0])
signatures.append(method)
else:
method = _get_parsed_method_details(method=description)
if method:
signatures.append(method)


def _append_signature_from_generated_html_tag(description, signatures, tag):
if 'id' in tag.attrs:
statement_type = tag.parent.attrs['class'][-1]

if statement_type in REQ_STATEMENT_TYPES:
name = tag.attrs['id']
name_components = name.split('.')
last_component: str = name_components[-1]
parent = '.'.join(name_components[0:-1])

if last_component[0].isupper():
class_element = re.findall(r'(?:class\s|final class\s|exception\s)?(.+)', description)
class_signature: ClassConstructorSignature = _get_parsed_class_details(
class_expr=class_element[0],
parent=parent)
signatures.append(class_signature)

elif last_component[0].islower() or last_component[0] == "_":
method = re.findall(r'(?:method\s|classmethod\s)?(.+)', description)
method_signature = _get_parsed_method_details(
method=method[0],
parent=parent)
if method_signature:
signatures.append(method_signature)


def _get_parsed_method_details(method: str, parent: Union[str, None] = None) -> Union[MethodSignature, None]:
method_name = ""
parent = class_name
parent = parent
req_args = []
opt_args = []
try:
Expand All @@ -84,9 +108,9 @@ def _get_parsed_method_details(method: str, class_name: Union[str, None]) -> Met
return None


def _get_parsed_class_details(class_expr: str) -> Union[ClassConstructorSignature, None]:
def _get_parsed_class_details(class_expr: str, parent: Union[str, None] = "") -> Union[ClassConstructorSignature, None]:
class_name = ""
parent = ""
parent = parent
req_args = []
opt_args = []
raw_text = class_expr
Expand Down Expand Up @@ -115,12 +139,11 @@ def get_ast_parsed_expression(expression, method_name, opt_args, parent, req_arg
method_name = func.attr
elif 'id' in func._fields:
method_name = func.id
if 'value' in func._fields:
if not parent and 'value' in func._fields:
if 'id' in func.value._fields:
parent = ".".join([parent, func.value.id]) if parent else func.value.id
parent = func.value.id
elif 'value' in func.value._fields and 'attr' in func.value._fields:
parent = ".".join([parent, func.value.value.id, func.value.attr]) if parent else ".".join(
[func.value.value.id, func.value.attr])
parent = ".".join([func.value.value.id, func.value.attr])

if 'args' in value._fields:
args = value.args
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,62 +38,67 @@ def python_match_examples(repo_name: str,
matched_call = _get_matched_function(call, doc_apis)
# Check if the function exists in our dictionary
if not matched_call:
function_split = call.split(".")
statement_parts = call.split(".")

if len(function_split) > 1:
if function_split[0] in var_declarations:
actual_function = '.'.join([var_declarations[function_split[0]], function_split[1]])
if len(statement_parts) > 1:
declared_variable = statement_parts[0]
if declared_variable in var_declarations:
statement_parts[0] = var_declarations[declared_variable]
actual_function = '.'.join(statement_parts)
matched_func = _get_matched_function(actual_function, doc_apis)
if matched_func:
matched_apis.append(
MatchedCall(called_signature=matched_func, raw_example=ex, original_call=call,
MatchedCall(called_signature=matched_func,
raw_example=ex,
original_call=call,
url=ex.url))
else:
# If not then maybe it does if we remove the first prefix
# e.g., nltk.nltk.get -> nltk.get
first_term_removed_function = '.'.join(function_split[1:])
matched_call = _get_matched_function(first_term_removed_function, functions)
first_term_removed_function = '.'.join(statement_parts[1:])
matched_call = _get_matched_function(call=first_term_removed_function,
functions=doc_apis,
no_partial_match=True)
if matched_call:
# method_calls.add((ex[1], call))
matched_apis.append(
MatchedCall(called_signature=matched_call, raw_example=ex, original_call=call,
MatchedCall(called_signature=matched_call,
raw_example=ex,
original_call=call,
url=ex.url))
# else:
# if function_split[0] in var_declarations:
# actual_function = '.'.join([var_declarations[function_split[0]], function_split[1]])
# matched_func = _get_matched_function(actual_function, doc_apis)
# if matched_func:
# # method_calls.add((ex[1], call))
# matched_apis.append(
# MatchedCall(called_signature=matched_call, raw_example=ex, original_call=call,
# url=ex.url))

elif matched_call:
# method_calls.add((ex[1], call))
matched_apis.append(
MatchedCall(called_signature=matched_call, raw_example=ex, original_call=call, url=ex.url))

return matched_apis


def _get_matched_function(call: str, functions: List[Signature]) -> Union[Signature, None]:
def _get_matched_function(call: str, functions: List[Signature], no_partial_match=False) -> Union[Signature, None]:
for func in functions:
if func is None:
continue

if call == func.fully_qualified_name:
return func
else:

elif not no_partial_match:
parents = func.parent.split(".") if func.parent else []
if len(parents) > 1:
partial_qualified_name = ".".join([parents[-1], func.name])
partial_terms = parents[1:]
partial_terms.extend([func.name])
partial_qualified_name = ".".join(partial_terms)
if call == partial_qualified_name:
return func

if type(func) == ClassConstructorSignature:
if call == func.name:
return func

if len(call.split('.')) == 1:
same_named_methods = [x for x in functions if x.name == call]
if len(same_named_methods) == 1:
return same_named_methods[0]

return None


Expand Down
Loading