From 4595284498c5c7bba9de24341e40a288d68e18f9 Mon Sep 17 00:00:00 2001 From: Kinnaird McQuade Date: Sat, 13 Mar 2021 06:06:09 -0500 Subject: [PATCH] Removes Table output duplicates --- azure_guardrails/scrapers/compliance_data.py | 109 +++++++++---------- azure_guardrails/scrapers/standard.py | 24 ++-- 2 files changed, 67 insertions(+), 66 deletions(-) diff --git a/azure_guardrails/scrapers/compliance_data.py b/azure_guardrails/scrapers/compliance_data.py index 63bfc02..b0a592c 100644 --- a/azure_guardrails/scrapers/compliance_data.py +++ b/azure_guardrails/scrapers/compliance_data.py @@ -223,78 +223,77 @@ def _matching_metadata(self) -> dict: results[display_name] = benchmark_data return results - def markdown_table(self) -> str: - headers = ["Service", "Policy Definition"] - benchmark_names = ["Azure Security Benchmark", "CIS", "CCMC L3", "ISO 27001", "NIST SP 800-53 R4", - "NIST SP 800-171 R2", "HIPAA HITRUST 9.2", "New Zealand ISM"] - headers.extend(benchmark_names) - results = self.table_summary() - return tabulate(results, headers=headers, tablefmt="github") - def csv_table(self, path: str, verbosity: int): - headers = ["Service", "Policy Definition"] - benchmark_names = ["Azure Security Benchmark", "CIS", "CCMC L3", "ISO 27001", "NIST SP 800-53 R4", - "NIST SP 800-171 R2", "HIPAA HITRUST 9.2", "New Zealand ISM", "Policy Link"] - headers.extend(benchmark_names) - results = [headers] - results.extend(self.table_summary(hyperlink_format=False)) + headers = ["Service", "Policy Definition", "Azure Security Benchmark", "CIS", "CCMC L3", "ISO 27001", "NIST SP 800-53 R4", "NIST SP 800-171 R2", "HIPAA HITRUST 9.2", "New Zealand ISM", "Link"] + + # results = headers.copy() + results = self.table_summary(hyperlink_format=False) if os.path.exists(path): os.remove(path) with open(path, 'w', newline='') as csv_file: - writer = csv.writer(csv_file) + writer = csv.DictWriter(csv_file, fieldnames=headers) + writer.writeheader() for row in results: writer.writerow(row) - # print(f"CSV updated! Wrote {len(results)} rows. Path: {path}") if verbosity >= 1: utils.print_grey(f"Removing the previous file: {path}") + def markdown_table(self) -> str: + results = self.table_summary() + return tabulate(results, headers="keys", tablefmt="github") + def table_summary(self, hyperlink_format: bool = True) -> list: results = [] def get_benchmark_id(benchmark_name: str, this_policy_metadata: dict) -> str: - if this_policy_metadata.benchmarks.get(benchmark_name, None): - # if benchmark_name in this_policy_metadata["benchmarks"].keys(): - # this_policy_metadata.benchmarks['Azure Security Benchmark'].requirement_id - benchmark_id = this_policy_metadata.benchmarks[benchmark_name].requirement_id + if this_policy_metadata.get(benchmark_name, None): + benchmark_id = this_policy_metadata[benchmark_name][benchmark_name] + benchmark_id = benchmark_id.replace(f"{benchmark_name}: ", "") + benchmark_id = benchmark_id.replace(f"ID : ", "") else: benchmark_id = "" return benchmark_id - for policy_definition_name in self.matching_metadata: + # Loop through the matching metadata only, then look within the policy_compliance_data that holds the master details + for policy_definition_name, policy_definition_details in self.matching_metadata.items(): name = policy_definition_name.replace("[Preview]: ", "") - for policy in self.matching_metadata[policy_definition_name]: - service_name = self.policy_compliance_data.policy_definition_metadata[name][policy].service_name - github_link = self.policy_compliance_data.policy_definition_metadata[name][policy].github_link - if hyperlink_format: - policy_definition_string = f"[{policy_definition_name}]({github_link})" - else: - policy_definition_string = policy_definition_name - - policy_metadata = self.policy_compliance_data.policy_definition_metadata[name][policy] - azure_security_benchmark_id = get_benchmark_id("Azure Security Benchmark", policy_metadata) - cis_id = get_benchmark_id("CIS", policy_metadata) - ccmc_id = get_benchmark_id("CCMC L3", policy_metadata) - iso_id = get_benchmark_id("ISO 27001", policy_metadata) - nist_800_171_id = get_benchmark_id("NIST SP 800-171 R2", policy_metadata) - nist_800_53_id = get_benchmark_id("NIST SP 800-53 R4", policy_metadata) - hipaa_id = get_benchmark_id("HIPAA HITRUST 9.2", policy_metadata) - new_zealand_id = get_benchmark_id("NZISM Security Benchmark", policy_metadata) - result = [ - service_name, - policy_definition_string, - azure_security_benchmark_id, - cis_id, - ccmc_id, - iso_id, - nist_800_53_id, - nist_800_171_id, - hipaa_id, - new_zealand_id, - ] - # If hyperlink format is not specified, that means it is not markdown and we want to include the github link in a separate column - if not hyperlink_format: - result.append(github_link) - results.append(result) - results = sorted(results, key=itemgetter(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)) + # for policy in self.matching_metadata[policy_definition_name]: + benchmarks = [] + github_link = "" + service_name = "" + for benchmark, benchmark_details in self.policy_compliance_data.policy_definition_metadata[name].items(): + benchmarks.append(benchmark) + service_name = benchmark_details.service_name + github_link = benchmark_details.github_link + if hyperlink_format: + policy_definition_string = f"[{policy_definition_name}]({github_link})" + else: + policy_definition_string = policy_definition_name + + azure_security_benchmark_id = get_benchmark_id("Azure Security Benchmark", policy_definition_details) + cis_id = get_benchmark_id("CIS", policy_definition_details) + ccmc_id = get_benchmark_id("CCMC L3", policy_definition_details) + iso_id = get_benchmark_id("ISO 27001", policy_definition_details) + nist_800_171_id = get_benchmark_id("NIST SP 800-171 R2", policy_definition_details) + nist_800_53_id = get_benchmark_id("NIST SP 800-53 R4", policy_definition_details) + hipaa_id = get_benchmark_id("HIPAA HITRUST 9.2", policy_definition_details) + new_zealand_id = get_benchmark_id("NZISM Security Benchmark", policy_definition_details) + + result = { + "Service": service_name, + "Policy Definition": policy_definition_string, + # "Name": policy_definition_name, + "Azure Security Benchmark": azure_security_benchmark_id, + "CIS": cis_id, + "CCMC L3": ccmc_id, + "ISO 27001": iso_id, + "NIST SP 800-171 R2": nist_800_171_id, + "NIST SP 800-53 R4": nist_800_53_id, + "HIPAA HITRUST 9.2": hipaa_id, + "New Zealand ISM": new_zealand_id, + "Link": github_link, + } + results.append(result) + results = sorted(results, key=itemgetter("Service", "Policy Definition")) return results diff --git a/azure_guardrails/scrapers/standard.py b/azure_guardrails/scrapers/standard.py index 8a7a45a..9fd8267 100644 --- a/azure_guardrails/scrapers/standard.py +++ b/azure_guardrails/scrapers/standard.py @@ -2,20 +2,22 @@ from azure_guardrails.shared.utils import chomp_keep_single_spaces +def get_requirement_id(input_text: str, replacement_string: str) -> str: + """Pass in table.previous_sibling.previous_sibling.text and get the Azure Benchmark ID""" + id_ownership_string = chomp_keep_single_spaces(input_text) + this_id = id_ownership_string + this_id = this_id.replace(f"ID : {replacement_string} ", "") + this_id = this_id.replace(f"ID : {replacement_string}", "") + this_id = this_id.replace(" Ownership : Customer", "") + this_id = this_id.replace(" Ownership : Shared", "") + return this_id + + def scrape_standard(html_file_path: str, benchmark_name: str, replacement_string: str): with open(html_file_path, "r") as f: soup = BeautifulSoup(f.read(), "html.parser") tables = soup.find_all("table") - def get_iso_id(input_text: str) -> str: - """Pass in table.previous_sibling.previous_sibling.text and get the Azure Benchmark ID""" - id_ownership_string = chomp_keep_single_spaces(input_text) - this_id = id_ownership_string - this_id = this_id.replace(f"ID : {replacement_string} ", "") - this_id = this_id.replace(" Ownership : Customer", "") - this_id = this_id.replace(" Ownership : Shared", "") - return this_id - def get_service_name(github_link: str) -> str: """Pass in the github link and get the name of the service based on folder name""" elements = github_link.split("/") @@ -27,8 +29,8 @@ def get_service_name(github_link: str) -> str: categories = [] for table in tables: table_identifier_sibling = table.previous_sibling.previous_sibling - # Azure Security Benchmark ID - requirement_id = get_iso_id(table_identifier_sibling.text) + # Get requirement ID + requirement_id = get_requirement_id(table_identifier_sibling.text, replacement_string) if replacement_string in table_identifier_sibling.text: # Requirement Name