Skip to content

Commit

Permalink
Merge pull request #60 from OHDSI/59-feat-add-script-for-automaticall…
Browse files Browse the repository at this point in the history
…y-generating-dbt-configs
  • Loading branch information
lawrenceadams authored Sep 28, 2024
2 parents 8c10e19 + 986e5ff commit 5561586
Showing 1 changed file with 217 additions and 0 deletions.
217 changes: 217 additions & 0 deletions scripts/python/generate_dbt_yaml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
# A script to generate dbt YAML files from the OMOP CDM documentation
#
# Requires `BeautifulSoup4` and `ruamel.yaml` to be installed
# Get the OMOP CDM documentation with e.g.:
# `wget https://raw.githubusercontent.com/OHDSI/CommonDataModel/refs/heads/main/docs/cdm54.html`

import argparse
from dataclasses import dataclass
from pathlib import Path

from ruamel.yaml import YAML
from bs4 import BeautifulSoup


@dataclass
class omop_documentation_container:
cdm_field: str
user_guide: str
etl_conventions: str
datatype: str
required: bool
primary_key: bool
foreign_key: bool
foreign_key_table: str
foreign_key_domain: str


def table_handler(table) -> list[omop_documentation_container]:
"""
Takes a table and returns a list of objects that represent the tables in the table.
"""
rows = table.find_all("tr")

return [row_handler(row) for row in rows]


def row_handler(rows) -> omop_documentation_container:
"""
Take each row from a table and handle it, resulting in a object that can neatly
store how the CDM docs express each column.
"""
cells = rows.find_all("td")

cells = {
"cdm_field": cells[0].text,
"user_guide": cells[1].text,
"etl_conventions": cells[2].text,
"datatype": cells[3].text,
"required": cells[4].text,
"primary_key": cells[5].text,
"foreign_key": cells[6].text,
"foreign_key_table": cells[7].text,
"foreign_key_domain": cells[8].text,
}

# Remove dangling whitespace and newlines from parsed HTML
cells = {k: v.replace("\n", "").strip() for k, v in cells.items()}

# Handle booleans expressed as text
cells.update(
{
"primary_key": sentinel_to_bool(cells["primary_key"]),
"required": sentinel_to_bool(cells["required"]),
"foreign_key": sentinel_to_bool(cells["foreign_key"]),
}
)

return omop_documentation_container(**cells)


def sentinel_to_bool(text) -> bool:
if text == "Yes":
return True
else:
return False


def extract_table_description(table_handle) -> str:
description = table_handle.find(
"p", string="Table Description"
).next_sibling.next_sibling.text

return description.replace("\n", " ")


def omop_docs_to_dbt_config(obj: omop_documentation_container) -> dict:
"""
With an OMOP documentation object, we can use some simple string parsing/heuristic
to create dbt test configs.
"""
column_config = {
"name": obj.cdm_field,
"description": obj.user_guide,
"data_type": obj.datatype,
}

# == Create Tests ==
tests: list = []

if obj.required:
tests.append("not_null")

if obj.primary_key:
tests.append("unique")

if obj.foreign_key:
if obj.foreign_key_domain == "":
# Handle simpler cases first, where a domain is not constrained
test = {
"relationships": {
"to": f"ref('{obj.foreign_key_table.lower()}')",
"field": f"{obj.foreign_key_table.lower()}_id",
}
}
tests.append(test)

else:
# Add constrained domain tests
specific_test = {
"dbt_utils.relationships_where": {
"to": f"ref('{obj.foreign_key_table.lower()}')",
"field": f"{obj.foreign_key_table.lower()}_id",
"from_condition": f"{obj.cdm_field} <> 0",
"to_condition": f"domain_id = '{obj.foreign_key_domain}'",
}
}
tests.append(specific_test)

if tests:
column_config["tests"] = tests

return column_config


def extract_table_names(soup_obj: BeautifulSoup) -> list[str]:
"""
Dynamically extract table names from the OMOP CDM documentation
"""
table_names = []

for div in soup_obj.find_all(
"div", attrs={"class": "section level3 tabset tabset-pills"}
):
table_names.append(div.find("h3").text)

print(" [Note] Ignoring `cohort` and `cohort_definition` tables from documentation")
table_names.remove("cohort")
table_names.remove("cohort_definition")

return table_names


def main(
cdm_docs_path: Path,
output_dir: Path,
) -> None:
"""
Main loop to generate dbt YAML files from the OMOP CDM documentation
"""
with open(cdm_docs_path) as file_handle:
file = file_handle.read()

soup = BeautifulSoup(file, features="html.parser")

tables = extract_table_names(soup)

print(f" Found {len(tables)} tables in the OMOP CDM documentation")

for table in tables:
# For each table generate the desired dbt yaml
# Get desired div with table
table_handle = soup.find("div", attrs={"id": table})

tbody_handle = table_handle.find("table").find("tbody")
parsed_table = table_handler(tbody_handle)
table_description = extract_table_description(table_handle)

table_dict = {
"models": [
{
"name": table,
"description": table_description,
"columns": [omop_docs_to_dbt_config(obj) for obj in parsed_table],
}
]
}

yaml = YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.width = 100
yaml.dump(table_dict, open(f"{output_dir}/{table}.yml", "w"))

print(f" Exported to `{output_dir}`")
print(" Done!")


# == Handle arguments ==
# Get cdm54.html from the OMOP CDM documentation, using args
parser = argparse.ArgumentParser(
description="Generate dbt YAML files from the OMOP CDM documentation. For example: python generate_dbt_yaml.py cdm54.html ./output"
)
parser.add_argument(
"cdm_html", type=str, help="Path to the OMOP CDM documentation HTML"
)
parser.add_argument("output_dir", type=str, help="Path to the output directory")
args = parser.parse_args()

cdm_docs_path = Path(args.cdm_html)
output_dir = Path(args.output_dir)

if not cdm_docs_path.exists():
parser.error(f"File {cdm_docs_path} does not exist")

if not output_dir.exists():
parser.error(f"Directory {output_dir} does not exist")

main(cdm_docs_path, output_dir)

0 comments on commit 5561586

Please sign in to comment.