Merge pull request #60 from OHDSI/59-feat-add-script-for-automaticall…

…y-generating-dbt-configs
OHDSI · Sep 28, 2024 · 5561586 · 5561586
2 parents 8c10e19 + 986e5ff
commit 5561586
Showing 1 changed file with 217 additions and 0 deletions.
diff --git a/scripts/python/generate_dbt_yaml.py b/scripts/python/generate_dbt_yaml.py
@@ -0,0 +1,217 @@
+# A script to generate dbt YAML files from the OMOP CDM documentation
+#
+# Requires `BeautifulSoup4` and `ruamel.yaml` to be installed
+# Get the OMOP CDM documentation with e.g.:
+#   `wget https://raw.githubusercontent.com/OHDSI/CommonDataModel/refs/heads/main/docs/cdm54.html`
+
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+
+from ruamel.yaml import YAML
+from bs4 import BeautifulSoup
+
+
+@dataclass
+class omop_documentation_container:
+    cdm_field: str
+    user_guide: str
+    etl_conventions: str
+    datatype: str
+    required: bool
+    primary_key: bool
+    foreign_key: bool
+    foreign_key_table: str
+    foreign_key_domain: str
+
+
+def table_handler(table) -> list[omop_documentation_container]:
+    """
+    Takes a table and returns a list of objects that represent the tables in the table.
+    """
+    rows = table.find_all("tr")
+
+    return [row_handler(row) for row in rows]
+
+
+def row_handler(rows) -> omop_documentation_container:
+    """
+    Take each row from a table and handle it, resulting in a object that can neatly
+    store how the CDM docs express each column.
+    """
+    cells = rows.find_all("td")
+
+    cells = {
+        "cdm_field": cells[0].text,
+        "user_guide": cells[1].text,
+        "etl_conventions": cells[2].text,
+        "datatype": cells[3].text,
+        "required": cells[4].text,
+        "primary_key": cells[5].text,
+        "foreign_key": cells[6].text,
+        "foreign_key_table": cells[7].text,
+        "foreign_key_domain": cells[8].text,
+    }
+
+    # Remove dangling whitespace and newlines from parsed HTML
+    cells = {k: v.replace("\n", "").strip() for k, v in cells.items()}
+
+    # Handle booleans expressed as text
+    cells.update(
+        {
+            "primary_key": sentinel_to_bool(cells["primary_key"]),
+            "required": sentinel_to_bool(cells["required"]),
+            "foreign_key": sentinel_to_bool(cells["foreign_key"]),
+        }
+    )
+
+    return omop_documentation_container(**cells)
+
+
+def sentinel_to_bool(text) -> bool:
+    if text == "Yes":
+        return True
+    else:
+        return False
+
+
+def extract_table_description(table_handle) -> str:
+    description = table_handle.find(
+        "p", string="Table Description"
+    ).next_sibling.next_sibling.text
+
+    return description.replace("\n", " ")
+
+
+def omop_docs_to_dbt_config(obj: omop_documentation_container) -> dict:
+    """
+    With an OMOP documentation object, we can use some simple string parsing/heuristic
+    to create dbt test configs.
+    """
+    column_config = {
+        "name": obj.cdm_field,
+        "description": obj.user_guide,
+        "data_type": obj.datatype,
+    }
+
+    # == Create Tests ==
+    tests: list = []
+
+    if obj.required:
+        tests.append("not_null")
+
+    if obj.primary_key:
+        tests.append("unique")
+
+    if obj.foreign_key:
+        if obj.foreign_key_domain == "":
+            # Handle simpler cases first, where a domain is not constrained
+            test = {
+                "relationships": {
+                    "to": f"ref('{obj.foreign_key_table.lower()}')",
+                    "field": f"{obj.foreign_key_table.lower()}_id",
+                }
+            }
+            tests.append(test)
+
+        else:
+            # Add constrained domain tests
+            specific_test = {
+                "dbt_utils.relationships_where": {
+                    "to": f"ref('{obj.foreign_key_table.lower()}')",
+                    "field": f"{obj.foreign_key_table.lower()}_id",
+                    "from_condition": f"{obj.cdm_field} <> 0",
+                    "to_condition": f"domain_id = '{obj.foreign_key_domain}'",
+                }
+            }
+            tests.append(specific_test)
+
+    if tests:
+        column_config["tests"] = tests
+
+    return column_config
+
+
+def extract_table_names(soup_obj: BeautifulSoup) -> list[str]:
+    """
+    Dynamically extract table names from the OMOP CDM documentation
+    """
+    table_names = []
+
+    for div in soup_obj.find_all(
+        "div", attrs={"class": "section level3 tabset tabset-pills"}
+    ):
+        table_names.append(div.find("h3").text)
+
+    print(" [Note] Ignoring `cohort` and `cohort_definition` tables from documentation")
+    table_names.remove("cohort")
+    table_names.remove("cohort_definition")
+
+    return table_names
+
+
+def main(
+    cdm_docs_path: Path,
+    output_dir: Path,
+) -> None:
+    """
+    Main loop to generate dbt YAML files from the OMOP CDM documentation
+    """
+    with open(cdm_docs_path) as file_handle:
+        file = file_handle.read()
+
+    soup = BeautifulSoup(file, features="html.parser")
+
+    tables = extract_table_names(soup)
+
+    print(f" Found {len(tables)} tables in the OMOP CDM documentation")
+
+    for table in tables:
+        # For each table generate the desired dbt yaml
+        # Get desired div with table
+        table_handle = soup.find("div", attrs={"id": table})
+
+        tbody_handle = table_handle.find("table").find("tbody")
+        parsed_table = table_handler(tbody_handle)
+        table_description = extract_table_description(table_handle)
+
+        table_dict = {
+            "models": [
+                {
+                    "name": table,
+                    "description": table_description,
+                    "columns": [omop_docs_to_dbt_config(obj) for obj in parsed_table],
+                }
+            ]
+        }
+
+        yaml = YAML()
+        yaml.indent(mapping=2, sequence=4, offset=2)
+        yaml.width = 100
+        yaml.dump(table_dict, open(f"{output_dir}/{table}.yml", "w"))
+
+    print(f" Exported to `{output_dir}`")
+    print("  Done!")
+
+
+# == Handle arguments ==
+# Get cdm54.html from the OMOP CDM documentation, using args
+parser = argparse.ArgumentParser(
+    description="Generate dbt YAML files from the OMOP CDM documentation. For example: python generate_dbt_yaml.py cdm54.html ./output"
+)
+parser.add_argument(
+    "cdm_html", type=str, help="Path to the OMOP CDM documentation HTML"
+)
+parser.add_argument("output_dir", type=str, help="Path to the output directory")
+args = parser.parse_args()
+
+cdm_docs_path = Path(args.cdm_html)
+output_dir = Path(args.output_dir)
+
+if not cdm_docs_path.exists():
+    parser.error(f"File {cdm_docs_path} does not exist")
+
+if not output_dir.exists():
+    parser.error(f"Directory {output_dir} does not exist")
+
+main(cdm_docs_path, output_dir)