Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add basic dbt objects and rule definitions #6

Merged
merged 27 commits into from
Mar 25, 2024
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
794e79b
WIP Add basic dbt objects and rule definitions
jochemvandooren Mar 8, 2024
25d1059
Fix decorator and change manifest loading
jochemvandooren Mar 13, 2024
37c0a01
Add newline after docstring
jochemvandooren Mar 13, 2024
8811c28
Improve documentation and remove logging
jochemvandooren Mar 13, 2024
4a620e8
Improve model definitions and process feedback
jochemvandooren Mar 14, 2024
d1ddd4b
Remove debugging lines
jochemvandooren Mar 14, 2024
5e4a4bc
Add more parameters and cleanup
jochemvandooren Mar 18, 2024
7e24cdd
Fix Model.from_node
jochemvandooren Mar 18, 2024
e1e0fb3
Add _values to Constraint
jochemvandooren Mar 18, 2024
787bebf
Fix typehints
jochemvandooren Mar 18, 2024
dce673b
Rename _raw_node to _raw_values in order to align naming
jochemvandooren Mar 18, 2024
c14d7a7
Get rid of self in regular method
jochemvandooren Mar 18, 2024
b8636f2
Add private variables to classmethods
jochemvandooren Mar 18, 2024
db2643b
Load manifest from path in ManifestLoader
jochemvandooren Mar 18, 2024
66e2c73
Improve loading
jochemvandooren Mar 18, 2024
cb82aa2
Add unit tests.
jochemvandooren Mar 20, 2024
2bf66c5
Remove unused variables
jochemvandooren Mar 20, 2024
8256171
Remove non-default rules
jochemvandooren Mar 20, 2024
05c46ac
Clean up conftest and add json resource
jochemvandooren Mar 21, 2024
c77279a
Prettify manifest.json
jochemvandooren Mar 21, 2024
cbca0b2
Update src/dbt_score/models.py
jochemvandooren Mar 22, 2024
23282db
Update tests/test_models.py
jochemvandooren Mar 22, 2024
09b0883
Update tests/conftest.py
jochemvandooren Mar 22, 2024
2a4966c
Naming and fix path
jochemvandooren Mar 22, 2024
a21c5c4
Improve indendation
jochemvandooren Mar 22, 2024
8d94bf9
Ignore missing return value for rules
jochemvandooren Mar 22, 2024
f396137
Add test and improve exceptions
jochemvandooren Mar 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
248 changes: 248 additions & 0 deletions src/dbt_score/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
"""Objects related to loading the dbt manifest."""
jochemvandooren marked this conversation as resolved.
Show resolved Hide resolved

import json
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any


@dataclass
class Constraint:
"""Constraint for a column.

Attributes:
type: The type of the constraint, e.g. `foreign_key`.
name: The name of the constraint.
expression: The expression of the constraint, e.g. `schema.other_table`.
_raw_values: The raw values of the constraint in the manifest.
"""

type: str
name: str | None = None
expression: str | None = None
_raw_values: dict[str, Any] = field(default_factory=dict)

@classmethod
def from_raw_values(cls, raw_values: dict[str, Any]) -> "Constraint":
"""Create a constraint object from a constraint node in the manifest."""
return cls(
type=raw_values["type"],
name=raw_values["name"],
expression=raw_values["expression"],
_raw_values=raw_values,
)


@dataclass
class Test:
"""Test for a column or model.

Attributes:
name: The name of the test.
type: The type of the test, e.g. `unique`.
kwargs: The kwargs of the test.
tags: The list of tags attached to the test.
_raw_values: The raw values of the test in the manifest.
"""

name: str
type: str
kwargs: dict[str, Any] = field(default_factory=dict)
tags: list[str] = field(default_factory=list)
_raw_values: dict[str, Any] = field(default_factory=dict)

@classmethod
def from_node(cls, test_node: dict[str, Any]) -> "Test":
"""Create a test object from a test node in the manifest."""
return cls(
name=test_node["name"],
type=test_node["test_metadata"]["name"],
kwargs=test_node["test_metadata"].get("kwargs", {}),
tags=test_node.get("tags", []),
_raw_values=test_node,
)


@dataclass
class Column:
"""Represents a column in a model.

Attributes:
name: The name of the column.
description: The description of the column.
data_type: The data type of the column.
meta: The metadata attached to the column.
constraints: The list of constraints attached to the column.
tags: The list of tags attached to the column.
tests: The list of tests attached to the column.
_raw_values: The raw values of the column as defined in the node.
_raw_test_values: The raw test values of the column as defined in the node.
"""

name: str
description: str
data_type: str | None = None
meta: dict[str, Any] = field(default_factory=dict)
constraints: list[Constraint] = field(default_factory=list)
tags: list[str] = field(default_factory=list)
tests: list[Test] = field(default_factory=list)
_raw_values: dict[str, Any] = field(default_factory=dict)
_raw_test_values: list[dict[str, Any]] = field(default_factory=list)

@classmethod
def from_node_values(
cls, values: dict[str, Any], test_values: list[dict[str, Any]]
) -> "Column":
"""Create a column object from raw values."""
return cls(
name=values["name"],
description=values["description"],
data_type=values["data_type"],
meta=values["meta"],
constraints=[
Constraint.from_raw_values(constraint)
for constraint in values["constraints"]
],
tags=values["tags"],
tests=[Test.from_node(test) for test in test_values],
_raw_values=values,
_raw_test_values=test_values,
)


@dataclass
class Model:
"""Represents a dbt model.

Attributes:
unique_id: The id of the model, e.g. `model.package.model_name`.
name: The name of the model.
relation_name: The relation name of the model, e.g. `db.schema.model_name`.
description: The full description of the model.
original_file_path: The sql path of the model, `e.g. model_dir/dir/file.sql`.
config: The config of the model.
meta: The meta of the model.
columns: The list of columns of the model.
package_name: The package name of the model.
database: The database name of the model.
schema: The schema name of the model.
raw_code: The raw code of the model.
alias: The alias of the model.
patch_path: The yml path of the model, e.g. `package://model_dir/dir/file.yml`.
tags: The list of tags attached to the model.
tests: The list of tests attached to the model.
depends_on: Dictionary of models/sources/macros that the model depends on.
_raw_values: The raw values of the model (node) in the manifest.
_raw_test_values: The raw test values of the model (node) in the manifest.
"""

unique_id: str
name: str
relation_name: str
description: str
original_file_path: str
config: dict[str, Any]
meta: dict[str, Any]
columns: list[Column]
package_name: str
database: str
schema: str
raw_code: str
alias: str | None = None
patch_path: str | None = None
tags: list[str] = field(default_factory=list)
tests: list[Test] = field(default_factory=list)
depends_on: dict[str, list[str]] = field(default_factory=dict)
_raw_values: dict[str, Any] = field(default_factory=dict)
_raw_test_values: list[dict[str, Any]] = field(default_factory=list)

def get_column(self, column_name: str) -> Column | None:
"""Get a column by name."""
for column in self.columns:
if column.name == column_name:
return column

return None

@staticmethod
def _get_columns(
node_values: dict[str, Any], test_values: list[dict[str, Any]]
) -> list[Column]:
"""Get columns from a node and its tests in the manifest."""
return [
Column.from_node_values(
values,
[
test
for test in test_values
if test["test_metadata"]["kwargs"].get("column_name") == name
],
)
for name, values in node_values.get("columns", {}).items()
]

@classmethod
def from_node(
cls, node_values: dict[str, Any], test_values: list[dict[str, Any]]
) -> "Model":
"""Create a model object from a node and it's tests in the manifest."""
return cls(
unique_id=node_values["unique_id"],
name=node_values["name"],
relation_name=node_values["relation_name"],
description=node_values["description"],
original_file_path=node_values["original_file_path"],
config=node_values["config"],
meta=node_values["meta"],
columns=cls._get_columns(node_values, test_values),
package_name=node_values["package_name"],
database=node_values["database"],
schema=node_values["schema"],
raw_code=node_values["raw_code"],
alias=node_values["alias"],
patch_path=node_values["patch_path"],
tags=node_values["tags"],
tests=[
Test.from_node(test)
for test in test_values
if not test["test_metadata"]["kwargs"].get("column_name")
],
depends_on=node_values["depends_on"],
_raw_values=node_values,
_raw_test_values=test_values,
)


class ManifestLoader:
jochemvandooren marked this conversation as resolved.
Show resolved Hide resolved
"""Load the models and tests from the manifest."""

def __init__(self, file_path: Path):
"""Initialize the ManifestLoader.

Args:
file_path: The file path of the JSON manifest.
"""
self.raw_manifest = json.loads(file_path.read_text(encoding="utf-8"))
self.raw_nodes = self.raw_manifest.get("nodes", {})
self.models: list[Model] = []
self.tests: dict[str, list[dict[str, Any]]] = defaultdict(list)

self._reindex_tests()
self._load_models()

def _load_models(self) -> None:
"""Load the models from the manifest."""
for node_id, node_values in self.raw_nodes.items():
if node_values.get("resource_type") == "model":
model = Model.from_node(node_values, self.tests.get(node_id, []))
self.models.append(model)

def _reindex_tests(self) -> None:
"""Index tests based on their model id."""
for node_values in self.raw_nodes.values():
# Only include tests that are attached to a model.
if node_values.get("resource_type") == "test" and (
attached_node := node_values.get("attached_node")
):
self.tests[attached_node].append(node_values)
85 changes: 85 additions & 0 deletions src/dbt_score/rule.py
Copy link
Contributor

@michael-the1 michael-the1 Mar 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm sure you've thought a lot about this setup and I'd also like to learn more so here goes:

What is the reason @rule returns a subclass of Rule, rather than a Rule object? That is, instead of:

Callable[..., Type[Rule]]

Why not:

Callable[..., Rule]

Generating subclasses, of which there will only be one object per subclass, feels similar to generating objects, i.e. specific instances of Rule, but the latter is semantically closer to what you'd want (I think).

rule would then look something like this:

def rule(
    description: str | None = None,
    severity: Severity = Severity.MEDIUM,
) -> Callable[[Callable[[Model], RuleViolation | None]], Rule]:
    """Rule decorator.
    The rule decorator creates a Rule and returns it.
    Args:
        description: The description of the rule.
        severity: The severity of the rule.
    """

    def decorator_rule(
        func: Callable[[Model], RuleViolation | None],
    ) -> Rule:
        """Decorator function."""

        ...

        def wrapped_func(self: Rule, *args: Any, **kwargs: Any) -> RuleViolation | None:
            """Wrap func to add `self`."""
            return func(*args, **kwargs)

        rule_obj = Rule(
            description=...,
            severity=...,
            evaluate=wrapped_func,
        )

        return rule_obj

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One of the reasons is to allow users to create more advanced rules by subclassing Rule (simpler rules are encouraged to use the diet @rule decorator). Does that make sense?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it is indeed very similar, but this way it will be easier for users to create their own complex rules with a class. So this gives full flexibility for users that need it

Copy link
Contributor

@michael-the1 michael-the1 Mar 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But can't they still do that even in this setup?

@rule
def some_simple_rule():
    ...

class ComplexRule(Rule):
    """Do the thing, as long as it implements `evaluate`"""
    ...

Then in the rule registry / runner:

# In the real registry this discovery happens differently of course
rules: Rule = [
    some_simple_rule(),  # Instantiate a Rule object
    ComplexRule(),  # Instantiate a ComplexRule object, which is a subclass of Rule
    ...
]

for rule in rules:
    rule.evaluate(model)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The snippet above is how it currently works. But with your suggestion of creating instances instead of classes in the decorator, some_simple_rule would already be an instance, therefore some_simple_rule() would not work, right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops, I missed the last return 🙈

def rule(..) -> Callable[[Callable[[Model], RuleViolation | None]], Rule]:
    def decorator_rule(
        func: Callable[[Model], RuleViolation | None],
    ) -> Rule:
        def wrapped_func(self: Rule, *args: Any, **kwargs: Any) -> RuleViolation | None:
            return func(*args, **kwargs)

        rule_obj = Rule()

        return rule_obj
    return decorator_rule

Which should result in:

@rule
def some_simple_rule():
    ...

some_simple_rule # Function object that returns a Rule object
some_simple_rule() # Instance of Rule

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True. But I think it's more straightforward to go with the approach: "every subclass of Rule is a rule". Such subclass can be created either by subclassing, or by using @rule as syntactic sugar. This puts all rules "at the same level", if that makes sense?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The way I see it, both approaches are the same in the sense that "every Rule is a rule".

I understand that, by always subclassing Rule, all rules are on the same hierarchy. But from another perspective, they're all Rules and in aggregate we will work with lists of Rules. It doesn't matter if you subclass a Rule or use a subclass of a subclass of Rule or use Rule itself.

From a user perspective, nothing will change. You either use @rule as syntactic sugar or subclass Rule.

Where they differ is that I think, semantically, a list of objects is more straightforward than a list of generated subclasses. And I also think it simplifies implementation a bit.

Btw, because nothing changes from a user's perspective, I wouldn't make this point blocking.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have given it another look and decided to go with the current implementation. Going with the instance implementation also has some challenges, e.g. the example

@rule
def some_simple_rule():
    ...

some_simple_rule # Function object that returns a Rule object
some_simple_rule() # Instance of Rule

will not exactly work. To call some_simple_rule() we will need to grab the func from somewhere again as it's an input variable. I think the implementation does not necessarily becomes easier when using instances instead of classes.

Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""Rule definitions."""

from dataclasses import dataclass
from enum import Enum
from typing import Any, Callable, Type

from dbt_score.models import Model


class Severity(Enum):
"""The severity/weight of a rule."""

LOW = 1
MEDIUM = 2
HIGH = 3
CRITICAL = 4


@dataclass
class RuleViolation:
"""The violation of a rule."""

message: str | None = None


class Rule:
"""The rule base class."""

description: str
severity: Severity = Severity.MEDIUM

def __init_subclass__(cls, **kwargs) -> None: # type: ignore
"""Initializes the subclass."""
super().__init_subclass__(**kwargs)
if not hasattr(cls, "description"):
jochemvandooren marked this conversation as resolved.
Show resolved Hide resolved
raise TypeError("Subclass must define class attribute `description`.")
jochemvandooren marked this conversation as resolved.
Show resolved Hide resolved

def evaluate(self, model: Model) -> RuleViolation | None:
"""Evaluates the rule."""
raise NotImplementedError("Subclass must implement method `evaluate`.")


def rule(
description: str | None = None,
severity: Severity = Severity.MEDIUM,
) -> Callable[[Callable[[Model], RuleViolation | None]], Type[Rule]]:
"""Rule decorator.

The rule decorator creates a rule class (subclass of Rule) and returns it.

Args:
description: The description of the rule.
severity: The severity of the rule.
"""

def decorator_rule(
func: Callable[[Model], RuleViolation | None],
) -> Type[Rule]:
"""Decorator function."""
if func.__doc__ is None and description is None:
raise TypeError("Rule must define `description` or `func.__doc__`.")

# Get description parameter, otherwise use the docstring.
rule_description = description or (
func.__doc__.split("\n")[0] if func.__doc__ else None
)

def wrapped_func(self: Rule, *args: Any, **kwargs: Any) -> RuleViolation | None:
"""Wrap func to add `self`."""
return func(*args, **kwargs)

# Create the rule class inheriting from Rule.
rule_class = type(
func.__name__,
(Rule,),
{
"description": rule_description,
"severity": severity,
"evaluate": wrapped_func,
},
)

return rule_class

return decorator_rule
1 change: 1 addition & 0 deletions src/dbt_score/rules/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Rules."""
28 changes: 28 additions & 0 deletions src/dbt_score/rules/generic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""All generic rules."""

from dbt_score.models import Model
from dbt_score.rule import RuleViolation, rule


@rule()
def has_description(model: Model) -> RuleViolation | None:
"""A model should have a description."""
if not model.description:
return RuleViolation(message="Model lacks a description.")

return None


@rule()
def columns_have_description(model: Model) -> RuleViolation | None:
"""All columns of a model should have a description."""
invalid_column_names = [
column.name for column in model.columns if not column.description
]
if invalid_column_names:
return RuleViolation(
message=f"The following columns lack a description: "
f"{', '.join(invalid_column_names)}."
)

return None
Loading