From 503a0bcda9080430f44eb98dd2a3ecdc92fa8548 Mon Sep 17 00:00:00 2001 From: Paul Yang Date: Sun, 12 Nov 2023 17:39:32 -0800 Subject: [PATCH] Add a way to print object + fields in a compact and pretty format. This is needed to aid debugging via logs and to make tests easier to write by allowing for easy generation of descriptive snapshots. --- metricflow/collection_helpers/pretty_print.py | 412 ++++++++++++++++++ metricflow/formatting.py | 4 +- .../collection_helpers/test_pretty_print.py | 86 ++++ 3 files changed, 500 insertions(+), 2 deletions(-) create mode 100644 metricflow/collection_helpers/pretty_print.py create mode 100644 metricflow/test/collection_helpers/test_pretty_print.py diff --git a/metricflow/collection_helpers/pretty_print.py b/metricflow/collection_helpers/pretty_print.py new file mode 100644 index 0000000000..d00846b4c6 --- /dev/null +++ b/metricflow/collection_helpers/pretty_print.py @@ -0,0 +1,412 @@ +from __future__ import annotations + +import logging +import pprint +from collections.abc import Mapping +from dataclasses import fields, is_dataclass +from enum import Enum +from typing import Any, List, Optional, Sized, Union + +from pydantic import BaseModel + +from metricflow.formatting import indent_log_line + +logger = logging.getLogger(__name__) + + +class MetricFlowPrettyFormatter: + """Creates string representations of objects useful for logging / snapshots.""" + + def __init__( + self, + indent_prefix: str, + max_line_length: int, + include_object_field_names: bool, + include_none_object_fields: bool, + include_empty_object_fields: bool, + ) -> None: + """See mf_pformat() for argument descriptions.""" + self._indent_prefix = indent_prefix + if max_line_length <= 0: + raise ValueError(f"max_line_length must be <= 0 as required by pprint.pformat(). Got {max_line_length}") + self._max_line_width = max_line_length + self._include_object_field_names = include_object_field_names + self._include_none_object_fields = include_none_object_fields + self._include_empty_object_fields = include_empty_object_fields + + @staticmethod + def _is_pydantic_base_model(obj: Any): # type:ignore + return isinstance(obj, BaseModel) + + def _handle_sequence_obj(self, list_like_obj: Union[list, tuple], remaining_line_width: Optional[int]) -> str: + """Pretty prints a sequence object i.e. list or tuple. + + Args: + list_like_obj: A list or a tuple. + remaining_line_width: If specified, try to make the string representation <= this many columns wide. + + Returns: + A string representation of the sequence like (1,) or [1, 2]. + """ + if isinstance(list_like_obj, list): + left_enclose_str = "[" + right_enclose_str = "]" + elif isinstance(list_like_obj, tuple): + left_enclose_str = "(" + right_enclose_str = ")" + else: + raise RuntimeError(f"Unhandled type: {type(list_like_obj)}") + + if len(list_like_obj) == 0: + return f"{left_enclose_str}{right_enclose_str}" + + # See if this object can be printed in one line. + items_as_str = tuple(self._handle_any_obj(list_item, remaining_line_width=None) for list_item in list_like_obj) + line_items = [left_enclose_str] + if len(items_as_str) > 0: + line_items.extend([", ".join(items_as_str)]) + if len(items_as_str) == 1: + line_items.append(",") + line_items.append(right_enclose_str) + result_without_width_limit = "".join(line_items) + + if remaining_line_width is None or len(result_without_width_limit) <= remaining_line_width: + return result_without_width_limit + + # The item can't be printed on one line, so do an indented style like: + """ + [ + 1, + 2, + ... + ] + """ + + # Convert each item to a pretty string. + items_as_str = tuple( + self._handle_any_obj( + list_item, remaining_line_width=max(0, remaining_line_width - len(self._indent_prefix)) + ) + for list_item in list_like_obj + ) + lines = [left_enclose_str] + + # item_block is similar to + """ + 1, + 2, + 3, + """ + + item_block = ",\n".join(items_as_str) + # Indent the item_block and a trailing comma. + if len(item_block) > 0: + lines.append(indent_log_line(item_block, indent_prefix=self._indent_prefix) + ",") + lines.append(right_enclose_str) + return "\n".join(lines) + + def _handle_indented_key_value_item( # type: ignore[misc] + self, + key: Any, + value: Any, + key_value_seperator: str, + is_dataclass_like_object: bool, + remaining_line_width: Optional[int], + ) -> str: + """Convert a key / value for a mapping-like object to a string that should be placed in an indented block. + + Mapping-like objects include dictionaries, dataclasses, and Pydantic models. The output of this method would + look like: + + "'key': [1, 2, 3]" or "arg=Foo()", etc. + + and the caller of this method would add the actual indent. + + Args: + key: The object representing the key. + value: The object representing the value. + key_value_seperator: The string used to separate the key and the value. e.g. ": " for dicts, "=" for + dataclasses. + is_dataclass_like_object: Set this to True if the given value object is a dataclass to handle some printing + options specific to dataclasses. + remaining_line_width: If specified, try to make the string representation <= this many columns wide. + + Returns: + The indented block that represents the key / value item and goes in between "[" / "]" in the string + representation of the mapping-like object. + """ + if is_dataclass_like_object: + if not self._include_none_object_fields and value is None: + return "" + + # See if the string representation can fit on one line. e.g. "'a': [1, 2]" + if remaining_line_width is None or remaining_line_width > 0: + result_items_without_limit: List[str] = [] + if is_dataclass_like_object and self._include_object_field_names: + result_items_without_limit.append(str(key)) + else: + self._handle_any_obj(key, remaining_line_width=remaining_line_width) + result_items_without_limit.append(key_value_seperator) + result_items_without_limit.append(self._handle_any_obj(value, remaining_line_width=None)) + + result_without_limit = "".join(result_items_without_limit) + if remaining_line_width is None or len(result_without_limit) <= remaining_line_width: + return result_without_limit + + # The string representation can't fit on one line - use multiple. e.g. + """ + 'key': + [1, 2, 3, 4] + """ + + # Create the string for the key. + result_lines: List[str] = [] + if is_dataclass_like_object: + if self._include_object_field_names: + result_lines.append(str(key) + key_value_seperator) + else: + key_lines = self._handle_any_obj(key, remaining_line_width=remaining_line_width).splitlines() + # Different ways of printing the key / value depending on whether the key fits on one line or requires + # multiple. + if len(key_lines) == 1: + result_lines.extend(key_lines) + else: + # The key needs to be printed in multiple lines. In that case, we want a result where the key value + # separator is on the last line with the key. e.g. + """ + KeyObject( + a='foo', + b='bar', + ): ... + """ + result_lines.append(key_lines[0]) + result_lines.extend(key_lines[1:-1]) + result_lines.append(key_lines[-1] + key_value_seperator) + + # Create the string for the values. + value_lines = self._handle_any_obj( + value, remaining_line_width=max(0, remaining_line_width - len(self._indent_prefix)) + ).splitlines() + + # Combine key and value. + + # Similar to the key, how we print the value depends on whether the value fits on one line or not. e.g. + """ + foo=[1, 2, 3] + + or + + foo=[ + 1, + 2, + 3, + ] + """ + if len(value_lines) > 1: + # For the multi-line case, we want to print the first line of the value on the same line as the last line of + # the key. + result_lines[-1] = result_lines[-1] + value_lines[0] + result_lines.append("\n".join(value_lines[1:-1])) + result_lines.append(value_lines[-1]) + return "\n".join(result_lines) + + result_lines.append(indent_log_line(value_lines[0], indent_prefix=self._indent_prefix)) + + return "\n".join(result_lines) + + def _handle_mapping_like_obj( + self, + mapping: Mapping, + left_enclose_str: str, + key_value_seperator: str, + right_enclose_str: str, + is_dataclass_like_object: bool, + remaining_line_width: Optional[int], + ) -> str: + """Convert a mapping-like object to a pretty string. + + This class treats dataclasses as mappings where the field / field values are the keys / values. + + + Args: + mapping: The mapping object to convert. + left_enclose_str: The string used on the left side to enclose the object. e.g. "{" for dicts or "Foo(" for + dataclasses. + key_value_seperator: The string used to separate keys and values. e.g. ": " for dicts, or "=" for + dataclasses. + right_enclose_str: The string used on the right side to enclose the object. e.g. "}" for dicts or ")" for + dataclasses. + is_dataclass_like_object: Flag to indicate whether this is a dataclass as there are some differences in + formatting those. + remaining_line_width: If specified, try to make the string representation <= this many columns wide. + + Returns: + A string representation of the mapping. e.g. "{'a'=[1, 2]}" or "Foo(a=[1, 2])". + """ + if is_dataclass_like_object and not self._include_empty_object_fields: + mapping = { + key: value + for key, value in mapping.items() + if (isinstance(value, Sized) and len(value) > 0) or (not isinstance(value, Sized)) + } + + if len(mapping) == 0: + return f"{left_enclose_str}{right_enclose_str}" + # Handle case if the string representation fits on one line. + if remaining_line_width is None or remaining_line_width > 0: + comma_separated_items: List[str] = [] + for key, value in mapping.items(): + if is_dataclass_like_object and not self._include_none_object_fields and value is None: + continue + key_value_str_items: List[str] = [] + + if is_dataclass_like_object: + if self._include_object_field_names: + key_value_str_items.append(str(key)) + key_value_str_items.append(key_value_seperator) + else: + key_value_str_items.append(self._handle_any_obj(key, remaining_line_width=None)) + key_value_str_items.append(key_value_seperator) + key_value_str_items.append(self._handle_any_obj(value, remaining_line_width=None)) + comma_separated_items.append("".join(key_value_str_items)) + result_without_limit = "".join((left_enclose_str, ", ".join(comma_separated_items), right_enclose_str)) + + if remaining_line_width is None or len(result_without_limit) <= remaining_line_width: + return result_without_limit + + # Handle multi-line case. + mapping_items_as_str = [] + for key, value in mapping.items(): + if is_dataclass_like_object and not self._include_none_object_fields and value is None: + continue + mapping_items_as_str.append( + self._handle_indented_key_value_item( + key=key, + value=value, + key_value_seperator=key_value_seperator, + is_dataclass_like_object=is_dataclass_like_object, + remaining_line_width=max(0, remaining_line_width - len(self._indent_prefix)), + ) + ) + lines = [left_enclose_str] + if len(mapping_items_as_str) > 0: + indented_block = indent_log_line(",\n".join(mapping_items_as_str), indent_prefix=self._indent_prefix) + lines.append(indented_block + ",") + lines.append(right_enclose_str) + return "\n".join(lines) + + def _handle_any_obj(self, obj: Any, remaining_line_width: Optional[int]) -> str: # type: ignore + """Convert any object into a pretty string-representation. + + This is called recursively as sequences and mappings have constituent objects of any type. + + Args: + obj: The object to convert. + remaining_line_width: If specified, try to make the string representation <= this many columns wide. + + Returns: + A pretty string-representation of the object. + """ + if isinstance(obj, Enum): + return obj.name + + if isinstance(obj, (list, tuple)): + return self._handle_sequence_obj(obj, remaining_line_width=remaining_line_width) + + if isinstance(obj, dict): + return self._handle_mapping_like_obj( + obj, + left_enclose_str="{", + key_value_seperator=": ", + right_enclose_str="}", + is_dataclass_like_object=False, + remaining_line_width=remaining_line_width, + ) + + if is_dataclass(obj): + # dataclasses.asdict() seems to exclude None fields, so doing this instead. + mapping = {field.name: getattr(obj, field.name) for field in fields(obj)} + return self._handle_mapping_like_obj( + mapping, + left_enclose_str=type(obj).__name__ + "(", + key_value_seperator="=", + right_enclose_str=")", + is_dataclass_like_object=True, + remaining_line_width=remaining_line_width, + ) + + if MetricFlowPrettyFormatter._is_pydantic_base_model(obj): + mapping = {key: getattr(obj, key) for key in obj.dict().keys()} + return self._handle_mapping_like_obj( + mapping, + left_enclose_str=type(obj).__name__ + "(", + key_value_seperator="=", + right_enclose_str=")", + is_dataclass_like_object=True, + remaining_line_width=remaining_line_width, + ) + + # Any other object that's not handled. + return pprint.pformat(obj, width=self._max_line_width, sort_dicts=False) + + def pretty_format(self, obj: Any) -> str: # type: ignore[misc] + """Return a pretty string representation of the object that's suitable for logging.""" + return self._handle_any_obj(obj, remaining_line_width=self._max_line_width) + + +def mf_pformat( # type: ignore + obj: Any, + max_line_length: int = 120, + indent_prefix: str = " ", + include_object_field_names: bool = True, + include_none_object_fields: bool = False, + include_empty_object_fields: bool = False, +) -> str: + """Print objects in a pretty way for logging / test snapshots. + + In Python 3.10, the pretty printer class will support dataclasses, so we can remove this once we're on + 3.10. Also tried the prettyprint package with dataclasses, but that prints full names for the classes + e.g. a.b.MyClass and it also always added line breaks, even if an object could fit on one line, so + preferring to not use that for compactness. + + e.g. + metricflow.specs.DimensionSpec( + element_name='country', + entity_links=() + ), + + Instead, the below will print something like: + + DimensionSpec(element_name='country', entity_links=()) + + Also, this simplifies the object representation in some cases (e.g. Enums) and provides options for a more compact + string. This is an improvement on pformat_big_objects() in dbt-semantic-interfaces to be more compact and easier + to read. + + TODO: Replace calls to pformat_big_objects with this. + + Args: + obj: The object to convert to string. + max_line_length: If the string representation is going to be longer than this, split into multiple lines. + indent_prefix: The prefix to use for hierarchical indents. + include_object_field_names: Include field names when printing objects - e.g. Foo(bar='baz') vs Foo('baz') + include_none_object_fields: Include fields with a None value - e.g. Foo(bar=None) vs Foo() + include_empty_object_fields: Include fields that are empty - e.g. Foo(bar=()) vs Foo() + + Returns: + A string representation of the object that's useful for logging / debugging. + """ + # Since this is used in logging calls, wrap with except so that a bug here doesn't result in something breaking. + try: + formatter = MetricFlowPrettyFormatter( + indent_prefix=indent_prefix, + max_line_length=max_line_length, + include_object_field_names=include_object_field_names, + include_none_object_fields=include_none_object_fields, + include_empty_object_fields=include_empty_object_fields, + ) + return formatter.pretty_format(obj) + except Exception: + # This automatically includes the call trace. + logger.exception("Error pretty printing due to an exception - using str() instead.") + return str(obj) diff --git a/metricflow/formatting.py b/metricflow/formatting.py index 647a27a92f..8170c70b5f 100644 --- a/metricflow/formatting.py +++ b/metricflow/formatting.py @@ -3,5 +3,5 @@ import textwrap -def indent_log_line(message: str, indent_level: int = 1) -> str: # noqa: D - return textwrap.indent(message, prefix=" " * indent_level) +def indent_log_line(message: str, indent_level: int = 1, indent_prefix: str = " ") -> str: # noqa: D + return textwrap.indent(message, prefix=indent_prefix * indent_level) diff --git a/metricflow/test/collection_helpers/test_pretty_print.py b/metricflow/test/collection_helpers/test_pretty_print.py new file mode 100644 index 0000000000..e073033558 --- /dev/null +++ b/metricflow/test/collection_helpers/test_pretty_print.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +import logging +import textwrap + +from dbt_semantic_interfaces.implementations.elements.dimension import PydanticDimension +from dbt_semantic_interfaces.type_enums import DimensionType + +from metricflow.collection_helpers.pretty_print import mf_pformat +from metricflow.test.time.metric_time_dimension import MTD_SPEC_DAY + +logger = logging.getLogger(__name__) + + +def test_literals() -> None: # noqa: D + assert mf_pformat(1) == "1" + assert mf_pformat(1.0) == "1.0" + assert mf_pformat("foo") == "'foo'" + + +def test_containers() -> None: # noqa: D + assert mf_pformat((1,)) == "(1,)" + assert mf_pformat(((1, 2), 3)) == "((1, 2), 3)" + assert mf_pformat([[1, 2], 3]) == "[[1, 2], 3]" + assert mf_pformat({"a": ((1, 2), 3), (1, 2): 3}) == "{'a': ((1, 2), 3), (1, 2): 3}" + + +def test_classes() -> None: # noqa: D + assert "TimeDimensionSpec('metric_time', DAY)" == mf_pformat( + MTD_SPEC_DAY, + include_object_field_names=False, + include_none_object_fields=False, + include_empty_object_fields=False, + ) + assert ( + textwrap.dedent( + """\ + TimeDimensionSpec( + element_name='metric_time', + entity_links=(), + time_granularity=DAY, + date_part=None, + aggregation_state=None, + ) + """ + ).rstrip() + == mf_pformat( + MTD_SPEC_DAY, + include_object_field_names=True, + include_none_object_fields=True, + include_empty_object_fields=True, + ) + ) + + assert "TimeDimensionSpec(element_name='metric_time', time_granularity=DAY)" == mf_pformat(MTD_SPEC_DAY) + + +def test_multi_line_key_value() -> None: + """Test a dict where the key and value needs to be printed on multiple lines.""" + assert ( + textwrap.dedent( + """\ + { + ( + 1, + 2, + 3, + ): ( + 4, + 5, + 6, + ), + } + """ + ).rstrip() + == mf_pformat( + obj={(1, 2, 3): (4, 5, 6)}, + max_line_length=1, + ) + ) + + +def test_pydantic_model() -> None: # noqa: D + assert "PydanticDimension(name='foo', type=CATEGORICAL, is_partition=False)" == mf_pformat( + PydanticDimension(name="foo", type=DimensionType.CATEGORICAL) + )