-
Notifications
You must be signed in to change notification settings - Fork 96
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Querying dimensions without metrics #804
Changes from 1 commit
daf7695
45b66dd
a1afcbb
4f0d5d2
c9526e5
c7e63cb
71f57a0
87c566e
20cc790
d350fc6
9b200a0
56332d1
61d23c6
4cea5b6
722d7ea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,8 +14,11 @@ | |
) | ||
from dbt_semantic_interfaces.type_enums.time_granularity import TimeGranularity | ||
|
||
from metricflow.dataflow.builder.node_data_set import DataflowPlanNodeOutputDataSetResolver | ||
from metricflow.dataflow.dataflow_plan import BaseOutput | ||
from metricflow.filters.time_constraint import TimeRangeConstraint | ||
from metricflow.model.semantic_manifest_lookup import SemanticManifestLookup | ||
from metricflow.naming.linkable_spec_name import StructuredLinkableSpecName | ||
from metricflow.specs.specs import ( | ||
TimeDimensionSpec, | ||
) | ||
|
@@ -74,6 +77,9 @@ def validate_time_granularity( | |
|
||
e.g. throw an error if "ds__week" is specified for a metric with a time granularity of MONTH. | ||
""" | ||
if not metric_references: | ||
return None | ||
|
||
valid_group_by_elements = self._semantic_manifest_lookup.metric_lookup.linkable_set_for_metrics( | ||
metric_references=metric_references, | ||
) | ||
|
@@ -97,15 +103,21 @@ def resolve_granularity_for_partial_time_dimension_specs( | |
self, | ||
metric_references: Sequence[MetricReference], | ||
partial_time_dimension_specs: Sequence[PartialTimeDimensionSpec], | ||
read_nodes: Sequence[BaseOutput], | ||
node_output_resolver: DataflowPlanNodeOutputDataSetResolver, | ||
) -> Dict[PartialTimeDimensionSpec, TimeDimensionSpec]: | ||
"""Figure out the lowest granularity possible for the partially specified time dimension specs. | ||
|
||
Returns a dictionary that maps how the partial time dimension spec should be turned into a time dimension spec. | ||
""" | ||
result: Dict[PartialTimeDimensionSpec, TimeDimensionSpec] = {} | ||
|
||
for partial_time_dimension_spec in partial_time_dimension_specs: | ||
minimum_time_granularity = self.find_minimum_granularity_for_partial_time_dimension_spec( | ||
partial_time_dimension_spec=partial_time_dimension_spec, metric_references=metric_references | ||
partial_time_dimension_spec=partial_time_dimension_spec, | ||
metric_references=metric_references, | ||
read_nodes=read_nodes, | ||
node_output_resolver=node_output_resolver, | ||
) | ||
result[partial_time_dimension_spec] = TimeDimensionSpec( | ||
element_name=partial_time_dimension_spec.element_name, | ||
|
@@ -116,35 +128,75 @@ def resolve_granularity_for_partial_time_dimension_specs( | |
return result | ||
|
||
def find_minimum_granularity_for_partial_time_dimension_spec( | ||
self, partial_time_dimension_spec: PartialTimeDimensionSpec, metric_references: Sequence[MetricReference] | ||
self, | ||
partial_time_dimension_spec: PartialTimeDimensionSpec, | ||
metric_references: Sequence[MetricReference], | ||
read_nodes: Sequence[BaseOutput], | ||
node_output_resolver: DataflowPlanNodeOutputDataSetResolver, | ||
) -> TimeGranularity: | ||
"""Find minimum granularity allowed for time dimension when queried with given metrics.""" | ||
valid_group_by_elements = self._semantic_manifest_lookup.metric_lookup.linkable_set_for_metrics( | ||
metric_references=metric_references, | ||
) | ||
|
||
minimum_time_granularity: Optional[TimeGranularity] = None | ||
for path_key in valid_group_by_elements.path_key_to_linkable_dimensions: | ||
if ( | ||
path_key.element_name == partial_time_dimension_spec.element_name | ||
and path_key.entity_links == partial_time_dimension_spec.entity_links | ||
and path_key.time_granularity is not None | ||
): | ||
minimum_time_granularity = ( | ||
path_key.time_granularity | ||
if minimum_time_granularity is None | ||
else min(minimum_time_granularity, path_key.time_granularity) | ||
) | ||
|
||
if not minimum_time_granularity: | ||
raise RequestTimeGranularityException( | ||
f"Unable to resolve the time dimension spec for {partial_time_dimension_spec}. " | ||
f"Valid group by elements are:\n" | ||
f"{pformat_big_objects([spec.qualified_name for spec in valid_group_by_elements.as_spec_set.as_tuple])}" | ||
if metric_references: | ||
valid_group_by_elements = self._semantic_manifest_lookup.metric_lookup.linkable_set_for_metrics( | ||
metric_references=metric_references, | ||
) | ||
for path_key in valid_group_by_elements.path_key_to_linkable_dimensions: | ||
if ( | ||
path_key.element_name == partial_time_dimension_spec.element_name | ||
and path_key.entity_links == partial_time_dimension_spec.entity_links | ||
and path_key.time_granularity is not None | ||
): | ||
minimum_time_granularity = ( | ||
path_key.time_granularity | ||
if minimum_time_granularity is None | ||
else min(minimum_time_granularity, path_key.time_granularity) | ||
) | ||
if not minimum_time_granularity: | ||
raise RequestTimeGranularityException( | ||
f"Unable to resolve the time dimension spec for {partial_time_dimension_spec}. " | ||
f"Valid group by elements are:\n" | ||
f"{pformat_big_objects([spec.qualified_name for spec in valid_group_by_elements.as_spec_set.as_tuple])}" | ||
) | ||
else: | ||
minimum_time_granularity = self.get_min_granularity_for_partial_time_dimension_without_metrics( | ||
read_nodes=read_nodes, | ||
node_output_resolver=node_output_resolver, | ||
partial_time_dimension_spec=partial_time_dimension_spec, | ||
) | ||
if not minimum_time_granularity: | ||
raise RequestTimeGranularityException( | ||
f"Unable to resolve the time dimension spec for {partial_time_dimension_spec}. " | ||
) | ||
|
||
return minimum_time_granularity | ||
|
||
def get_min_granularity_for_partial_time_dimension_without_metrics( | ||
self, | ||
read_nodes: Sequence[BaseOutput], | ||
node_output_resolver: DataflowPlanNodeOutputDataSetResolver, | ||
partial_time_dimension_spec: PartialTimeDimensionSpec, | ||
) -> Optional[TimeGranularity]: | ||
"""Find the minimum.""" | ||
granularity_free_qualified_name = StructuredLinkableSpecName( | ||
entity_link_names=tuple( | ||
[entity_link.element_name for entity_link in partial_time_dimension_spec.entity_links] | ||
), | ||
element_name=partial_time_dimension_spec.element_name, | ||
).granularity_free_qualified_name | ||
for read_node in read_nodes: | ||
output_data_set = node_output_resolver.get_output_data_set(read_node) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This feels like potentially a lot of iterating through datasets to me since we'll call this function once for every partial time dimension spec, but I think it's fine for now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, it would probably be better to create a dict of time dimension instances to their output datasets on init (or something like that) so we only need to do this loop once. Not sure how complicated that change will be, let me look into it and see if it's worth doing before merge There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW, it turns out we are caching the output dataset so |
||
for time_dimension_instance in output_data_set.instance_set.time_dimension_instances: | ||
if time_dimension_instance.spec.date_part: | ||
continue | ||
time_dim_name_without_granularity = StructuredLinkableSpecName.from_name( | ||
time_dimension_instance.spec.qualified_name | ||
).granularity_free_qualified_name | ||
if time_dim_name_without_granularity == granularity_free_qualified_name: | ||
return time_dimension_instance.spec.time_granularity | ||
courtneyholcomb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
return None | ||
|
||
def adjust_time_range_to_granularity( | ||
self, time_range_constraint: TimeRangeConstraint, time_granularity: TimeGranularity | ||
) -> TimeRangeConstraint: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok. I think it might be better to keep this totally separate from the new function and branch at the callsite for now.
There are only two callsites, one of which is in this class, and that way we aren't tempted to keep adding responsibility to the node output resolver.
What do you think? Otherwise it might be worth it to mark the new method with the leading _ for the time being, I'm not sure we'll ever really want to call it on its own if we keep this method signature.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not totally sure I understand what you mean. Is it this? We don't want to use the
node_output_resolver
here, so we should pass in the output datasets already resolved.I'm thinking maybe it would be better to do this all on init for this class. Iterate through all the datasets and store a dict of all
time_dimension_instances
to a set of supportedgranularities
. Then it will be super easy to grab that at query time.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
^ implemented it this way!