Skip to content

Commit

Permalink
fix(ingest/bigquery): Correctly apply table pattern to read events; f…
Browse files Browse the repository at this point in the history
…ix end time calculation; deprecate match_fully_qualified_names (datahub-project#9077)
  • Loading branch information
asikowitz authored Oct 24, 2023
1 parent c849246 commit eb0b03d
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,8 @@ class BigQueryV2Config(
)

match_fully_qualified_names: bool = Field(
default=False,
description="Whether `dataset_pattern` is matched against fully qualified dataset name `<project_id>.<dataset_name>`.",
default=True,
description="[deprecated] Whether `dataset_pattern` is matched against fully qualified dataset name `<project_id>.<dataset_name>`.",
)

include_external_url: bool = Field(
Expand Down Expand Up @@ -327,8 +327,7 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
):
logger.warning(
"Please update `dataset_pattern` to match against fully qualified schema name `<project_id>.<dataset_name>` and set config `match_fully_qualified_names : True`."
"Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
"The config option `match_fully_qualified_names` will be deprecated in future and the default behavior will assume `match_fully_qualified_names: True`."
"The config option `match_fully_qualified_names` is deprecated and will be removed in a future release."
)
return values

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,7 @@ def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]:
# handle the case where the read happens within our time range but the query
# completion event is delayed and happens after the configured end time.
corrected_start_time = self.start_time - self.config.max_query_duration
corrected_end_time = self.end_time + -self.config.max_query_duration
corrected_end_time = self.end_time + self.config.max_query_duration
self.report.log_entry_start_time = corrected_start_time
self.report.log_entry_end_time = corrected_end_time

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,12 @@ def get_time_window(self) -> Tuple[datetime, datetime]:
def _is_table_allowed(self, table_ref: Optional[BigQueryTableRef]) -> bool:
return (
table_ref is not None
and self.config.dataset_pattern.allowed(table_ref.table_identifier.dataset)
and self.config.table_pattern.allowed(table_ref.table_identifier.table)
and self.config.dataset_pattern.allowed(
f"{table_ref.table_identifier.project_id}.{table_ref.table_identifier.dataset}"
if self.config.match_fully_qualified_names
else table_ref.table_identifier.dataset
)
and self.config.table_pattern.allowed(str(table_ref.table_identifier))
)

def _should_ingest_usage(self) -> bool:
Expand Down Expand Up @@ -844,7 +848,7 @@ def _get_parsed_bigquery_log_events(
# handle the case where the read happens within our time range but the query
# completion event is delayed and happens after the configured end time.
corrected_start_time = self.start_time - self.config.max_query_duration
corrected_end_time = self.end_time + -self.config.max_query_duration
corrected_end_time = self.end_time + self.config.max_query_duration
self.report.audit_start_time = corrected_start_time
self.report.audit_end_time = corrected_end_time

Expand Down

0 comments on commit eb0b03d

Please sign in to comment.