Skip to content

Commit

Permalink
feat(ingest/bigquery): support bigquery profiling with sampling (data…
Browse files Browse the repository at this point in the history
  • Loading branch information
mayurinehate authored Sep 15, 2023
1 parent f4da939 commit 99d7eb7
Show file tree
Hide file tree
Showing 2 changed files with 162 additions and 80 deletions.
222 changes: 144 additions & 78 deletions metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,9 @@ def generate_dataset_profile( # noqa: C901 (complexity)
logger.debug(f"profiling {self.dataset_name}: flushing stage 1 queries")
self.query_combiner.flush()

if self.config.use_sampling and not self.config.limit:
self.update_dataset_batch_use_sampling(profile)

columns_profiling_queue: List[_SingleColumnSpec] = []
if columns_to_profile:
for column in all_columns:
Expand Down Expand Up @@ -737,6 +740,61 @@ def generate_dataset_profile( # noqa: C901 (complexity)
self.query_combiner.flush()
return profile

def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> None:
if (
self.dataset.engine.dialect.name.lower() == BIGQUERY
and profile.rowCount
and profile.rowCount > self.config.sample_size
):
"""
According to BigQuery Sampling Docs(https://cloud.google.com/bigquery/docs/table-sampling),
BigQuery does not cache the results of a query that includes a TABLESAMPLE clause and the
query may return different results every time. Calculating different column level metrics
on different sampling results is possible however each query execution would incur the cost
of reading data from storage. Also, using different table samples may create non-coherent
representation of column level metrics, for example, minimum value of a column in one sample
can be greater than maximum value of the column in another sample.
It is observed that for a simple select * query with TABLESAMPLE, results are cached and
stored in temporary table. This can be (ab)used and all column level profiling calculations
can be performed against it.
Risks:
1. All the risks mentioned in notes of `create_bigquery_temp_table` are also
applicable here.
2. TABLESAMPLE query may read entire table for small tables that are written
as single data block. This may incorrectly label datasetProfile's partition as
"SAMPLE", although profile is for entire table.
3. Table Sampling in BigQuery is a Pre-GA (Preview) feature.
"""
sample_pc = 100 * self.config.sample_size / profile.rowCount
sql = (
f"SELECT * FROM {str(self.dataset._table)} "
+ f"TABLESAMPLE SYSTEM ({sample_pc:.3f} percent)"
)
temp_table_name = create_bigquery_temp_table(
self,
sql,
self.dataset_name,
self.dataset.engine.engine.raw_connection(),
)
if temp_table_name:
self.dataset._table = sa.text(temp_table_name)
logger.debug(f"Setting table name to be {self.dataset._table}")

if (
profile.partitionSpec
and profile.partitionSpec.type == PartitionTypeClass.FULL_TABLE
):
profile.partitionSpec = PartitionSpecClass(
type=PartitionTypeClass.QUERY, partition="SAMPLE"
)
elif (
profile.partitionSpec
and profile.partitionSpec.type == PartitionTypeClass.PARTITION
):
profile.partitionSpec.partition += " SAMPLE"


@dataclasses.dataclass
class GEContext:
Expand Down Expand Up @@ -961,84 +1019,18 @@ def _generate_single_profile(
if platform == BIGQUERY and (
custom_sql or self.config.limit or self.config.offset
):
# On BigQuery, we need to bypass GE's mechanism for creating temporary tables because
# it requires create/delete table permissions.
import google.cloud.bigquery.job.query
from google.cloud.bigquery.dbapi.cursor import Cursor as BigQueryCursor

raw_connection = self.base_engine.raw_connection()
try:
cursor: "BigQueryCursor" = cast(
"BigQueryCursor", raw_connection.cursor()
)
if custom_sql is not None:
# Note that limit and offset are not supported for custom SQL.
# Presence of custom SQL represents that the bigquery table
# is either partitioned or sharded
bq_sql = custom_sql
else:
bq_sql = f"SELECT * FROM `{table}`"
if self.config.limit:
bq_sql += f" LIMIT {self.config.limit}"
if self.config.offset:
bq_sql += f" OFFSET {self.config.offset}"
try:
cursor.execute(bq_sql)
except Exception as e:
if not self.config.catch_exceptions:
raise e
logger.exception(
f"Encountered exception while profiling {pretty_name}"
)
self.report.report_warning(
pretty_name,
f"Profiling exception {e} when running custom sql {bq_sql}",
)
return None

# Great Expectations batch v2 API, which is the one we're using, requires
# a concrete table name against which profiling is executed. Normally, GE
# creates a table with an expiry time of 24 hours. However, we don't want the
# temporary tables to stick around that long, so we'd also have to delete them
# ourselves. As such, the profiler required create and delete table permissions
# on BigQuery.
#
# It turns out that we can (ab)use the BigQuery cached results feature
# to avoid creating temporary tables ourselves. For almost all queries, BigQuery
# will store the results in a temporary, cached results table when an explicit
# destination table is not provided. These tables are pretty easy to identify
# because they live in "anonymous datasets" and have a name that looks like
# "project-id._d60e97aec7f471046a960419adb6d44e98300db7.anon10774d0ea85fd20fe9671456c5c53d5f1b85e1b17bedb232dfce91661a219ee3"
# These tables are per-user and per-project, so there's no risk of permissions escalation.
# As per the docs, the cached results tables typically have a lifetime of 24 hours,
# which should be plenty for our purposes.
# See https://cloud.google.com/bigquery/docs/cached-results for more details.
#
# The code below extracts the name of the cached results table from the query job
# and points GE to that table for profiling.
#
# Risks:
# 1. If the query results are larger than the maximum response size, BigQuery will
# not cache the results. According to the docs https://cloud.google.com/bigquery/quotas,
# the maximum response size is 10 GB compressed.
# 2. The cache lifetime of 24 hours is "best-effort" and hence not guaranteed.
# 3. Tables with column-level security may not be cached, and tables with row-level
# security will not be cached.
# 4. BigQuery "discourages" using cached results directly, but notes that
# the current semantics do allow it.
#
# The better long-term solution would be to use a subquery avoid this whole
# temporary table dance. However, that would require either a) upgrading to
# use GE's batch v3 API or b) bypassing GE altogether.

query_job: Optional[
"google.cloud.bigquery.job.query.QueryJob"
] = cursor._query_job
assert query_job
temp_destination_table = query_job.destination
bigquery_temp_table = f"{temp_destination_table.project}.{temp_destination_table.dataset_id}.{temp_destination_table.table_id}"
finally:
raw_connection.close()
if custom_sql is not None:
# Note that limit and offset are not supported for custom SQL.
bq_sql = custom_sql
else:
bq_sql = f"SELECT * FROM `{table}`"
if self.config.limit:
bq_sql += f" LIMIT {self.config.limit}"
if self.config.offset:
bq_sql += f" OFFSET {self.config.offset}"
bigquery_temp_table = create_bigquery_temp_table(
self, bq_sql, pretty_name, self.base_engine.raw_connection()
)

if platform == BIGQUERY:
if bigquery_temp_table:
Expand Down Expand Up @@ -1128,6 +1120,7 @@ def _get_ge_dataset(
**batch_kwargs,
},
)

if platform == BIGQUERY:
# This is done as GE makes the name as DATASET.TABLE
# but we want it to be PROJECT.DATASET.TABLE instead for multi-project setups
Expand All @@ -1153,3 +1146,76 @@ def _get_column_types_to_ignore(dialect_name: str) -> List[str]:
return ["JSON"]

return []


def create_bigquery_temp_table(
instance: Union[DatahubGEProfiler, _SingleDatasetProfiler],
bq_sql: str,
table_pretty_name: str,
raw_connection: Any,
) -> Optional[str]:
# On BigQuery, we need to bypass GE's mechanism for creating temporary tables because
# it requires create/delete table permissions.
import google.cloud.bigquery.job.query
from google.cloud.bigquery.dbapi.cursor import Cursor as BigQueryCursor

try:
cursor: "BigQueryCursor" = cast("BigQueryCursor", raw_connection.cursor())
try:
cursor.execute(bq_sql)
except Exception as e:
if not instance.config.catch_exceptions:
raise e
logger.exception(
f"Encountered exception while profiling {table_pretty_name}"
)
instance.report.report_warning(
table_pretty_name,
f"Profiling exception {e} when running custom sql {bq_sql}",
)
return None

# Great Expectations batch v2 API, which is the one we're using, requires
# a concrete table name against which profiling is executed. Normally, GE
# creates a table with an expiry time of 24 hours. However, we don't want the
# temporary tables to stick around that long, so we'd also have to delete them
# ourselves. As such, the profiler required create and delete table permissions
# on BigQuery.
#
# It turns out that we can (ab)use the BigQuery cached results feature
# to avoid creating temporary tables ourselves. For almost all queries, BigQuery
# will store the results in a temporary, cached results table when an explicit
# destination table is not provided. These tables are pretty easy to identify
# because they live in "anonymous datasets" and have a name that looks like
# "project-id._d60e97aec7f471046a960419adb6d44e98300db7.anon10774d0ea85fd20fe9671456c5c53d5f1b85e1b17bedb232dfce91661a219ee3"
# These tables are per-user and per-project, so there's no risk of permissions escalation.
# As per the docs, the cached results tables typically have a lifetime of 24 hours,
# which should be plenty for our purposes.
# See https://cloud.google.com/bigquery/docs/cached-results for more details.
#
# The code below extracts the name of the cached results table from the query job
# and points GE to that table for profiling.
#
# Risks:
# 1. If the query results are larger than the maximum response size, BigQuery will
# not cache the results. According to the docs https://cloud.google.com/bigquery/quotas,
# the maximum response size is 10 GB compressed.
# 2. The cache lifetime of 24 hours is "best-effort" and hence not guaranteed.
# 3. Tables with column-level security may not be cached, and tables with row-level
# security will not be cached.
# 4. BigQuery "discourages" using cached results directly, but notes that
# the current semantics do allow it.
#
# The better long-term solution would be to use a subquery avoid this whole
# temporary table dance. However, that would require either a) upgrading to
# use GE's batch v3 API or b) bypassing GE altogether.

query_job: Optional[
"google.cloud.bigquery.job.query.QueryJob"
] = cursor._query_job
assert query_job
temp_destination_table = query_job.destination
bigquery_temp_table = f"{temp_destination_table.project}.{temp_destination_table.dataset_id}.{temp_destination_table.table_id}"
return bigquery_temp_table
finally:
raw_connection.close()
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,26 @@ class GEProfilingConfig(ConfigModel):
# Hidden option - used for debugging purposes.
catch_exceptions: bool = Field(default=True, description="")

partition_profiling_enabled: bool = Field(default=True, description="")
partition_profiling_enabled: bool = Field(
default=True,
description="Whether to profile partitioned tables. Only BigQuery supports this. "
"If enabled, latest partition data is used for profiling.",
)
partition_datetime: Optional[datetime.datetime] = Field(
default=None,
description="For partitioned datasets profile only the partition which matches the datetime or profile the latest one if not set. Only Bigquery supports this.",
description="If specified, profile only the partition which matches this datetime. "
"If not specified, profile the latest partition. Only Bigquery supports this.",
)
use_sampling: bool = Field(
default=True,
description="Whether to profile column level stats on sample of table. Only BigQuery supports this. "
"If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
)

sample_size: int = Field(
default=1000,
description="Number of rows to be sampled from table for column level profiling."
"Applicable only if `use_sampling` is set to True.",
)

@pydantic.root_validator(pre=True)
Expand Down

0 comments on commit 99d7eb7

Please sign in to comment.