-
Notifications
You must be signed in to change notification settings - Fork 79
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add num_rows estimate in hub_cache (#2940)
* add num_rows estimate in hub_cache * fix tests * add migration * fix test * fix test * add num_rows_source * bump job version * update tests * Revert "update tests" This reverts commit fd5fec8. * Revert "bump job version" This reverts commit 4ce87cc. * Revert "add num_rows_source" This reverts commit 9207cdb. * support mix of exact and estimated to compute estimated at config level * same for dataset level
- Loading branch information
Showing
9 changed files
with
877 additions
and
3 deletions.
There are no files selected for viewing
83 changes: 83 additions & 0 deletions
83
...ongodb_migration/migrations/_20240624144000_cache_add_estimated_num_rows_field_in_size.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# Copyright 2023 The HuggingFace Authors. | ||
|
||
import logging | ||
|
||
from libcommon.constants import CACHE_COLLECTION_RESPONSES, CACHE_MONGOENGINE_ALIAS | ||
from libcommon.simple_cache import CachedResponseDocument | ||
from mongoengine.connection import get_db | ||
|
||
from mongodb_migration.check import check_documents | ||
from mongodb_migration.migration import Migration | ||
|
||
|
||
# connection already occurred in the main.py (caveat: we use globals) | ||
class MigrationAddEstimatedNumRowsToSizeCacheResponse(Migration): | ||
def up(self) -> None: | ||
# See https://docs.mongoengine.org/guide/migration.html#example-1-addition-of-a-field | ||
logging.info( | ||
"If missing, add the 'estimated_num_rows' field with the default value" | ||
" None to the cached results of dataset-size and config-size" | ||
) | ||
db = get_db(CACHE_MONGOENGINE_ALIAS) | ||
db[CACHE_COLLECTION_RESPONSES].update_many( | ||
{ | ||
"kind": "config-size", | ||
"http_status": 200, | ||
"content.size.config.estimated_num_rows": {"$exists": False}, | ||
}, | ||
{ | ||
"$set": { | ||
"content.size.config.estimated_num_rows": None, | ||
"content.size.splits.$[].estimated_num_rows": None, | ||
} | ||
}, | ||
) | ||
db[CACHE_COLLECTION_RESPONSES].update_many( | ||
{ | ||
"kind": "dataset-size", | ||
"http_status": 200, | ||
"content.size.dataset.estimated_num_rows": {"$exists": False}, | ||
}, | ||
{ | ||
"$set": { | ||
"content.size.dataset.estimated_num_rows": None, | ||
"content.size.configs.$[].estimated_num_rows": None, | ||
"content.size.splits.$[].estimated_num_rows": None, | ||
} | ||
}, | ||
) | ||
|
||
def down(self) -> None: | ||
logging.info("Remove the 'config-size' field from all the cached results") | ||
db = get_db(CACHE_MONGOENGINE_ALIAS) | ||
db[CACHE_COLLECTION_RESPONSES].update_many( | ||
{ | ||
"kind": "config-size", | ||
"http_status": 200, | ||
}, | ||
{ | ||
"$unset": { | ||
"content.size.config.estimated_num_rows": "", | ||
"content.size.splits.$[].estimated_num_rows": "", | ||
} | ||
}, | ||
) | ||
db[CACHE_COLLECTION_RESPONSES].update_many( | ||
{ | ||
"kind": "dataset-size", | ||
"http_status": 200, | ||
}, | ||
{ | ||
"$unset": { | ||
"content.size.dataset.estimated_num_rows": "", | ||
"content.size.configs.$[].estimated_num_rows": "", | ||
"content.size.splits.$[].estimated_num_rows": "", | ||
} | ||
}, | ||
) | ||
|
||
def validate(self) -> None: | ||
logging.info("Ensure that a random selection of cached results have the 'estimated_num_rows' field") | ||
|
||
check_documents(DocCls=CachedResponseDocument, sample_size=10) |
188 changes: 188 additions & 0 deletions
188
...db_migration/tests/migrations/test_20240624144000_cache_add_estimated_num_rows_in_size.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# Copyright 2024 The HuggingFace Authors. | ||
from typing import Any | ||
|
||
from libcommon.constants import CACHE_COLLECTION_RESPONSES, CACHE_MONGOENGINE_ALIAS | ||
from libcommon.resources import MongoResource | ||
from mongoengine.connection import get_db | ||
|
||
from mongodb_migration.migrations._20240624144000_cache_add_estimated_num_rows_field_in_size import ( | ||
MigrationAddEstimatedNumRowsToSizeCacheResponse, | ||
) | ||
|
||
|
||
def assert_estimated_num_rows_in_config(dataset: str, kind: str) -> None: | ||
db = get_db(CACHE_MONGOENGINE_ALIAS) | ||
entry = db[CACHE_COLLECTION_RESPONSES].find_one({"dataset": dataset, "kind": kind}) | ||
assert entry is not None | ||
assert entry["content"]["size"]["config"]["estimated_num_rows"] is None | ||
assert all(split["estimated_num_rows"] is None for split in entry["content"]["size"]["splits"]) | ||
|
||
|
||
def assert_estimated_num_rows_in_dataset(dataset: str, kind: str) -> None: | ||
db = get_db(CACHE_MONGOENGINE_ALIAS) | ||
entry = db[CACHE_COLLECTION_RESPONSES].find_one({"dataset": dataset, "kind": kind}) | ||
assert entry is not None | ||
assert entry["content"]["size"]["dataset"]["estimated_num_rows"] is None | ||
assert all(split["estimated_num_rows"] is None for split in entry["content"]["size"]["configs"]) | ||
assert all(split["estimated_num_rows"] is None for split in entry["content"]["size"]["splits"]) | ||
|
||
|
||
def assert_unchanged_in_config(dataset: str, kind: str) -> None: | ||
db = get_db(CACHE_MONGOENGINE_ALIAS) | ||
entry = db[CACHE_COLLECTION_RESPONSES].find_one({"dataset": dataset, "kind": kind}) | ||
assert entry is not None | ||
if "size" in entry["content"]: | ||
assert "estimated_num_rows" not in entry["content"]["size"]["config"] | ||
assert all("estimated_num_rows" not in split for split in entry["content"]["size"]["splits"]) | ||
|
||
|
||
def assert_unchanged_in_dataset(dataset: str, kind: str) -> None: | ||
db = get_db(CACHE_MONGOENGINE_ALIAS) | ||
entry = db[CACHE_COLLECTION_RESPONSES].find_one({"dataset": dataset, "kind": kind}) | ||
assert entry is not None | ||
if "size" in entry["content"]: | ||
assert "estimated_num_rows" not in entry["content"]["size"]["dataset"] | ||
assert all("estimated_num_rows" not in split for split in entry["content"]["size"]["configs"]) | ||
assert all("estimated_num_rows" not in split for split in entry["content"]["size"]["splits"]) | ||
|
||
|
||
def test_cache_add_partial(mongo_host: str) -> None: | ||
with MongoResource(database="test_cache_add_tags_to_hub_cache", host=mongo_host, mongoengine_alias="cache"): | ||
db = get_db(CACHE_MONGOENGINE_ALIAS) | ||
cache: list[dict[str, Any]] = [ | ||
{ | ||
"dataset": "dataset", | ||
"config": "config", | ||
"kind": "config-size", | ||
"content": { | ||
"size": { | ||
"config": { | ||
"dataset": "dataset", | ||
"config": "config", | ||
"num_bytes_original_files": 123, | ||
"num_bytes_parquet_files": 123, | ||
"num_bytes_memory": 123, | ||
"num_rows": 1000, | ||
"num_columns": 1, | ||
}, | ||
"splits": [ | ||
{ | ||
"dataset": "dataset", | ||
"config": "config", | ||
"split": "train", | ||
"num_bytes_original_files": 120, | ||
"num_bytes_parquet_files": 120, | ||
"num_bytes_memory": 120, | ||
"num_rows": 900, | ||
"num_columns": 1, | ||
}, | ||
{ | ||
"dataset": "dataset", | ||
"config": "config", | ||
"split": "test", | ||
"num_bytes_original_files": 3, | ||
"num_bytes_parquet_files": 3, | ||
"num_bytes_memory": 3, | ||
"num_rows": 100, | ||
"num_columns": 1, | ||
}, | ||
], | ||
}, | ||
"partial": False, | ||
}, | ||
"http_status": 200, | ||
"job_runner_version": 1, | ||
"progress": None, | ||
}, | ||
{ | ||
"dataset": "dataset", | ||
"config": "config", | ||
"kind": "dataset-size", | ||
"content": { | ||
"size": { | ||
"dataset": { | ||
"dataset": "dataset", | ||
"config": "config", | ||
"num_bytes_original_files": 123, | ||
"num_bytes_parquet_files": 123, | ||
"num_bytes_memory": 123, | ||
"num_rows": 1000, | ||
"num_columns": 1, | ||
}, | ||
"configs": [ | ||
{ | ||
"dataset": "dataset", | ||
"config": "config", | ||
"num_bytes_original_files": 123, | ||
"num_bytes_parquet_files": 123, | ||
"num_bytes_memory": 123, | ||
"num_rows": 1000, | ||
"num_columns": 1, | ||
} | ||
], | ||
"splits": [ | ||
{ | ||
"dataset": "dataset", | ||
"config": "config", | ||
"split": "train", | ||
"num_bytes_original_files": 120, | ||
"num_bytes_parquet_files": 120, | ||
"num_bytes_memory": 120, | ||
"num_rows": 900, | ||
"num_columns": 1, | ||
}, | ||
{ | ||
"dataset": "dataset", | ||
"config": "config", | ||
"split": "test", | ||
"num_bytes_original_files": 3, | ||
"num_bytes_parquet_files": 3, | ||
"num_bytes_memory": 3, | ||
"num_rows": 100, | ||
"num_columns": 1, | ||
}, | ||
], | ||
}, | ||
"partial": False, | ||
}, | ||
"http_status": 200, | ||
"job_runner_version": 1, | ||
"progress": None, | ||
}, | ||
{ | ||
"dataset": "dataset_with_error", | ||
"config": "config_with_error", | ||
"kind": "config-size", | ||
"content": {"error": "error"}, | ||
"details": { | ||
"error": "error", | ||
"cause_exception": "UnexpextedError", | ||
"cause_message": "error", | ||
"cause_traceback": ["Traceback"], | ||
}, | ||
"error_code": "UnexpectedError", | ||
"http_status": 500, | ||
"job_runner_version": 1, | ||
"progress": None, | ||
}, | ||
] | ||
|
||
db[CACHE_COLLECTION_RESPONSES].insert_many(cache) | ||
|
||
migration = MigrationAddEstimatedNumRowsToSizeCacheResponse( | ||
version="20240624144000", | ||
description="add the 'estimated_num_rows' fields to size", | ||
) | ||
migration.up() | ||
|
||
assert_estimated_num_rows_in_config("dataset", kind="config-size") | ||
assert_estimated_num_rows_in_dataset("dataset", kind="dataset-size") | ||
assert_unchanged_in_config("dataset_with_error", kind="config-size") | ||
|
||
migration.down() | ||
assert_unchanged_in_config("dataset", kind="config-size") | ||
assert_unchanged_in_dataset("dataset", kind="dataset-size") | ||
assert_unchanged_in_config("dataset_with_error", kind="config-size") | ||
|
||
db[CACHE_COLLECTION_RESPONSES].drop() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.