Skip to content

Commit

Permalink
nit: uniformize logs and method nmaes (#2449)
Browse files Browse the repository at this point in the history
  • Loading branch information
severo authored Feb 13, 2024
1 parent bde7cf6 commit d768955
Show file tree
Hide file tree
Showing 28 changed files with 32 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def compute_config_duckdb_index_size_response(dataset: str, config: str) -> Conf
Returns:
`ConfigDuckdbIndexSizeResponse`: An object with the duckdb_index_size_response.
"""
logging.info(f"get 'config-duckdb-index-size' for {dataset=} {config=}")
logging.info(f"compute 'config-duckdb-index-size' for {dataset=} {config=}")
splits = get_split_names(dataset=dataset, config=config)
try:
total = 0
Expand Down
2 changes: 1 addition & 1 deletion services/worker/src/worker/job_runners/config/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def compute_config_info_response(dataset: str, config: str) -> ConfigInfoRespons
Returns:
`ConfigInfoResponse`: An object with the dataset_info response for requested config.
"""
logging.info(f"get 'config-info' for {dataset=} and {config=}")
logging.info(f"compute 'config-info' for {dataset=} and {config=}")

previous_step = "config-parquet-and-info"
dataset_info_best_response = get_previous_step_or_raise(kinds=[previous_step], dataset=dataset, config=config)
Expand Down
2 changes: 1 addition & 1 deletion services/worker/src/worker/job_runners/config/is_valid.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def compute_is_valid_response(dataset: str, config: str) -> tuple[IsValidRespons
Returns:
`tuple[IsValidResponse, float]`: The response (viewer, preview, search, filter) and the progress.
"""
logging.info(f"get 'config-is-valid' response for {dataset=} {config=}")
logging.info(f"compute 'config-is-valid' response for {dataset=} {config=}")

preview = False
viewer = False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from worker.utils import get_split_names


def compute_opt_in_out_urls_scan_response(dataset: str, config: str) -> tuple[OptInOutUrlsCountResponse, float]:
logging.info(f"get 'config-opt-in-out-urls-count' for {dataset=} {config=}")
def compute_opt_in_out_urls_count_response(dataset: str, config: str) -> tuple[OptInOutUrlsCountResponse, float]:
logging.info(f"compute 'config-opt-in-out-urls-count' for {dataset=} {config=}")

urls_columns = []
num_opt_in_urls = 0
Expand Down Expand Up @@ -76,5 +76,5 @@ def get_job_type() -> str:
return "config-opt-in-out-urls-count"

def compute(self) -> JobResult:
response_content, progress = compute_opt_in_out_urls_scan_response(dataset=self.dataset, config=self.config)
response_content, progress = compute_opt_in_out_urls_count_response(dataset=self.dataset, config=self.config)
return JobResult(response_content, progress=progress)
2 changes: 1 addition & 1 deletion services/worker/src/worker/job_runners/config/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def compute_parquet_response(dataset: str, config: str) -> ConfigParquetResponse
Returns:
`ConfigParquetResponse`: An object with the parquet_response (list of parquet files).
"""
logging.info(f"get 'config-parquet' for {dataset=} {config=}")
logging.info(f"compute 'config-parquet' for {dataset=} {config=}")

previous_step = "config-parquet-and-info"
config_parquet_and_info_best_response = get_previous_step_or_raise(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1115,7 +1115,7 @@ def compute_config_parquet_and_info_response(
Returns:
`ConfigParquetAndInfoResponse`: An object with the list of parquet files, the dataset info and whether the response is partial or not.
"""
logging.info(f"get 'config-parquet-and-info' for {dataset=} {config=}")
logging.info(f"compute 'config-parquet-and-info' for {dataset=} {config=}")

logging.info(f"getting config names for {dataset=}")
previous_step = "dataset-config-names"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def compute_parquet_metadata_response(
Returns:
`ConfigParquetMetadataResponse`: An object with the list of parquet metadata files.
"""
logging.info(f"get 'config-parquet-metadata' for {dataset=} {config=}")
logging.info(f"compute 'config-parquet-metadata' for {dataset=} {config=}")

config_parquet_best_response = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config)
try:
Expand Down
2 changes: 1 addition & 1 deletion services/worker/src/worker/job_runners/config/size.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def compute_config_size_response(dataset: str, config: str) -> ConfigSizeRespons
Returns:
`ConfigSizeResponse`: An object with the size_response.
"""
logging.info(f"get 'config-size' for {dataset=} {config=}")
logging.info(f"compute 'config-size' for {dataset=} {config=}")

dataset_info_best_response = get_previous_step_or_raise(
kinds=["config-parquet-and-info"], dataset=dataset, config=config
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def compute_split_names_from_info_response(dataset: str, config: str) -> SplitsL
Returns:
`SplitsList`: An object with the list of split names for the dataset and config.
"""
logging.info(f"get 'config-split-names-from-info' for {dataset=} {config=}")
logging.info(f"compute 'config-split-names-from-info' for {dataset=} {config=}")
config_info_best_response = get_previous_step_or_raise(kinds=["config-info"], dataset=dataset, config=config)

try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def compute_split_names_from_streaming_response(
Returns:
`SplitsList`: An object with the list of split names for the dataset and config.
"""
logging.info(f"get 'config-split-names-from-streaming' for {dataset=} {config=}")
logging.info(f"compute 'config-split-names-from-streaming' for {dataset=} {config=}")
try:
split_name_items: list[FullSplitItem] = [
{"dataset": dataset, "config": config, "split": str(split)}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def compute_config_names_response(
Returns:
`DatasetConfigNamesResponse`: An object with the list of config names.
"""
logging.info(f"get 'dateset-config-names' for {dataset=}")
logging.info(f"compute 'dataset-config-names' for {dataset=}")
# get the list of splits in streaming mode
try:
config_name_items: list[ConfigNameItem] = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def compute_dataset_duckdb_index_size_response(dataset: str) -> tuple[DatasetDuc
Returns:
`tuple[DatasetDuckdbIndexSizeResponse, float]`: An object with the duckdb_index_size_response and the progress.
"""
logging.info(f"get 'config-duckdb-index-sie' for {dataset=}")
logging.info(f"compute 'config-duckdb-index-size' for {dataset=}")
config_names_best_response = get_previous_step_or_raise(kinds=["dataset-config-names"], dataset=dataset)
content = config_names_best_response.response["content"]
if "config_names" not in content:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def compute_hub_cache_response(dataset: str) -> tuple[DatasetHubCacheResponse, f
Returns:
`tuple[DatasetHubCacheResponse, float]`: The response and the progress.
"""
logging.info(f"get 'dateset-hub-cache' for {dataset=}")
logging.info(f"compute 'dataset-hub-cache' for {dataset=}")

is_valid_response = get_previous_step_or_raise(kinds=["dataset-is-valid"], dataset=dataset)
content = is_valid_response.response["content"]
Expand Down
2 changes: 1 addition & 1 deletion services/worker/src/worker/job_runners/dataset/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def compute_dataset_info_response(dataset: str) -> tuple[DatasetInfoResponse, fl
correctly processed and included in current response (some configs might not exist in cache yet
or raise errors).
"""
logging.info(f"get 'dataset-info' for {dataset=}")
logging.info(f"compute 'dataset-info' for {dataset=}")

config_names_best_response = get_previous_step_or_raise(kinds=["dataset-config-names"], dataset=dataset)
content = config_names_best_response.response["content"]
Expand Down
2 changes: 1 addition & 1 deletion services/worker/src/worker/job_runners/dataset/is_valid.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def compute_is_valid_response(dataset: str) -> tuple[IsValidResponse, float]:
Returns:
`tuple[IsValidResponse, float]`: The response (viewer, preview, search, filter) and the progress.
"""
logging.info(f"get 'dataset-is-valid' response for {dataset=}")
logging.info(f"compute 'dataset-is-valid' response for {dataset=}")

config_names_response = get_previous_step_or_raise(kinds=["dataset-config-names"], dataset=dataset)
content = config_names_response.response["content"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def compute_loading_tags_response(dataset: str) -> DatasetLoadingTagsResponse:
Returns:
`DatasetLoadingTagsResponse`: The dataset-loading-tags response (list of tags).
"""
logging.info(f"get 'dataset-loading-tags' for {dataset=}")
logging.info(f"compute 'dataset-loading-tags' for {dataset=}")

dataset_info_best_response = get_previous_step_or_raise(kinds=["dataset-info"], dataset=dataset)
http_status = dataset_info_best_response.response["http_status"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@


def compute_opt_in_out_urls_count_response(dataset: str) -> tuple[OptInOutUrlsCountResponse, float]:
logging.info(f"get opt-in-out-urls-count for {dataset=}")
logging.info(f"compute 'dataset-opt-in-out-urls-count' for {dataset=}")

config_names_response = get_previous_step_or_raise(kinds=["dataset-config-names"], dataset=dataset)
content = config_names_response.response["content"]
Expand Down
2 changes: 1 addition & 1 deletion services/worker/src/worker/job_runners/dataset/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def compute_parquet_response(dataset: str) -> tuple[DatasetParquetResponse, floa
Returns:
`tuple[DatasetParquetResponse, float]`: A tuple with the parquet_response (list of parquet files) and progress.
"""
logging.info(f"get 'dataset-parquet' for {dataset=}")
logging.info(f"compute 'dataset-parquet' for {dataset=}")

config_names_best_response = get_previous_step_or_raise(kinds=["dataset-config-names"], dataset=dataset)
content = config_names_best_response.response["content"]
Expand Down
2 changes: 1 addition & 1 deletion services/worker/src/worker/job_runners/dataset/size.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def compute_sizes_response(dataset: str) -> tuple[DatasetSizeResponse, float]:
Returns:
`tuple[DatasetSizeResponse, float]`: An object with the sizes_response and the progress.
"""
logging.info(f"get 'dataset-size' for {dataset=}")
logging.info(f"compute 'dataset-size' for {dataset=}")

config_names_best_response = get_previous_step_or_raise(kinds=["dataset-config-names"], dataset=dataset)
content = config_names_best_response.response["content"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def compute_dataset_split_names_response(dataset: str) -> tuple[DatasetSplitName
An object with a list of split names for the dataset [splits],
a list of pending configs to be processed [pending] and the list of errors [failed] by config.
"""
logging.info(f"get 'dataset-split-names' for {dataset=}")
logging.info(f"compute 'dataset-split-names' for {dataset=}")

# Get the config names from the previous steps
config_names_best_response = get_previous_step_or_raise(kinds=["dataset-config-names"], dataset=dataset)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ def compute_descriptive_statistics_response(
numerical (int and float) or ClassLabel feature.
"""

logging.info(f"get 'split-descriptive-statistics' for {dataset=} {config=} {split=}")
logging.info(f"compute 'split-descriptive-statistics' for {dataset=} {config=} {split=}")

config_parquet_and_info_step = "config-parquet-and-info"
parquet_and_info_best_response = get_previous_step_or_raise(
Expand Down
6 changes: 3 additions & 3 deletions services/worker/src/worker/job_runners/split/duckdb_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def get_delete_operations(all_repo_files: set[str], split_names: set[str], confi
]


def compute_index_rows(
def compute_split_duckdb_index_response(
job_id: str,
dataset: str,
config: str,
Expand All @@ -120,7 +120,7 @@ def compute_index_rows(
committer_hf_token: Optional[str],
parquet_metadata_directory: StrPath,
) -> SplitDuckdbIndex:
logging.info(f"get split-duckdb-index for {dataset=} {config=} {split=}")
logging.info(f"compute 'split-duckdb-index' for {dataset=} {config=} {split=}")

# get parquet urls and dataset_info
config_parquet_metadata_step = "config-parquet-metadata"
Expand Down Expand Up @@ -357,7 +357,7 @@ def compute(self) -> CompleteJobResult:
if self.cache_subdirectory is None:
raise CacheDirectoryNotInitializedError("Cache directory has not been initialized.")
return CompleteJobResult(
compute_index_rows(
compute_split_duckdb_index_response(
job_id=self.job_info["job_id"],
dataset=self.dataset,
config=self.config,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def compute_first_rows_response(
columns_max_number: int,
indexer: Indexer,
) -> SplitFirstRowsResponse:
logging.info(f"get 'split-first-rows-from-parquet' for {dataset=} {config=} {split=}")
logging.info(f"compute 'split-first-rows-from-parquet' for {dataset=} {config=} {split=}")

try:
rows_index = indexer.get_rows_index(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def compute_first_rows_response(
Returns:
`SplitFirstRowsResponse`: The list of first rows of the split.
"""
logging.info(f"get 'first-rows-from-streaming' for {dataset=} {config=} {split=}")
logging.info(f"compute 'split-first-rows-from-streaming' for {dataset=} {config=} {split=}")
trust_remote_code = resolve_trust_remote_code(dataset=dataset, allow_list=dataset_scripts_allow_list)
# get the features
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def compute_image_url_columns(
Returns:
`ImageUrlColumnsResponse`: The list of image url columns.
"""
logging.info(f"get 'split-image-url-columns' for {dataset=} {config=} {split=}")
logging.info(f"compute 'split-image-url-columns' for {dataset=} {config=} {split=}")

# get the first rows from previous job
upstream_response = get_previous_step_or_raise(
Expand Down
2 changes: 1 addition & 1 deletion services/worker/src/worker/job_runners/split/is_valid.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def compute_is_valid_response(dataset: str, config: str, split: str) -> IsValidR
Returns:
`IsValidResponse`: The response (viewer, preview, search, filter).
"""
logging.info(f"get is-valid response for {dataset=}")
logging.info(f"compute 'split-is-valid' response for {dataset=}")

viewer = has_any_successful_response(
dataset=dataset,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def compute_opt_in_out_urls_count_response(
config: str,
split: str,
) -> OptInOutUrlsCountResponse:
logging.info(f"get opt-in-out-urls-count for {dataset=} {config=} {split=}")
logging.info(f"compute 'split-opt-in-out-urls-count' for {dataset=} {config=} {split=}")

opt_in_out_urls_scan = get_previous_step_or_raise(
kinds=["split-opt-in-out-urls-scan"], dataset=dataset, config=config, split=split
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def compute_opt_in_out_urls_scan_response(
Returns:
`OptInOutUrlsScanResponse`: An object with the lists of opt-in/opt-out urls
"""
logging.info(f"get 'split-opt-in-out-urls-scan' for {dataset=} {config=} {split=}")
logging.info(f"compute 'split-opt-in-out-urls-scan' for {dataset=} {config=} {split=}")
trust_remote_code = resolve_trust_remote_code(dataset=dataset, allow_list=dataset_scripts_allow_list)

if not spawning_token:
Expand Down

0 comments on commit d768955

Please sign in to comment.