nit: uniformize logs and method nmaes (#2449)

huggingface · Feb 13, 2024 · d768955 · d768955
1 parent bde7cf6
commit d768955
Show file tree

Hide file tree

Showing 28 changed files with 32 additions and 32 deletions.
diff --git a/services/worker/src/worker/job_runners/config/duckdb_index_size.py b/services/worker/src/worker/job_runners/config/duckdb_index_size.py
@@ -40,7 +40,7 @@ def compute_config_duckdb_index_size_response(dataset: str, config: str) -> Conf
     Returns:
         `ConfigDuckdbIndexSizeResponse`: An object with the duckdb_index_size_response.
     """
-    logging.info(f"get 'config-duckdb-index-size' for {dataset=} {config=}")
+    logging.info(f"compute 'config-duckdb-index-size' for {dataset=} {config=}")
     splits = get_split_names(dataset=dataset, config=config)
     try:
         total = 0

diff --git a/services/worker/src/worker/job_runners/config/info.py b/services/worker/src/worker/job_runners/config/info.py
@@ -26,7 +26,7 @@ def compute_config_info_response(dataset: str, config: str) -> ConfigInfoRespons
     Returns:
         `ConfigInfoResponse`: An object with the dataset_info response for requested config.
     """
-    logging.info(f"get 'config-info' for {dataset=} and {config=}")
+    logging.info(f"compute 'config-info' for {dataset=} and {config=}")
 
     previous_step = "config-parquet-and-info"
     dataset_info_best_response = get_previous_step_or_raise(kinds=[previous_step], dataset=dataset, config=config)

diff --git a/services/worker/src/worker/job_runners/config/is_valid.py b/services/worker/src/worker/job_runners/config/is_valid.py
@@ -34,7 +34,7 @@ def compute_is_valid_response(dataset: str, config: str) -> tuple[IsValidRespons
     Returns:
         `tuple[IsValidResponse, float]`: The response (viewer, preview, search, filter) and the progress.
     """
-    logging.info(f"get 'config-is-valid' response for {dataset=} {config=}")
+    logging.info(f"compute 'config-is-valid' response for {dataset=} {config=}")
 
     preview = False
     viewer = False

diff --git a/services/worker/src/worker/job_runners/config/opt_in_out_urls_count.py b/services/worker/src/worker/job_runners/config/opt_in_out_urls_count.py
@@ -15,8 +15,8 @@
 from worker.utils import get_split_names
 
 
-def compute_opt_in_out_urls_scan_response(dataset: str, config: str) -> tuple[OptInOutUrlsCountResponse, float]:
-    logging.info(f"get 'config-opt-in-out-urls-count' for {dataset=} {config=}")
+def compute_opt_in_out_urls_count_response(dataset: str, config: str) -> tuple[OptInOutUrlsCountResponse, float]:
+    logging.info(f"compute 'config-opt-in-out-urls-count' for {dataset=} {config=}")
 
     urls_columns = []
     num_opt_in_urls = 0
@@ -76,5 +76,5 @@ def get_job_type() -> str:
         return "config-opt-in-out-urls-count"
 
     def compute(self) -> JobResult:
-        response_content, progress = compute_opt_in_out_urls_scan_response(dataset=self.dataset, config=self.config)
+        response_content, progress = compute_opt_in_out_urls_count_response(dataset=self.dataset, config=self.config)
         return JobResult(response_content, progress=progress)
diff --git a/services/worker/src/worker/job_runners/config/parquet.py b/services/worker/src/worker/job_runners/config/parquet.py
@@ -29,7 +29,7 @@ def compute_parquet_response(dataset: str, config: str) -> ConfigParquetResponse
     Returns:
         `ConfigParquetResponse`: An object with the parquet_response (list of parquet files).
     """
-    logging.info(f"get 'config-parquet' for {dataset=} {config=}")
+    logging.info(f"compute 'config-parquet' for {dataset=} {config=}")
 
     previous_step = "config-parquet-and-info"
     config_parquet_and_info_best_response = get_previous_step_or_raise(

diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py
@@ -1115,7 +1115,7 @@ def compute_config_parquet_and_info_response(
     Returns:
         `ConfigParquetAndInfoResponse`: An object with the list of parquet files, the dataset info and whether the response is partial or not.
     """
-    logging.info(f"get 'config-parquet-and-info' for {dataset=} {config=}")
+    logging.info(f"compute 'config-parquet-and-info' for {dataset=} {config=}")
 
     logging.info(f"getting config names for {dataset=}")
     previous_step = "dataset-config-names"

diff --git a/services/worker/src/worker/job_runners/config/parquet_metadata.py b/services/worker/src/worker/job_runners/config/parquet_metadata.py
@@ -85,7 +85,7 @@ def compute_parquet_metadata_response(
     Returns:
         `ConfigParquetMetadataResponse`: An object with the list of parquet metadata files.
     """
-    logging.info(f"get 'config-parquet-metadata' for {dataset=} {config=}")
+    logging.info(f"compute 'config-parquet-metadata' for {dataset=} {config=}")
 
     config_parquet_best_response = get_previous_step_or_raise(kinds=["config-parquet"], dataset=dataset, config=config)
     try:

diff --git a/services/worker/src/worker/job_runners/config/size.py b/services/worker/src/worker/job_runners/config/size.py
@@ -30,7 +30,7 @@ def compute_config_size_response(dataset: str, config: str) -> ConfigSizeRespons
     Returns:
         `ConfigSizeResponse`: An object with the size_response.
     """
-    logging.info(f"get 'config-size' for {dataset=} {config=}")
+    logging.info(f"compute 'config-size' for {dataset=} {config=}")
 
     dataset_info_best_response = get_previous_step_or_raise(
         kinds=["config-parquet-and-info"], dataset=dataset, config=config

diff --git a/services/worker/src/worker/job_runners/config/split_names_from_info.py b/services/worker/src/worker/job_runners/config/split_names_from_info.py
@@ -35,7 +35,7 @@ def compute_split_names_from_info_response(dataset: str, config: str) -> SplitsL
     Returns:
         `SplitsList`: An object with the list of split names for the dataset and config.
     """
-    logging.info(f"get 'config-split-names-from-info' for {dataset=} {config=}")
+    logging.info(f"compute 'config-split-names-from-info' for {dataset=} {config=}")
     config_info_best_response = get_previous_step_or_raise(kinds=["config-info"], dataset=dataset, config=config)
 
     try:

diff --git a/services/worker/src/worker/job_runners/config/split_names_from_streaming.py b/services/worker/src/worker/job_runners/config/split_names_from_streaming.py
@@ -62,7 +62,7 @@ def compute_split_names_from_streaming_response(
     Returns:
         `SplitsList`: An object with the list of split names for the dataset and config.
     """
-    logging.info(f"get 'config-split-names-from-streaming' for {dataset=} {config=}")
+    logging.info(f"compute 'config-split-names-from-streaming' for {dataset=} {config=}")
     try:
         split_name_items: list[FullSplitItem] = [
             {"dataset": dataset, "config": config, "split": str(split)}

diff --git a/services/worker/src/worker/job_runners/dataset/config_names.py b/services/worker/src/worker/job_runners/dataset/config_names.py
@@ -58,7 +58,7 @@ def compute_config_names_response(
     Returns:
         `DatasetConfigNamesResponse`: An object with the list of config names.
     """
-    logging.info(f"get 'dateset-config-names' for {dataset=}")
+    logging.info(f"compute 'dataset-config-names' for {dataset=}")
     # get the list of splits in streaming mode
     try:
         config_name_items: list[ConfigNameItem] = [

diff --git a/services/worker/src/worker/job_runners/dataset/duckdb_index_size.py b/services/worker/src/worker/job_runners/dataset/duckdb_index_size.py
@@ -40,7 +40,7 @@ def compute_dataset_duckdb_index_size_response(dataset: str) -> tuple[DatasetDuc
     Returns:
         `tuple[DatasetDuckdbIndexSizeResponse, float]`: An object with the duckdb_index_size_response and the progress.
     """
-    logging.info(f"get 'config-duckdb-index-sie' for {dataset=}")
+    logging.info(f"compute 'config-duckdb-index-size' for {dataset=}")
     config_names_best_response = get_previous_step_or_raise(kinds=["dataset-config-names"], dataset=dataset)
     content = config_names_best_response.response["content"]
     if "config_names" not in content:

diff --git a/services/worker/src/worker/job_runners/dataset/hub_cache.py b/services/worker/src/worker/job_runners/dataset/hub_cache.py
@@ -26,7 +26,7 @@ def compute_hub_cache_response(dataset: str) -> tuple[DatasetHubCacheResponse, f
     Returns:
         `tuple[DatasetHubCacheResponse, float]`: The response and the progress.
     """
-    logging.info(f"get 'dateset-hub-cache' for {dataset=}")
+    logging.info(f"compute 'dataset-hub-cache' for {dataset=}")
 
     is_valid_response = get_previous_step_or_raise(kinds=["dataset-is-valid"], dataset=dataset)
     content = is_valid_response.response["content"]

diff --git a/services/worker/src/worker/job_runners/dataset/info.py b/services/worker/src/worker/job_runners/dataset/info.py
@@ -36,7 +36,7 @@ def compute_dataset_info_response(dataset: str) -> tuple[DatasetInfoResponse, fl
             correctly processed and included in current response (some configs might not exist in cache yet
             or raise errors).
     """
-    logging.info(f"get 'dataset-info' for {dataset=}")
+    logging.info(f"compute 'dataset-info' for {dataset=}")
 
     config_names_best_response = get_previous_step_or_raise(kinds=["dataset-config-names"], dataset=dataset)
     content = config_names_best_response.response["content"]

diff --git a/services/worker/src/worker/job_runners/dataset/is_valid.py b/services/worker/src/worker/job_runners/dataset/is_valid.py
@@ -29,7 +29,7 @@ def compute_is_valid_response(dataset: str) -> tuple[IsValidResponse, float]:
     Returns:
         `tuple[IsValidResponse, float]`: The response (viewer, preview, search, filter) and the progress.
     """
-    logging.info(f"get 'dataset-is-valid' response for {dataset=}")
+    logging.info(f"compute 'dataset-is-valid' response for {dataset=}")
 
     config_names_response = get_previous_step_or_raise(kinds=["dataset-config-names"], dataset=dataset)
     content = config_names_response.response["content"]

diff --git a/services/worker/src/worker/job_runners/dataset/loading_tags.py b/services/worker/src/worker/job_runners/dataset/loading_tags.py
@@ -36,7 +36,7 @@ def compute_loading_tags_response(dataset: str) -> DatasetLoadingTagsResponse:
     Returns:
         `DatasetLoadingTagsResponse`: The dataset-loading-tags response (list of tags).
     """
-    logging.info(f"get 'dataset-loading-tags' for {dataset=}")
+    logging.info(f"compute 'dataset-loading-tags' for {dataset=}")
 
     dataset_info_best_response = get_previous_step_or_raise(kinds=["dataset-info"], dataset=dataset)
     http_status = dataset_info_best_response.response["http_status"]

diff --git a/services/worker/src/worker/job_runners/dataset/opt_in_out_urls_count.py b/services/worker/src/worker/job_runners/dataset/opt_in_out_urls_count.py
@@ -16,7 +16,7 @@
 
 
 def compute_opt_in_out_urls_count_response(dataset: str) -> tuple[OptInOutUrlsCountResponse, float]:
-    logging.info(f"get opt-in-out-urls-count for {dataset=}")
+    logging.info(f"compute 'dataset-opt-in-out-urls-count' for {dataset=}")
 
     config_names_response = get_previous_step_or_raise(kinds=["dataset-config-names"], dataset=dataset)
     content = config_names_response.response["content"]

diff --git a/services/worker/src/worker/job_runners/dataset/parquet.py b/services/worker/src/worker/job_runners/dataset/parquet.py
@@ -38,7 +38,7 @@ def compute_parquet_response(dataset: str) -> tuple[DatasetParquetResponse, floa
     Returns:
         `tuple[DatasetParquetResponse, float]`: A tuple with the parquet_response (list of parquet files) and progress.
     """
-    logging.info(f"get 'dataset-parquet' for {dataset=}")
+    logging.info(f"compute 'dataset-parquet' for {dataset=}")
 
     config_names_best_response = get_previous_step_or_raise(kinds=["dataset-config-names"], dataset=dataset)
     content = config_names_best_response.response["content"]

diff --git a/services/worker/src/worker/job_runners/dataset/size.py b/services/worker/src/worker/job_runners/dataset/size.py
@@ -41,7 +41,7 @@ def compute_sizes_response(dataset: str) -> tuple[DatasetSizeResponse, float]:
     Returns:
         `tuple[DatasetSizeResponse, float]`: An object with the sizes_response and the progress.
     """
-    logging.info(f"get 'dataset-size' for {dataset=}")
+    logging.info(f"compute 'dataset-size' for {dataset=}")
 
     config_names_best_response = get_previous_step_or_raise(kinds=["dataset-config-names"], dataset=dataset)
     content = config_names_best_response.response["content"]

diff --git a/services/worker/src/worker/job_runners/dataset/split_names.py b/services/worker/src/worker/job_runners/dataset/split_names.py
@@ -37,7 +37,7 @@ def compute_dataset_split_names_response(dataset: str) -> tuple[DatasetSplitName
             An object with a list of split names for the dataset [splits],
             a list of pending configs to be processed [pending] and the list of errors [failed] by config.
     """
-    logging.info(f"get 'dataset-split-names' for {dataset=}")
+    logging.info(f"compute 'dataset-split-names' for {dataset=}")
 
     # Get the config names from the previous steps
     config_names_best_response = get_previous_step_or_raise(kinds=["dataset-config-names"], dataset=dataset)

diff --git a/services/worker/src/worker/job_runners/split/descriptive_statistics.py b/services/worker/src/worker/job_runners/split/descriptive_statistics.py
@@ -501,7 +501,7 @@ def compute_descriptive_statistics_response(
             numerical (int and float) or ClassLabel feature.
     """
 
-    logging.info(f"get 'split-descriptive-statistics' for {dataset=} {config=} {split=}")
+    logging.info(f"compute 'split-descriptive-statistics' for {dataset=} {config=} {split=}")
 
     config_parquet_and_info_step = "config-parquet-and-info"
     parquet_and_info_best_response = get_previous_step_or_raise(

diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py
@@ -104,7 +104,7 @@ def get_delete_operations(all_repo_files: set[str], split_names: set[str], confi
     ]
 
 
-def compute_index_rows(
+def compute_split_duckdb_index_response(
     job_id: str,
     dataset: str,
     config: str,
@@ -120,7 +120,7 @@ def compute_index_rows(
     committer_hf_token: Optional[str],
     parquet_metadata_directory: StrPath,
 ) -> SplitDuckdbIndex:
-    logging.info(f"get split-duckdb-index for {dataset=} {config=} {split=}")
+    logging.info(f"compute 'split-duckdb-index' for {dataset=} {config=} {split=}")
 
     # get parquet urls and dataset_info
     config_parquet_metadata_step = "config-parquet-metadata"
@@ -357,7 +357,7 @@ def compute(self) -> CompleteJobResult:
         if self.cache_subdirectory is None:
             raise CacheDirectoryNotInitializedError("Cache directory has not been initialized.")
         return CompleteJobResult(
-            compute_index_rows(
+            compute_split_duckdb_index_response(
                 job_id=self.job_info["job_id"],
                 dataset=self.dataset,
                 config=self.config,

diff --git a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py
@@ -34,7 +34,7 @@ def compute_first_rows_response(
     columns_max_number: int,
     indexer: Indexer,
 ) -> SplitFirstRowsResponse:
-    logging.info(f"get 'split-first-rows-from-parquet' for {dataset=} {config=} {split=}")
+    logging.info(f"compute 'split-first-rows-from-parquet' for {dataset=} {config=} {split=}")
 
     try:
         rows_index = indexer.get_rows_index(

diff --git a/services/worker/src/worker/job_runners/split/first_rows_from_streaming.py b/services/worker/src/worker/job_runners/split/first_rows_from_streaming.py
@@ -99,7 +99,7 @@ def compute_first_rows_response(
     Returns:
         `SplitFirstRowsResponse`: The list of first rows of the split.
     """
-    logging.info(f"get 'first-rows-from-streaming' for {dataset=} {config=} {split=}")
+    logging.info(f"compute 'split-first-rows-from-streaming' for {dataset=} {config=} {split=}")
     trust_remote_code = resolve_trust_remote_code(dataset=dataset, allow_list=dataset_scripts_allow_list)
     # get the features
     try:

diff --git a/services/worker/src/worker/job_runners/split/image_url_columns.py b/services/worker/src/worker/job_runners/split/image_url_columns.py
@@ -43,7 +43,7 @@ def compute_image_url_columns(
     Returns:
         `ImageUrlColumnsResponse`: The list of image url columns.
     """
-    logging.info(f"get 'split-image-url-columns' for {dataset=} {config=} {split=}")
+    logging.info(f"compute 'split-image-url-columns' for {dataset=} {config=} {split=}")
 
     # get the first rows from previous job
     upstream_response = get_previous_step_or_raise(

diff --git a/services/worker/src/worker/job_runners/split/is_valid.py b/services/worker/src/worker/job_runners/split/is_valid.py
@@ -33,7 +33,7 @@ def compute_is_valid_response(dataset: str, config: str, split: str) -> IsValidR
     Returns:
         `IsValidResponse`: The response (viewer, preview, search, filter).
     """
-    logging.info(f"get is-valid response for {dataset=}")
+    logging.info(f"compute 'split-is-valid' response for {dataset=}")
 
     viewer = has_any_successful_response(
         dataset=dataset,

diff --git a/services/worker/src/worker/job_runners/split/opt_in_out_urls_count.py b/services/worker/src/worker/job_runners/split/opt_in_out_urls_count.py
@@ -15,7 +15,7 @@ def compute_opt_in_out_urls_count_response(
     config: str,
     split: str,
 ) -> OptInOutUrlsCountResponse:
-    logging.info(f"get opt-in-out-urls-count for {dataset=} {config=} {split=}")
+    logging.info(f"compute 'split-opt-in-out-urls-count' for {dataset=} {config=} {split=}")
 
     opt_in_out_urls_scan = get_previous_step_or_raise(
         kinds=["split-opt-in-out-urls-scan"], dataset=dataset, config=config, split=split

diff --git a/services/worker/src/worker/job_runners/split/opt_in_out_urls_scan_from_streaming.py b/services/worker/src/worker/job_runners/split/opt_in_out_urls_scan_from_streaming.py
@@ -154,7 +154,7 @@ def compute_opt_in_out_urls_scan_response(
     Returns:
         `OptInOutUrlsScanResponse`: An object with the lists of opt-in/opt-out urls
     """
-    logging.info(f"get 'split-opt-in-out-urls-scan' for {dataset=} {config=} {split=}")
+    logging.info(f"compute 'split-opt-in-out-urls-scan' for {dataset=} {config=} {split=}")
     trust_remote_code = resolve_trust_remote_code(dataset=dataset, allow_list=dataset_scripts_allow_list)
 
     if not spawning_token: