2023 reduce number comments (#2024)

* only return the dataset name, not the revision * feat: 🎸 only post one message per dataset if a dataset already has a discussion from the parquet-converter bot, be it open or closed, we ignore the dataset. * update dicussion content * fix types
huggingface · Oct 30, 2023 · b22957f · b22957f
1 parent 3267ee6
commit b22957f
Show file tree

Hide file tree

Showing 4 changed files with 104 additions and 183 deletions.
diff --git a/jobs/cache_maintenance/src/cache_maintenance/discussions.py b/jobs/cache_maintenance/src/cache_maintenance/discussions.py
@@ -8,21 +8,16 @@
 
 from huggingface_hub import HfApi
 from huggingface_hub.constants import REPO_TYPE_DATASET
-from libcommon.simple_cache import (
-    DatasetWithRevision,
-    get_datasets_with_last_updated_kind,
-)
+from libcommon.simple_cache import get_datasets_with_last_updated_kind
 
 PARQUET_CACHE_KIND = "config-parquet"
 DAYS = 1
-CLOSED_STATUS = "closed"
 
 
 class ParquetCounters(TypedDict):
     datasets: int
-    messages: int
-    dismissed_messages: int
     new_discussions: int
+    dismissed_discussions: int
     errors: int
 
 
@@ -55,35 +50,30 @@ def post_messages_on_parquet_conversion(
     bot_token: str,
     parquet_revision: str,
 ) -> ParquetCounters:
-    logging.info("Post messages in Hub discussion to notify about parquet conversion")
-    datasets_with_revision = limit_to_one_dataset_per_namespace(
+    logging.info("Create a Hub discussion to notify about parquet conversion")
+    datasets = limit_to_one_dataset_per_namespace(
         get_datasets_with_last_updated_kind(kind=PARQUET_CACHE_KIND, days=DAYS)
     )
 
-    logging.info(f"Posting messages for {len(datasets_with_revision)} datasets")
+    logging.info(f"Creating discussions for {len(datasets)} datasets")
     log_batch = 100
     counters: ParquetCounters = {
         "datasets": 0,
-        "messages": 0,
-        "dismissed_messages": 0,
         "new_discussions": 0,
+        "dismissed_discussions": 0,
         "errors": 0,
     }
 
     def get_log() -> str:
         return (
-            f"{counters['messages'] } messages posted (total:"
-            f" {len(datasets_with_revision)} datasets): {counters['new_discussions']} discussions have been opened."
-            f" {counters['dismissed_messages']} messages have been dismissed because the discussion had been closed."
-            f" {counters['errors']} errors."
+            f" {counters['new_discussions']} discussions have been opened. A total of"
+            f" {len(datasets)} datasets were selected, but {counters['dismissed_discussions']} datasets"
+            f" already had a discussion (open or closed). {counters['errors']} errors."
         )
 
     hf_api = HfApi(endpoint=hf_endpoint, token=bot_token)
 
-    for dataset_with_revision in datasets_with_revision:
-        dataset = dataset_with_revision.dataset
-        revision = dataset_with_revision.revision
-
+    for dataset in datasets:
         counters["datasets"] += 1
         try:
             bot_discussions = [
@@ -95,40 +85,24 @@ def get_log() -> str:
             ]
 
             if bot_discussions:
-                if len(bot_discussions) > 1:
-                    logging.warning(
-                        f"Found {len(bot_discussions)} discussions for {dataset} with bot {bot_associated_user_name},"
-                        " only the first one will be used."
-                    )
-                discussion = bot_discussions[0]
+                counters["dismissed_discussions"] += 1
+                continue
             else:
-                discussion = hf_api.create_discussion(
+                hf_api.create_discussion(
                     repo_id=dataset,
                     repo_type=REPO_TYPE_DATASET,
-                    title=f"Notifications from {bot_associated_user_name}",
-                    description=create_discussion_description(),
+                    title="[bot] Conversion to Parquet",
+                    description=create_discussion_description(
+                        dataset=dataset,
+                        hf_endpoint=hf_endpoint,
+                        parquet_revision=parquet_revision,
+                        bot_associated_user_name=bot_associated_user_name,
+                    ),
                     token=bot_token,
                 )
                 sleep(1)
                 # ^ see https://github.com/huggingface/moon-landing/issues/7729 (internal)
                 counters["new_discussions"] += 1
-            if discussion.status == CLOSED_STATUS:
-                counters["dismissed_messages"] += 1
-                continue
-            hf_api.comment_discussion(
-                repo_id=dataset,
-                repo_type=REPO_TYPE_DATASET,
-                discussion_num=discussion.num,
-                comment=create_parquet_comment(
-                    dataset=dataset,
-                    hf_endpoint=hf_endpoint,
-                    parquet_revision=parquet_revision,
-                    dataset_revision=revision,
-                ),
-                token=bot_token,
-            )
-
-            counters["messages"] += 1
 
         except Exception as e:
             logging.warning(f"Failed to post a message for {dataset}: {e}")
@@ -144,36 +118,23 @@ def get_log() -> str:
     return counters
 
 
-def temporary_call_to_action_for_feedback() -> str:
-    return "Please comment below if you have any questions or feedback about this new notifications channel. "
-
-
-def create_discussion_description() -> str:
-    return (
-        "The Datasets Server bot will post messages here about operations such as conversion to"
-        " Parquet. There are some advantages associated with having a version of your dataset available in the "
-        "[Parquet format](https://parquet.apache.org/). You can learn more about these in the"
-        f""" [documentation](https://huggingface.co/docs/datasets-server/parquet).
-
-_{temporary_call_to_action_for_feedback()}Close the discussion if you want to stop receiving notifications._"""
-    )
-
-
-def create_parquet_comment(
-    dataset: str, hf_endpoint: str, parquet_revision: str, dataset_revision: Optional[str]
+def create_discussion_description(
+    dataset: str, hf_endpoint: str, parquet_revision: str, bot_associated_user_name: str
 ) -> str:
-    link_dataset = f" revision {dataset_revision[:7]}" if dataset_revision else ""
-
     link_parquet = create_link(
         text=parquet_revision,
         dataset=dataset,
         hf_endpoint=hf_endpoint,
         revision_type="tree",
         revision=parquet_revision,
     )
-    return f"""Datasets Server has converted the dataset{link_dataset} to Parquet.
+    return (
+        f"The {bot_associated_user_name} bot has created a version of this dataset in the [Parquet"
+        " format](https://parquet.apache.org/). You can learn more about the advantages associated with this format"
+        f""" in the [documentation](https://huggingface.co/docs/datasets-server/parquet).
 
-The Parquet files are published to the Hub in the {link_parquet} branch."""
+The Parquet files are published in the {link_parquet} branch."""
+    )
 
 
 def create_link(
@@ -182,28 +143,28 @@ def create_link(
     return f"[`{text}`]({hf_endpoint}/datasets/{dataset}/{revision_type}/{parse.quote(revision, safe='')})"
 
 
-def limit_to_one_dataset_per_namespace(datasets_with_revision: list[DatasetWithRevision]) -> list[DatasetWithRevision]:
+def limit_to_one_dataset_per_namespace(datasets: list[str]) -> list[str]:
     """
     Limit the number of datasets to one per namespace.
 
     For instance, if we have `a/b` and `a/c`, we will only keep one of them.
     The choice is arbitrary. The filtered list has no particular order.
 
     Args:
-        datasets (list[DatasetWithRevision]): The list of datasets (with revision) to filter.
+        datasets (list[str]): The list of datasets to filter.
 
     Returns:
-        list[DatasetWithRevision]: The filtered list of datasets (with revision).
+        list[str]: The filtered list of datasets.
     """
     namespaces: set[str] = set()
-    selected_datasets_with_revision: list[DatasetWithRevision] = []
-    for dataset_with_revision in datasets_with_revision:
-        namespace = get_namespace(dataset_with_revision.dataset)
+    selected_datasets: list[str] = []
+    for dataset in datasets:
+        namespace = get_namespace(dataset)
         if (namespace is None) or (namespace in namespaces):
             continue
         namespaces.add(namespace)
-        selected_datasets_with_revision.append(dataset_with_revision)
-    return selected_datasets_with_revision
+        selected_datasets.append(dataset)
+    return selected_datasets
 
 
 def get_namespace(dataset: str) -> Optional[str]: