Skip to content

Commit

Permalink
2023 reduce number comments (#2024)
Browse files Browse the repository at this point in the history
* only return the dataset name, not the revision

* feat: 🎸 only post one message per dataset

if a dataset already has a discussion from the parquet-converter bot, be
it open or closed, we ignore the dataset.

* update dicussion content

* fix types
  • Loading branch information
severo authored Oct 30, 2023
1 parent 3267ee6 commit b22957f
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 183 deletions.
111 changes: 36 additions & 75 deletions jobs/cache_maintenance/src/cache_maintenance/discussions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,16 @@

from huggingface_hub import HfApi
from huggingface_hub.constants import REPO_TYPE_DATASET
from libcommon.simple_cache import (
DatasetWithRevision,
get_datasets_with_last_updated_kind,
)
from libcommon.simple_cache import get_datasets_with_last_updated_kind

PARQUET_CACHE_KIND = "config-parquet"
DAYS = 1
CLOSED_STATUS = "closed"


class ParquetCounters(TypedDict):
datasets: int
messages: int
dismissed_messages: int
new_discussions: int
dismissed_discussions: int
errors: int


Expand Down Expand Up @@ -55,35 +50,30 @@ def post_messages_on_parquet_conversion(
bot_token: str,
parquet_revision: str,
) -> ParquetCounters:
logging.info("Post messages in Hub discussion to notify about parquet conversion")
datasets_with_revision = limit_to_one_dataset_per_namespace(
logging.info("Create a Hub discussion to notify about parquet conversion")
datasets = limit_to_one_dataset_per_namespace(
get_datasets_with_last_updated_kind(kind=PARQUET_CACHE_KIND, days=DAYS)
)

logging.info(f"Posting messages for {len(datasets_with_revision)} datasets")
logging.info(f"Creating discussions for {len(datasets)} datasets")
log_batch = 100
counters: ParquetCounters = {
"datasets": 0,
"messages": 0,
"dismissed_messages": 0,
"new_discussions": 0,
"dismissed_discussions": 0,
"errors": 0,
}

def get_log() -> str:
return (
f"{counters['messages'] } messages posted (total:"
f" {len(datasets_with_revision)} datasets): {counters['new_discussions']} discussions have been opened."
f" {counters['dismissed_messages']} messages have been dismissed because the discussion had been closed."
f" {counters['errors']} errors."
f" {counters['new_discussions']} discussions have been opened. A total of"
f" {len(datasets)} datasets were selected, but {counters['dismissed_discussions']} datasets"
f" already had a discussion (open or closed). {counters['errors']} errors."
)

hf_api = HfApi(endpoint=hf_endpoint, token=bot_token)

for dataset_with_revision in datasets_with_revision:
dataset = dataset_with_revision.dataset
revision = dataset_with_revision.revision

for dataset in datasets:
counters["datasets"] += 1
try:
bot_discussions = [
Expand All @@ -95,40 +85,24 @@ def get_log() -> str:
]

if bot_discussions:
if len(bot_discussions) > 1:
logging.warning(
f"Found {len(bot_discussions)} discussions for {dataset} with bot {bot_associated_user_name},"
" only the first one will be used."
)
discussion = bot_discussions[0]
counters["dismissed_discussions"] += 1
continue
else:
discussion = hf_api.create_discussion(
hf_api.create_discussion(
repo_id=dataset,
repo_type=REPO_TYPE_DATASET,
title=f"Notifications from {bot_associated_user_name}",
description=create_discussion_description(),
title="[bot] Conversion to Parquet",
description=create_discussion_description(
dataset=dataset,
hf_endpoint=hf_endpoint,
parquet_revision=parquet_revision,
bot_associated_user_name=bot_associated_user_name,
),
token=bot_token,
)
sleep(1)
# ^ see https://github.com/huggingface/moon-landing/issues/7729 (internal)
counters["new_discussions"] += 1
if discussion.status == CLOSED_STATUS:
counters["dismissed_messages"] += 1
continue
hf_api.comment_discussion(
repo_id=dataset,
repo_type=REPO_TYPE_DATASET,
discussion_num=discussion.num,
comment=create_parquet_comment(
dataset=dataset,
hf_endpoint=hf_endpoint,
parquet_revision=parquet_revision,
dataset_revision=revision,
),
token=bot_token,
)

counters["messages"] += 1

except Exception as e:
logging.warning(f"Failed to post a message for {dataset}: {e}")
Expand All @@ -144,36 +118,23 @@ def get_log() -> str:
return counters


def temporary_call_to_action_for_feedback() -> str:
return "Please comment below if you have any questions or feedback about this new notifications channel. "


def create_discussion_description() -> str:
return (
"The Datasets Server bot will post messages here about operations such as conversion to"
" Parquet. There are some advantages associated with having a version of your dataset available in the "
"[Parquet format](https://parquet.apache.org/). You can learn more about these in the"
f""" [documentation](https://huggingface.co/docs/datasets-server/parquet).
_{temporary_call_to_action_for_feedback()}Close the discussion if you want to stop receiving notifications._"""
)


def create_parquet_comment(
dataset: str, hf_endpoint: str, parquet_revision: str, dataset_revision: Optional[str]
def create_discussion_description(
dataset: str, hf_endpoint: str, parquet_revision: str, bot_associated_user_name: str
) -> str:
link_dataset = f" revision {dataset_revision[:7]}" if dataset_revision else ""

link_parquet = create_link(
text=parquet_revision,
dataset=dataset,
hf_endpoint=hf_endpoint,
revision_type="tree",
revision=parquet_revision,
)
return f"""Datasets Server has converted the dataset{link_dataset} to Parquet.
return (
f"The {bot_associated_user_name} bot has created a version of this dataset in the [Parquet"
" format](https://parquet.apache.org/). You can learn more about the advantages associated with this format"
f""" in the [documentation](https://huggingface.co/docs/datasets-server/parquet).
The Parquet files are published to the Hub in the {link_parquet} branch."""
The Parquet files are published in the {link_parquet} branch."""
)


def create_link(
Expand All @@ -182,28 +143,28 @@ def create_link(
return f"[`{text}`]({hf_endpoint}/datasets/{dataset}/{revision_type}/{parse.quote(revision, safe='')})"


def limit_to_one_dataset_per_namespace(datasets_with_revision: list[DatasetWithRevision]) -> list[DatasetWithRevision]:
def limit_to_one_dataset_per_namespace(datasets: list[str]) -> list[str]:
"""
Limit the number of datasets to one per namespace.
For instance, if we have `a/b` and `a/c`, we will only keep one of them.
The choice is arbitrary. The filtered list has no particular order.
Args:
datasets (list[DatasetWithRevision]): The list of datasets (with revision) to filter.
datasets (list[str]): The list of datasets to filter.
Returns:
list[DatasetWithRevision]: The filtered list of datasets (with revision).
list[str]: The filtered list of datasets.
"""
namespaces: set[str] = set()
selected_datasets_with_revision: list[DatasetWithRevision] = []
for dataset_with_revision in datasets_with_revision:
namespace = get_namespace(dataset_with_revision.dataset)
selected_datasets: list[str] = []
for dataset in datasets:
namespace = get_namespace(dataset)
if (namespace is None) or (namespace in namespaces):
continue
namespaces.add(namespace)
selected_datasets_with_revision.append(dataset_with_revision)
return selected_datasets_with_revision
selected_datasets.append(dataset)
return selected_datasets


def get_namespace(dataset: str) -> Optional[str]:
Expand Down
Loading

0 comments on commit b22957f

Please sign in to comment.