From 53a1f6c8524e20df452583b6d2d8c6e8ead34f31 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 9 Dec 2024 15:18:35 +0100 Subject: [PATCH 1/2] more compute for fw2 --- libs/libcommon/src/libcommon/queue/past_jobs.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/libs/libcommon/src/libcommon/queue/past_jobs.py b/libs/libcommon/src/libcommon/queue/past_jobs.py index 62e9533c5..f13d08712 100644 --- a/libs/libcommon/src/libcommon/queue/past_jobs.py +++ b/libs/libcommon/src/libcommon/queue/past_jobs.py @@ -47,6 +47,11 @@ def __get__(self, instance: object, cls: type[U]) -> QuerySet[U]: # don't record short durations, because they will not have impact, but can clutter the collection JOB_DURATION_MIN_SECONDS = 30 +# hardcoded list of datasets for which we allocated more compute +# (typically impactful datasets with tons of subsets) +ALLOWED_COMPUTE_MULTIPLIER = { + "HuggingFaceFW/fineweb-2": 100 +} class PastJobDocument(Document): """The duration of a job that has been completed. @@ -96,7 +101,8 @@ def create_past_job(dataset: str, started_at: datetime, finished_at: datetime) - PastJobDocument(dataset=dataset, duration=duration, finished_at=finished_at).save() if not is_blocked(dataset) and duration > JOB_DURATION_CHECK_MIN_SECONDS: - if PastJobDocument.objects(dataset=dataset).sum("duration") > DATASET_BLOCKAGE_THRESHOLD_SECONDS: + max_duration = DATASET_BLOCKAGE_THRESHOLD_SECONDS * ALLOWED_COMPUTE_MULTIPLIER.get(dataset, 1) + if PastJobDocument.objects(dataset=dataset).sum("duration") > max_duration: block_dataset(dataset) return True return False From 36f4ce1cfaa9ba4afe15c5020b3d5a1c35d49933 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 9 Dec 2024 15:23:38 +0100 Subject: [PATCH 2/2] style --- libs/libcommon/src/libcommon/queue/past_jobs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libs/libcommon/src/libcommon/queue/past_jobs.py b/libs/libcommon/src/libcommon/queue/past_jobs.py index f13d08712..08667fec1 100644 --- a/libs/libcommon/src/libcommon/queue/past_jobs.py +++ b/libs/libcommon/src/libcommon/queue/past_jobs.py @@ -49,9 +49,8 @@ def __get__(self, instance: object, cls: type[U]) -> QuerySet[U]: # hardcoded list of datasets for which we allocated more compute # (typically impactful datasets with tons of subsets) -ALLOWED_COMPUTE_MULTIPLIER = { - "HuggingFaceFW/fineweb-2": 100 -} +ALLOWED_COMPUTE_MULTIPLIER = {"HuggingFaceFW/fineweb-2": 100} + class PastJobDocument(Document): """The duration of a job that has been completed.