Detect dataset modalities using dataset-filetypes (#2909)

* trigger dataset-modalities from dataset-filetypes * prepare for another modality detection method * add detection of modality per file extension * fix test and types * log in case of exception (removes bandit warning) + add tests * fix test * refresh all the entries * Additional modalities detection (#2912) * extract features modality detection to method * detect tabular datasets * add two modalities + fix comment * add simple time-series detection * add comment * add test --------- Co-authored-by: Quentin Lhoest <[email protected]> * .tif/.tiff -> image, not geospatial --------- Co-authored-by: Quentin Lhoest <[email protected]>
huggingface · Jun 14, 2024 · 5a498aa · 5a498aa
1 parent f102b46
commit 5a498aa
Show file tree

Hide file tree

Showing 6 changed files with 434 additions and 36 deletions.
diff --git a/libs/libcommon/src/libcommon/processing_graph.py b/libs/libcommon/src/libcommon/processing_graph.py
@@ -708,8 +708,8 @@ def parse_id(id: str) -> tuple[str, str, Optional[str], Optional[str], str]:
     },
     "dataset-modalities": {
         "input_type": "dataset",
-        "triggered_by": "dataset-info",
-        "job_runner_version": 1,
+        "triggered_by": ["dataset-info", "dataset-filetypes"],
+        "job_runner_version": 2,
         "difficulty": 20,
     },
     "dataset-croissant-crumbs": {

diff --git a/libs/libcommon/tests/test_operations.py b/libs/libcommon/tests/test_operations.py
@@ -427,7 +427,7 @@ def test_2274_only_first_steps(
                 }
             )
 
-        assert len(queue.get_pending_jobs_df(dataset=dataset)) == 7
+        assert len(queue.get_pending_jobs_df(dataset=dataset)) == 8
         assert len(get_cache_entries_df(dataset=dataset)) == 2
 
         # let's delete all the jobs, to get in the same state as the bug

diff --git a/libs/libcommon/tests/test_processing_graph.py b/libs/libcommon/tests/test_processing_graph.py
@@ -168,8 +168,8 @@ def test_graph() -> None:
         (
             "dataset-modalities",
             ["dataset-hub-cache"],
-            ["dataset-info"],
-            ["dataset-config-names", "config-parquet-and-info", "config-info", "dataset-info"],
+            ["dataset-info", "dataset-filetypes"],
+            ["dataset-config-names", "config-parquet-and-info", "config-info", "dataset-info", "dataset-filetypes"],
         ),
         (
             "dataset-is-valid",
@@ -363,6 +363,7 @@ def test_graph() -> None:
                 "config-size",
                 "config-split-names",
                 "dataset-config-names",
+                "dataset-filetypes",
                 "dataset-info",
                 "dataset-is-valid",
                 "dataset-compatible-libraries",
@@ -417,7 +418,7 @@ def test_graph() -> None:
         ),
         (
             "dataset-filetypes",
-            [],
+            ["dataset-modalities"],
             [],
             [],
         ),

diff --git a/services/worker/src/worker/dtos.py b/services/worker/src/worker/dtos.py
@@ -333,7 +333,7 @@ class DatasetCompatibleLibrariesResponse(TypedDict):
     formats: list[DatasetFormat]
 
 
-DatasetModality = Literal["image", "audio", "text"]
+DatasetModality = Literal["image", "audio", "text", "video", "geospatial", "3d", "tabular", "timeseries"]
 
 
 class DatasetModalitiesResponse(TypedDict):

diff --git a/services/worker/src/worker/job_runners/dataset/modalities.py b/services/worker/src/worker/job_runners/dataset/modalities.py
@@ -3,7 +3,7 @@
 
 import logging
 
-from datasets import Audio, Features, Image, Translation, TranslationVariableLanguages, Value
+from datasets import Audio, Features, Image, Sequence, Translation, TranslationVariableLanguages, Value
 from datasets.features.features import FeatureType, _visit
 from libcommon.exceptions import PreviousStepFormatError
 from libcommon.simple_cache import (
@@ -18,9 +18,65 @@
 from worker.job_runners.dataset.dataset_job_runner import DatasetJobRunner
 
 
-def compute_modalities_response(dataset: str) -> DatasetModalitiesResponse:
+def detect_features_modalities(features: Features) -> set[DatasetModality]:
     """
-    Get the response of 'dataset-modalities' for one specific dataset on huggingface.co.
+    Detect modalities of a dataset using the features (column types).
+
+    Args:
+        features (`datasets.Features`):
+            The features of a config.
+
+    Returns:
+        `set[DatasetModality]`: A set of modalities.
+    """
+    modalities: set[DatasetModality] = set()
+
+    def classify_modality(feature: FeatureType) -> None:
+        nonlocal modalities
+        if isinstance(feature, Audio):
+            modalities.add("audio")
+        elif isinstance(feature, Image):
+            modalities.add("image")
+        elif isinstance(feature, Value) and feature.dtype in ("string", "large_string"):
+            modalities.add("text")
+        elif isinstance(feature, (Translation, TranslationVariableLanguages)):
+            modalities.add("text")
+
+    _visit(features, classify_modality)
+
+    # detection of tabular data: if there are at least two top-level numerical columns, and no "media" columns
+    if (
+        not ("audio" in modalities or "image" in modalities)
+        and len(
+            [
+                feature
+                for feature in features.values()
+                if isinstance(feature, Value) and ("int" in feature.dtype or "float" in feature.dtype)
+            ]
+        )
+        >= 2
+    ):
+        modalities.add("tabular")
+
+    # detection of time series
+    if any(
+        "emb" not in column_name  # ignore lists of floats that may be embeddings
+        and (
+            (isinstance(feature, Sequence) and feature.feature == Value("float32"))
+            or (isinstance(feature, list) and feature[0] == Value("float32"))
+        )
+        for column_name, feature in features.items()
+    ):
+        modalities.add("timeseries")
+    # other idea: detect datasets with only numerical columns and one timestamp column
+    # (and ideally be able to detect dates/timestamps even from a column with string type)
+
+    return modalities
+
+
+def detect_modalities_from_features(dataset: str) -> set[DatasetModality]:
+    """
+    Detect modalities of a dataset using the features (column types).
 
     Args:
         dataset (`str`):
@@ -33,36 +89,195 @@ def compute_modalities_response(dataset: str) -> DatasetModalitiesResponse:
             If the content of the previous step has not the expected format
 
     Returns:
-        `tuple[DatasetModalitiesResponse, float]`: An object with the modalities_response and the progress.
+        `set[DatasetModality]`: A set of modalities.
     """
-    logging.info(f"compute 'dataset-modalities' for {dataset=}")
-
     dataset_info_response = get_previous_step_or_raise(kind="dataset-info", dataset=dataset)
     content = dataset_info_response["content"]
     if "dataset_info" not in content or not isinstance(content["dataset_info"], dict):
         raise PreviousStepFormatError("Previous step did not return the expected content: 'dataset_info'.")
 
     try:
         modalities: set[DatasetModality] = set()
+        for config_info in content["dataset_info"].values():
+            modalities.update(detect_features_modalities(features=Features.from_dict(config_info["features"])))
+    except Exception as e:
+        raise PreviousStepFormatError("Previous step did not return the expected content.", e) from e
 
-        def classify_modality(feature: FeatureType) -> None:
-            nonlocal modalities
-            if isinstance(feature, Audio):
-                modalities.add("audio")
-            elif isinstance(feature, Image):
-                modalities.add("image")
-            elif isinstance(feature, Value) and feature.dtype in ("string", "large_string"):
-                modalities.add("text")
-            elif isinstance(feature, (Translation, TranslationVariableLanguages)):
-                modalities.add("text")
+    return modalities
 
-        for config_info in content["dataset_info"].values():
-            features = Features.from_dict(config_info["features"])
-            _visit(features, classify_modality)
 
+def detect_modalities_from_filetypes(dataset: str) -> set[DatasetModality]:
+    """
+    Detect modalities of a dataset using the repository file extensions.
+
+    Args:
+        dataset (`str`):
+            A namespace (user or an organization) and a repo name separated by a `/`.
+
+    Raises:
+        [~`libcommon.simple_cache.CachedArtifactError`]:
+            If the previous step gave an error.
+        [~`libcommon.exceptions.PreviousStepFormatError`]:
+            If the content of the previous step has not the expected format
+
+    Returns:
+        `set[DatasetModality]`: A set of modalities.
+    """
+    dataset_filetypes_response = get_previous_step_or_raise(kind="dataset-filetypes", dataset=dataset)
+    content = dataset_filetypes_response["content"]
+    if "filetypes" not in content or not isinstance(content["filetypes"], list):
+        raise PreviousStepFormatError("Previous step did not return the expected content: 'filetypes'.")
+
+    # from https://developer.mozilla.org/en-US/docs/Web/Media/Formats/Image_types
+    IMAGE_EXTENSIONS = (
+        ".apng",
+        ".avif",
+        ".gif",
+        ".jpg",
+        ".jpeg",
+        ".jfif",
+        ".pjpeg",
+        ".pjp",
+        ".png",
+        ".svg",
+        "webp",
+        ".bmp",
+        ".ico",
+        ".cur",
+        ".tif",
+        ".tiff",
+    )
+    # from https://developer.mozilla.org/en-US/docs/Web/Media/Formats/Containers#browser_compatibility + others
+    AUDIO_EXTENSIONS = (
+        ".aac",
+        ".flac",
+        ".mp3",
+        ".m4a",
+        ".oga",
+        ".wav",
+        # other audio formats
+        ".weba",
+        ".opus",
+        ".spx",
+        ".wma",
+        ".aiff",
+        ".ape",
+        ".mka",
+        ".wv",
+        ".tak",
+    )
+    AUDIO_BUT_COULD_ALSO_BE_VIDEO_EXTENSIONS = (".ogg",)
+    VIDEO_EXTENSIONS = (
+        ".m4v",
+        ".m4p",
+        ".ogv",
+        ".mov",
+        ".mkv",
+        # other video formats
+        ".avi",
+        ".wmv",
+        ".flv",
+    )
+    VIDEO_BUT_COULD_ALSO_BE_AUDIO_EXTENSIONS = (".3gp", ".mpg", ".mpeg", ".mp4", ".webm")
+    GEOSPATIAL_EXTENSIONS = (
+        # vectorial
+        ".shp",
+        ".shx",
+        ".dbf",
+        ".prj",
+        ".cpg",
+        ".kml",
+        ".kmz",
+        ".gpx",
+        ".geojson",
+        ".topojson",
+        ".gml",
+        ".geoparquet",
+        ".fgb",
+        # raster
+        ".img",
+        ".bil",
+        ".bip",
+        ".bsq",
+        # geotiff uses .tif or .tiff, but better to just show "image" modality
+        # than wrongly put "geospatial" if it only contains tif images
+        # ".tif",
+        # ".tiff",
+        # vectorial or raster
+        ".gpkg",
+        ".mbtiles",
+        ".pmtiles",
+    )
+    _3D_EXTENSIONS = (
+        # from https://docs.unity3d.com/Manual/3D-formats.html
+        ".fbx",
+        ".dae",
+        ".dxf",
+        ".obj",
+        # other 3D formats
+        ".stl",
+        ".ply",
+        ".gltf",
+        ".glb",
+        ".usdz",
+    )
+    TEXT_EXTENSIONS = (".txt",)
+    try:
+        modalities: set[DatasetModality] = set()
+        for filetype in content["filetypes"]:
+            # TODO: should we condition by a number of files (filetype["count"] > threshold) to avoid false positives?
+            if filetype["extension"] in IMAGE_EXTENSIONS:
+                modalities.add("image")
+            elif filetype["extension"] in AUDIO_EXTENSIONS + AUDIO_BUT_COULD_ALSO_BE_VIDEO_EXTENSIONS:
+                modalities.add("audio")
+            elif filetype["extension"] in VIDEO_EXTENSIONS + VIDEO_BUT_COULD_ALSO_BE_AUDIO_EXTENSIONS:
+                modalities.add("video")
+            elif filetype["extension"] in GEOSPATIAL_EXTENSIONS:
+                modalities.add("geospatial")
+            elif filetype["extension"] in _3D_EXTENSIONS:
+                modalities.add("3d")
+            elif filetype["extension"] in TEXT_EXTENSIONS:
+                modalities.add("text")
     except Exception as e:
         raise PreviousStepFormatError("Previous step did not return the expected content.", e) from e
 
+    return modalities
+
+
+def compute_modalities_response(dataset: str) -> DatasetModalitiesResponse:
+    """
+    Get the response of 'dataset-modalities' for one specific dataset on huggingface.co.
+
+    Args:
+        dataset (`str`):
+            A namespace (user or an organization) and a repo name separated by a `/`.
+
+    Raises:
+        [~`libcommon.exceptions.PreviousStepFormatError`]:
+            If the content of the previous step has not the expected format
+
+    Returns:
+        `tuple[DatasetModalitiesResponse, float]`: An object with the modalities_response and the progress.
+    """
+    logging.info(f"compute 'dataset-modalities' for {dataset=}")
+
+    modalities: set[DatasetModality] = set()
+    try:
+        modalities.update(detect_modalities_from_features(dataset))
+    except PreviousStepFormatError:
+        raise
+    except Exception:
+        logging.info(f"failed to detect modalities from features of {dataset=}")
+        pass
+
+    try:
+        modalities.update(detect_modalities_from_filetypes(dataset))
+    except PreviousStepFormatError:
+        raise
+    except Exception:
+        logging.info(f"failed to detect modalities from file types of {dataset=}")
+        pass
+
     return DatasetModalitiesResponse(
         {
             "modalities": sorted(modalities),