Expose embedding indexes of Dataset (#408)

scaleapi · Nov 7, 2023 · dc4a025 · dc4a025
1 parent 0b2f68c
commit dc4a025
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 - Allow direct embedding vector upload together with dataset items. `DatasetItem` now has an additional parameter called `embedding_info` which can be used to directly upload embeddings when a dataset is uploaded.
+- Added `dataset.embedding_indexes` property, which exposes information about every embedding index which belongs to the dataset.   
 
 
 ## [0.16.6](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.6) - 2023-11-01

diff --git a/nucleus/constants.py b/nucleus/constants.py
@@ -52,6 +52,7 @@
 EMBEDDING_VECTOR_KEY = "embedding_vector"
 EMBEDDINGS_URL_KEY = "embeddings_urls"
 EMBEDDING_DIMENSION_KEY = "embedding_dimension"
+EMBEDDING_TYPE_KEY = "embedding_type"
 ERRORS_KEY = "errors"
 ERROR_CODES = "error_codes"
 ERROR_ITEMS = "upload_errors"
@@ -73,6 +74,8 @@
 INDEX_KEY = "index"
 INDEX_ID_KEY = "index_id"
 INDEX_CONTINUOUS_ENABLE_KEY = "enable"
+INDEX_LEVEL_KEY = "index_level"
+INDEX_TYPE_KEY = "index_type"
 IOU_KEY = "iou"
 ITEMS_KEY = "items"
 ITEM_KEY = "item"

diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -17,6 +17,7 @@
 
 from nucleus.annotation_uploader import AnnotationUploader, PredictionUploader
 from nucleus.async_job import AsyncJob, EmbeddingsExportJob
+from nucleus.embedding_index import EmbeddingIndex
 from nucleus.evaluation_match import EvaluationMatch
 from nucleus.prediction import from_json as prediction_from_json
 from nucleus.track import Track
@@ -194,6 +195,18 @@ def slices(self) -> List[Slice]:
         )
         return [Slice.from_request(info, self._client) for info in response]
 
+    @property
+    def embedding_indexes(self) -> List[EmbeddingIndex]:
+        """Gets all the embedding indexes belonging to this Dataset."""
+        response = self._client.make_request(
+            {}, f"dataset/{self.id}/embeddingIndexes", requests.get
+        )
+
+        return [
+            EmbeddingIndex.from_json(info)
+            for info in response["embedding_indexes"]
+        ]
+
     def get_slices(
         self,
         name: Optional[str] = None,

diff --git a/nucleus/embedding_index.py b/nucleus/embedding_index.py
@@ -0,0 +1,54 @@
+from dataclasses import dataclass
+from enum import Enum
+
+from nucleus.constants import (
+    EMBEDDING_DIMENSION_KEY,
+    EMBEDDING_TYPE_KEY,
+    ID_KEY,
+    INDEX_LEVEL_KEY,
+    INDEX_TYPE_KEY,
+    STATUS_KEY,
+)
+
+
+class IndexType(str, Enum):
+    INTERNAL = "Internal"
+    CUSTOM = "Custom"
+
+
+class IndexLevel(str, Enum):
+    IMAGE = "Image"
+    OBJECT = "Object"
+
+
+class IndexStatus(str, Enum):
+    STARTED = "Started"
+    COMPLETED = "Completed"
+    ERRORED = "Errored"
+
+
+@dataclass
+class EmbeddingIndex:
+    """Represents an Embedding Index belonging to a Dataset.
+
+    Embedding Indexes contain generated embeddings for each item in the dataset,
+    and are used by the Autotag and the Similarity Search functionality.
+    """
+
+    id: str
+    status: IndexStatus
+    index_type: IndexType
+    index_level: IndexLevel
+    embedding_type: str
+    embedding_dimension: int
+
+    @classmethod
+    def from_json(cls, payload: dict):
+        return cls(
+            id=payload[ID_KEY],
+            status=payload[STATUS_KEY],
+            index_type=payload[INDEX_TYPE_KEY],
+            index_level=payload[INDEX_LEVEL_KEY],
+            embedding_type=payload[EMBEDDING_TYPE_KEY],
+            embedding_dimension=payload[EMBEDDING_DIMENSION_KEY],
+        )