Skip to content

Commit

Permalink
Expose embedding indexes of Dataset (#408)
Browse files Browse the repository at this point in the history
  • Loading branch information
ntamas92 authored Nov 7, 2023
1 parent 0b2f68c commit dc4a025
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added
- Allow direct embedding vector upload together with dataset items. `DatasetItem` now has an additional parameter called `embedding_info` which can be used to directly upload embeddings when a dataset is uploaded.
- Added `dataset.embedding_indexes` property, which exposes information about every embedding index which belongs to the dataset.


## [0.16.6](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.6) - 2023-11-01
Expand Down
3 changes: 3 additions & 0 deletions nucleus/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
EMBEDDING_VECTOR_KEY = "embedding_vector"
EMBEDDINGS_URL_KEY = "embeddings_urls"
EMBEDDING_DIMENSION_KEY = "embedding_dimension"
EMBEDDING_TYPE_KEY = "embedding_type"
ERRORS_KEY = "errors"
ERROR_CODES = "error_codes"
ERROR_ITEMS = "upload_errors"
Expand All @@ -73,6 +74,8 @@
INDEX_KEY = "index"
INDEX_ID_KEY = "index_id"
INDEX_CONTINUOUS_ENABLE_KEY = "enable"
INDEX_LEVEL_KEY = "index_level"
INDEX_TYPE_KEY = "index_type"
IOU_KEY = "iou"
ITEMS_KEY = "items"
ITEM_KEY = "item"
Expand Down
13 changes: 13 additions & 0 deletions nucleus/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from nucleus.annotation_uploader import AnnotationUploader, PredictionUploader
from nucleus.async_job import AsyncJob, EmbeddingsExportJob
from nucleus.embedding_index import EmbeddingIndex
from nucleus.evaluation_match import EvaluationMatch
from nucleus.prediction import from_json as prediction_from_json
from nucleus.track import Track
Expand Down Expand Up @@ -194,6 +195,18 @@ def slices(self) -> List[Slice]:
)
return [Slice.from_request(info, self._client) for info in response]

@property
def embedding_indexes(self) -> List[EmbeddingIndex]:
"""Gets all the embedding indexes belonging to this Dataset."""
response = self._client.make_request(
{}, f"dataset/{self.id}/embeddingIndexes", requests.get
)

return [
EmbeddingIndex.from_json(info)
for info in response["embedding_indexes"]
]

def get_slices(
self,
name: Optional[str] = None,
Expand Down
54 changes: 54 additions & 0 deletions nucleus/embedding_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from dataclasses import dataclass
from enum import Enum

from nucleus.constants import (
EMBEDDING_DIMENSION_KEY,
EMBEDDING_TYPE_KEY,
ID_KEY,
INDEX_LEVEL_KEY,
INDEX_TYPE_KEY,
STATUS_KEY,
)


class IndexType(str, Enum):
INTERNAL = "Internal"
CUSTOM = "Custom"


class IndexLevel(str, Enum):
IMAGE = "Image"
OBJECT = "Object"


class IndexStatus(str, Enum):
STARTED = "Started"
COMPLETED = "Completed"
ERRORED = "Errored"


@dataclass
class EmbeddingIndex:
"""Represents an Embedding Index belonging to a Dataset.
Embedding Indexes contain generated embeddings for each item in the dataset,
and are used by the Autotag and the Similarity Search functionality.
"""

id: str
status: IndexStatus
index_type: IndexType
index_level: IndexLevel
embedding_type: str
embedding_dimension: int

@classmethod
def from_json(cls, payload: dict):
return cls(
id=payload[ID_KEY],
status=payload[STATUS_KEY],
index_type=payload[INDEX_TYPE_KEY],
index_level=payload[INDEX_LEVEL_KEY],
embedding_type=payload[EMBEDDING_TYPE_KEY],
embedding_dimension=payload[EMBEDDING_DIMENSION_KEY],
)

0 comments on commit dc4a025

Please sign in to comment.