Merge pull request #63 from scaleapi/da/validation_serialization

Da/validation serialization
scaleapi · Apr 26, 2021 · 40f9531 · 40f9531
2 parents 66d2b49 + c4815d3
commit 40f9531
Show file tree

Hide file tree

Showing 4 changed files with 93 additions and 16 deletions.
diff --git a/nucleus/annotation.py b/nucleus/annotation.py
@@ -1,25 +1,27 @@
+import json
 from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, Optional, Any, Union, List
+from typing import Any, Dict, List, Optional, Union
+
 from .constants import (
     ANNOTATION_ID_KEY,
+    ANNOTATIONS_KEY,
+    BOX_TYPE,
     DATASET_ITEM_ID_KEY,
-    REFERENCE_ID_KEY,
-    METADATA_KEY,
-    X_KEY,
-    Y_KEY,
-    WIDTH_KEY,
-    HEIGHT_KEY,
     GEOMETRY_KEY,
-    BOX_TYPE,
-    POLYGON_TYPE,
+    HEIGHT_KEY,
+    INDEX_KEY,
+    ITEM_ID_KEY,
     LABEL_KEY,
+    MASK_URL_KEY,
+    METADATA_KEY,
+    POLYGON_TYPE,
+    REFERENCE_ID_KEY,
     TYPE_KEY,
     VERTICES_KEY,
-    ITEM_ID_KEY,
-    MASK_URL_KEY,
-    INDEX_KEY,
-    ANNOTATIONS_KEY,
+    WIDTH_KEY,
+    X_KEY,
+    Y_KEY,
 )
 
 
@@ -42,6 +44,15 @@ def from_json(cls, payload: dict):
         else:
             return SegmentationAnnotation.from_json(payload)
 
+    def to_payload(self):
+        raise NotImplementedError(
+            "For serialization, use a specific subclass (i.e. SegmentationAnnotation), "
+            "not the base annotation class."
+        )
+
+    def to_json(self) -> str:
+        return json.dumps(self.to_payload())
+
 
 @dataclass
 class Segment:

diff --git a/nucleus/dataset_item.py b/nucleus/dataset_item.py
@@ -1,12 +1,14 @@
-from dataclasses import dataclass
+import json
 import os.path
+from dataclasses import dataclass
 from typing import Optional
+
 from .constants import (
+    DATASET_ITEM_ID_KEY,
     IMAGE_URL_KEY,
     METADATA_KEY,
-    REFERENCE_ID_KEY,
     ORIGINAL_IMAGE_URL_KEY,
-    DATASET_ITEM_ID_KEY,
+    REFERENCE_ID_KEY,
 )
 
 
@@ -51,3 +53,6 @@ def to_payload(self) -> dict:
         if self.item_id:
             payload[DATASET_ITEM_ID_KEY] = self.item_id
         return payload
+
+    def to_json(self) -> str:
+        return json.dumps(self.to_payload())
diff --git a/nucleus/utils.py b/nucleus/utils.py
@@ -70,3 +70,25 @@ def format_dataset_item_response(response: dict) -> dict:
         ITEM_KEY: DatasetItem.from_json(item),
         ANNOTATIONS_KEY: annotation_response,
     }
+
+
+def serialize_and_write(
+    upload_unit: List[Union[DatasetItem, Annotation]], file_pointer
+):
+    for unit in upload_unit:
+        try:
+            file_pointer.write(unit.to_json())
+        except TypeError as e:
+            type_name = type(unit).__name__
+            message = (
+                f"The following {type_name} could not be serialized: {unit}\n"
+            )
+            message += (
+                "This is usally an issue with a custom python object being "
+                "present in the metadata. Please inspect this error and adjust the "
+                "metadata so it is json-serializable: only python primitives such as "
+                "strings, ints, floats, lists, and dicts. For example, you must "
+                "convert numpy arrays into list or lists of lists.\n"
+            )
+            message += f"The specific error was {e}"
+            raise ValueError(message) from e
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,39 @@
+import pytest
+from nucleus import DatasetItem
+from nucleus import utils
+
+import io
+
+
+class TestNonSerializableObject:
+    def weird_function():
+        print("can't touch this. Dun dun dun dun.")
+
+
+def test_serialize():
+
+    test_items = [
+        DatasetItem("fake_url1", "fake_id1"),
+        DatasetItem(
+            "fake_url2",
+            "fake_id2",
+            metadata={
+                "ok": "field",
+                "bad": TestNonSerializableObject(),
+            },
+        ),
+    ]
+
+    with io.StringIO() as in_memory_filelike:
+        with pytest.raises(ValueError) as error:
+            utils.serialize_and_write(
+                test_items,
+                in_memory_filelike,
+            )
+        assert "DatasetItem" in str(error.value)
+        assert "fake_id2" in str(error.value)
+        assert "fake_id1" not in str(error.value)
+
+        test_items[1].metadata["bad"] = "fixed"
+
+        utils.serialize_and_write(test_items, in_memory_filelike)