From 17de994e952b18316d9847f59dd65a432b871bdd Mon Sep 17 00:00:00 2001 From: guyuz Date: Tue, 20 Aug 2024 12:16:11 +0300 Subject: [PATCH] Deprecating ElementType and doc fixes (#12) * deprecate ElemenType basically everywhere * - removed tqdm and prints from the repo because they break the docs (consider removing the dependency) - minor fixes to text in notebooks --- bridge/display/text.py | 17 ++-- bridge/display/vision.py | 27 +++--- bridge/primitives/dataset/dataset.py | 14 ++- bridge/primitives/dataset/sample_api.py | 8 +- bridge/primitives/dataset/singular_dataset.py | 10 +-- bridge/primitives/element/element.py | 5 +- bridge/primitives/element/element_type.py | 12 --- bridge/primitives/sample/sample.py | 21 +++-- bridge/primitives/sample/singular_sample.py | 9 +- .../sample/transform/sample_transform.py | 3 +- bridge/primitives/sample/transform/vision.py | 29 +++---- bridge/providers/dataset_provider.py | 3 +- bridge/providers/text.py | 7 +- bridge/providers/vision.py | 24 +++--- bridge/utils/download_and_extract_archive.py | 22 +++-- docs/source/getting_started.rst | 68 ++++++++++++--- docs/source/index.rst | 1 + .../vision/custom_data/dataset_provider.ipynb | 11 ++- .../vision/custom_data/display_engine.ipynb | 25 +++--- .../vision/custom_data/load_mechanism.ipynb | 8 +- .../vision/fundamentals/coco_eda_demo.ipynb | 20 ++--- .../vision/fundamentals/sample_api.ipynb | 21 +++-- .../vision/fundamentals/table_api.ipynb | 28 +++--- .../processing_data/cache_mechanism.ipynb | 17 ++-- .../processing_data/sample_transform.ipynb | 4 +- .../processing_data/source2tensors_demo.ipynb | 19 ++-- docs/source/user_guide.rst | 86 ++++++------------- tests/core/test_dataset.py | 25 +++--- tests/core/test_dictable.py | 3 +- tests/core/test_element.py | 5 +- 30 files changed, 259 insertions(+), 293 deletions(-) delete mode 100644 bridge/primitives/element/element_type.py diff --git a/bridge/display/text.py b/bridge/display/text.py index 38af19a..d1a0d96 100644 --- a/bridge/display/text.py +++ b/bridge/display/text.py @@ -3,7 +3,6 @@ import pandas as pd import panel as pn -from bridge.primitives.element.element_type import ElementType from bridge.primitives.sample.singular_sample import SingularSample if TYPE_CHECKING: @@ -14,9 +13,9 @@ class TextClassification(DisplayEngine[SingularDataset, SingularSample]): def show_element(self, element: Element, element_plot_kwargs: Dict[str, Any] | None = None): - if element.etype == ElementType.class_label: + if element.etype == "class_label": return pn.pane.Markdown(element.to_pd_series().to_frame().T.to_markdown()) - elif element.etype == ElementType.text: + elif element.etype == "text": return pn.pane.Markdown(element.data) else: raise NotImplementedError() @@ -27,9 +26,7 @@ def show_sample( element_plot_kwargs: Dict[str, Any] | None = None, sample_plot_kwargs: Dict[str, Any] | None = None, ): - annotations_md = pd.DataFrame( - [ann.to_pd_series() for ann in sample.annotations[ElementType.class_label]] - ).to_markdown() + annotations_md = pd.DataFrame([ann.to_pd_series() for ann in sample.annotations["class_label"]]).to_markdown() text_display = pn.pane.Markdown(sample.data) return pn.Column("# Sample Text:", text_display, "# Annotations Data:", annotations_md) @@ -52,9 +49,9 @@ def plot_sample_by_widget(sample_id): # class Panel(DisplayEngine): # def show_element(self, element: Element, element_plot_kwargs: Dict[str, Any] | None = None): -# if element.etype == ElementType.class_label: +# if element.etype == "class_label": # return self._show_class_label(element, element_plot_kwargs) -# elif element.etype == ElementType.text: +# elif element.etype == "text": # return self._show_text(element, element_plot_kwargs) # else: # raise NotImplementedError() @@ -66,9 +63,9 @@ def plot_sample_by_widget(sample_id): # sample_plot_kwargs: Dict[str, Any] | None = None, # ): # annotations_md = pd.DataFrame( -# [ann.to_pd_series() for ann in sample.elements[ElementType.class_label]] +# [ann.to_pd_series() for ann in sample.elements["class_label"]] # ).to_markdown() -# text_display = self.show_element(sample.elements[ElementType.text][0]) +# text_display = self.show_element(sample.elements["text"][0]) # return pn.Column("# Sample Text:", text_display, "# Annotations Data:", annotations_md) # # def show_dataset( diff --git a/bridge/display/vision.py b/bridge/display/vision.py index 69f3f99..3e068b9 100644 --- a/bridge/display/vision.py +++ b/bridge/display/vision.py @@ -6,7 +6,6 @@ import pandas as pd from bridge.display.display_engine import DisplayEngine -from bridge.primitives.element.element_type import ElementType from bridge.primitives.sample import Sample from bridge.utils import optional_dependencies @@ -24,9 +23,9 @@ def __init__(self, bbox_format: str = "xyxy") -> None: def show_element(self, element: Element, element_plot_kwargs: Dict[str, Any] | None = None): etype = element.etype - if etype == ElementType.image: + if etype == "image": plot = self._plot_single_image(element) - elif etype == ElementType.bbox: + elif etype == "bbox": plot = self._plot_single_bbox(element) else: raise NotImplementedError(f"Invalid etype: {etype}") @@ -42,13 +41,13 @@ def show_sample( ): import holoviews as hv - imgs = [self._plot_single_image(element) for element in sample.elements[ElementType.image]] - if ElementType.bbox in sample.elements: - bboxes = self._plot_list_of_bbox_or_class_labels(sample.elements[ElementType.bbox]) + imgs = [self._plot_single_image(element) for element in sample.elements["image"]] + if "bbox" in sample.elements: + bboxes = self._plot_list_of_bbox_or_class_labels(sample.elements["bbox"]) else: bboxes = hv.Overlay() - if ElementType.class_label in sample.elements: - class_labels = self._plot_list_of_bbox_or_class_labels(sample.elements[ElementType.class_label]) + if "class_label" in sample.elements: + class_labels = self._plot_list_of_bbox_or_class_labels(sample.elements["class_label"]) else: class_labels = hv.Overlay() for i in range(len(imgs)): @@ -104,10 +103,10 @@ def _plot_list_of_bbox_or_class_labels(self, elements: List[Element]): rectangle_list = [] for element in elements: data: BoundingBox = element.data - if element.etype == ElementType.bbox: + if element.etype == "bbox": xyxy = self._extract_bbox_coords(data) cl = data.class_label - elif element.etype == ElementType.class_label: # assume cls + elif element.etype == "class_label": # assume cls xyxy = [np.nan, np.nan, np.nan, np.nan] cl = data else: @@ -122,7 +121,7 @@ def _plot_list_of_bbox_or_class_labels(self, elements: List[Element]): for i, group in hv_df.groupby("class"): p = hv.Rectangles(group, label=i) plots.append(p) - plots = hv.Overlay(plots).opts(hv.opts.Rectangles(**self._default_kwargs(ElementType.bbox))) + plots = hv.Overlay(plots).opts(hv.opts.Rectangles(**self._default_kwargs("bbox"))) return plots def _extract_bbox_coords(self, data): @@ -143,12 +142,12 @@ def _extract_bbox_coords(self, data): return xyxy @staticmethod - def _default_kwargs(etype: ElementType) -> Dict[str, Any]: + def _default_kwargs(etype: str) -> Dict[str, Any]: import holoviews as hv - if etype == ElementType.image: + if etype == "image": return dict(aspect="equal", invert_yaxis=True, legend_position="left", xaxis=None, yaxis=None) - elif etype == ElementType.bbox: + elif etype == "bbox": return dict(fill_alpha=0.0, line_width=3, line_color=hv.Cycle("Category20")) @staticmethod diff --git a/bridge/primitives/dataset/dataset.py b/bridge/primitives/dataset/dataset.py index f99764d..88f63c8 100644 --- a/bridge/primitives/dataset/dataset.py +++ b/bridge/primitives/dataset/dataset.py @@ -5,12 +5,10 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Hashable, Iterable, Iterator, List, Sequence import pandas as pd -from tqdm.contrib import tmap from typing_extensions import Self from bridge.primitives.dataset.sample_api import SampleAPI from bridge.primitives.dataset.table_api import TableAPI -from bridge.primitives.element.element_type import ElementType from bridge.primitives.sample import Sample from bridge.utils.constants import ELEMENT_COLS, INDICES from bridge.utils.helper import Displayable @@ -27,7 +25,7 @@ def __init__( self, elements: pd.DataFrame, display_engine: DisplayEngine = None, - cache_mechanisms: Dict[ElementType, CacheMechanism | None] | None = None, + cache_mechanisms: Dict[str, CacheMechanism | None] | None = None, ): self._elements = elements self._display_engine = display_engine @@ -59,7 +57,7 @@ def merge( self, other: "Dataset", display_engine: DisplayEngine | None = None, - cache_mechanisms: Dict[ElementType, CacheMechanism | None] | None = None, + cache_mechanisms: Dict[str, CacheMechanism | None] | None = None, ) -> "Dataset": self_element_ids = self.elements.index.get_level_values(ELEMENT_COLS.ID) other_element_ids = other.elements.index.get_level_values(ELEMENT_COLS.ID) @@ -86,8 +84,8 @@ def get(self, sample_id: Hashable) -> Sample: def transform_samples( self, transform: SampleTransform, - map_fn=tmap, - cache_mechanisms: Dict[ElementType, CacheMechanism] | None = None, + map_fn=map, + cache_mechanisms: Dict[str, CacheMechanism] | None = None, display_engine: DisplayEngine | None = None, ) -> Self: fn = functools.partial( @@ -99,7 +97,7 @@ def transform_samples( elements = [element for sample in samples for e_list in sample.elements.values() for element in e_list] return Dataset.from_elements(elements, display_engine=display_engine) - def map_samples(self, function: Callable[[Sample], Any], map_fn=tmap): + def map_samples(self, function: Callable[[Sample], Any], map_fn=map): outputs = map_fn(function, self) if isinstance(outputs, GeneratorType): return list(outputs) @@ -130,7 +128,7 @@ def from_elements( cls, elements: Iterable[Element], display_engine: DisplayEngine = None, - cache_mechanisms: Dict[ElementType, CacheMechanism | None] | None = None, + cache_mechanisms: Dict[str, CacheMechanism | None] | None = None, ) -> Self: element_records = [e.to_pd_series() for e in elements] elements_df = pd.DataFrame(element_records).set_index(INDICES) diff --git a/bridge/primitives/dataset/sample_api.py b/bridge/primitives/dataset/sample_api.py index 084b0bb..6f4a0dd 100644 --- a/bridge/primitives/dataset/sample_api.py +++ b/bridge/primitives/dataset/sample_api.py @@ -3,14 +3,12 @@ import abc from typing import TYPE_CHECKING, Any, Callable, Dict, Hashable, Iterable, Iterator, Sequence -from tqdm.contrib import tmap from typing_extensions import Self if TYPE_CHECKING: from bridge.display.display_engine import DisplayEngine from bridge.primitives.element.data.cache_mechanism import CacheMechanism from bridge.primitives.element.element import Element - from bridge.primitives.element.element_type import ElementType from bridge.primitives.sample import Sample from bridge.primitives.sample.transform import SampleTransform @@ -28,14 +26,14 @@ def get(self, sample_id: Hashable) -> Sample: def transform_samples( self, transform: SampleTransform, - map_fn=tmap, - cache_mechanisms: Dict[ElementType, CacheMechanism] | None = None, + map_fn=map, + cache_mechanisms: Dict[str, CacheMechanism] | None = None, display_engine: DisplayEngine | None = None, ) -> Self: pass @abc.abstractmethod - def map_samples(self, function: Callable[[Sample], Any], map_fn=tmap) -> Sequence[Sample]: + def map_samples(self, function: Callable[[Sample], Any], map_fn=map) -> Sequence[Sample]: pass @abc.abstractmethod diff --git a/bridge/primitives/dataset/singular_dataset.py b/bridge/primitives/dataset/singular_dataset.py index c591183..cccb098 100644 --- a/bridge/primitives/dataset/singular_dataset.py +++ b/bridge/primitives/dataset/singular_dataset.py @@ -3,7 +3,6 @@ from typing import TYPE_CHECKING, Callable, Dict, Hashable, List, Sequence import pandas as pd -from tqdm.contrib import tmap from typing_extensions import Self from bridge.primitives.dataset.dataset import Dataset @@ -14,7 +13,6 @@ from bridge.display import DisplayEngine from bridge.primitives.element.data.cache_mechanism import CacheMechanism from bridge.primitives.element.element import Element - from bridge.primitives.element.element_type import ElementType from bridge.primitives.sample.transform import SampleTransform @@ -31,7 +29,7 @@ def __init__( samples: pd.DataFrame, annotations: pd.DataFrame, display_engine: DisplayEngine = None, - cache_mechanisms: Dict[ElementType, CacheMechanism | None] | None = None, + cache_mechanisms: Dict[str, CacheMechanism | None] | None = None, ): assert ( len( @@ -132,8 +130,8 @@ def sort_annotations(self, by: str, ascending: bool = True): def transform_samples( self, transform: SampleTransform, - map_fn=tmap, - cache_mechanisms: Dict[ElementType, CacheMechanism] | None = None, + map_fn=map, + cache_mechanisms: Dict[str, CacheMechanism] | None = None, display_engine: DisplayEngine | None = None, ) -> Self: ds = super().transform_samples( @@ -157,7 +155,7 @@ def from_lists( samples_list: List[Element], annotations_list: List[Element], display_engine: DisplayEngine = None, - cache_mechanisms: Dict[ElementType, CacheMechanism | None] | None = None, + cache_mechanisms: Dict[str, CacheMechanism | None] | None = None, ) -> Self: sample_records = [s.to_dict() for s in samples_list] annotation_records = [a.to_dict() for a in annotations_list] diff --git a/bridge/primitives/element/element.py b/bridge/primitives/element/element.py index 8bed26f..c137d48 100644 --- a/bridge/primitives/element/element.py +++ b/bridge/primitives/element/element.py @@ -13,7 +13,6 @@ from bridge.display import DisplayEngine from bridge.primitives.element.data.cache_mechanism import CacheMechanism from bridge.primitives.element.element_data_type import ELEMENT_DATA_TYPE - from bridge.primitives.element.element_type import ElementType class Element(Displayable): @@ -22,7 +21,7 @@ class Element(Displayable): def __init__( self, element_id: Hashable, - etype: ElementType, + etype: str, load_mechanism: LoadMechanism, sample_id: Hashable, display_engine: DisplayEngine | None = None, @@ -55,7 +54,7 @@ def _data_impl(self): return data @property - def etype(self) -> ElementType: + def etype(self) -> str: return self._etype @property diff --git a/bridge/primitives/element/element_type.py b/bridge/primitives/element/element_type.py deleted file mode 100644 index d72a1d6..0000000 --- a/bridge/primitives/element/element_type.py +++ /dev/null @@ -1,12 +0,0 @@ -from __future__ import annotations - -from bridge.utils import StrEnum - - -class ElementType(StrEnum): - image = "image" - class_label = "class_label" - bbox = "bbox" - text = "text" - segmentation = "segmentation" - keypoint = "keypoint" diff --git a/bridge/primitives/sample/sample.py b/bridge/primitives/sample/sample.py index 6113fa6..b7c46c0 100644 --- a/bridge/primitives/sample/sample.py +++ b/bridge/primitives/sample/sample.py @@ -7,7 +7,6 @@ from bridge.primitives.element.data.cache_mechanism import CacheMechanism from bridge.primitives.element.element import Element -from bridge.primitives.element.element_type import ElementType from bridge.utils.constants import ELEMENT_COLS, INDICES from bridge.utils.helper import Displayable @@ -22,7 +21,7 @@ class Sample(Displayable): def __init__( self, - elements: List[Element] | Dict[ElementType, List[Element]], + elements: List[Element] | Dict[str, List[Element]], display_engine: DisplayEngine | None = None, ): if isinstance(elements, dict): @@ -41,11 +40,11 @@ def id(self) -> Hashable: return e_list[0].sample_id @property - def elements(self) -> Dict[ElementType, List[Element]]: + def elements(self) -> Dict[str, List[Element]]: return self._elements @property - def data(self) -> Dict[ElementType, List[ELEMENT_DATA_TYPE]]: + def data(self) -> Dict[str, List[ELEMENT_DATA_TYPE]]: data_dict = defaultdict(list) for etype, elist in self._elements.items(): data_dict[etype].extend([e.data for e in elist]) @@ -57,7 +56,7 @@ def show(self, **kwargs: Any): def transform( self, transform: SampleTransform, - cache_mechanisms: Dict[ElementType, CacheMechanism] | None = None, + cache_mechanisms: Dict[str, CacheMechanism] | None = None, display_engine: DisplayEngine | None = None, ) -> "Sample": cache_mechanisms = self._get_cache_mechanisms_for_transform(self, cache_mechanisms) @@ -71,7 +70,7 @@ def from_pd_dataframe( cls, elements_df: pd.DataFrame, display_engine: DisplayEngine | None, - cache_mechanisms: Dict[ElementType, CacheMechanism | None], + cache_mechanisms: Dict[str, CacheMechanism | None], ): def fast_to_dict_records(df): data = df.values.tolist() @@ -87,7 +86,7 @@ def fast_to_dict_records(df): for element_row in fast_to_dict_records(elements_df): etype = element_row[ELEMENT_COLS.ETYPE] - element_type = ElementType(etype) + element_type = str(etype) elements.append( Element.from_dict( element_row, @@ -108,7 +107,7 @@ def __len__(self) -> int: return sum(map(len, self._elements.values())) @staticmethod - def _assert_valid_elements(elements: Dict[ElementType, List[Element]]): + def _assert_valid_elements(elements: Dict[str, List[Element]]): sample_ids_from_elements = set([e.sample_id for e_list in elements.values() for e in e_list]) assert len(sample_ids_from_elements) == 1, ( f"All elements must contain a single sample id," @@ -116,15 +115,15 @@ def _assert_valid_elements(elements: Dict[ElementType, List[Element]]): ) @staticmethod - def _convert_elements_list_to_dict(elements: List[Element]) -> Dict[ElementType, List[Element]]: - elements_by_type: Dict[ElementType, List[Element]] = defaultdict(list) + def _convert_elements_list_to_dict(elements: List[Element]) -> Dict[str, List[Element]]: + elements_by_type: Dict[str, List[Element]] = defaultdict(list) for element in elements: elements_by_type[element.etype].append(element) d = dict(elements_by_type) return d @staticmethod - def _get_cache_mechanisms_for_transform(sample: Sample, cache_mechanisms: Dict[ElementType, CacheMechanism] | None): + def _get_cache_mechanisms_for_transform(sample: Sample, cache_mechanisms: Dict[str, CacheMechanism] | None): if cache_mechanisms is None: cache_mechanisms = {} diff --git a/bridge/primitives/sample/singular_sample.py b/bridge/primitives/sample/singular_sample.py index 2ca780f..f7246e2 100644 --- a/bridge/primitives/sample/singular_sample.py +++ b/bridge/primitives/sample/singular_sample.py @@ -13,14 +13,11 @@ from bridge.primitives.element.data.cache_mechanism import CacheMechanism from bridge.primitives.element.element import Element from bridge.primitives.element.element_data_type import ELEMENT_DATA_TYPE - from bridge.primitives.element.element_type import ElementType from bridge.primitives.sample.transform import SampleTransform class SingularSample(Sample): - def __init__( - self, elements: List[Element] | Dict[ElementType, List[Element]], display_engine: DisplayEngine = None - ): + def __init__(self, elements: List[Element] | Dict[str, List[Element]], display_engine: DisplayEngine = None): super().__init__(elements, display_engine) self._set_element_and_annotations() @@ -52,13 +49,13 @@ def data(self) -> ELEMENT_DATA_TYPE: return self._element.data @property - def annotations(self) -> Dict[ElementType, List[Element]]: + def annotations(self) -> Dict[str, List[Element]]: return self._annotations def transform( self, transform: SampleTransform, - cache_mechanisms: Dict[ElementType, CacheMechanism] | None = None, + cache_mechanisms: Dict[str, CacheMechanism] | None = None, display_engine: DisplayEngine | None = None, ) -> Self: transformed_sample = super().transform(transform, cache_mechanisms, display_engine) diff --git a/bridge/primitives/sample/transform/sample_transform.py b/bridge/primitives/sample/transform/sample_transform.py index 4489669..96d11a2 100644 --- a/bridge/primitives/sample/transform/sample_transform.py +++ b/bridge/primitives/sample/transform/sample_transform.py @@ -6,7 +6,6 @@ if TYPE_CHECKING: from bridge.display import DisplayEngine from bridge.primitives.element.data.cache_mechanism import CacheMechanism - from bridge.primitives.element.element_type import ElementType from bridge.primitives.sample import Sample @@ -15,7 +14,7 @@ class SampleTransform(ABC): def __call__( self, sample: Sample, - cache_mechanisms: Dict[ElementType, CacheMechanism] | None, + cache_mechanisms: Dict[str, CacheMechanism] | None, display_engine: DisplayEngine | None, ) -> Sample: pass diff --git a/bridge/primitives/sample/transform/vision.py b/bridge/primitives/sample/transform/vision.py index e211191..440e1a8 100644 --- a/bridge/primitives/sample/transform/vision.py +++ b/bridge/primitives/sample/transform/vision.py @@ -8,7 +8,6 @@ from PIL.Image import Image from bridge.primitives.element.element import Element -from bridge.primitives.element.element_type import ElementType from bridge.primitives.sample import Sample from bridge.primitives.sample.transform.sample_transform import SampleTransform from bridge.utils import optional_dependencies @@ -33,7 +32,7 @@ def __init__( def __call__( self, sample: Sample, - cache_mechanisms: Dict[ElementType, CacheMechanism], + cache_mechanisms: Dict[str, CacheMechanism], display_engine: DisplayEngine | None, ) -> Sample: elements = copy.deepcopy(sample.elements) @@ -51,32 +50,32 @@ def __call__( sample = Sample(elements=elements, display_engine=display_engine) return sample - def _elements_to_albm(self, elements: Dict[ElementType, List[Element]]): + def _elements_to_albm(self, elements: Dict[str, List[Element]]): albm_dict = {} - assert ElementType.image in elements, "Can't apply albumentations without an image element." - for i, img_element in enumerate(elements[ElementType.image]): + assert "image" in elements, "Can't apply albumentations without an image element." + for i, img_element in enumerate(elements["image"]): data = img_element.data albm_dict[f"image_{i}"] = data albm_dict.update({"image": albm_dict["image_0"], "bboxes": [], "keypoints": []}) - if ElementType.bbox in elements: - for i, bbox_element in enumerate(elements[ElementType.bbox]): + if "bbox" in elements: + for i, bbox_element in enumerate(elements["bbox"]): data: BoundingBox = bbox_element.data albm_dict[f"bboxes_{i}"] = [[*(data.coords.tolist()), data.class_label]] - if ElementType.keypoint in elements: + if "keypoint" in elements: raise NotImplementedError("Didn't fully implement keypoints in albumentations yet.") - # for i, keypoint in enumerate(elements[ElementType.keypoint]): + # for i, keypoint in enumerate(elements["keypoint"]): # keypoint: Keypoint # albm_dict[f"keypoints_{i}"] = keypoint.coords return albm_dict def _albm_to_elements( self, - elements: Dict[ElementType, List[Element]], + elements: Dict[str, List[Element]], albm_dict: Dict[str, Any], - cache_mechanisms: Dict[ElementType, CacheMechanism], + cache_mechanisms: Dict[str, CacheMechanism], ): - albm_to_elements = {"bboxes": ElementType.bbox, "keypoints": ElementType.keypoint, "image": ElementType.image} + albm_to_elements = {"bboxes": "bbox", "keypoints": "keypoint", "image": "image"} del albm_dict["image"] del albm_dict["bboxes"] del albm_dict["keypoints"] @@ -97,14 +96,14 @@ def _albm_to_elements( @staticmethod def _update_element_with_transformed_data( albm_data: Union[Image, List, np.ndarray], - cache_mechanisms: Dict[ElementType, CacheMechanism], + cache_mechanisms: Dict[str, CacheMechanism], curr_element: Element, ): - if curr_element.etype == ElementType.bbox: + if curr_element.etype == "bbox": albm_data = np.array(albm_data[0]) new_element_data = BoundingBox(albm_data[:4], class_label=albm_data[4]) # noqa new_category = "obj" - elif curr_element.etype == ElementType.image: + elif curr_element.etype == "image": if isinstance(albm_data, np.ndarray): new_category = "image" else: diff --git a/bridge/providers/dataset_provider.py b/bridge/providers/dataset_provider.py index 43358c2..6fc900d 100644 --- a/bridge/providers/dataset_provider.py +++ b/bridge/providers/dataset_provider.py @@ -9,7 +9,6 @@ if TYPE_CHECKING: from bridge.display import DisplayEngine from bridge.primitives.element.data.cache_mechanism import CacheMechanism - from bridge.primitives.element.element_type import ElementType class DatasetProvider(ABC, Generic[D, S]): @@ -17,6 +16,6 @@ class DatasetProvider(ABC, Generic[D, S]): def build_dataset( self, display_engine: DisplayEngine[D, S] = SimplePrints(), - cache_mechanisms: Dict[ElementType, CacheMechanism | None] | None = None, + cache_mechanisms: Dict[str, CacheMechanism | None] | None = None, ) -> D: pass diff --git a/bridge/providers/text.py b/bridge/providers/text.py index 05bf0f1..292b256 100644 --- a/bridge/providers/text.py +++ b/bridge/providers/text.py @@ -8,7 +8,6 @@ from bridge.primitives.dataset.singular_dataset import SingularDataset from bridge.primitives.element.data.load_mechanism import LoadMechanism from bridge.primitives.element.element import Element -from bridge.primitives.element.element_type import ElementType from bridge.primitives.sample.singular_sample import SingularSample from bridge.providers.dataset_provider import DatasetProvider from bridge.utils import download_and_extract_archive @@ -35,7 +34,7 @@ def __init__(self, root: str | os.PathLike, split: str = "train", download: bool def build_dataset( self, display_engine: DisplayEngine[SingularDataset, SingularSample] = SimplePrints(), - cache_mechanisms: Dict[ElementType, CacheMechanism] = None, + cache_mechanisms: Dict[str, CacheMechanism] = None, ) -> SingularDataset: samples = [] annotations = [] @@ -47,14 +46,14 @@ def build_dataset( text_element = Element( element_id=f"text_{textfile.stem}", sample_id=textfile.stem, - etype=ElementType.text, + etype="text", load_mechanism=load_mechanism, ) load_mechanism = LoadMechanism(ClassLabel(class_idx, class_dir.name), category="obj") label_element = Element( element_id=f"label_{textfile.stem}", sample_id=textfile.stem, - etype=ElementType.class_label, + etype="class_label", load_mechanism=load_mechanism, ) samples.append(text_element) diff --git a/bridge/providers/vision.py b/bridge/providers/vision.py index 8a8d046..80f3a47 100644 --- a/bridge/providers/vision.py +++ b/bridge/providers/vision.py @@ -5,13 +5,11 @@ from typing import TYPE_CHECKING, Dict import numpy as np -from tqdm import tqdm from bridge.display.basic import SimplePrints from bridge.primitives.dataset import SingularDataset from bridge.primitives.element.data.load_mechanism import LoadMechanism from bridge.primitives.element.element import Element -from bridge.primitives.element.element_type import ElementType from bridge.primitives.sample.singular_sample import SingularSample from bridge.providers.dataset_provider import DatasetProvider from bridge.utils import download_and_extract_archive, optional_dependencies @@ -26,9 +24,7 @@ class ImageFolder(DatasetProvider[SingularDataset, SingularSample]): def __init__(self, root: str | os.PathLike): self._root = root - def build_dataset( - self, display_engine: DisplayEngine = None, cache_mechanisms: Dict[ElementType, CacheMechanism] = None - ): + def build_dataset(self, display_engine: DisplayEngine = None, cache_mechanisms: Dict[str, CacheMechanism] = None): images = [] classes = [] for i, class_dir in enumerate(sorted(Path(self._root).iterdir())): @@ -37,14 +33,14 @@ def build_dataset( img_element = Element( element_id=f"image_{sample_id}", sample_id=sample_id, - etype=ElementType.image, + etype="image", load_mechanism=LoadMechanism.from_url_string(str(img_file), category="image"), metadata={"filename": img_file.name}, ) class_element = Element( element_id=f"class_{i}", sample_id=sample_id, - etype=ElementType.class_label, + etype="class_label", load_mechanism=LoadMechanism(ClassLabel(i, class_dir.name), category="obj"), metadata={"filename": img_file.name}, ) @@ -95,12 +91,12 @@ def __init__(self, root: str | os.PathLike, split: str = "train", img_source: st def build_dataset( self, display_engine: DisplayEngine = SimplePrints(), - cache_mechanisms: Dict[ElementType, CacheMechanism] = None, + cache_mechanisms: Dict[str, CacheMechanism] = None, ): img_id_list = list(sorted(self._coco.imgs.keys())) images = [] bboxes = [] - for img_id in tqdm(img_id_list): + for img_id in img_id_list: coco_img = self._coco.loadImgs(img_id)[0] img_file = self._images_dir / coco_img["file_name"] @@ -112,7 +108,7 @@ def build_dataset( img_element = Element( element_id=f"{img_id}_img", sample_id=img_id, - etype=ElementType.image, + etype="image", load_mechanism=load_mechanism, metadata={k: v for k, v in coco_img.items() if k not in Element.keys and k != "id"}, ) @@ -125,7 +121,7 @@ def build_dataset( bbox_element = Element( element_id=f"{img_id}_{coco_ann_dict['id']}", sample_id=img_id, - etype=ElementType.bbox, + etype="bbox", load_mechanism=load_mechanism, metadata={ "category_id": category_id, @@ -149,20 +145,20 @@ def __init__(self, root: str | os.PathLike, train: bool = True, download: bool = def build_dataset( self, display_engine: DisplayEngine = SimplePrints(), - cache_mechanisms: Dict[ElementType, CacheMechanism | None] | None = None, + cache_mechanisms: Dict[str, CacheMechanism | None] | None = None, ): sample_list = [] annotation_list = [] for i, (img, target) in enumerate(zip(self._ds.data, self._ds.targets)): img_element = Element( element_id=i, - etype=ElementType.image, + etype="image", sample_id=i, load_mechanism=LoadMechanism(url_or_data=img, category="image"), ) label_element = Element( element_id=f"label_{i}", - etype=ElementType.class_label, + etype="class_label", sample_id=i, load_mechanism=LoadMechanism( url_or_data=ClassLabel(class_idx=target, class_name=self._ds.classes[target]), diff --git a/bridge/utils/download_and_extract_archive.py b/bridge/utils/download_and_extract_archive.py index 54f5f9c..09c9d3d 100644 --- a/bridge/utils/download_and_extract_archive.py +++ b/bridge/utils/download_and_extract_archive.py @@ -19,12 +19,19 @@ from tqdm import tqdm -def _urlretrieve(url: str, filename: Union[str, pathlib.Path], chunk_size: int = 1024 * 32) -> None: +def _urlretrieve( + url: str, filename: Union[str, pathlib.Path], chunk_size: int = 1024 * 32, progress_bar: bool = False +) -> None: with urllib.request.urlopen(urllib.request.Request(url)) as response: - with open(filename, "wb") as fh, tqdm(total=response.length) as pbar: - while chunk := response.read(chunk_size): - fh.write(chunk) - pbar.update(len(chunk)) + with open(filename, "wb") as fh: + if progress_bar: + with tqdm(total=response.length) as pbar: + for chunk in iter(lambda: response.read(chunk_size), b""): + fh.write(chunk) + pbar.update(len(chunk)) + else: + for chunk in iter(lambda: response.read(chunk_size), b""): + fh.write(chunk) def calculate_md5(fpath: Union[str, pathlib.Path], chunk_size: int = 1024 * 1024) -> str: @@ -219,10 +226,11 @@ def _extract_tar( for member in tar_members: member_path = os.path.join(to_path, member.name) if member_path not in existing_files: - print(f"Extracting {member.name} to {to_path}") + # print(f"Extracting {member.name} to {to_path}") tar.extract(member, to_path) else: - print(f"{member.name} already exists in {to_path}, skipping.") + pass + # print(f"{member.name} already exists in {to_path}, skipping.") _ZIP_COMPRESSION_MAP: Dict[str, int] = { diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 71295cc..689c3cf 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -1,8 +1,5 @@ -Getting started +Getting Started =============== - - - Installation ------------ @@ -27,14 +24,57 @@ You can install the latest version of Bridge's from PyPI. It comes in a few flav $ pip install bridge-ds[dev] -Demos ------ - -For high-level demos to show off Bridge's capabilities, consider -browsing the following notebooks: - -#. :doc:`Quick and easy data exploration ` -#. :doc:`From sources, through augmentations, to Pytorch ` +Key Concepts +------------ -For a deeper understanding of Bridge, and to connect your custom datasets and data types, - proceed to the :doc:`user_guide` section. \ No newline at end of file +In this section you will learn the basics of Bridge. Start by +reading about the key concepts, and then proceed to +the guides below. + +Element +^^^^^^^ + +An Element is the basic unit of data in a Dataset, from raw data +objects such as images, text, audio, to various annotations such +as class labels, bounding boxes, and segmentation maps. In +essence, +anything that constitutes a piece of information within the +dataset can be an Element. + +Sample +^^^^^^ + +A Sample is a collection of Elements. It is our representation of +a typical item-within-a-dataset. +For example, an image with +object detections constitutes a Sample, comprising a single image +Element and multiple bounding box Elements. + +Dataset +^^^^^^^ + +A Dataset is a collection of Samples. It exposes the Table and +Sample APIs. + +Table API +^^^^^^^^^ + +A general term for the set of functions and operators exposed by +the +Dataset which allows users to perform +high-level operations with a user experience similar to Pandas - +assign, query, sort, map, etc. In short, an API that +allows users to treat any dataset as a DataFrame, where **every +row is an element.** + +Sample API +^^^^^^^^^^ + +A general term for the set of functions and operators exposed by +the Dataset which allows users to work on +individual examples in the dataset in a meaningful manner. +If the Table API is meant for high-level +dataset management, then the Sample API is used for low-level +operations +like loading, +caching, and transforming raw data (e.g. pixels, strings). diff --git a/docs/source/index.rst b/docs/source/index.rst index 4c24beb..0a870f0 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -15,6 +15,7 @@ with a Pandas-like experience, and handle individual samples and raw data using class-based, easy-to-work-with interface. .. toctree:: + :maxdepth: 2 getting_started user_guide diff --git a/docs/source/notebooks/vision/custom_data/dataset_provider.ipynb b/docs/source/notebooks/vision/custom_data/dataset_provider.ipynb index d86be11..c0fa95c 100644 --- a/docs/source/notebooks/vision/custom_data/dataset_provider.ipynb +++ b/docs/source/notebooks/vision/custom_data/dataset_provider.ipynb @@ -115,7 +115,7 @@ "\n", " Parameters:\n", " - display_engine (DisplayEngine): The display engine to use for visualization.\n", - " - cache_mechanisms (Dict[ElementType, CacheMechanism | None] | None): Cache mechanisms for different types of elements.\n", + " - cache_mechanisms (Dict[str, CacheMechanism | None] | None): Cache mechanisms for different types of elements.\n", " NOTE: Learn more about cache mechanisms and display engines in more advanced tutorials.\n", " \"\"\"\n", " # Implement dataset building logic here\n", @@ -211,7 +211,6 @@ "from bridge.primitives.dataset.singular_dataset import SingularDataset\n", "from bridge.primitives.element.data.load_mechanism import LoadMechanism\n", "from bridge.primitives.element.element import Element\n", - "from bridge.primitives.element.element_type import ElementType\n", "from bridge.utils.data_objects import ClassLabel\n", "\n", "\n", @@ -243,14 +242,14 @@ " text_element = Element(\n", " element_id=f\"text_{textfile.stem}\",\n", " sample_id=textfile.stem,\n", - " etype=ElementType.text,\n", + " etype=\"text\",\n", " load_mechanism=load_mechanism,\n", " )\n", " load_mechanism = LoadMechanism(ClassLabel(class_idx, class_dir.name), category=\"obj\")\n", " label_element = Element(\n", " element_id=f\"label_{textfile.stem}\",\n", " sample_id=textfile.stem,\n", - " etype=ElementType.class_label,\n", + " etype=\"class_label\",\n", " load_mechanism=load_mechanism,\n", " )\n", " samples.append(text_element)\n", @@ -285,7 +284,7 @@ "text_element = Element(\n", " element_id=f\"text_{textfile.stem}\",\n", " sample_id=textfile.stem,\n", - " etype=ElementType.text,\n", + " etype='text',\n", " load_mechanism=load_mechanism,\n", ")\n", "```\n", @@ -300,7 +299,7 @@ "label_element = Element(\n", " element_id=f\"label_{textfile.stem}\",\n", " sample_id=textfile.stem,\n", - " etype=ElementType.class_label,\n", + " etype='class_label',\n", " load_mechanism=load_mechanism,\n", ")\n", "```\n", diff --git a/docs/source/notebooks/vision/custom_data/display_engine.ipynb b/docs/source/notebooks/vision/custom_data/display_engine.ipynb index 02544d7..e501a4d 100644 --- a/docs/source/notebooks/vision/custom_data/display_engine.ipynb +++ b/docs/source/notebooks/vision/custom_data/display_engine.ipynb @@ -72,7 +72,7 @@ "metadata": {}, "source": [ "## Class Structure\n", - "We can improve this viz by writing our own DisplayEngine. For starters, let's see which methods we need to implement:\n", + "We can improve this \"viz\" by writing our own DisplayEngine. For starters, let's see which methods we need to implement:\n", "\n", "```python\n", "class MyDisplayEngine(DisplayEngine):\n", @@ -93,7 +93,7 @@ "\n", " def show_dataset(\n", " self,\n", - " dataset: D,\n", + " dataset,\n", " element_plot_kwargs: Dict[str, Any] | None = None,\n", " sample_plot_kwargs: Dict[str, Any] | None = None,\n", " dataset_plot_kwargs: Dict[str, Any] | None = None,\n", @@ -118,16 +118,15 @@ "import panel as pn\n", "\n", "from bridge.display.basic import DisplayEngine\n", - "from bridge.primitives.element.element_type import ElementType\n", "\n", "pn.extension()\n", "\n", "\n", "class TextClassification(DisplayEngine):\n", " def show_element(self, element, element_plot_kwargs: Dict[str, Any] | None = None):\n", - " if element.etype == ElementType.class_label:\n", + " if element.etype == \"class_label\":\n", " return pn.pane.Markdown(element.to_pd_series().to_frame().T.to_markdown())\n", - " elif element.etype == ElementType.text:\n", + " elif element.etype == \"text\":\n", " return pn.pane.Markdown(element.data)\n", " else:\n", " raise NotImplementedError()\n", @@ -198,9 +197,9 @@ "\n", "class TextClassification(DisplayEngine):\n", " def show_element(self, element, element_plot_kwargs: Dict[str, Any] | None = None):\n", - " if element.etype == ElementType.class_label:\n", + " if element.etype == \"class_label\":\n", " return pn.pane.Markdown(element.to_pd_series().to_frame().T.to_markdown())\n", - " elif element.etype == ElementType.text:\n", + " elif element.etype == \"text\":\n", " return pn.pane.Markdown(element.data)\n", " else:\n", " raise NotImplementedError()\n", @@ -211,9 +210,7 @@ " element_plot_kwargs: Dict[str, Any] | None = None,\n", " sample_plot_kwargs: Dict[str, Any] | None = None,\n", " ):\n", - " annotations_md = pd.DataFrame(\n", - " [ann.to_pd_series() for ann in sample.annotations[ElementType.class_label]]\n", - " ).to_markdown()\n", + " annotations_md = pd.DataFrame([ann.to_pd_series() for ann in sample.annotations[\"class_label\"]]).to_markdown()\n", " text_display = pn.pane.Markdown(sample.data)\n", " return pn.Column(\"# Sample Text:\", text_display, \"# Annotations Data:\", annotations_md)\n", "\n", @@ -258,9 +255,9 @@ "\n", "class TextClassification(DisplayEngine):\n", " def show_element(self, element, element_plot_kwargs: Dict[str, Any] | None = None):\n", - " if element.etype == ElementType.class_label:\n", + " if element.etype == \"class_label\":\n", " return pn.pane.Markdown(element.to_pd_series().to_frame().T.to_markdown())\n", - " elif element.etype == ElementType.text:\n", + " elif element.etype == \"text\":\n", " return pn.pane.Markdown(element.data)\n", " else:\n", " raise NotImplementedError()\n", @@ -271,9 +268,7 @@ " element_plot_kwargs: Dict[str, Any] | None = None,\n", " sample_plot_kwargs: Dict[str, Any] | None = None,\n", " ):\n", - " annotations_md = pd.DataFrame(\n", - " [ann.to_pd_series() for ann in sample.annotations[ElementType.class_label]]\n", - " ).to_markdown()\n", + " annotations_md = pd.DataFrame([ann.to_pd_series() for ann in sample.annotations[\"class_label\"]]).to_markdown()\n", " text_display = pn.pane.Markdown(sample.data)\n", " return pn.Column(\"# Sample Text:\", text_display, \"# Annotations Data:\", annotations_md)\n", "\n", diff --git a/docs/source/notebooks/vision/custom_data/load_mechanism.ipynb b/docs/source/notebooks/vision/custom_data/load_mechanism.ipynb index 5f24724..1d0c40f 100644 --- a/docs/source/notebooks/vision/custom_data/load_mechanism.ipynb +++ b/docs/source/notebooks/vision/custom_data/load_mechanism.ipynb @@ -82,7 +82,7 @@ "\n", "In this tutorial we will learn about the **LoadMechanism**, Bridge's way of loading raw data from different sources.\n", "\n", - "A quick reminder: to access the raw data within each element, we need to use the **SampleAPI**. The column `data` in the **TableAPI** usually (but not always) contains a reference to the data rather than the data itself:\n" + "A quick reminder: to access the raw data within each element, we need to use the **SampleAPI** with `sample.data / element.data`. The column `data` in the **TableAPI** usually (but not always) contains a reference to the data rather than the data itself:\n" ] }, { @@ -100,7 +100,7 @@ "id": "8", "metadata": {}, "source": [ - "When we want to access data for a given element, we need to call the `element.data` property. In COCO, we have elements for _images_ and for _bboxes_. Because COCO is a **SingularDataset**, every sample has a special element, in this case the image, and annotation elements, in this case the bboxes." + "When we want to access data for a given element, we need to call the `element.data` property. In COCO, we have elements for _images_ and for _bboxes_. Because COCO is a **SingularDataset**, every sample has a special element, in this case the image, and we can access its data directly with `sample.data`." ] }, { @@ -122,7 +122,7 @@ "id": "10", "metadata": {}, "source": [ - "Every element holds a **LoadMechanism**, an object responsible for loading data from different sources. In the case of images, `element.data` will perform an HTTP request and load the image in the response. In case for bboxes, which already exist in-memory (i.e. we can see them directly in the `annotations` table, `element.data` will simply load the stored object.\n", + "Every element holds a **LoadMechanism**, an object responsible for loading data from different sources. In this case, for images, `element.data` will perform an HTTP request and load the image in the response. For bboxes, which already exist in-memory (note that we can see them directly in the `annotations` table), `element.data` will simply load the stored object.\n", "\n", "The **LoadMechanism** is defined by two variables:" ] @@ -160,7 +160,7 @@ "source": [ "## In summary\n", "1. Bridge loads data lazily, only when `element.data` is called\n", - "2. The loading mechanism accepts **url_or_data** which defines where to load from (or whatto load), and **category** which defines _how_ to load it." + "2. The loading mechanism function accepts **url_or_data** which defines where to load from (or what to load), and **category** which defines _how_ to load it." ] } ], diff --git a/docs/source/notebooks/vision/fundamentals/coco_eda_demo.ipynb b/docs/source/notebooks/vision/fundamentals/coco_eda_demo.ipynb index afedf15..8150e1b 100644 --- a/docs/source/notebooks/vision/fundamentals/coco_eda_demo.ipynb +++ b/docs/source/notebooks/vision/fundamentals/coco_eda_demo.ipynb @@ -107,7 +107,7 @@ "id": "8", "metadata": {}, "source": [ - "In the previous tutorials we've made a brief introduction into using the Sample and Table APIs. In this demo we'll perform a short step-by-step analysis on COCO, using different toolings available in BridgeDS; emphasizing its ease of use." + "In this demo we'll perform a short step-by-step analysis on COCO, using different tools available in BridgeDS." ] }, { @@ -144,7 +144,7 @@ "id": "12", "metadata": {}, "source": [ - "In the annotations table, class names are represented by numerical labels, which may impede readability during dataset analysis. To address this, we may choose to use a third-party file that maps these integer labels to their corresponding text labels." + "Observe the annotations table: the class names (within the BoundingBox objects in the `data` column) are represented by numerical labels, which may impede readability during data analysis. To address this, we may choose to use a third-party file that maps these integer labels to their corresponding text labels." ] }, { @@ -168,7 +168,7 @@ "id": "14", "metadata": {}, "source": [ - "Like we've seen in the Table API tutorial, we can use `ds.assign_annotations` to replace our bounding box class labels with new ones:" + "We can use `ds.assign_annotations` to replace our bounding box class labels with new ones:" ] }, { @@ -199,7 +199,7 @@ "id": "16", "metadata": {}, "source": [ - "Another issue is that `ds.samples.date_captured` is actually made of strings, instead of pd.Timestamps. Let's fix that:" + "Another issue is that `ds.samples.date_captured` is actually made of strings, instead of `pd.Timestamp`. Let's fix that:" ] }, { @@ -223,7 +223,7 @@ "\n", "```\n", "for sample in samples:\n", - " for annotation in samples:\n", + " for annotation in sample:\n", " \n", "```\n", "\n", @@ -235,10 +235,10 @@ "id": "19", "metadata": {}, "source": [ - "## Plotting some graphs\n", - "With our dataframes now in appropriate formats, let's generate some basic plots to gain insights into our data.\n", + "## Plotting\n", + "With our tables now in appropriate formats, let's generate some basic plots to gain insights into our data.\n", "\n", - "Note: While our preferred plotting API is [hvplot](https://hvplot.holoviz.org/), [Pandas Plotting](https://pandas.pydata.org/docs/user_guide/visualization.html) remains a viable option." + "Note: While our preferred plotting API is [hvplot](https://hvplot.holoviz.org/), [Pandas Plotting](https://pandas.pydata.org/docs/user_guide/visualization.html) remains a viable option, as are other options that support the Pandas API." ] }, { @@ -346,9 +346,9 @@ "id": "28", "metadata": {}, "source": [ - "To gain a deeper understanding of the image and the size of the dining table annotation in question, we introduce DisplayEngines, which you've seen briefly in the Sample API tutorial. These objects are injected into Datasets Samples, and Elements, enabling us to manipulate the behavior of the `ds.show() / sample.show / element.show()` methods.\n", + "To gain a deeper understanding of the image and the size of the dining table annotation in question, we introduce DisplayEngines. These objects are injected into Datasets Samples, and Elements, enabling us to manipulate the behavior of the `ds.show() / sample.show / element.show()` methods.\n", "\n", - "By default, the **SimplePrints** engine is utilized. Let's switch to the **Holoviews** engine for enhanced visualization:" + "By default, the **SimplePrints** engine is used. Let's switch to the **Holoviews** engine for enhanced visualization:" ] }, { diff --git a/docs/source/notebooks/vision/fundamentals/sample_api.ipynb b/docs/source/notebooks/vision/fundamentals/sample_api.ipynb index d67abf9..5e8746b 100644 --- a/docs/source/notebooks/vision/fundamentals/sample_api.ipynb +++ b/docs/source/notebooks/vision/fundamentals/sample_api.ipynb @@ -104,9 +104,12 @@ "In BridgeDS, we use two complementing approaches to view datasets. We call them the **Sample API** and the **Table API**. This tutorial is about the former.\n", "\n", "Sample API can be loosely described as:\n", - "> A dataset can be viewed as a collection of samples, where samples are pythonic objects (Sample) that contain a collection of elements.\n", "\n", - "Let's demonstrate how to use it:" + " A dataset can be viewed as a collection of samples, where each sample is a pythonic object (called Sample) that contains a collection of elements.\n", + "\n", + "In case any of the terms 'dataset', 'sample' or 'element' is foreign to you, we recommend you to first go back to the Key Concepts section.\n", + "\n", + "Let's demonstrate how to use the **Sample API**:" ] }, { @@ -138,11 +141,13 @@ "metadata": {}, "source": [ "## Properties\n", - "The sample object itself is rather lean, it exposes only its _id_, its _elements_, and its _display_engine_.\n", + "The sample object is fairly minimal, exposing only its _id_, _elements_, and _display_engine_ properties, with limited methods available. This design reflects its role as a container for elements rather than a data object itself.\n", + "\n", + "Now, let’s shift our focus to the elements:\n", "\n", - "If you recall, _elements_ in BridgeDS can be anything - from raw data objects like images or text, to annotations such as bboxes, segmaps or class labels.\n", + "As a reminder, in BridgeDS, _elements_ can be any type of data—ranging from raw objects like images or text, to annotations like bounding boxes (bboxes), segmentation maps (segmaps), or class labels.\n", "\n", - "Let's see what elements our current sample has:\n" + "Let’s take a look at the elements contained in our current sample:" ] }, { @@ -163,7 +168,7 @@ "id": "12", "metadata": {}, "source": [ - "We see one image element and two bboxes. Having one \"raw data\" element (the image) and multiple \"annotation\" elements is actually a common use-case. For this reason, we implement COCO using a sub-class of Sample called **SingularSample**, that exposes a more convenient API where the sample has a special element available with `sample.element`, and the rest of the elements are available at `sample.annotations`:" + "We observe one image element and two bbox elements. It's common to have samples compose of a single element representing raw data (the image) alongside multiple elements representing annotations. To accommodate this frequent use case, we implement COCO samples using a subclass of **Sample** called **SingularSample**. This subclass provides a more convenient API, where the main element is accessible via `sample.element`, and the remaining elements are organized under `sample.annotations`:" ] }, { @@ -197,7 +202,7 @@ "metadata": {}, "source": [ "## The DisplayEngine\n", - "We will elaborate on how the DisplayEngine works in a separate tutorial, but for basic purposes it's worth noting that both the Dataset and the Sample objects expose a `.show()` method, which takes advantage of the DisplayEngine and produces the following:" + "We will elaborate on how the DisplayEngine works in a separate tutorial, but for basic usage it's worth noting that both the Dataset and the Sample objects expose a `.show()` method, which takes advantage of the DisplayEngine and produces the following:" ] }, { @@ -226,7 +231,7 @@ "id": "18", "metadata": {}, "source": [ - "As you can see, our class labels are integers rather than strings, because that's how the raw data is present in the COCO dataset. If you would like to learn how to change this, or in general how to perform dataset-wide operations, proceed to our next tutorial about the **Table API**" + "As you can see, our class labels are represented as integers rather than strings. This is because the raw COCO dataset stores class labels in this format. If you'd like to learn how to convert these integers into readable strings, or explore how to perform operations across the entire dataset, check out our next tutorial on the **Table API**." ] } ], diff --git a/docs/source/notebooks/vision/fundamentals/table_api.ipynb b/docs/source/notebooks/vision/fundamentals/table_api.ipynb index 49a0fb8..7f75b14 100644 --- a/docs/source/notebooks/vision/fundamentals/table_api.ipynb +++ b/docs/source/notebooks/vision/fundamentals/table_api.ipynb @@ -107,8 +107,9 @@ "source": [ "In BridgeDS, we use two complementing approaches to view datasets. We call them the **Sample API** and the **Table API**. This tutorial is about the latter.\n", "\n", - "The table API can be loosely described as:\n", - "> A dataset can be viewed as a table where every row represents a single element. Elements have unique ids but share the sample_id with other elements from the same Sample." + "The Table API can be described as:\n", + "\n", + " A dataset can be viewed as a table where every row represents a single element. Elements have a unique element_id but share their sample_id with other elements from the same Sample. The element_id and sample_id columns serve as the table's multi-index." ] }, { @@ -117,7 +118,7 @@ "metadata": {}, "source": [ "## Tables\n", - "Like in the previous tutorial, we semantically split the elements into two groups: **ds.samples** containing images and **ds.annotations** containing bboxes:" + "Like in the previous tutorial, we semantically split the elements into two groups: `ds.samples` containing images and `ds.annotations` containing bboxes:" ] }, { @@ -145,7 +146,10 @@ "id": "12", "metadata": {}, "source": [ - "## Filter\n", + "## Methods\n", + "The Table API is designed to expose callables that accept Pandas DataFrames as arguments, due to their simple and familiar API. The following sections showcase methods that allow users to perform different actions on Datasets, and these methods accept tuples of DataFrames (samples, annotations).\n", + "\n", + "### Filter\n", "Using tables allows us to easily filter out images or bboxes using familiar Pandas syntax. Note that when filtering samples, BridgeDS automatically filters out corresponding annotations:" ] }, @@ -175,7 +179,7 @@ "id": "14", "metadata": {}, "source": [ - "## Assign\n", + "### Assign\n", "We can assign new columns to either `ds.samples` or `ds.annotations` using familiar syntax. Let's assign the value `n_bboxes` to every sample:" ] }, @@ -189,7 +193,7 @@ "ds = ds.assign_samples(\n", " n_bboxes=lambda samples, anns: anns.groupby(\"sample_id\")\n", " .size()\n", - " .reindex(samples.index.get_level_values(\"sample_id\"))\n", + " .reindex(samples.index.get_level_values(\"sample_id\"), fill_value=0)\n", " .values\n", ")\n", "ds.samples.head()" @@ -200,7 +204,7 @@ "id": "16", "metadata": {}, "source": [ - "## Sorting\n", + "### Sorting\n", "We can sort the tables using familiar Pandas syntax:" ] }, @@ -220,7 +224,7 @@ "id": "18", "metadata": {}, "source": [ - "Note that if we sort the samples table, we can change the positional index used by the Sample API (ds.iget). The next cell will show the dataset in order from most bboxes per image to least:" + "Note that if we sort the samples table, we can change the positional index used by the Sample API (ds.iget) which dictates the order of the samples below. The next cell will show the dataset in order from most bboxes per image to least:" ] }, { @@ -232,14 +236,6 @@ "source": [ "sorted_ds.show()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/source/notebooks/vision/processing_data/cache_mechanism.ipynb b/docs/source/notebooks/vision/processing_data/cache_mechanism.ipynb index fe998f3..0625e52 100644 --- a/docs/source/notebooks/vision/processing_data/cache_mechanism.ipynb +++ b/docs/source/notebooks/vision/processing_data/cache_mechanism.ipynb @@ -126,9 +126,9 @@ "source": [ "One way to speed this up is to use a `CacheMechanism`: an object that, once `image_element.data` is called once, stores the data in a different location (e.g. a local file or in-memory). This action is transparent to the user but making subsequent `.data` calls significantly faster. \n", "\n", - "In our scenario, we can assign a cache mechanism for every `ElementType`. The Dataset has two ElementTypes:\n", - "1. `ElementType.bbox` - already stored in-memory, we'll leave them as-is\n", - "2. `ElementType.image` - we want to cache them in the filesystem rather than as variables." + "In our scenario, we can assign a cache mechanism for every `etype`. The Dataset has two etypes:\n", + "1. `'bbox'` - already stored in-memory, no need to re-cache them\n", + "2. `'image'` - we want to cache them in the filesystem." ] }, { @@ -140,7 +140,6 @@ "source": [ "from bridge.primitives.element.data.cache_mechanism import CacheMechanism\n", "from bridge.primitives.element.data.uri_components import URIComponents\n", - "from bridge.primitives.element.element_type import ElementType\n", "\n", "root_dir = TMP_NOTEBOOK_ROOT / \"coco\"\n", "\n", @@ -148,10 +147,10 @@ "stream_ds = provider.build_dataset(\n", " display_engine=Holoviews(bbox_format=\"xywh\"),\n", " cache_mechanisms={\n", - " ElementType.image: CacheMechanism(\n", + " \"image\": CacheMechanism(\n", " root_uri=URIComponents.from_str(str(TMP_NOTEBOOK_ROOT / \"my_local_cache\")),\n", " ),\n", - " ElementType.bbox: None,\n", + " \"bbox\": None,\n", " },\n", ")\n", "stream_ds" @@ -234,9 +233,9 @@ " ...\n", "```\n", "\n", - "2. Update `ds.elements` when we call `element.data`, with the new LoadMechanism we got from `cache_mechanism.store()` (So the **TableAPI** will align with the new source)\n", + "2. Update the `ds.elements` table (of which `ds.samples` and `ds.annotations` are derived) when we call `element.data`, with the new LoadMechanism we got from `cache_mechanism.store()` (So the **TableAPI** will align with the new source)\n", "\n", - "In fact, every elements holds a reference to a CacheMechanism just like it holds a LoadMechanism. Using this knowledge, here is the actual code for `element.data`:\n", + "In fact, every element holds a reference to a CacheMechanism just like it holds a LoadMechanism. Using this knowledge, here is the actual code for `element.data`:\n", "\n", "```python\n", "@property\n", @@ -278,7 +277,7 @@ "flipped_ds = stream_ds.select_samples(lambda samples, anns: samples.index[:20]).transform_samples(\n", " transform=transform,\n", " cache_mechanisms={\n", - " ElementType.image: CacheMechanism(\n", + " \"image\": CacheMechanism(\n", " URIComponents.from_str(str(TMP_NOTEBOOK_ROOT / \"flipped\")),\n", " )\n", " },\n", diff --git a/docs/source/notebooks/vision/processing_data/sample_transform.ipynb b/docs/source/notebooks/vision/processing_data/sample_transform.ipynb index fc763ca..528d5cb 100644 --- a/docs/source/notebooks/vision/processing_data/sample_transform.ipynb +++ b/docs/source/notebooks/vision/processing_data/sample_transform.ipynb @@ -95,7 +95,7 @@ "\n", "Manipulating data in Bridge is done through SampleTransforms. If you recall, data is stored in Bridge Elements rather than Samples, but in many cases we want to transform all Elements in a Sample together (for example, crop an image and remove all bboxes outside of the crop).\n", "\n", - "Bridge allows SampleTransforms to be called in two contexts:\n", + "Bridge utilizes SampleTransforms in two contexts:\n", "\n", "- `new_sample = sample.transform(sample_transform)`\n", "- `new_ds = ds.transform_samples(sample_transform)` - iterate over all samples and transform each one." @@ -168,7 +168,7 @@ "id": "11", "metadata": {}, "source": [ - "See how `url_or_data` has changed for the flipped sample? Consider that the LoadMechanism for the original sample is just configured to just load an image from a URL; when we apply an augmentation, our new image is not the same as the source. To keep the new image we need to store it somehow, detached from the original source.\n", + "See how `url_or_data` has changed for the flipped sample? Consider that the LoadMechanism for the original sample is just configured to just load an image from a URL; when we apply an augmentation, our new image is not the same as the source. To keep the new image we need to store it somewhere, detached from the original source.\n", "\n", "The default implementation for `sample.transform()` is to save the new data to RAM, but keeping the entire transformed dataset in RAM cannot scale.\n", "\n", diff --git a/docs/source/notebooks/vision/processing_data/source2tensors_demo.ipynb b/docs/source/notebooks/vision/processing_data/source2tensors_demo.ipynb index 2798f95..b7f2f89 100644 --- a/docs/source/notebooks/vision/processing_data/source2tensors_demo.ipynb +++ b/docs/source/notebooks/vision/processing_data/source2tensors_demo.ipynb @@ -48,7 +48,6 @@ "from bridge.display.vision import Holoviews\n", "from bridge.primitives.element.data.cache_mechanism import CacheMechanism\n", "from bridge.primitives.element.data.uri_components import URIComponents\n", - "from bridge.primitives.element.element_type import ElementType\n", "from bridge.utils import pmap\n", "\n", "hv.extension(\"bokeh\")\n", @@ -89,7 +88,7 @@ "source": [ "# Demo: Data Processing - From Sources to Pytorch\n", "\n", - "In this demo, we'll be working with COCO-val. We began by loading it into Bridge Dataset, and we will proceed by applying data augmentations, visualizing the results, and after we approve if our augmentation pipeline we will finally convert this augmented Dataset into a training-ready PyTorch dataset." + "In this demo, we'll be working with COCO-val. We began by loading it into Bridge Dataset, and we will proceed by applying data augmentations, visualizing the results, and once we're satisfied with our augmentation pipeline we will finally convert this augmented Dataset into a training-ready PyTorch dataset." ] }, { @@ -151,12 +150,12 @@ "\n", "# Cache the resulting augmented images into a local path ${TMP_NOTEBOOK_ROOT}/ds_augs\n", "caches = {\n", - " ElementType.image: CacheMechanism(URIComponents.from_str(str(TMP_NOTEBOOK_ROOT / \"ds_augs\"))),\n", + " \"image\": CacheMechanism(URIComponents.from_str(str(TMP_NOTEBOOK_ROOT / \"ds_augs\"))),\n", "}\n", "\n", - "# Function responsible for iteratin and applying the SampleTransform.\n", + "# Function responsible for iterating and applying the SampleTransform.\n", "# It could be as simple as `map`, but we can use a multi-process variant for better performance.\n", - "map_fn = functools.partial(pmap, backend=\"dataloader\")\n", + "map_fn = functools.partial(pmap, backend=\"dataloader\", progress_bar=False)\n", "\n", "with warnings.catch_warnings():\n", " warnings.simplefilter(\"ignore\", category=UserWarning) # hide \"low contrast\" warnings\n", @@ -303,7 +302,7 @@ "id": "20", "metadata": {}, "source": [ - "This time, we've lost significantly less annotations to the random crop. We can observe the samples manually as well, if we'd like:" + "This time, we've lost significantly less annotations to the random crop operation. We can observe the samples manually as well, if we'd like:" ] }, { @@ -324,7 +323,7 @@ "### Converting to tensors\n", "At this point, we're satisfied with our augmented Dataset. The next step is converting this dataset into viable input for a deep learning model - that is, converting the dataset to tensors. For our engine of choice, we'll demonstrate with PyTorch, but this technique should generalize to other deep learning frameworks just as well.\n", "\n", - "NOTE: up until this point of the tutorial, we have no actual dependency on which DL framework we were using. All of this works just as well if our DL framework of choice were Keras or TF.\n", + "NOTE: up until this point of the tutorial, we have no actual dependency on which deep learning framework we were using. All of this works just as well if our DL framework of choice were Keras or TensorFlow.\n", "\n", "The transformation into tensors works exactly as before, with `transform_data`:" ] @@ -358,9 +357,7 @@ " transform=to_tensor_transform,\n", " map_fn=map_fn,\n", " display_engine=None, # the output is not images anymore, so a Holoviews renderer won't work\n", - " cache_mechanisms={\n", - " ElementType.image: CacheMechanism(URIComponents.from_str(str(TMP_NOTEBOOK_ROOT / \"ds_tensors\")))\n", - " },\n", + " cache_mechanisms={\"image\": CacheMechanism(URIComponents.from_str(str(TMP_NOTEBOOK_ROOT / \"ds_tensors\")))},\n", " )" ] }, @@ -436,7 +433,7 @@ "id": "29", "metadata": {}, "source": [ - "As we can see, every item in `PytorchEngineDataset` is a dictionary with string keys that match ElementTypes (in our case, 'image' and 'bbox'); the values are lists of objects where the image is a torch.Tensor object, and the bboxes are a class we created, but you can use whatever you like. " + "As we can see, every item in `PytorchEngineDataset` is a dictionary with string keys that match etypes (in our case, 'image' and 'bbox'); the values are lists of objects where the image is a torch.Tensor object, and the bboxes are a class we created, but you can use whatever you like. " ] } ], diff --git a/docs/source/user_guide.rst b/docs/source/user_guide.rst index cfcf3ff..123076e 100644 --- a/docs/source/user_guide.rst +++ b/docs/source/user_guide.rst @@ -1,79 +1,45 @@ User Guide ========== +Demos +----- -Key Concepts ------------- +For high-level demos that showcase Bridge's capabilities, consider +browsing the following notebooks: -In this section you will learn the basics of Bridge. Start by -reading about the key concepts, and then proceed to -the guides below. +Exploratory Data Analysis +^^^^^^^^^^^^^^^^^^^^^^^^^ -Element -^^^^^^^ +* :doc:`COCO Dataset EDA ` -An Element is the basic unit of data in a Dataset, from raw data -objects such as images, text, audio, to various annotations such -as class labels, bounding boxes, and segmentation maps. In -essence, -anything that constitutes a piece of information within the -dataset can be an Element. - -Sample -^^^^^^ - -A Sample is a collection of Elements. It is our representation of -a typical item-within-a-dataset. -For example, an image with -object detections constitutes a Sample, comprising a single image -Element and multiple bounding box Elements. - -Dataset -^^^^^^^ - -A Dataset is a collection of Samples. It exposes the Table and -Sample APIs. - -Table API -^^^^^^^^^ - -A general term for the set of functions and operators exposed by -the -Dataset which allows users to perform -high-level operations with a user experience similar to Pandas - -assign, query, sort, map, etc. In short, an API that -allows users to treat any dataset as a DataFrame, where **every -row is an element.** - -Sample API -^^^^^^^^^^ - -A general term for the set of functions and operators exposed by -the Dataset which allows users to work on -individual examples in the dataset in a meaningful manner. -If the Table API is meant for high-level -dataset management, then the Sample API is used for low-level -operations -like loading, -caching, and transforming raw data (e.g. pixels, strings). +Full Pipeline +^^^^^^^^^^^^^ +* :doc:`COCO Dataset → Augmentations → Pytorch ` Guides ------ -Fundametals -^^^^^^^^^^^ +While the demos provide a glimpse into Bridge's capabilities, the following guides will introduce you to the core concepts and +design of Bridge, which will allow you to tailor it to your own needs. + +Fundamentals +^^^^^^^^^^^^ -* :doc:`Table API ` -* :doc:`Sample API ` +Start here to learn the basics of Bridge, namely how a Bridge Dataset is designed. + +* :doc:`The Sample API ` +* :doc:`The Table API ` Custom datasets ^^^^^^^^^^^^^^^ -* :doc:`Load Mechanism ` -* :doc:`Dataset Provider ` -* :doc:`Display Engine ` +Learn how to create custom Bridge Datasets. + +* :doc:`Connect raw data to Bridge using Load Mechanisms ` +* :doc:`Create a custom dataset with Dataset Providers ` +* :doc:`Interact with your data using Display Engines ` Processing data ^^^^^^^^^^^^^^^ -* :doc:`Cache Mechanism ` -* :doc:`Sample Transform ` +* :doc:`Augment data using Sample Transforms ` +* :doc:`Save intermediate data with Cache Mechanisms ` diff --git a/tests/core/test_dataset.py b/tests/core/test_dataset.py index 524c1ee..6c44d3f 100644 --- a/tests/core/test_dataset.py +++ b/tests/core/test_dataset.py @@ -8,7 +8,6 @@ from bridge.primitives.element.data.load_mechanism import LoadMechanism from bridge.primitives.element.data.uri_components import URIComponents from bridge.primitives.element.element import Element -from bridge.primitives.element.element_type import ElementType from bridge.utils.constants import ELEMENT_COLS from bridge.utils.data_objects import ClassLabel @@ -24,7 +23,7 @@ def dummy_elements(): img_element = Element( element_id=i, sample_id=i, - etype=ElementType.image, + etype="image", load_mechanism=LoadMechanism( url_or_data=np.random.randint(0, 255, size=(100, 100, 3)).astype("uint8"), category="obj", @@ -33,7 +32,7 @@ def dummy_elements(): lbl_element = Element( element_id=f"label_{i}", sample_id=i, - etype=ElementType.class_label, + etype="class_label", load_mechanism=LoadMechanism(url_or_data=ClassLabel(class_idx=np.random.randint(0, 10)), category="obj"), ) elements.extend([img_element, lbl_element]) @@ -51,7 +50,7 @@ def dummy_elements_2(): img_element = Element( element_id=100 + i, sample_id=50 + i, - etype=ElementType.image, + etype="image", load_mechanism=LoadMechanism( url_or_data=np.random.randint(0, 255, size=(100, 100, 3)).astype("uint8"), category="obj", @@ -60,7 +59,7 @@ def dummy_elements_2(): lbl_element = Element( element_id=f"label_{100+i}", sample_id=50 + i, # Adjusting sample_id to create overlap - etype=ElementType.class_label, + etype="class_label", load_mechanism=LoadMechanism(url_or_data=ClassLabel(class_idx=np.random.randint(0, 10)), category="obj"), ) elements.extend([img_element, lbl_element]) @@ -80,7 +79,7 @@ def dummy_dataset_2(dummy_elements_2): @pytest.fixture def dummy_classification_dataset_local_cache(tmp_path, dummy_elements): cache = CacheMechanism(URIComponents.from_str(str(tmp_path))) - return Dataset.from_elements(elements=dummy_elements, cache_mechanisms={ElementType.image: cache}) + return Dataset.from_elements(elements=dummy_elements, cache_mechanisms={"image": cache}) def test_repr(dummy_dataset): @@ -89,9 +88,9 @@ def test_repr(dummy_dataset): def test_select_by_etype(dummy_dataset): - ds = dummy_dataset.select(lambda e: e[ELEMENT_COLS.ETYPE] == ElementType.class_label) + ds = dummy_dataset.select(lambda e: e[ELEMENT_COLS.ETYPE] == "class_label") assert len(ds) == 100 - assert ElementType.image not in ds.elements[ELEMENT_COLS.ETYPE].drop_duplicates().to_list() + assert "image" not in ds.elements[ELEMENT_COLS.ETYPE].drop_duplicates().to_list() def test_select_by_sample_id(dummy_dataset): @@ -110,15 +109,15 @@ def test_data(dummy_dataset): labels = [] for sample in dummy_dataset: data_dict = sample.data - images.extend(data_dict[ElementType.image]) - labels.extend(data_dict[ElementType.class_label]) + images.extend(data_dict["image"]) + labels.extend(data_dict["class_label"]) assert all([isinstance(img, np.ndarray) for img in images]) assert all([img.shape == (100, 100, 3) for img in images]) assert all([isinstance(label, ClassLabel) for label in labels]) def test_cache(dummy_classification_dataset_local_cache): - cache_path = str(dummy_classification_dataset_local_cache._cache_mechanisms[ElementType.image]._root_uri) + cache_path = str(dummy_classification_dataset_local_cache._cache_mechanisms["image"]._root_uri) # assert starting conditions: cache dir exists and is empty, all elements have object data rather than URI assert Path(cache_path).exists() and Path(cache_path).is_dir() @@ -132,9 +131,7 @@ def test_cache(dummy_classification_dataset_local_cache): _ = sample.data image_schemes = ( - dummy_classification_dataset_local_cache.elements.pipe( - lambda df_: df_.loc[df_[ELEMENT_COLS.ETYPE] == ElementType.image] - ) + dummy_classification_dataset_local_cache.elements.pipe(lambda df_: df_.loc[df_[ELEMENT_COLS.ETYPE] == "image"]) .data.apply(lambda d: d.scheme) .drop_duplicates() .values diff --git a/tests/core/test_dictable.py b/tests/core/test_dictable.py index 06fb394..8cb1344 100644 --- a/tests/core/test_dictable.py +++ b/tests/core/test_dictable.py @@ -5,7 +5,6 @@ from bridge.primitives.element.data.cache_mechanism import CacheMechanism from bridge.primitives.element.data.load_mechanism import LoadMechanism from bridge.primitives.element.element import Element -from bridge.primitives.element.element_type import ElementType from bridge.utils.constants import ELEMENT_COLS from bridge.utils.data_objects import BoundingBox, ClassLabel, Keypoint @@ -66,7 +65,7 @@ def bad_load_mechanism_dict(request): def element_dict(load_mechanism_dict): return { ELEMENT_COLS.ID: "123", - ELEMENT_COLS.ETYPE: ElementType.image, + ELEMENT_COLS.ETYPE: "image", ELEMENT_COLS.SAMPLE_ID: 0, **load_mechanism_dict, } diff --git a/tests/core/test_element.py b/tests/core/test_element.py index 22c3ed9..04273ad 100644 --- a/tests/core/test_element.py +++ b/tests/core/test_element.py @@ -4,7 +4,6 @@ from bridge.primitives.element.data.cache_mechanism import CacheMechanism from bridge.primitives.element.data.load_mechanism import LoadMechanism from bridge.primitives.element.element import Element -from bridge.primitives.element.element_type import ElementType @pytest.fixture @@ -19,7 +18,7 @@ def dummy_sample_id(): @pytest.fixture def dummy_etype(): - return ElementType.segmentation + return "segmentation" @pytest.fixture @@ -71,7 +70,7 @@ def test_validate_metadata(load_mechanism_mock, display_engine_mock, cache_mecha Element( element_id=0, sample_id=0, - etype=ElementType.segmentation, + etype="segmentation", load_mechanism=load_mechanism_mock, display_engine=display_engine_mock, cache_mechanism=cache_mechanism_mock,