thousandbrainsproject · scottcanoe · Jan 15, 2025 · Dec 10, 2024 · Dec 16, 2024 · Dec 17, 2024
diff --git a/benchmarks/configs/ycb_experiments.py b/benchmarks/configs/ycb_experiments.py
@@ -108,6 +108,7 @@
 # v7 : Updated for State class support + using new feature names like pose_vectors
 # v8 : Using separate graph per input channel
 # v9 : Using models trained on 14 unique rotations
+# v10 : Using models trained without the semantic sensor
 fe_pretrain_dir = os.path.expanduser(
     os.path.join(monty_models_dir, "pretrained_ycb_v10")
 )

diff --git a/docs/how-to-use-monty/getting-started.md b/docs/how-to-use-monty/getting-started.md
@@ -99,8 +99,8 @@ A lot of our current experiments are based on the [YCB dataset](https://www.ycbb
 
 | Models | Archive Format | Download Link |
 | --- | --- | --- |
-| pretrained_ycb_v9 | tgz |  [pretrained_ycb_v9.tgz](https://tbp-pretrained-models-public-c9c24aef2e49b897.s3.us-east-2.amazonaws.com/tbp.monty/pretrained_ycb_v9.tgz) |
-| pretrained_ycb_v9 | zip |  [pretrained_ycb_v9.zip](https://tbp-pretrained-models-public-c9c24aef2e49b897.s3.us-east-2.amazonaws.com/tbp.monty/pretrained_ycb_v9.zip) |
+| pretrained_ycb_v9 | tgz |  [pretrained_ycb_v10.tgz](https://tbp-pretrained-models-public-c9c24aef2e49b897.s3.us-east-2.amazonaws.com/tbp.monty/pretrained_ycb_v10.tgz) |
+| pretrained_ycb_v9 | zip |  [pretrained_ycb_v10.zip](https://tbp-pretrained-models-public-c9c24aef2e49b897.s3.us-east-2.amazonaws.com/tbp.monty/pretrained_ycb_v10.zip) |
 
 Unpack the archive in the `~/tbp/results/monty/pretrained_models/` folder. For example:
 

diff --git a/src/tbp/monty/frameworks/environment_utils/transforms.py b/src/tbp/monty/frameworks/environment_utils/transforms.py
@@ -9,12 +9,14 @@
 # https://opensource.org/licenses/MIT.
 
 from numbers import Number
-from typing import Tuple
+from typing import Optional, Tuple
 
 import numpy as np
 import quaternion as qt
 import scipy
 
+from tbp.monty.frameworks.models.states import State
+
 __all__ = [
     "AddNoiseToRawDepthImage",
     "DepthTo3DLocations",
@@ -308,91 +310,122 @@ def __init__(
         self.clip_value = clip_value
         self.depth_clip_sensors = depth_clip_sensors
 
-    def __call__(self, observations, state=None):
+    def __call__(self, observations: dict, state: Optional[State] = None):
+        """Apply the depth-to-3D-locations transform to sensor observations.
+
+        Applies spatial transforms to the observations and generates a mask used
+        to determine which pixels are the object's surface. This modifies the
+        observations in-place. Some items may be modified, such as depth values,
+        while other values are added (see Returns section).
+
+        In the first part of this function, we build a mask that indicates
+        not only which part of the image is on-object, but also which part of
+        the image is "on-surface". The distinction between on-object and
+        on-surface arises from the fact that the field of view may contain parts
+        of the object that are far away from each other. For example, we may
+        be looking at the front lip of a mug, but the back lip of the mug is
+        also in the field of view. When we compute surface point normals or
+        surface curvature for the front lip of the mug, we don't want to include
+        pixels from the back lip of the mug.
+
+        To do this, we first need a semantic map that indicates only whether
+        the pixel is on- or off-object. We then analyze the depth data to
+        determine whether the pixels in the field of view appear to belong to the
+        same part of the object (see `get_surface_from_depth` for details). The
+        intersection of these two maps forms the on-surface mask (called
+        `semantic_obs`) that is embedded into the observation dict and is used
+        later when performing point-normal and curvature estimation.
+
+        How we decide to build these masks is dependent on several factors,
+        such as whether we are using a distant agent or a surface agent, and
+        whether a ground-truth semantic map should be used. This necessitates
+        a few different code paths. Here is a brief outline of the parameters
+        that reflect these factors as they are commonly used in Monty:
+         - when using a surface agent, self.depth_clip_sensors is a non-empty
+           tuple. More specifically, we know which sensor is the surface agent
+           since it's index will be in self.depth_clip_sensors. We only apply
+           depth clipping to the surface agent.
+         - surface agents also have their depth and semantic data clipped to a
+           a very short range from the sensor. This is done to more closely model
+           a finger which has short reach.
+         - `use_semantic_sensor` is currently only used with multi-object
+           experiments, and when this is `True`, the observation dict will have
+           an item called "semantic". In the future, we would like to include
+           semantic estimation for multi-object experiments. But until then, we
+           continue to support use of the semantic sensor for multi-object
+           experiments.
+         - When two parts of an object are visible, we separate the two parts
+           according to depth. However, the threshold we use to separate the two
+           parts is dependent on distant vs surface agents. The parameter
+           `default_on_surface_th` changes based on whether we are using a distant
+           or surface agent.
+
+        After the mask is generated, we unproject the 2D camera coordinates into
+        3D coordinates relative to the agent. We then add the transformed observations
+        to the original observations dict.
+
+        Args:
+            observations (dict): Observations returned by the data loader.
+            state (State, optional): Optionally supplied CMP-compliant state object.
+
+        Returns:
+            dict: The original observations dict with the following possibly added:
+                - "semantic_3d": 3D coordinates for each pixel. If `self.world_coord`
+                    is `True` (default), then the coordinates are in the world's
+                    reference frame and are in the sensor's reference frame otherwise.
+                    It is structured as a 2D array with shape (n_pixels, 4) with
+                    columns containing x-, y-, z-coordinates, and a semantic ID.
+                - "world_camera": Sensor-to-world coordinate frame transform. Included
+                    only when `self.world_coord` is `True` (default).
+                - "sensor_frame_data": 3D coordinates for each pixel relative to the
+                    sensor. Has the same structure as "semantic_3d". Included only
+                    when `self.get_all_points` is `True`.
+        """
         for i, sensor_id in enumerate(self.sensor_ids):
             agent_obs = observations[self.agent_id][sensor_id]
-            depth_obs = agent_obs["depth"]
-
-            #     In the first part of this function, we build a mask that indicates
-            # not only which part of the image is on-object, but also which part of
-            # the image is "on-surface". The distinction between on-object and
-            # on-surface arises from the fact that the field of view may contain parts
-            # of the object that are far away from each other. For example, we may
-            # be looking at the front lip of a mug, but the back lip of the mug is
-            # also in the field of view. When we compute surface point normals or
-            # surface curvature for the front lip of the mug, we don't want to include
-            # pixels from the back lip of the mug.
-            #     To do this, we first need a semantic map that indicates only whether
-            # the pixel is on- or off-object. We then analyze the depth data to
-            # determine whether the pixels in the field of view appear to belong to the
-            # same part of the object (see `get_surface_from_depth` for details). The
-            # intersection of these two maps forms the on-surface mask (called
-            # `semantic_obs`) that is embedded into the observation dict and is used
-            # later when performing point-normal and curvature estimation.
-            #     How we decide to build these masks is dependent on several factors,
-            # such as whether we are using a distant agent or a surface agent, and
-            # whether a ground-truth semantic map should be used. This necessitates
-            # a few different code paths. Here is a brief outline of the parameters
-            # that reflect these factors as they are commonly used in Monty:
-            #  - when using a surface agent, self.depth_clip_sensors is a non-empty
-            #    tuple. More specifically, we know which sensor is the surface agent
-            #    since it's index will be in self.depth_clip_sensors. We only apply
-            #    depth clipping to the surface agent.
-            #  - surface agents also have their depth and semantic data clipped to a
-            #    a very short range from the sensor. This is done to more closely model
-            #    a finger which has short reach.
-            #  - `use_semantic_sensor` is currently only used with multi-object
-            #    experiments, and when this is `True`, the observation dict will have
-            #    an item called "semantic". In the future, we would like to include
-            #    semantic estimation for multi-object experiments. But until then, we
-            #    continue to support use of the semantic sensor for multi-object
-            #    experiments.
-            #  - When two parts of an object are visible, we separate the two parts
-            #    according to depth. However, the threshold we use to separate the two
-            #    parts is dependent on distant vs surface agents. The parameter
-            #    `default_on_surface_th` changes based on whether we are using a distant
-            #    or surface agent.
+            depth_patch = agent_obs["depth"]
 
             # We need a semantic map that masks off-object pixels. We can use the
             # ground-truth semantic map if it's available. Otherwise, we generate one
             # from the depth map and (temporarily) add it to the observation dict.
             if "semantic" in agent_obs.keys():
-                semantic_added = False
+                semantic_patch = agent_obs["semantic"]
             else:
-                semantic_mask = np.ones_like(depth_obs, dtype=int)
-                semantic_mask[depth_obs >= 1] = 0
-                agent_obs["semantic"] = semantic_mask
-                semantic_added = True
+                # The generated map uses depth observations to determine whether
+                # pixels are on object using 1 meter as a threshold since
+                # `MissingToMaxDepth` sets the background void to 1.
+                semantic_patch = np.ones_like(depth_patch, dtype=int)
+                semantic_patch[depth_patch >= 1] = 0
 
             # Apply depth clipping to the surface agent, and initialize the
             # surface-separation threshold for later use.
             if i in self.depth_clip_sensors:
                 # Surface agent: clip depth and semantic data (in place), and set
                 # the default surface-separation threshold to be very short.
-                self.clip(agent_obs)
+                self.clip(depth_patch, semantic_patch)
                 default_on_surface_th = self.clip_value
             else:
                 # Distance agent: do not clip depth or semantic data, and set the
-                # default surface-separation threshold to be very long.
+                # default surface-separation threshold to be very far away.
                 default_on_surface_th = 1000.0
 
             # Build a mask that only includes the pixels that are on-surface, where
             # on-surface means the pixels are on-object and locally connected to
             # the center of the patch's field of view. However, if we are using a
             # surface agent and are using the semantic sensor, we may use the
             # (clipped) ground-truth semantic mask as a shortcut (though it doesn't
-            # use surface estimation--just ton-objectness).
+            # use surface estimation--just on-objectness).
             if self.depth_clip_sensors and self.use_semantic_sensor:
                 # NOTE: this particular combination of self.depth_clip_sensors and
                 # self.use_semantic_sensor is not commonly used at present, if ever.
                 # self.depth_clip_sensors implies a surface agent, and
                 # self.use_semantic_sensor implies multi-object experiments.
-                semantic_obs = agent_obs["semantic"]
+                surface_patch = agent_obs["semantic"]
             else:
-                semantic_obs = self.get_surface_from_depth(
-                    depth_obs,
+                surface_patch = self.get_surface_from_depth(
+                    depth_patch,
+                    semantic_patch,
                     default_on_surface_th,
-                    agent_obs["semantic"],
                 )
 
             # Approximate true world coordinates
@@ -403,7 +436,7 @@ def __call__(self, observations, state=None):
             y = y.reshape(1, self.h[i], self.w[i])
 
             # Unproject 2D camera coordinates into 3D coordinates relative to the agent
-            depth = depth_obs.reshape(1, self.h[i], self.w[i])
+            depth = depth_patch.reshape(1, self.h[i], self.w[i])
             xyz = np.vstack((x * depth, y * depth, -depth, np.ones(depth.shape)))
             xyz = xyz.reshape(4, -1)
             xyz = np.matmul(self.inv_k[i], xyz)
@@ -439,7 +472,7 @@ def __call__(self, observations, state=None):
                 observations[self.agent_id][sensor_id]["world_camera"] = world_camera
 
             # Extract 3D coordinates of detected objects (semantic_id != 0)
-            semantic = semantic_obs.reshape(1, -1)
+            semantic = surface_patch.reshape(1, -1)
             if self.get_all_points:
                 semantic_3d = xyz.transpose(1, 0)
                 semantic_3d[:, 3] = semantic[0]
@@ -460,27 +493,32 @@ def __call__(self, observations, state=None):
             # a deepcopy because we are appending a new observation
             observations[self.agent_id][sensor_id]["semantic_3d"] = semantic_3d
 
-        # Delete the added semantic mask if it was added.
-        if semantic_added:
-            del agent_obs["semantic"]
-
         return observations
 
-    def clip(self, agent_obs):
+    def clip(self, depth_patch: np.ndarray, semantic_patch: np.ndarray) -> None:
         """Clip the depth and semantic data that lie beyond a certain depth threshold.
 
-        Set the values of 0 (infinite depth) to the clip value.
+        This is currently used for surface agent observations to limit the "reach"
+        of the agent. Pixels beyond the clip value are set to 0 in the semantic patch,
+        and the depth values beyond the clip value (or equal to 0) are set to the clip
+        value.
+
+        This function this modifies `depth_patch` and `semantic_patch` in-place.
+
+        Args:
+            depth_patch: depth observations
+            semantic_patch: binary mask indicating on-object locations
         """
-        agent_obs["semantic"][agent_obs["depth"] >= self.clip_value] = 0
-        agent_obs["depth"][agent_obs["depth"] > self.clip_value] = self.clip_value
-        agent_obs["depth"][agent_obs["depth"] == 0] = self.clip_value
+        semantic_patch[depth_patch >= self.clip_value] = 0
+        depth_patch[depth_patch > self.clip_value] = self.clip_value
+        depth_patch[depth_patch == 0] = self.clip_value
 
     def get_on_surface_th(
         self,
         depth_patch: np.ndarray,
+        semantic_patch: np.ndarray,
         min_depth_range: Number,
         default_on_surface_th: Number,
-        semantic_mask: np.ndarray,
     ) -> Tuple[Number, bool]:
         """Return a depth threshold if we have a bimodal depth distribution.
 
@@ -499,16 +537,17 @@ def get_on_surface_th(
 
         Args:
             depth_patch: sensor patch observations of depth
+            semantic_patch: binary mask indicating on-object locations
             min_depth_range: minimum range of depth values to even be considered
             default_on_surface_th: default threshold to use if no bimodal distribution
                 is found
-            semantic_mask: binary mask indicating on-object locations
+
         Returns:
             threshold and whether we want to use values above or below threshold
         """
         center_loc = (depth_patch.shape[0] // 2, depth_patch.shape[1] // 2)
         depth_center = depth_patch[center_loc[0], center_loc[1]]
-        semantic_center = semantic_mask[center_loc[0], center_loc[1]]
+        semantic_center = semantic_patch[center_loc[0], center_loc[1]]
 
         depths = np.asarray(depth_patch).flatten()
         flip_sign = False
@@ -535,8 +574,8 @@ def get_on_surface_th(
     def get_surface_from_depth(
         self,
         depth_patch: np.ndarray,
+        semantic_patch: np.ndarray,
         default_on_surface_th: Number,
-        semantic_mask: np.ndarray,
     ) -> np.ndarray:
         """Return surface patch information from heuristics on depth patch.
 
@@ -570,9 +609,10 @@ def get_surface_from_depth(
 
         Args:
             depth_patch: sensor patch observations of depth
+            semantic_patch: binary mask indicating on-object locations
             default_on_surface_th: default threshold to use if no bimodal distribution
                 is found
-            semantic_mask: binary mask indicating on-object locations
+
         Returns:
             sensor patch shaped info about whether each pixel is on surface of not
         """
@@ -590,14 +630,13 @@ def get_surface_from_depth(
         # sign), and apply it to the depth to get the semantic patch.
         th, flip_sign = self.get_on_surface_th(
             depth_patch,
-            min_depth_range=0.01,
-            default_on_surface_th=default_on_surface_th,
-            semantic_mask=semantic_mask,
+            semantic_patch,
+            0.01,
+            default_on_surface_th,
         )
         if flip_sign is False:
-            semantic_patch = depth_patch < th
+            surface_patch = depth_patch < th
         else:
-            semantic_patch = depth_patch > th
+            surface_patch = depth_patch > th
 
-        semantic_patch = semantic_patch * semantic_mask
-        return semantic_patch
+        return surface_patch * semantic_patch