thousandbrainsproject · scottcanoe · Dec 20, 2024 · Dec 12, 2024 · Dec 13, 2024 · Dec 13, 2024
diff --git a/benchmarks/results/ycb_10objs.csv b/benchmarks/results/ycb_10objs.csv
@@ -1,13 +1,13 @@
 Experiment,% Correct,% Used MLH,Num Matching Steps,Rotation Error (radians),Run Time,Episode Run Time (s)
-base_config_10distinctobj_dist_agent,98.57%,5.71%,36,0.31,12m,31s
-base_config_10distinctobj_surf_agent,100.00%,0.00%,28,0.17,6m,27s
-randrot_noise_10distinctobj_dist_agent,99.00%,7.00%,51,0.50,10m,56s
-randrot_noise_10distinctobj_dist_on_distm,99.00%,1.00%,35,0.26,7m,48s
-randrot_noise_10distinctobj_surf_agent,100.00%,1.00%,29,0.36,7m,49s
-randrot_10distinctobj_surf_agent,100.00%,0.00%,29,0.37,4m,29s
-randrot_noise_10distinctobj_5lms_dist_agent,100.00%,3.00%,52,0.88,21m,139s
-base_10simobj_surf_agent,95.00%,10.00%,84,0.21,14m,76s
-randrot_noise_10simobj_dist_agent,81.00%,38.00%,193,0.52,26m,206s
-randrot_noise_10simobj_surf_agent,90.00%,35.00%,178,0.45,34m,294s
-randomrot_rawnoise_10distinctobj_surf_agent,65.00%,77.00%,16,1.60,22m,24s
-base_10multi_distinctobj_dist_agent,74.29%,37.14%,27,0.64,1h9m,2s
+base_config_10distinctobj_dist_agent,99.29%,5.71%,36,0.31,10m,31s
+base_config_10distinctobj_surf_agent,100.00%,0.00%,28,0.21,6m,28s
+randrot_noise_10distinctobj_dist_agent,98.00%,7.00%,46,0.50,9m,55s
+randrot_noise_10distinctobj_dist_on_distm,99.00%,3.00%,35,0.26,7m,50s
+randrot_noise_10distinctobj_surf_agent,100.00%,0.00%,31,0.34,8m,62s
+randrot_10distinctobj_surf_agent,100.00%,0.00%,28,0.40,7m,50s
+randrot_noise_10distinctobj_5lms_dist_agent,100.00%,7.00%,50,0.94,44m,189s
+base_10simobj_surf_agent,95.71%,10.71%,82,0.21,18m,104s
+randrot_noise_10simobj_dist_agent,82.00%,37.00%,185,0.52,26m,202s
+randrot_noise_10simobj_surf_agent,89.00%,34.00%,183,0.47,35m,307s
+randomrot_rawnoise_10distinctobj_surf_agent,68.00%,81.00%,15,1.72,22m,23s
+base_10multi_distinctobj_dist_agent,72.86%,43.57%,23,0.75,1h12m,1s
diff --git a/benchmarks/results/ycb_77objs.csv b/benchmarks/results/ycb_77objs.csv
@@ -1,6 +1,6 @@
 Experiment,% Correct,% Used MLH,Num Matching Steps,Rotation Error (radians),Run Time,Episode Run Time (s)
-base_77obj_dist_agent,93.51%,14.29%,90,0.31,1h37m,295s
-base_77obj_surf_agent,98.27%,5.63%,57,0.21,46m,141s
-randrot_noise_77obj_dist_agent,87.01%,28.57%,155,0.64,2h14m,479s
-randrot_noise_77obj_surf_agent,94.93%,19.48%,102,0.62,1h23m,304s
-randrot_noise_77obj_5lms_dist_agent,93.51%,3.90%,71,0.92,54m,1398s
+base_77obj_dist_agent,92.21%,16.02%,88,0.30,1h38m,301s
+base_77obj_surf_agent,98.27%,4.33%,52,0.18,42m,123s
+randrot_noise_77obj_dist_agent,87.01%,29.00%,151,0.63,2h10m,468s
+randrot_noise_77obj_surf_agent,94.37%,21.65%,113,0.61,1h31m,339s
+randrot_noise_77obj_5lms_dist_agent,90.91%,5.19%,70,1.01,1h7m,1439s
diff --git a/docs/overview/benchmark-experiments.md b/docs/overview/benchmark-experiments.md
@@ -51,18 +51,18 @@ The following results are obtained from experiments using the 10-object subsets
 
 | Experiment                                  | % Correct | % Used MLH | Num Matching Steps | Rotation Error (radians) | Run Time | Episode Run Time (s) |
 |---------------------------------------------|-----------|------------|--------------------|--------------------------|----------|----------------------|
-| base_config_10distinctobj_dist_agent        | 98.57%    | 5.71%      | 36                 | 0.31                     | 12m      | 31s                  |
-| base_config_10distinctobj_surf_agent        | 100.00%   | 0.00%      | 28                 | 0.17                     | 6m       | 27s                  |
-| randrot_noise_10distinctobj_dist_agent      | 99.00%    | 7.00%      | 51                 | 0.50                     | 10m      | 56s                  |
-| randrot_noise_10distinctobj_dist_on_distm   | 99.00%    | 1.00%      | 35                 | 0.26                     | 7m       | 48s                  |
-| randrot_noise_10distinctobj_surf_agent      | 100.00%   | 1.00%      | 29                 | 0.36                     | 7m       | 49s                  |
-| randrot_10distinctobj_surf_agent            | 100.00%   | 0.00%      | 29                 | 0.37                     | 4m       | 29s                  |
-| randrot_noise_10distinctobj_5lms_dist_agent | 100.00%   | 3.00%      | 52                 | 0.88                     | 21m      | 139s                 |
-| base_10simobj_surf_agent                    | 95.00%    | 10.00%     | 84                 | 0.21                     | 14m      | 76s                  |
-| randrot_noise_10simobj_dist_agent           | 81.00%    | 38.00%     | 193                | 0.52                     | 26m      | 206s                 |
-| randrot_noise_10simobj_surf_agent           | 90.00%    | 35.00%     | 178                | 0.45                     | 34m      | 294s                 |
-| randomrot_rawnoise_10distinctobj_surf_agent | 65.00%    | 77.00%     | 16                 | 1.60                     | 22m      | 24s                  |
-| base_10multi_distinctobj_dist_agent         | 74.29%    | 37.14%     | 27                 | 0.64                     | 1h9m     | 2s                   |
+| base_config_10distinctobj_dist_agent        | 99.29%    | 5.71%      | 36                 | 0.31                     | 10m      | 31s                  |
+| base_config_10distinctobj_surf_agent        | 100.00%   | 0.00%      | 28                 | 0.21                     | 6m       | 28s                  |
+| randrot_noise_10distinctobj_dist_agent      | 98.00%    | 7.00%      | 46                 | 0.50                     | 9m       | 55s                  |
+| randrot_noise_10distinctobj_dist_on_distm   | 99.00%    | 3.00%      | 35                 | 0.26                     | 7m       | 50s                  |
+| randrot_noise_10distinctobj_surf_agent      | 100.00%   | 0.00%      | 31                 | 0.34                     | 8m       | 62s                  |
+| randrot_10distinctobj_surf_agent            | 100.00%   | 0.00%      | 28                 | 0.40                     | 7m       | 50s                  |
+| randrot_noise_10distinctobj_5lms_dist_agent | 100.00%   | 7.00%      | 50                 | 0.94                     | 44m      | 189s                 |
+| base_10simobj_surf_agent                    | 95.71%    | 10.71%     | 82                 | 0.21                     | 18m      | 104s                 |
+| randrot_noise_10simobj_dist_agent           | 82.00%    | 37.00%     | 185                | 0.52                     | 26m      | 202s                 |
+| randrot_noise_10simobj_surf_agent           | 89.00%    | 34.00%     | 183                | 0.47                     | 35m      | 307s                 |
+| randomrot_rawnoise_10distinctobj_surf_agent | 68.00%    | 81.00%     | 15                 | 1.72                     | 22m      | 23s                  |
+| base_10multi_distinctobj_dist_agent         | 72.86%    | 43.57%     | 23                 | 0.75                     | 1h12m    | 1s                   |
 
 ## Longer Experiments with all 77 YCB Objects
 
@@ -75,11 +75,11 @@ The following results are obtained from experiments on the entire YCB dataset (7
 
 | Experiment                          | % Correct | % Used MLH | Num Matching Steps | Rotation Error (radians) | Run Time | Episode Run Time (s) |
 |-------------------------------------|-----------|------------|--------------------|--------------------------|----------|----------------------|
-| base_77obj_dist_agent               | 93.51%    | 14.29%     | 90                 | 0.31                     | 1h37m    | 295s                 |
-| base_77obj_surf_agent               | 98.27%    | 5.63%      | 57                 | 0.21                     | 46m      | 141s                 |
-| randrot_noise_77obj_dist_agent      | 87.01%    | 28.57%     | 155                | 0.64                     | 2h14m    | 479s                 |
-| randrot_noise_77obj_surf_agent      | 94.93%    | 19.48%     | 102                | 0.62                     | 1h23m    | 304s                 |
-| randrot_noise_77obj_5lms_dist_agent | 93.51%    | 3.90%      | 71                 | 0.92                     | 54m      | 1398s                |
+| base_77obj_dist_agent               | 92.21%    | 16.02%     | 88                 | 0.30                     | 1h38m    | 301s                 |
+| base_77obj_surf_agent               | 98.27%    | 4.33%      | 52                 | 0.18                     | 42m      | 123s                 |
+| randrot_noise_77obj_dist_agent      | 87.01%    | 29.00%     | 151                | 0.63                     | 2h10m    | 468s                 |
+| randrot_noise_77obj_surf_agent      | 94.37%    | 21.65%     | 113                | 0.61                     | 1h31m    | 339s                 |
+| randrot_noise_77obj_5lms_dist_agent | 90.91%    | 5.19%      | 70                 | 1.01                     | 1h7m     | 1439s                |
 
 ### Explanation of Some of the Results
 

diff --git a/src/tbp/monty/frameworks/environments/embodied_data.py b/src/tbp/monty/frameworks/environments/embodied_data.py
@@ -483,7 +483,7 @@ def __next__(self):
     def pre_episode(self):
         super().pre_episode()
         if not self.dataset.env._agents[0].action_space_type == "surface_agent":
-            self.get_good_view("view_finder")
+            self.get_good_view_with_patch_refinement()
 
     def first_step(self):
         """Carry out particular motor-system state updates required on the first step.
@@ -509,26 +509,36 @@ def first_step(self):
 
         return self._observation
 
-    def get_good_view(self, view_sensor_id):
+    def get_good_view(
+        self, view_sensor_id: str, allow_translation: bool = True
+    ) -> None:
         """Policy to get a good view of the object before an episode starts.
 
-        Used by the distant agent - the surface agent makes use of the
-        touch_object method instead. Also currently used by the distant
-        after a "jump" has been initialized by a model-based policy.
-
-        Move towards object until it fills n percent of the view sensor
-        or the closest point of the object is <0.03 distance from the
-        sensor (-> won't be rendered properly anymore). This makes sure
-        that big and small objects all fill similar amount of space in the
-        sensor field of view. Otherwise small objects may be too small to
-        perform saccades or the sensor ends up inside of big objects.
+        Used by the distant agent to find the initial view of an object at the
+        beginning of an episode with respect to a given sensor (the surface agent
+        makes use of the `touch_object` method instead). Also currently used
+        by the distant agent after a "jump" has been initialized by a model-based
+        policy.
+
+        First, the agent moves towards object until it fills a minimum of percentage
+        (given by `motor_system.good_view_percentage`) of the sensor's field of view
+        or the closest point of the object is less than a given distance
+        (`motor_system.desired_object_distance`) from the sensor. This makes sure
+        that big and small objects all fill similar amount of space in the sensor's
+        field of view. Otherwise small objects may be too small to perform saccades or
+        the sensor ends up inside of big objects. This step is performed by default
+        but can be skipped by setting `allow_translation=False`.
+
+        Second, the agent will then be oriented towards the object so that the
+        sensor's central pixel is on-object. In the case of multi-object experiments,
+        (i.e., when `num_distractors > 0`), there is an additional orientation step
+        performed prior to the translational movement step.
 
         Args:
-            view_sensor_id: The name of the sensor used as view finder.
-                This sensor should ideally be a zoomed out version of the
-                sensor patch such that it can contain the whole object
-                while the sensor patch always only sees a small patch of
-                the object.
+            view_sensor_id: The name of the sensor used to inform movements.
+            allow_translation: Whether to allow movement toward the object via
+                the motor systems's `move_close_enough` method. If `False`, only
+                orientienting movements are performed. Default is `True`.
 
         TODO M : move most of this to the motor systems, shouldn't be in embodied_data
             class
@@ -547,25 +557,24 @@ def get_good_view(self, view_sensor_id):
                 for action in actions:
                     self._observation, self.motor_system.state = self.dataset[action]
 
-        # Move closer to the object, if not already close enough
-        action, close_enough = self.motor_system.move_close_enough(
-            self._observation,
-            view_sensor_id,
-            target_semantic_id=self.primary_target["semantic_id"],
-            multi_objects_present=self.num_distactors > 0,
-        )
-
-        # Continue moving to a close distance to the object
-        while not close_enough:
-            logging.debug("moving closer!")
-            self._observation, self.motor_system.state = self.dataset[action]
-
+        if allow_translation:
+            # Move closer to the object, if not already close enough
             action, close_enough = self.motor_system.move_close_enough(
                 self._observation,
                 view_sensor_id,
                 target_semantic_id=self.primary_target["semantic_id"],
                 multi_objects_present=self.num_distactors > 0,
             )
+            # Continue moving to a close distance to the object
+            while not close_enough:
+                logging.debug("moving closer!")
+                self._observation, self.motor_system.state = self.dataset[action]
+                action, close_enough = self.motor_system.move_close_enough(
+                    self._observation,
+                    view_sensor_id,
+                    target_semantic_id=self.primary_target["semantic_id"],
+                    multi_objects_present=self.num_distactors > 0,
+                )
 
         # Re-center ourselves (if necessary) after having moved closer
         actions, on_object = self.motor_system.orient_to_object(
@@ -590,6 +599,26 @@ def get_good_view(self, view_sensor_id):
         # )
         # assert on_object, "Primary target must be visible at the start of the episode"
 
+    def get_good_view_with_patch_refinement(self) -> None:
+        """Policy to get a good view of the object for the central patch.
+
+        Used by the distant agent to move and orient toward an object such that the
+        central patch is on-object. This is done by first moving and orienting the
+        agent toward the object using the view finder. A second orienting movement is
+        then performed using the central patch (i.e., the sensor module with id
+        "patch" or "patch_0") to ensure that the patch's central pixel is on-object.
+
+        Also currently used by the distant agent after a "jump" has been initialized
+        by a model-based policy.
+
+
+        """
+        self.get_good_view("view_finder")
+        for patch_id in ("patch", "patch_0"):
+            if patch_id in self._observation["agent_id_0"].keys():
+                self.get_good_view(patch_id, allow_translation=False)
+                break
+
     def execute_jump_attempt(self):
         """Attempt a hypothesis-testing "jump" onto a location of the object.
 
@@ -714,7 +743,7 @@ def handle_successful_jump(self):
             self.motor_system.action_details["z_defined_pc"].append(None)
 
         else:
-            self.get_good_view("view_finder")
+            self.get_good_view_with_patch_refinement()
             # TODO implement better way to get better view after the jump that isn't
             # "cheating" by using get_good_view (which uses the semantic sensor)
 

diff --git a/src/tbp/monty/frameworks/models/evidence_matching.py b/src/tbp/monty/frameworks/models/evidence_matching.py
@@ -366,6 +366,12 @@ def reset(self):
         self.symmetry_evidence = 0
         self.last_possible_hypotheses = None
 
+        self.current_mlh["graph_id"] = "no_observations_yet"
+        self.current_mlh["location"] = [0, 0, 0]
+        self.current_mlh["rotation"] = Rotation.from_euler("xyz", [0, 0, 0])
+        self.current_mlh["scale"] = 1
+        self.current_mlh["evidence"] = 0
+
     def receive_votes(self, vote_data):
         """Get evidence count votes and use to update own evidence counts.
 

diff --git a/src/tbp/monty/frameworks/models/motor_policies.py b/src/tbp/monty/frameworks/models/motor_policies.py
@@ -664,8 +664,9 @@ def orient_to_object(
 
         logging.debug("Searching for object")
 
-        # Check if the center of the view finder is on the object
-        if sem_obs[obs_dim[0] // 2][obs_dim[1] // 2] == target_semantic_id:
+        # Check if the central pixel is on-object.
+        y_mid, x_mid = obs_dim[0] // 2, obs_dim[1] // 2
+        if sem_obs[y_mid, x_mid] == target_semantic_id:
             logging.debug("Already centered on the object")
             return [], True
 
@@ -724,7 +725,7 @@ def find_location_to_look_at(self, sem3d_obs, image_shape, target_semantic_id):
         # as expected, which can otherwise break if e.g. on_object_image is passed
         # as an int or boolean rather than float
         smoothed_on_object_image = scipy.ndimage.gaussian_filter(
-            on_object_image, sem3d_obs.shape[0] / 10, mode="constant"
+            on_object_image, 2, mode="constant"
         )
         idx_loc_to_look_at = np.argmax(smoothed_on_object_image * on_object_image)
         idx_loc_to_look_at = np.unravel_index(idx_loc_to_look_at, on_object_image.shape)