diff --git a/.github/scripts/ci-style.sh b/.github/scripts/ci-style.sh
index c4e9f4123b..0482321b52 100755
--- a/.github/scripts/ci-style.sh
+++ b/.github/scripts/ci-style.sh
@@ -13,36 +13,51 @@ if [[ $CLIPPY_VERSION == "clippy 0.1.72"* ]]; then
     export CARGO_INCREMENTAL=0
 fi
 
+if [[ $CLIPPY_VERSION == "clippy 0.1.77"* && $CARGO_BUILD_TARGET == "x86_64-apple-darwin" ]]; then
+    export SKIP_CLIPPY=1
+fi
+
 # --- Check main crate ---
 
-# check base
-cargo clippy
-# check all features
-for_all_features "cargo clippy"
-# check release
-for_all_features "cargo clippy --release"
-# check for tests
-for_all_features "cargo clippy --tests"
-
-# target-specific features
-if [[ $arch == "x86_64" && $os == "linux" ]]; then
-    cargo clippy --features perf_counter
-    cargo clippy --release --features perf_counter
-    cargo clippy --tests --features perf_counter
-fi
+if [[ $SKIP_CLIPPY == 1 ]]; then
+    echo "Skipping clippy version $CLIPPY_VERSION on $CARGO_BUILD_TARGET"
+else
+    # check base
+    cargo clippy
+    # check all features
+    for_all_features "cargo clippy"
+    # check release
+    for_all_features "cargo clippy --release"
+    # check for tests
+    for_all_features "cargo clippy --tests"
 
-# mock tests
-cargo clippy --features mock_test
-cargo clippy --features mock_test --tests
-cargo clippy --features mock_test --benches
+    # target-specific features
+    if [[ $arch == "x86_64" && $os == "linux" ]]; then
+        cargo clippy --features perf_counter
+        cargo clippy --release --features perf_counter
+        cargo clippy --tests --features perf_counter
+    fi
+
+    # mock tests
+    cargo clippy --features mock_test
+    cargo clippy --features mock_test --tests
+    cargo clippy --features mock_test --benches
+
+    # non-mock benchmarks
+    cargo clippy --features test_private --benches
+fi
 
 # --- Check auxiliary crate ---
 
 style_check_auxiliary_crate() {
     crate_path=$1
 
-    cargo clippy --manifest-path=$crate_path/Cargo.toml
-    cargo fmt --manifest-path=$crate_path/Cargo.toml -- --check
+    if [[ $SKIP_CLIPPY == 1 ]]; then
+        echo "Skipping clippy test for $crate_path"
+    else
+        cargo clippy --manifest-path=$crate_path/Cargo.toml
+        cargo fmt --manifest-path=$crate_path/Cargo.toml -- --check
+    fi
 }
 
 style_check_auxiliary_crate macros
diff --git a/.github/workflows/merge-check.yml b/.github/workflows/merge-check.yml
index 41297ccd5e..8dd53785e9 100644
--- a/.github/workflows/merge-check.yml
+++ b/.github/workflows/merge-check.yml
@@ -11,7 +11,7 @@ env:
   # Ignore some actions for the merge check:
   # - This action itself
   # - Public API check, doc broken link check: we allow them to fail.
-  # - Minimal tests for stable Rust: we allow them to fail.
+  # - Minimal tests and style checks for stable Rust: we allow them to fail.
   # - Extended binding tests: it may take long to run. We don't want to wait for them.
   #   Note: The action name for openjdk tests is different due to the reused workflow.
   IGNORED_ACTIONS: |
@@ -23,6 +23,9 @@ env:
       "minimal-tests-core/x86_64-unknown-linux-gnu/stable",
       "minimal-tests-core/i686-unknown-linux-gnu/stable",
       "minimal-tests-core/x86_64-apple-darwin/stable",
+      "style-check/x86_64-unknown-linux-gnu/stable",
+      "style-check/i686-unknown-linux-gnu/stable",
+      "style-check/x86_64-apple-darwin/stable",
       "extended-tests-openjdk / test",
       "extended-tests-v8",
       "extended-tests-jikesrvm",
diff --git a/.github/workflows/minimal-tests-core.yml b/.github/workflows/minimal-tests-core.yml
index f1f2f4c34b..3a72c531e7 100644
--- a/.github/workflows/minimal-tests-core.yml
+++ b/.github/workflows/minimal-tests-core.yml
@@ -79,6 +79,49 @@ jobs:
       - name: Test
         run: ./.github/scripts/ci-test.sh
 
+  style-check:
+    needs: setup-test-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        target:
+          - { os: ubuntu-22.04, triple: x86_64-unknown-linux-gnu }
+          - { os: ubuntu-22.04, triple: i686-unknown-linux-gnu }
+          - { os: macos-12, triple: x86_64-apple-darwin }
+        rust: ${{ fromJson(needs.setup-test-matrix.outputs.rust )}}
+
+    name: style-check/${{ matrix.target.triple }}/${{ matrix.rust }}
+    runs-on: ${{ matrix.target.os }}
+
+    env:
+      # This determines the default target which cargo-build, cargo-test, etc. use.
+      CARGO_BUILD_TARGET: "${{ matrix.target.triple }}"
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install Rust
+        run: |
+          # "rustup toolchain install" should always install the host toolchain,
+          # so we don't specify the triple.
+          rustup toolchain install ${{ matrix.rust }}
+          rustup override set ${{ matrix.rust }}
+
+          # Ensure we install the target support for the target we are testing for.
+          # This is especially important for i686-unknown-linux-gnu
+          # because it's different from the host.
+          rustup target add ${{ matrix.target.triple }}
+
+          rustup component add rustfmt clippy
+
+      # Show the Rust toolchain and target we are actually using
+      - run: rustup show
+      - run: cargo --version
+      - run: cargo rustc -- --print cfg
+
+      # Setup Environments
+      - name: Setup Environments
+        run: ./.github/scripts/ci-setup-${{ matrix.target.triple }}.sh
+
       # Style checks
       - name: Style checks
         run: ./.github/scripts/ci-style.sh
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 75f479c763..308e14da96 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,30 @@
+0.27.0 (2024-08-09)
+===
+
+## What's Changed
+
+### Policy
+* Clear pin bits by @udesou in https://github.com/mmtk/mmtk-core/pull/1166
+* Set the unlog bits for objects in the bootimage by @k-sareen in https://github.com/mmtk/mmtk-core/pull/1168
+* Fix bug in line mark when `MARK_LINE_AT_SCAN_TIME=false` by @k-sareen in https://github.com/mmtk/mmtk-core/pull/1171
+
+### API
+* Require object reference to be aligned by @qinsoon in https://github.com/mmtk/mmtk-core/pull/1159
+* Internal pointer support by @qinsoon in https://github.com/mmtk/mmtk-core/pull/1165
+* Heap traversal by @wks in https://github.com/mmtk/mmtk-core/pull/1174
+
+### CI
+* Parallelize and workaround CI style checks by @wks in https://github.com/mmtk/mmtk-core/pull/1184
+
+### Misc
+* Extensible eBPF timeline attributes by @wks in https://github.com/mmtk/mmtk-core/pull/1162
+* Only map with executable permissions when using code space by @k-sareen in https://github.com/mmtk/mmtk-core/pull/1176
+* Fix style check for Rust 1.80 by @qinsoon in https://github.com/mmtk/mmtk-core/pull/1178
+* Extra assertions for mutators and epilogues by @wks in https://github.com/mmtk/mmtk-core/pull/1182
+* Refactor iterate_meta_bits by @wks in https://github.com/mmtk/mmtk-core/pull/1181
+
+**Full Changelog**: https://github.com/mmtk/mmtk-core/compare/v0.26.0...v0.27.0
+
 0.26.0 (2024-07-01)
 ===
 
diff --git a/Cargo.toml b/Cargo.toml
index b089819007..b92291190f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "mmtk"
-version = "0.26.0"
+version = "0.27.0"
 authors = ["The MMTk Developers <>"]
 edition = "2021"
 license = "MIT OR Apache-2.0"
@@ -38,7 +38,7 @@ log = { version = "0.4", features = ["max_level_trace", "release_max_level_off"]
 memoffset = "0.9"
 mimalloc-sys = { version = "0.1.6", optional = true }
 # MMTk macros - we have to specify a version here in order to publish the crate, even though we use the dependency from a local path.
-mmtk-macros = { version="0.26.0", path = "macros/" }
+mmtk-macros = { version="0.27.0", path = "macros/" }
 num_cpus = "1.8"
 num-traits = "0.2"
 pfm = { version = "0.1.1", optional = true }
@@ -55,6 +55,7 @@ sysinfo = "0.30.9"
 [dev-dependencies]
 paste = "1.0.8"
 rand = "0.8.5"
+rand_chacha = "0.3.1"
 criterion = "0.4"
 
 [build-dependencies]
@@ -73,7 +74,10 @@ perf_counter = ["pfm"]
 
 # This feature is only used for tests with MockVM.
 # CI scripts run those tests with this feature.
-mock_test = []
+mock_test = ["test_private"]
+
+# This feature will expose some private functions for testings or benchmarking.
+test_private = []
 
 # .github/scripts/ci-common.sh extracts features from the following part (including from comments).
 # So be careful when editing or adding stuff to the section below.
diff --git a/benches/main.rs b/benches/main.rs
index 6c735ce4e2..f68bc3c071 100644
--- a/benches/main.rs
+++ b/benches/main.rs
@@ -2,42 +2,25 @@ use criterion::criterion_group;
 use criterion::criterion_main;
 use criterion::Criterion;
 
-// As we can only initialize one MMTk instance, we have to run each benchmark in a separate process.
-// So we only register one benchmark to criterion ('bench_main'), and based on the env var MMTK_BENCH,
-// we pick the right benchmark to run.
+#[cfg(all(feature = "mock_test", feature = "test_private"))]
+pub mod mock_bench;
 
-// The benchmark can be executed with the following command. The feature `mock_test` is required, as the tests use MockVM.
-// MMTK_BENCH=alloc cargo bench --features mock_test
-// MMTK_BENCH=sft   cargo bench --features mock_test
-
-// [Yi] I am not sure if these benchmarks are helpful any more after the MockVM refactoring. MockVM is really slow, as it
-// is accessed with a lock, and it dispatches every call to function pointers in a struct. These tests may use MockVM,
-// so they become slower as well. And the slowdown
-// from MockVM may hide the actual performance difference when we change the functions that are benchmarked.
-// We may want to improve the MockVM implementation so we can skip dispatching for benchmarking, or introduce another MockVM
-// implementation for benchmarking.
-// However, I will just keep these benchmarks here. If we find it not useful, and we do not plan to improve MockVM, we can delete
-// them.
-
-#[cfg(feature = "mock_test")]
-mod mock_bench;
+#[cfg(all(not(feature = "mock_test"), feature = "test_private"))]
+pub mod regular_bench;
 
 pub fn bench_main(_c: &mut Criterion) {
-    #[cfg(feature = "mock_test")]
-    match std::env::var("MMTK_BENCH") {
-        Ok(bench) => match bench.as_str() {
-            "alloc" => mock_bench::alloc::bench(_c),
-            "internal_pointer" => mock_bench::internal_pointer::bench(_c),
-            "sft" => mock_bench::sft::bench(_c),
-            _ => panic!("Unknown benchmark {:?}", bench),
-        },
-        Err(_) => panic!("Need to name a benchmark by the env var MMTK_BENCH"),
-    }
-
-    #[cfg(not(feature = "mock_test"))]
-    {
-        eprintln!("ERROR: Currently there are no benchmarks when the \"mock_test\" feature is not enabled.");
-        std::process::exit(1);
+    cfg_if::cfg_if! {
+        if #[cfg(feature = "mock_test")] {
+            // If the "mock_test" feature is enabled, we only run mock test.
+            mock_bench::bench(_c);
+        } else if #[cfg(feature = "test_private")] {
+            regular_bench::bench(_c);
+        } else {
+            eprintln!("ERROR: Benchmarks in mmtk_core requires the test_priavte feature (implied by mock_test) to run.");
+            eprintln!("  Rerun with `MMTK_BENCH=\"bench_name\" cargo bench --features mock_test` to run mock-test benchmarks.");
+            eprintln!("  Rerun with `cargo bench --features test_private -- bench_name` to run other benchmarks.");
+            std::process::exit(1);
+        }
     }
 }
 
diff --git a/benches/mock_bench/mod.rs b/benches/mock_bench/mod.rs
index f4ca1c4428..95466bfdc7 100644
--- a/benches/mock_bench/mod.rs
+++ b/benches/mock_bench/mod.rs
@@ -1,3 +1,34 @@
+use criterion::Criterion;
+
 pub mod alloc;
 pub mod internal_pointer;
 pub mod sft;
+
+// As we can only initialize one MMTk instance, we have to run each benchmark in a separate process.
+// So we only register one benchmark to criterion ('bench_main'), and based on the env var MMTK_BENCH,
+// we pick the right benchmark to run.
+
+// The benchmark can be executed with the following command. The feature `mock_test` is required, as the tests use MockVM.
+// MMTK_BENCH=alloc cargo bench --features mock_test
+// MMTK_BENCH=sft   cargo bench --features mock_test
+
+// [Yi] I am not sure if these benchmarks are helpful any more after the MockVM refactoring. MockVM is really slow, as it
+// is accessed with a lock, and it dispatches every call to function pointers in a struct. These tests may use MockVM,
+// so they become slower as well. And the slowdown
+// from MockVM may hide the actual performance difference when we change the functions that are benchmarked.
+// We may want to improve the MockVM implementation so we can skip dispatching for benchmarking, or introduce another MockVM
+// implementation for benchmarking.
+// However, I will just keep these benchmarks here. If we find it not useful, and we do not plan to improve MockVM, we can delete
+// them.
+
+pub fn bench(c: &mut Criterion) {
+    match std::env::var("MMTK_BENCH") {
+        Ok(bench) => match bench.as_str() {
+            "alloc" => alloc::bench(c),
+            "internal_pointer" => internal_pointer::bench(c),
+            "sft" => sft::bench(c),
+            _ => panic!("Unknown benchmark {:?}", bench),
+        },
+        Err(_) => panic!("Need to name a benchmark by the env var MMTK_BENCH"),
+    }
+}
diff --git a/benches/regular_bench/bulk_meta/bscan.rs b/benches/regular_bench/bulk_meta/bscan.rs
new file mode 100644
index 0000000000..b8d57187ce
--- /dev/null
+++ b/benches/regular_bench/bulk_meta/bscan.rs
@@ -0,0 +1,87 @@
+//! Benchmarks for scanning side metadata for non-zero bits.
+
+use criterion::Criterion;
+use mmtk::util::{
+    constants::LOG_BITS_IN_WORD, test_private::scan_non_zero_bits_in_metadata_bytes, Address,
+};
+use rand::{seq::IteratorRandom, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+
+fn allocate_aligned(size: usize) -> Address {
+    let ptr = unsafe {
+        std::alloc::alloc_zeroed(std::alloc::Layout::from_size_align(size, size).unwrap())
+    };
+    Address::from_mut_ptr(ptr)
+}
+
+const BLOCK_BYTES: usize = 32768usize; // Match an Immix block size.
+
+// Asssume one-bit-per-word metadata (matching VO bits).
+const BLOCK_META_BYTES: usize = BLOCK_BYTES >> LOG_BITS_IN_WORD;
+
+/// Set this many distinct bits in the bitmap.
+const NUM_OBJECTS: usize = 200;
+
+/// Get a deterministic seeded Rng.
+fn get_rng() -> ChaCha8Rng {
+    // Create an Rng from a seed and an explicit Rng type.
+    // Not secure at all, but completely deterministic and reproducible.
+    // The following seed is read from /dev/random
+    const SEED64: u64 = 0x4050cb1b5ab26c70;
+    ChaCha8Rng::seed_from_u64(SEED64)
+}
+
+/// A bitmap, with known location of each bit for assertion.
+struct PreparedBitmap {
+    start: Address,
+    end: Address,
+    set_bits: Vec<(Address, u8)>,
+}
+
+/// Make a bitmap of the desired size and set bits.
+fn make_standard_bitmap() -> PreparedBitmap {
+    let start = allocate_aligned(BLOCK_META_BYTES);
+    let end = start + BLOCK_META_BYTES;
+    let mut rng = get_rng();
+
+    let mut set_bits = (0..(BLOCK_BYTES >> LOG_BITS_IN_WORD))
+        .choose_multiple(&mut rng, NUM_OBJECTS)
+        .iter()
+        .map(|total_bit_offset| {
+            let word_offset = total_bit_offset >> LOG_BITS_IN_WORD;
+            let bit_offset = total_bit_offset & ((1 << LOG_BITS_IN_WORD) - 1);
+            (start + (word_offset << LOG_BITS_IN_WORD), bit_offset as u8)
+        })
+        .collect::<Vec<_>>();
+
+    set_bits.sort();
+
+    for (addr, bit) in set_bits.iter() {
+        let word = unsafe { addr.load::<usize>() };
+        let new_word = word | (1 << bit);
+        unsafe { addr.store::<usize>(new_word) };
+    }
+
+    PreparedBitmap {
+        start,
+        end,
+        set_bits,
+    }
+}
+
+pub fn bench(c: &mut Criterion) {
+    c.bench_function("bscan_block", |b| {
+        let bitmap = make_standard_bitmap();
+        let mut holder: Vec<(Address, u8)> = Vec::with_capacity(NUM_OBJECTS);
+
+        b.iter(|| {
+            holder.clear();
+            scan_non_zero_bits_in_metadata_bytes(bitmap.start, bitmap.end, &mut |addr, shift| {
+                holder.push((addr, shift));
+            });
+        });
+
+        assert_eq!(holder.len(), NUM_OBJECTS);
+        assert_eq!(holder, bitmap.set_bits);
+    });
+}
diff --git a/benches/regular_bench/bulk_meta/bzero_bset.rs b/benches/regular_bench/bulk_meta/bzero_bset.rs
new file mode 100644
index 0000000000..fc01e58b4c
--- /dev/null
+++ b/benches/regular_bench/bulk_meta/bzero_bset.rs
@@ -0,0 +1,62 @@
+//! Benchmarks for bulk zeroing and setting.
+
+use std::os::raw::c_void;
+
+use criterion::Criterion;
+use mmtk::util::{constants::LOG_BITS_IN_WORD, test_private, Address};
+
+fn allocate_aligned(size: usize) -> Address {
+    let ptr = unsafe {
+        std::alloc::alloc_zeroed(std::alloc::Layout::from_size_align(size, size).unwrap())
+    };
+    Address::from_mut_ptr(ptr)
+}
+
+const LINE_BYTES: usize = 256usize; // Match an Immix line size.
+const BLOCK_BYTES: usize = 32768usize; // Match an Immix block size.
+
+// Asssume one-bit-per-word metadata (matching VO bits).
+const LINE_META_BYTES: usize = LINE_BYTES >> LOG_BITS_IN_WORD;
+const BLOCK_META_BYTES: usize = BLOCK_BYTES >> LOG_BITS_IN_WORD;
+
+pub fn bench(c: &mut Criterion) {
+    c.bench_function("bzero_bset_line", |b| {
+        let start = allocate_aligned(LINE_META_BYTES);
+        let end = start + LINE_META_BYTES;
+
+        b.iter(|| {
+            test_private::set_meta_bits(start, 0, end, 0);
+            test_private::zero_meta_bits(start, 0, end, 0);
+        })
+    });
+
+    c.bench_function("bzero_bset_line_memset", |b| {
+        let start = allocate_aligned(LINE_META_BYTES);
+        let end = start + LINE_META_BYTES;
+
+        b.iter(|| unsafe {
+            libc::memset(start.as_mut_ref() as *mut c_void, 0xff, end - start);
+            libc::memset(start.as_mut_ref() as *mut c_void, 0x00, end - start);
+        })
+    });
+
+    c.bench_function("bzero_bset_block", |b| {
+        let start = allocate_aligned(BLOCK_META_BYTES);
+        let end = start + BLOCK_META_BYTES;
+
+        b.iter(|| {
+            test_private::set_meta_bits(start, 0, end, 0);
+            test_private::zero_meta_bits(start, 0, end, 0);
+        })
+    });
+
+    c.bench_function("bzero_bset_block_memset", |b| {
+        let start = allocate_aligned(BLOCK_META_BYTES);
+        let end = start + BLOCK_META_BYTES;
+
+        b.iter(|| unsafe {
+            libc::memset(start.as_mut_ref() as *mut c_void, 0xff, end - start);
+            libc::memset(start.as_mut_ref() as *mut c_void, 0x00, end - start);
+        })
+    });
+}
diff --git a/benches/regular_bench/bulk_meta/mod.rs b/benches/regular_bench/bulk_meta/mod.rs
new file mode 100644
index 0000000000..2abc48b849
--- /dev/null
+++ b/benches/regular_bench/bulk_meta/mod.rs
@@ -0,0 +1,9 @@
+pub mod bscan;
+pub mod bzero_bset;
+
+pub use criterion::Criterion;
+
+pub fn bench(c: &mut Criterion) {
+    bscan::bench(c);
+    bzero_bset::bench(c);
+}
diff --git a/benches/regular_bench/mod.rs b/benches/regular_bench/mod.rs
new file mode 100644
index 0000000000..63b1f3a1a5
--- /dev/null
+++ b/benches/regular_bench/mod.rs
@@ -0,0 +1,7 @@
+pub use criterion::Criterion;
+
+mod bulk_meta;
+
+pub fn bench(c: &mut Criterion) {
+    bulk_meta::bench(c);
+}
diff --git a/macros/Cargo.toml b/macros/Cargo.toml
index 258b375b83..44b8848119 100644
--- a/macros/Cargo.toml
+++ b/macros/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "mmtk-macros"
 # the macro crate uses the same version as mmtk-core
-version = "0.26.0"
+version = "0.27.0"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 description = "MMTk macros provides procedural macros used by mmtk-core."
diff --git a/src/mmtk.rs b/src/mmtk.rs
index ed69f31e70..f5f7622130 100644
--- a/src/mmtk.rs
+++ b/src/mmtk.rs
@@ -6,6 +6,8 @@ use crate::plan::Plan;
 use crate::policy::sft_map::{create_sft_map, SFTMap};
 use crate::scheduler::GCWorkScheduler;
 
+#[cfg(feature = "vo_bit")]
+use crate::util::address::ObjectReference;
 #[cfg(feature = "analysis")]
 use crate::util::analysis::AnalysisManager;
 use crate::util::finalizable_processor::FinalizableProcessor;
@@ -467,4 +469,54 @@ impl<VM: VMBinding> MMTK<VM> {
     pub fn get_options(&self) -> &Options {
         &self.options
     }
+
+    /// Enumerate objects in all spaces in this MMTK instance.
+    ///
+    /// The call-back function `f` is called for every object that has the valid object bit (VO
+    /// bit), i.e. objects that are allocated in the heap of this MMTK instance, but has not been
+    /// reclaimed, yet.
+    ///
+    /// # Notes about object initialization and finalization
+    ///
+    /// When this function visits an object, it only guarantees that its VO bit must have been set.
+    /// It is not guaranteed if the object has been "fully initialized" in the sense of the
+    /// programming language the VM is implementing.  For example, the object header and the type
+    /// information may not have been written.
+    ///
+    /// It will also visit objects that have been "finalized" in the sense of the programming
+    /// langauge the VM is implementing, as long as the object has not been reclaimed by the GC,
+    /// yet.  Be careful.  If the object header is destroyed, it may not be safe to access such
+    /// objects in the high-level language.
+    ///
+    /// # Interaction with allocation and GC
+    ///
+    /// This function does not mutate the heap.  It is safe if multiple threads execute this
+    /// function concurrently during mutator time.
+    ///
+    /// It has *undefined behavior* if allocation or GC happens while this function is being
+    /// executed.  The VM binding must ensure no threads are allocating and GC does not start while
+    /// executing this function.  One way to do this is stopping all mutators before calling this
+    /// function.
+    ///
+    /// Some high-level languages may provide an API that allows the user to allocate objects and
+    /// trigger GC while enumerating objects.  One example is [`ObjectSpace::each_object`][os_eo] in
+    /// Ruby.  The VM binding may use the callback of this function to save all visited object
+    /// references and let the user visit those references after this function returns.  Make sure
+    /// those saved references are in the root set or in an object that will live through GCs before
+    /// the high-level language finishes visiting the saved object references.
+    ///
+    /// [os_eo]: https://docs.ruby-lang.org/en/master/ObjectSpace.html#method-c-each_object
+    #[cfg(feature = "vo_bit")]
+    pub fn enumerate_objects<F>(&self, f: F)
+    where
+        F: FnMut(ObjectReference),
+    {
+        use crate::util::object_enum;
+
+        let mut enumerator = object_enum::ClosureObjectEnumerator::<_, VM>::new(f);
+        let plan = self.get_plan();
+        plan.for_each_space(&mut |space| {
+            space.enumerate_objects(&mut enumerator);
+        })
+    }
 }
diff --git a/src/plan/marksweep/global.rs b/src/plan/marksweep/global.rs
index fb6c271cab..4a19f77841 100644
--- a/src/plan/marksweep/global.rs
+++ b/src/plan/marksweep/global.rs
@@ -65,6 +65,10 @@ impl<VM: VMBinding> Plan for MarkSweep<VM> {
         self.common.release(tls, true);
     }
 
+    fn end_of_gc(&mut self, _tls: VMWorkerThread) {
+        self.ms.end_of_gc();
+    }
+
     fn collection_required(&self, space_full: bool, _space: Option<SpaceStats<Self::VM>>) -> bool {
         self.base().collection_required(self, space_full)
     }
diff --git a/src/policy/copyspace.rs b/src/policy/copyspace.rs
index bf73255033..84e875191e 100644
--- a/src/policy/copyspace.rs
+++ b/src/policy/copyspace.rs
@@ -6,10 +6,11 @@ use crate::policy::sft::SFT;
 use crate::policy::space::{CommonSpace, Space};
 use crate::scheduler::GCWorker;
 use crate::util::alloc::allocator::AllocatorContext;
-use crate::util::copy::*;
 use crate::util::heap::{MonotonePageResource, PageResource};
 use crate::util::metadata::{extract_side_metadata, MetadataSpec};
+use crate::util::object_enum::ObjectEnumerator;
 use crate::util::object_forwarding;
+use crate::util::{copy::*, object_enum};
 use crate::util::{Address, ObjectReference};
 use crate::vm::*;
 use libc::{mprotect, PROT_EXEC, PROT_NONE, PROT_READ, PROT_WRITE};
@@ -133,6 +134,10 @@ impl<VM: VMBinding> Space<VM> for CopySpace<VM> {
     fn set_copy_for_sft_trace(&mut self, semantics: Option<CopySemantics>) {
         self.common.copy = semantics;
     }
+
+    fn enumerate_objects(&self, enumerator: &mut dyn ObjectEnumerator) {
+        object_enum::enumerate_blocks_from_monotonic_page_resource(enumerator, &self.pr);
+    }
 }
 
 impl<VM: VMBinding> crate::policy::gc_work::PolicyTraceObject<VM> for CopySpace<VM> {
diff --git a/src/policy/immix/block.rs b/src/policy/immix/block.rs
index 8c772924c8..85ed76ddef 100644
--- a/src/policy/immix/block.rs
+++ b/src/policy/immix/block.rs
@@ -10,6 +10,7 @@ use crate::util::metadata::side_metadata::{MetadataByteArrayRef, SideMetadataSpe
 use crate::util::metadata::vo_bit;
 #[cfg(feature = "object_pinning")]
 use crate::util::metadata::MetadataSpec;
+use crate::util::object_enum::BlockMayHaveObjects;
 use crate::util::Address;
 use crate::vm::*;
 use std::sync::atomic::Ordering;
@@ -86,6 +87,12 @@ impl Region for Block {
     }
 }
 
+impl BlockMayHaveObjects for Block {
+    fn may_have_objects(&self) -> bool {
+        self.get_state() != BlockState::Unallocated
+    }
+}
+
 impl Block {
     /// Log pages in block
     pub const LOG_PAGES: usize = Self::LOG_BYTES - LOG_BYTES_IN_PAGE as usize;
diff --git a/src/policy/immix/immixspace.rs b/src/policy/immix/immixspace.rs
index 73756d2f90..0e3d303202 100644
--- a/src/policy/immix/immixspace.rs
+++ b/src/policy/immix/immixspace.rs
@@ -9,7 +9,6 @@ use crate::policy::sft_map::SFTMap;
 use crate::policy::space::{CommonSpace, Space};
 use crate::util::alloc::allocator::AllocatorContext;
 use crate::util::constants::LOG_BYTES_IN_PAGE;
-use crate::util::copy::*;
 use crate::util::heap::chunk_map::*;
 use crate::util::heap::BlockPageResource;
 use crate::util::heap::PageResource;
@@ -18,7 +17,9 @@ use crate::util::metadata::side_metadata::SideMetadataSpec;
 #[cfg(feature = "vo_bit")]
 use crate::util::metadata::vo_bit;
 use crate::util::metadata::{self, MetadataSpec};
+use crate::util::object_enum::ObjectEnumerator;
 use crate::util::object_forwarding;
+use crate::util::{copy::*, epilogue, object_enum};
 use crate::util::{Address, ObjectReference};
 use crate::vm::*;
 use crate::{
@@ -189,6 +190,10 @@ impl<VM: VMBinding> Space<VM> for ImmixSpace<VM> {
     fn set_copy_for_sft_trace(&mut self, _semantics: Option<CopySemantics>) {
         panic!("We do not use SFT to trace objects for Immix. set_copy_context() cannot be used.")
     }
+
+    fn enumerate_objects(&self, enumerator: &mut dyn ObjectEnumerator) {
+        object_enum::enumerate_blocks_from_chunk_map::<Block>(enumerator, &self.chunk_map);
+    }
 }
 
 impl<VM: VMBinding> crate::policy::gc_work::PolicyTraceObject<VM> for ImmixSpace<VM> {
@@ -979,6 +984,12 @@ impl<VM: VMBinding> FlushPageResource<VM> {
     }
 }
 
+impl<VM: VMBinding> Drop for FlushPageResource<VM> {
+    fn drop(&mut self) {
+        epilogue::debug_assert_counter_zero(&self.counter, "FlushPageResource::counter");
+    }
+}
+
 use crate::policy::copy_context::PolicyCopyContext;
 use crate::util::alloc::Allocator;
 use crate::util::alloc::ImmixAllocator;
diff --git a/src/policy/immortalspace.rs b/src/policy/immortalspace.rs
index 8b29584166..ba142202ba 100644
--- a/src/policy/immortalspace.rs
+++ b/src/policy/immortalspace.rs
@@ -6,6 +6,7 @@ use crate::util::address::Address;
 use crate::util::heap::{MonotonePageResource, PageResource};
 use crate::util::metadata::mark_bit::MarkState;
 
+use crate::util::object_enum::{self, ObjectEnumerator};
 use crate::util::{metadata, ObjectReference};
 
 use crate::plan::{ObjectQueue, VectorObjectQueue};
@@ -112,6 +113,10 @@ impl<VM: VMBinding> Space<VM> for ImmortalSpace<VM> {
     fn release_multiple_pages(&mut self, _start: Address) {
         panic!("immortalspace only releases pages enmasse")
     }
+
+    fn enumerate_objects(&self, enumerator: &mut dyn ObjectEnumerator) {
+        object_enum::enumerate_blocks_from_monotonic_page_resource(enumerator, &self.pr);
+    }
 }
 
 use crate::scheduler::GCWorker;
diff --git a/src/policy/largeobjectspace.rs b/src/policy/largeobjectspace.rs
index 623f6fbbe4..b0145b167f 100644
--- a/src/policy/largeobjectspace.rs
+++ b/src/policy/largeobjectspace.rs
@@ -8,6 +8,7 @@ use crate::policy::space::{CommonSpace, Space};
 use crate::util::constants::BYTES_IN_PAGE;
 use crate::util::heap::{FreeListPageResource, PageResource};
 use crate::util::metadata;
+use crate::util::object_enum::ObjectEnumerator;
 use crate::util::opaque_pointer::*;
 use crate::util::treadmill::TreadMill;
 use crate::util::{Address, ObjectReference};
@@ -175,6 +176,10 @@ impl<VM: VMBinding> Space<VM> for LargeObjectSpace<VM> {
     fn release_multiple_pages(&mut self, start: Address) {
         self.pr.release_pages(start);
     }
+
+    fn enumerate_objects(&self, enumerator: &mut dyn ObjectEnumerator) {
+        self.treadmill.enumerate_objects(enumerator);
+    }
 }
 
 use crate::scheduler::GCWorker;
diff --git a/src/policy/lockfreeimmortalspace.rs b/src/policy/lockfreeimmortalspace.rs
index 80b65e288d..dee3205181 100644
--- a/src/policy/lockfreeimmortalspace.rs
+++ b/src/policy/lockfreeimmortalspace.rs
@@ -16,6 +16,7 @@ use crate::util::heap::VMRequest;
 use crate::util::memory::MmapStrategy;
 use crate::util::metadata::side_metadata::SideMetadataContext;
 use crate::util::metadata::side_metadata::SideMetadataSanity;
+use crate::util::object_enum::ObjectEnumerator;
 use crate::util::opaque_pointer::*;
 use crate::util::ObjectReference;
 use crate::vm::VMBinding;
@@ -166,6 +167,10 @@ impl<VM: VMBinding> Space<VM> for LockFreeImmortalSpace<VM> {
         side_metadata_sanity_checker
             .verify_metadata_context(std::any::type_name::<Self>(), &self.metadata)
     }
+
+    fn enumerate_objects(&self, enumerator: &mut dyn ObjectEnumerator) {
+        enumerator.visit_address_range(self.start, self.start + self.total_bytes);
+    }
 }
 
 use crate::plan::{ObjectQueue, VectorObjectQueue};
diff --git a/src/policy/markcompactspace.rs b/src/policy/markcompactspace.rs
index 9db05e56aa..bc0f5659c2 100644
--- a/src/policy/markcompactspace.rs
+++ b/src/policy/markcompactspace.rs
@@ -11,6 +11,7 @@ use crate::util::constants::LOG_BYTES_IN_WORD;
 use crate::util::copy::CopySemantics;
 use crate::util::heap::{MonotonePageResource, PageResource};
 use crate::util::metadata::{extract_side_metadata, vo_bit};
+use crate::util::object_enum::{self, ObjectEnumerator};
 use crate::util::{Address, ObjectReference};
 use crate::{vm::*, ObjectQueue};
 use atomic::Ordering;
@@ -131,6 +132,10 @@ impl<VM: VMBinding> Space<VM> for MarkCompactSpace<VM> {
     fn release_multiple_pages(&mut self, _start: Address) {
         panic!("markcompactspace only releases pages enmasse")
     }
+
+    fn enumerate_objects(&self, enumerator: &mut dyn ObjectEnumerator) {
+        object_enum::enumerate_blocks_from_monotonic_page_resource(enumerator, &self.pr);
+    }
 }
 
 impl<VM: VMBinding> crate::policy::gc_work::PolicyTraceObject<VM> for MarkCompactSpace<VM> {
diff --git a/src/policy/marksweepspace/malloc_ms/global.rs b/src/policy/marksweepspace/malloc_ms/global.rs
index d95aa8437b..dbe386db5c 100644
--- a/src/policy/marksweepspace/malloc_ms/global.rs
+++ b/src/policy/marksweepspace/malloc_ms/global.rs
@@ -15,6 +15,7 @@ use crate::util::metadata::side_metadata::{
     SideMetadataContext, SideMetadataSanity, SideMetadataSpec,
 };
 use crate::util::metadata::MetadataSpec;
+use crate::util::object_enum::ObjectEnumerator;
 use crate::util::opaque_pointer::*;
 use crate::util::Address;
 use crate::util::ObjectReference;
@@ -229,6 +230,10 @@ impl<VM: VMBinding> Space<VM> for MallocSpace<VM> {
         side_metadata_sanity_checker
             .verify_metadata_context(std::any::type_name::<Self>(), &self.metadata)
     }
+
+    fn enumerate_objects(&self, _enumerator: &mut dyn ObjectEnumerator) {
+        unimplemented!()
+    }
 }
 
 use crate::scheduler::GCWorker;
diff --git a/src/policy/marksweepspace/native_ms/block.rs b/src/policy/marksweepspace/native_ms/block.rs
index 6727430e9e..a150d974b5 100644
--- a/src/policy/marksweepspace/native_ms/block.rs
+++ b/src/policy/marksweepspace/native_ms/block.rs
@@ -7,6 +7,7 @@ use super::MarkSweepSpace;
 use crate::util::constants::LOG_BYTES_IN_PAGE;
 use crate::util::heap::chunk_map::*;
 use crate::util::linear_scan::Region;
+use crate::util::object_enum::BlockMayHaveObjects;
 use crate::vm::ObjectModel;
 use crate::{
     util::{
@@ -48,6 +49,12 @@ impl Region for Block {
     }
 }
 
+impl BlockMayHaveObjects for Block {
+    fn may_have_objects(&self) -> bool {
+        self.get_state() != BlockState::Unallocated
+    }
+}
+
 impl Block {
     /// Log pages in block
     pub const LOG_PAGES: usize = Self::LOG_BYTES - LOG_BYTES_IN_PAGE as usize;
diff --git a/src/policy/marksweepspace/native_ms/global.rs b/src/policy/marksweepspace/native_ms/global.rs
index 41773be244..4a86dda6fa 100644
--- a/src/policy/marksweepspace/native_ms/global.rs
+++ b/src/policy/marksweepspace/native_ms/global.rs
@@ -8,8 +8,10 @@ use crate::{
     scheduler::{GCWorkScheduler, GCWorker, WorkBucketStage},
     util::{
         copy::CopySemantics,
+        epilogue,
         heap::{BlockPageResource, PageResource},
         metadata::{self, side_metadata::SideMetadataSpec, MetadataSpec},
+        object_enum::{self, ObjectEnumerator},
         ObjectReference,
     },
     vm::{ActivePlan, VMBinding},
@@ -246,6 +248,10 @@ impl<VM: VMBinding> Space<VM> for MarkSweepSpace<VM> {
     fn release_multiple_pages(&mut self, _start: crate::util::Address) {
         todo!()
     }
+
+    fn enumerate_objects(&self, enumerator: &mut dyn ObjectEnumerator) {
+        object_enum::enumerate_blocks_from_chunk_map::<Block>(enumerator, &self.chunk_map);
+    }
 }
 
 impl<VM: VMBinding> crate::policy::gc_work::PolicyTraceObject<VM> for MarkSweepSpace<VM> {
@@ -373,6 +379,13 @@ impl<VM: VMBinding> MarkSweepSpace<VM> {
         self.scheduler.work_buckets[crate::scheduler::WorkBucketStage::Release].add(work_packet);
     }
 
+    pub fn end_of_gc(&mut self) {
+        epilogue::debug_assert_counter_zero(
+            &self.pending_release_packets,
+            "pending_release_packets",
+        );
+    }
+
     /// Release a block.
     pub fn release_block(&self, block: Block) {
         self.block_clear_metadata(block);
@@ -581,3 +594,9 @@ impl<VM: VMBinding> RecycleBlocks<VM> {
         }
     }
 }
+
+impl<VM: VMBinding> Drop for RecycleBlocks<VM> {
+    fn drop(&mut self) {
+        epilogue::debug_assert_counter_zero(&self.counter, "RecycleBlocks::counter");
+    }
+}
diff --git a/src/policy/space.rs b/src/policy/space.rs
index 556e0c6bfe..0329c995b7 100644
--- a/src/policy/space.rs
+++ b/src/policy/space.rs
@@ -5,6 +5,7 @@ use crate::util::conversions::*;
 use crate::util::metadata::side_metadata::{
     SideMetadataContext, SideMetadataSanity, SideMetadataSpec,
 };
+use crate::util::object_enum::ObjectEnumerator;
 use crate::util::Address;
 use crate::util::ObjectReference;
 
@@ -348,6 +349,28 @@ pub trait Space<VM: VMBinding>: 'static + SFT + Sync + Downcast {
         side_metadata_sanity_checker
             .verify_metadata_context(std::any::type_name::<Self>(), &self.common().metadata)
     }
+
+    /// Enumerate objects in the current space.
+    ///
+    /// Implementers can use the `enumerator` to report
+    ///
+    /// -   individual objects within the space using `enumerator.visit_object`, and
+    /// -   ranges of address that may contain objects using `enumerator.visit_address_range`. The
+    ///     caller will then enumerate objects in the range using the VO bits metadata.
+    ///
+    /// Each object in the space shall be covered by one of the two methods above.
+    ///
+    /// # Implementation considerations
+    ///
+    /// **Skipping empty ranges**: When enumerating address ranges, spaces can skip ranges (blocks,
+    /// chunks, etc.) that are guarenteed not to contain objects.
+    ///
+    /// **Dynamic dispatch**: Because `Space` is a trait object type and `enumerator` is a `dyn`
+    /// reference, invoking methods of `enumerator` involves a dynamic dispatching.  But the
+    /// overhead is OK if we call it a block at a time because scanning the VO bits will dominate
+    /// the execution time.  For LOS, it will be cheaper to enumerate individual objects than
+    /// scanning VO bits because it is sparse.
+    fn enumerate_objects(&self, enumerator: &mut dyn ObjectEnumerator);
 }
 
 /// Print the VM map for a space.
diff --git a/src/policy/vmspace.rs b/src/policy/vmspace.rs
index bf881e84e4..786930d0cf 100644
--- a/src/policy/vmspace.rs
+++ b/src/policy/vmspace.rs
@@ -11,6 +11,7 @@ use crate::util::heap::PageResource;
 use crate::util::metadata::mark_bit::MarkState;
 #[cfg(feature = "set_unlog_bits_vm_space")]
 use crate::util::metadata::MetadataSpec;
+use crate::util::object_enum::ObjectEnumerator;
 use crate::util::opaque_pointer::*;
 use crate::util::ObjectReference;
 use crate::vm::{ObjectModel, VMBinding};
@@ -145,6 +146,13 @@ impl<VM: VMBinding> Space<VM> for VMSpace<VM> {
         // mmapped by the runtime rather than us). So we we use SFT here.
         SFT_MAP.get_checked(start).name() == self.name()
     }
+
+    fn enumerate_objects(&self, enumerator: &mut dyn ObjectEnumerator) {
+        let external_pages = self.pr.get_external_pages();
+        for ep in external_pages.iter() {
+            enumerator.visit_address_range(ep.start, ep.end);
+        }
+    }
 }
 
 use crate::scheduler::GCWorker;
diff --git a/src/scheduler/gc_work.rs b/src/scheduler/gc_work.rs
index 4335ceea16..ff2fb75511 100644
--- a/src/scheduler/gc_work.rs
+++ b/src/scheduler/gc_work.rs
@@ -60,11 +60,17 @@ impl<C: GCWorkContext> GCWork<C::VM> for Prepare<C> {
         plan_mut.prepare(worker.tls);
 
         if plan_mut.constraints().needs_prepare_mutator {
-            for mutator in <C::VM as VMBinding>::VMActivePlan::mutators() {
-                mmtk.scheduler.work_buckets[WorkBucketStage::Prepare]
-                    .add(PrepareMutator::<C::VM>::new(mutator));
-            }
+            let prepare_mutator_packets = <C::VM as VMBinding>::VMActivePlan::mutators()
+                .map(|mutator| Box::new(PrepareMutator::<C::VM>::new(mutator)) as _)
+                .collect::<Vec<_>>();
+            // Just in case the VM binding is inconsistent about the number of mutators and the actual mutator list.
+            debug_assert_eq!(
+                prepare_mutator_packets.len(),
+                <C::VM as VMBinding>::VMActivePlan::number_of_mutators()
+            );
+            mmtk.scheduler.work_buckets[WorkBucketStage::Prepare].bulk_add(prepare_mutator_packets);
         }
+
         for w in &mmtk.scheduler.worker_group.workers_shared {
             let result = w.designated_work.push(Box::new(PrepareCollector));
             debug_assert!(result.is_ok());
@@ -133,10 +139,16 @@ impl<C: GCWorkContext + 'static> GCWork<C::VM> for Release<C> {
         let plan_mut: &mut C::PlanType = unsafe { &mut *(self.plan as *const _ as *mut _) };
         plan_mut.release(worker.tls);
 
-        for mutator in <C::VM as VMBinding>::VMActivePlan::mutators() {
-            mmtk.scheduler.work_buckets[WorkBucketStage::Release]
-                .add(ReleaseMutator::<C::VM>::new(mutator));
-        }
+        let release_mutator_packets = <C::VM as VMBinding>::VMActivePlan::mutators()
+            .map(|mutator| Box::new(ReleaseMutator::<C::VM>::new(mutator)) as _)
+            .collect::<Vec<_>>();
+        // Just in case the VM binding is inconsistent about the number of mutators and the actual mutator list.
+        debug_assert_eq!(
+            release_mutator_packets.len(),
+            <C::VM as VMBinding>::VMActivePlan::number_of_mutators()
+        );
+        mmtk.scheduler.work_buckets[WorkBucketStage::Release].bulk_add(release_mutator_packets);
+
         for w in &mmtk.scheduler.worker_group.workers_shared {
             let result = w.designated_work.push(Box::new(ReleaseCollector));
             debug_assert!(result.is_ok());
diff --git a/src/util/epilogue.rs b/src/util/epilogue.rs
new file mode 100644
index 0000000000..b97a88fc74
--- /dev/null
+++ b/src/util/epilogue.rs
@@ -0,0 +1,13 @@
+//! Utilities for implementing epilogues.
+//!
+//! Epilogues are operations that are done when several other operations are done.
+
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+/// A debugging method for detecting the case when the epilogue is never executed.
+pub fn debug_assert_counter_zero(counter: &AtomicUsize, what: &'static str) {
+    let value = counter.load(Ordering::SeqCst);
+    if value != 0 {
+        panic!("{what} is still {value}.");
+    }
+}
diff --git a/src/util/metadata/side_metadata/global.rs b/src/util/metadata/side_metadata/global.rs
index 39df588b41..ad39a3d565 100644
--- a/src/util/metadata/side_metadata/global.rs
+++ b/src/util/metadata/side_metadata/global.rs
@@ -8,6 +8,7 @@ use crate::util::metadata::metadata_val_traits::*;
 use crate::util::metadata::vo_bit::VO_BIT_SIDE_METADATA_SPEC;
 use crate::util::Address;
 use num_traits::FromPrimitive;
+use ranges::BitByteRange;
 use std::fmt;
 use std::io::Result;
 use std::sync::atomic::{AtomicU8, Ordering};
@@ -154,188 +155,77 @@ impl SideMetadataSpec {
         MMAPPER.is_mapped_address(meta_addr)
     }
 
-    /// This method is used for iterating side metadata for a data address range. As we cannot guarantee
-    /// that the data address range can be mapped to whole metadata bytes, we have to deal with cases that
-    /// we need to mask and zero certain bits in a metadata byte. The end address and the end bit are exclusive.
-    /// The end bit for update_bits could be 8, so overflowing needs to be taken care of.
-    ///
-    /// Returns true if we iterate through every bits in the range. Return false if we abort iteration early.
-    ///
-    /// Arguments:
-    /// * `forwards`: If true, we iterate forwards (from start/low address to end/high address). Otherwise,
-    ///               we iterate backwards (from end/high address to start/low address).
-    /// * `visit_bytes`/`visit_bits`: The closures returns whether the itertion is early terminated.
-    pub(super) fn iterate_meta_bits(
+    /// This method is used for bulk zeroing side metadata for a data address range.
+    pub(crate) fn zero_meta_bits(
         meta_start_addr: Address,
         meta_start_bit: u8,
         meta_end_addr: Address,
         meta_end_bit: u8,
-        forwards: bool,
-        visit_bytes: &impl Fn(Address, Address) -> bool,
-        visit_bits: &impl Fn(Address, u8, u8) -> bool,
-    ) -> bool {
-        trace!(
-            "iterate_meta_bits: {} {}, {} {}",
-            meta_start_addr,
-            meta_start_bit,
-            meta_end_addr,
-            meta_end_bit
-        );
-        // Start/end is the same, we don't need to do anything.
-        if meta_start_addr == meta_end_addr && meta_start_bit == meta_end_bit {
-            return false;
-        }
-
-        // zeroing bytes
-        if meta_start_bit == 0 && meta_end_bit == 0 {
-            return visit_bytes(meta_start_addr, meta_end_addr);
-        }
-
-        if meta_start_addr == meta_end_addr {
-            // Update bits in the same byte between start and end bit
-            visit_bits(meta_start_addr, meta_start_bit, meta_end_bit)
-        } else if meta_start_addr + 1usize == meta_end_addr && meta_end_bit == 0 {
-            // Update bits in the same byte after the start bit (between start bit and 8)
-            visit_bits(meta_start_addr, meta_start_bit, 8)
-        } else {
-            // Update each segments.
-            // Clippy wants to move this if block up as a else-if block. But I think this is logically more clear. So disable the clippy warning.
-            #[allow(clippy::collapsible_else_if)]
-            if forwards {
-                // update bits in the first byte
-                if Self::iterate_meta_bits(
-                    meta_start_addr,
-                    meta_start_bit,
-                    meta_start_addr + 1usize,
-                    0,
-                    forwards,
-                    visit_bytes,
-                    visit_bits,
-                ) {
-                    return true;
-                }
-                // update bytes in the middle
-                if Self::iterate_meta_bits(
-                    meta_start_addr + 1usize,
-                    0,
-                    meta_end_addr,
-                    0,
-                    forwards,
-                    visit_bytes,
-                    visit_bits,
-                ) {
-                    return true;
-                }
-                // update bits in the last byte
-                if Self::iterate_meta_bits(
-                    meta_end_addr,
-                    0,
-                    meta_end_addr,
-                    meta_end_bit,
-                    forwards,
-                    visit_bytes,
-                    visit_bits,
-                ) {
-                    return true;
-                }
-                false
-            } else {
-                // update bits in the last byte
-                if Self::iterate_meta_bits(
-                    meta_end_addr,
-                    0,
-                    meta_end_addr,
-                    meta_end_bit,
-                    forwards,
-                    visit_bytes,
-                    visit_bits,
-                ) {
-                    return true;
-                }
-                // update bytes in the middle
-                if Self::iterate_meta_bits(
-                    meta_start_addr + 1usize,
-                    0,
-                    meta_end_addr,
-                    0,
-                    forwards,
-                    visit_bytes,
-                    visit_bits,
-                ) {
-                    return true;
+    ) {
+        let mut visitor = |range| {
+            match range {
+                BitByteRange::Bytes { start, end } => {
+                    memory::zero(start, end - start);
+                    false
                 }
-                // update bits in the first byte
-                if Self::iterate_meta_bits(
-                    meta_start_addr,
-                    meta_start_bit,
-                    meta_start_addr + 1usize,
-                    0,
-                    forwards,
-                    visit_bytes,
-                    visit_bits,
-                ) {
-                    return true;
+                BitByteRange::BitsInByte {
+                    addr,
+                    bit_start,
+                    bit_end,
+                } => {
+                    // we are zeroing selected bit in one byte
+                    // Get a mask that the bits we need to zero are set to zero, and the other bits are 1.
+                    let mask: u8 =
+                        u8::MAX.checked_shl(bit_end as u32).unwrap_or(0) | !(u8::MAX << bit_start);
+                    unsafe { addr.as_ref::<AtomicU8>() }.fetch_and(mask, Ordering::SeqCst);
+                    false
                 }
-                false
             }
-        }
-    }
-
-    /// This method is used for bulk zeroing side metadata for a data address range.
-    pub(super) fn zero_meta_bits(
-        meta_start_addr: Address,
-        meta_start_bit: u8,
-        meta_end_addr: Address,
-        meta_end_bit: u8,
-    ) {
-        let zero_bytes = |start: Address, end: Address| -> bool {
-            memory::zero(start, end - start);
-            false
-        };
-        let zero_bits = |addr: Address, start_bit: u8, end_bit: u8| -> bool {
-            // we are zeroing selected bits in one byte
-            let mask: u8 =
-                u8::MAX.checked_shl(end_bit.into()).unwrap_or(0) | !(u8::MAX << start_bit); // Get a mask that the bits we need to zero are set to zero, and the other bits are 1.
-            unsafe { addr.as_ref::<AtomicU8>() }.fetch_and(mask, Ordering::SeqCst);
-            false
         };
-        Self::iterate_meta_bits(
+        ranges::break_bit_range(
             meta_start_addr,
             meta_start_bit,
             meta_end_addr,
             meta_end_bit,
             true,
-            &zero_bytes,
-            &zero_bits,
+            &mut visitor,
         );
     }
 
     /// This method is used for bulk setting side metadata for a data address range.
-    pub(super) fn set_meta_bits(
+    pub(crate) fn set_meta_bits(
         meta_start_addr: Address,
         meta_start_bit: u8,
         meta_end_addr: Address,
         meta_end_bit: u8,
     ) {
-        let set_bytes = |start: Address, end: Address| -> bool {
-            memory::set(start, 0xff, end - start);
-            false
-        };
-        let set_bits = |addr: Address, start_bit: u8, end_bit: u8| -> bool {
-            // we are setting selected bits in one byte
-            let mask: u8 =
-                !(u8::MAX.checked_shl(end_bit.into()).unwrap_or(0)) & (u8::MAX << start_bit); // Get a mask that the bits we need to set are 1, and the other bits are 0.
-            unsafe { addr.as_ref::<AtomicU8>() }.fetch_or(mask, Ordering::SeqCst);
-            false
+        let mut visitor = |range| {
+            match range {
+                BitByteRange::Bytes { start, end } => {
+                    memory::set(start, 0xff, end - start);
+                    false
+                }
+                BitByteRange::BitsInByte {
+                    addr,
+                    bit_start,
+                    bit_end,
+                } => {
+                    // we are setting selected bits in one byte
+                    // Get a mask that the bits we need to set are 1, and the other bits are 0.
+                    let mask: u8 = !(u8::MAX.checked_shl(bit_end as u32).unwrap_or(0))
+                        & (u8::MAX << bit_start);
+                    unsafe { addr.as_ref::<AtomicU8>() }.fetch_or(mask, Ordering::SeqCst);
+                    false
+                }
+            }
         };
-        Self::iterate_meta_bits(
+        ranges::break_bit_range(
             meta_start_addr,
             meta_start_bit,
             meta_end_addr,
             meta_end_bit,
             true,
-            &set_bytes,
-            &set_bits,
+            &mut visitor,
         );
     }
 
@@ -498,37 +388,44 @@ impl SideMetadataSpec {
 
         debug_assert_eq!(dst_meta_start_bit, src_meta_start_bit);
 
-        let copy_bytes = |dst_start: Address, dst_end: Address| -> bool {
-            unsafe {
-                let byte_offset = dst_start - dst_meta_start_addr;
-                let src_start = src_meta_start_addr + byte_offset;
-                let size = dst_end - dst_start;
-                std::ptr::copy::<u8>(src_start.to_ptr(), dst_start.to_mut_ptr(), size);
-                false
+        let mut visitor = |range| {
+            match range {
+                BitByteRange::Bytes {
+                    start: dst_start,
+                    end: dst_end,
+                } => unsafe {
+                    let byte_offset = dst_start - dst_meta_start_addr;
+                    let src_start = src_meta_start_addr + byte_offset;
+                    let size = dst_end - dst_start;
+                    std::ptr::copy::<u8>(src_start.to_ptr(), dst_start.to_mut_ptr(), size);
+                    false
+                },
+                BitByteRange::BitsInByte {
+                    addr: dst,
+                    bit_start,
+                    bit_end,
+                } => {
+                    let byte_offset = dst - dst_meta_start_addr;
+                    let src = src_meta_start_addr + byte_offset;
+                    // we are setting selected bits in one byte
+                    let mask: u8 = !(u8::MAX.checked_shl(bit_end as u32).unwrap_or(0))
+                        & (u8::MAX << bit_start); // Get a mask that the bits we need to set are 1, and the other bits are 0.
+                    let old_src = unsafe { src.as_ref::<AtomicU8>() }.load(Ordering::Relaxed);
+                    let old_dst = unsafe { dst.as_ref::<AtomicU8>() }.load(Ordering::Relaxed);
+                    let new = (old_src & mask) | (old_dst & !mask);
+                    unsafe { dst.as_ref::<AtomicU8>() }.store(new, Ordering::Relaxed);
+                    false
+                }
             }
         };
 
-        let copy_bits = |dst: Address, start_bit: u8, end_bit: u8| -> bool {
-            let byte_offset = dst - dst_meta_start_addr;
-            let src = src_meta_start_addr + byte_offset;
-            // we are setting selected bits in one byte
-            let mask: u8 =
-                !(u8::MAX.checked_shl(end_bit.into()).unwrap_or(0)) & (u8::MAX << start_bit); // Get a mask that the bits we need to set are 1, and the other bits are 0.
-            let old_src = unsafe { src.as_ref::<AtomicU8>() }.load(Ordering::Relaxed);
-            let old_dst = unsafe { dst.as_ref::<AtomicU8>() }.load(Ordering::Relaxed);
-            let new = (old_src & mask) | (old_dst & !mask);
-            unsafe { dst.as_ref::<AtomicU8>() }.store(new, Ordering::Relaxed);
-            false
-        };
-
-        Self::iterate_meta_bits(
+        ranges::break_bit_range(
             dst_meta_start_addr,
             dst_meta_start_bit,
             dst_meta_end_addr,
             dst_meta_end_bit,
             true,
-            &copy_bytes,
-            &copy_bits,
+            &mut visitor,
         );
     }
 
@@ -1169,47 +1066,157 @@ impl SideMetadataSpec {
 
         // The result will be set by one of the following closures.
         // Use Cell so it doesn't need to be mutably borrowed by the two closures which Rust will complain.
-        let res = std::cell::Cell::new(None);
-
-        let check_bytes_backwards = |start: Address, end: Address| -> bool {
-            match helpers::find_last_non_zero_bit_in_metadata_bytes(start, end) {
-                helpers::FindMetaBitResult::Found { addr, bit } => {
-                    res.set(Some(contiguous_meta_address_to_address(self, addr, bit)));
-                    // Return true to abort the search. We found the bit.
-                    true
+        let mut res = None;
+
+        let mut visitor = |range: BitByteRange| {
+            match range {
+                BitByteRange::Bytes { start, end } => {
+                    match helpers::find_last_non_zero_bit_in_metadata_bytes(start, end) {
+                        helpers::FindMetaBitResult::Found { addr, bit } => {
+                            res = Some(contiguous_meta_address_to_address(self, addr, bit));
+                            // Return true to abort the search. We found the bit.
+                            true
+                        }
+                        // If we see unmapped metadata, we don't need to search any more.
+                        helpers::FindMetaBitResult::UnmappedMetadata => true,
+                        // Return false to continue searching.
+                        helpers::FindMetaBitResult::NotFound => false,
+                    }
+                }
+                BitByteRange::BitsInByte {
+                    addr,
+                    bit_start,
+                    bit_end,
+                } => {
+                    match helpers::find_last_non_zero_bit_in_metadata_bits(addr, bit_start, bit_end)
+                    {
+                        helpers::FindMetaBitResult::Found { addr, bit } => {
+                            res = Some(contiguous_meta_address_to_address(self, addr, bit));
+                            // Return true to abort the search. We found the bit.
+                            true
+                        }
+                        // If we see unmapped metadata, we don't need to search any more.
+                        helpers::FindMetaBitResult::UnmappedMetadata => true,
+                        // Return false to continue searching.
+                        helpers::FindMetaBitResult::NotFound => false,
+                    }
                 }
-                // If we see unmapped metadata, we don't need to search any more.
-                helpers::FindMetaBitResult::UnmappedMetadata => true,
-                // Return false to continue searching.
-                helpers::FindMetaBitResult::NotFound => false,
             }
         };
-        let check_bits_backwards = |addr: Address, start_bit: u8, end_bit: u8| -> bool {
-            match helpers::find_last_non_zero_bit_in_metadata_bits(addr, start_bit, end_bit) {
-                helpers::FindMetaBitResult::Found { addr, bit } => {
-                    res.set(Some(contiguous_meta_address_to_address(self, addr, bit)));
-                    // Return true to abort the search. We found the bit.
-                    true
+
+        ranges::break_bit_range(
+            start_meta_addr,
+            start_meta_shift,
+            end_meta_addr,
+            end_meta_shift,
+            false,
+            &mut visitor,
+        );
+
+        res.map(|addr| addr.align_down(1 << self.log_bytes_in_region))
+    }
+
+    /// Search for data addresses that have non zero values in the side metadata.  This method is
+    /// primarily used for heap traversal by scanning the VO bits.
+    ///
+    /// This function searches the side metadata for the data address range from `data_start_addr`
+    /// (inclusive) to `data_end_addr` (exclusive).  The data address range must be fully mapped.
+    ///
+    /// For each data region that has non-zero side metadata, `visit_data` is called with the lowest
+    /// address of that region.  Note that it may not be the original address used to set the
+    /// metadata bits.
+    pub fn scan_non_zero_values<T: MetadataValue>(
+        &self,
+        data_start_addr: Address,
+        data_end_addr: Address,
+        visit_data: &mut impl FnMut(Address),
+    ) {
+        if self.uses_contiguous_side_metadata() && self.log_num_of_bits == 0 {
+            // Contiguous one-bit-per-region side metadata
+            // TODO: VO bits is one-bit-per-word.  But if we want to scan other metadata (such as
+            // the forwarding bits which has two bits per word), we will need to refactor the
+            // algorithm of `scan_non_zero_values_fast`.
+            self.scan_non_zero_values_fast(data_start_addr, data_end_addr, visit_data);
+        } else {
+            // TODO: VO bits are always contiguous.  But if we want to scan other metadata, such as
+            // side mark bits, we need to refactor `bulk_update_metadata` to support `FnMut`, too,
+            // and use it to apply `scan_non_zero_values_fast` on each contiguous side metadata
+            // range.
+            warn!(
+                "We are trying to search for non zero bits in a discontiguous side metadata \
+            or the metadata has more than one bit per region. \
+                The performance is slow, as MMTk does not optimize for this case."
+            );
+            self.scan_non_zero_values_simple::<T>(data_start_addr, data_end_addr, visit_data);
+        }
+    }
+
+    fn scan_non_zero_values_simple<T: MetadataValue>(
+        &self,
+        data_start_addr: Address,
+        data_end_addr: Address,
+        visit_data: &mut impl FnMut(Address),
+    ) {
+        let region_bytes = 1usize << self.log_bytes_in_region;
+
+        let mut cursor = data_start_addr;
+        while cursor < data_end_addr {
+            debug_assert!(cursor.is_mapped());
+
+            // If we find non-zero value, just call back.
+            if !unsafe { self.load::<T>(cursor).is_zero() } {
+                visit_data(cursor);
+            }
+            cursor += region_bytes;
+        }
+    }
+
+    fn scan_non_zero_values_fast(
+        &self,
+        data_start_addr: Address,
+        data_end_addr: Address,
+        visit_data: &mut impl FnMut(Address),
+    ) {
+        debug_assert!(self.uses_contiguous_side_metadata());
+        debug_assert_eq!(self.log_num_of_bits, 0);
+
+        // Then figure out the start and end metadata address and bits.
+        let start_meta_addr = address_to_contiguous_meta_address(self, data_start_addr);
+        let start_meta_shift = meta_byte_lshift(self, data_start_addr);
+        let end_meta_addr = address_to_contiguous_meta_address(self, data_end_addr);
+        let end_meta_shift = meta_byte_lshift(self, data_end_addr);
+
+        let mut visitor = |range| {
+            match range {
+                BitByteRange::Bytes { start, end } => {
+                    helpers::scan_non_zero_bits_in_metadata_bytes(start, end, &mut |addr, bit| {
+                        visit_data(helpers::contiguous_meta_address_to_address(self, addr, bit));
+                    });
                 }
-                // If we see unmapped metadata, we don't need to search any more.
-                helpers::FindMetaBitResult::UnmappedMetadata => true,
-                // Return false to continue searching.
-                helpers::FindMetaBitResult::NotFound => false,
+                BitByteRange::BitsInByte {
+                    addr,
+                    bit_start,
+                    bit_end,
+                } => helpers::scan_non_zero_bits_in_metadata_bits(
+                    addr,
+                    bit_start,
+                    bit_end,
+                    &mut |addr, bit| {
+                        visit_data(helpers::contiguous_meta_address_to_address(self, addr, bit));
+                    },
+                ),
             }
+            false
         };
 
-        Self::iterate_meta_bits(
+        ranges::break_bit_range(
             start_meta_addr,
             start_meta_shift,
             end_meta_addr,
             end_meta_shift,
             false,
-            &check_bytes_backwards,
-            &check_bits_backwards,
+            &mut visitor,
         );
-
-        res.get()
-            .map(|addr| addr.align_down(1 << self.log_bytes_in_region))
     }
 }
 
diff --git a/src/util/metadata/side_metadata/helpers.rs b/src/util/metadata/side_metadata/helpers.rs
index 845ad79666..9eb8823649 100644
--- a/src/util/metadata/side_metadata/helpers.rs
+++ b/src/util/metadata/side_metadata/helpers.rs
@@ -1,3 +1,4 @@
+use super::ranges::BitOffset;
 use super::SideMetadataSpec;
 use crate::util::constants::LOG_BYTES_IN_PAGE;
 use crate::util::constants::{BITS_IN_WORD, BYTES_IN_PAGE, LOG_BITS_IN_BYTE};
@@ -269,6 +270,59 @@ fn find_last_non_zero_bit_in_u8(byte_value: u8) -> Option<u8> {
     }
 }
 
+pub fn scan_non_zero_bits_in_metadata_bytes(
+    meta_start: Address,
+    meta_end: Address,
+    visit_bit: &mut impl FnMut(Address, BitOffset),
+) {
+    use crate::util::constants::BYTES_IN_ADDRESS;
+
+    let mut cursor = meta_start;
+    while cursor < meta_end && !cursor.is_aligned_to(BYTES_IN_ADDRESS) {
+        let byte = unsafe { cursor.load::<u8>() };
+        scan_non_zero_bits_in_metadata_word(cursor, byte as usize, visit_bit);
+        cursor += 1usize;
+    }
+
+    while cursor + BYTES_IN_ADDRESS < meta_end {
+        let word = unsafe { cursor.load::<usize>() };
+        scan_non_zero_bits_in_metadata_word(cursor, word, visit_bit);
+        cursor += BYTES_IN_ADDRESS;
+    }
+
+    while cursor < meta_end {
+        let byte = unsafe { cursor.load::<u8>() };
+        scan_non_zero_bits_in_metadata_word(cursor, byte as usize, visit_bit);
+        cursor += 1usize;
+    }
+}
+
+fn scan_non_zero_bits_in_metadata_word(
+    meta_addr: Address,
+    mut word: usize,
+    visit_bit: &mut impl FnMut(Address, BitOffset),
+) {
+    while word != 0 {
+        let bit = word.trailing_zeros();
+        visit_bit(meta_addr, bit as u8);
+        word = word & (word - 1);
+    }
+}
+
+pub fn scan_non_zero_bits_in_metadata_bits(
+    meta_addr: Address,
+    bit_start: BitOffset,
+    bit_end: BitOffset,
+    visit_bit: &mut impl FnMut(Address, BitOffset),
+) {
+    let byte = unsafe { meta_addr.load::<u8>() };
+    for bit in bit_start..bit_end {
+        if byte & (1 << bit) != 0 {
+            visit_bit(meta_addr, bit);
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/src/util/metadata/side_metadata/mod.rs b/src/util/metadata/side_metadata/mod.rs
index 406f91167d..16111b42c1 100644
--- a/src/util/metadata/side_metadata/mod.rs
+++ b/src/util/metadata/side_metadata/mod.rs
@@ -2,11 +2,12 @@
 // For convenience, this module is public and the bindings may create and use side metadata for their purpose.
 
 mod constants;
-mod helpers;
+pub(crate) mod helpers;
 #[cfg(target_pointer_width = "32")]
 mod helpers_32;
 
 mod global;
+pub(crate) mod ranges;
 mod sanity;
 mod side_metadata_tests;
 pub(crate) mod spec_defs;
diff --git a/src/util/metadata/side_metadata/ranges.rs b/src/util/metadata/side_metadata/ranges.rs
new file mode 100644
index 0000000000..48d12a7068
--- /dev/null
+++ b/src/util/metadata/side_metadata/ranges.rs
@@ -0,0 +1,369 @@
+//! Data types for visiting metadata ranges at different granularities.
+//!
+//! Currently, the `break_bit_range` function can break a bit range into sub-ranges of whole bytes
+//! and in-byte bits.
+//!
+//! TODO:
+//!
+//! -   Add a function to break a byte range into sub-ranges of whole words and in-word bytes.
+//!     -   And use it for searching side metadata for non-zero bits.
+//! -   Add a function to break a byte range at chunk boundaries.
+//!     -   And use it for visiting discontiguous side metadata in bulk.
+
+use crate::util::Address;
+
+/// The type for bit offset in a byte.
+pub type BitOffset = u8;
+
+/// A range of bytes or bits within a byte.  It is the unit of visiting a contiguous bit range of a
+/// side metadata.
+///
+/// In general, a bit range of a bitmap starts with multiple bits in the byte, followed by many
+/// whole bytes, and ends with multiple bits in the last byte.
+///
+/// A range is never empty.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum BitByteRange {
+    /// A range of whole bytes.
+    Bytes {
+        /// The starting address (inclusive) of the bytes.
+        start: Address,
+        /// The ending address (exclusive) of the bytes.
+        end: Address,
+    },
+    /// A range of bits within a byte.
+    BitsInByte {
+        /// The address of the byte.
+        addr: Address,
+        /// The starting bit index (inclusive), starting with zero from the low-order bit.
+        bit_start: BitOffset,
+        /// The ending bit index (exclusive),  starting with zero from the low-order bit.  This may
+        /// be 8 which means the range includes the highest bit.  Be careful when shifting a `u8`
+        /// value because shifting an `u8` by 8 is considered an overflow in Rust.
+        bit_end: BitOffset,
+    },
+}
+
+/// Break a bit range into sub-ranges of whole bytes and in-byte bits.
+///
+/// This method is primarily used for iterating side metadata for a data address range. As we cannot
+/// guarantee that the data address range can be mapped to whole metadata bytes, we have to deal
+/// with visiting only a bit range in a metadata byte.
+///
+/// The bit range starts at the bit at index `start_bit` in the byte at address `start_addr`, and
+/// ends at (but does not include) the bit at index `end_bit` in the byte at address `end_addr`.
+///
+/// Arguments:
+/// *   `forwards`: If true, we iterate forwards (from start/low address to end/high address).
+///     Otherwise, we iterate backwards (from end/high address to start/low address).
+/// *   `visitor`: The callback that visits ranges of bits or bytes.  It returns whether the
+///     itertion is early terminated.
+///
+/// Returns true if we iterate through every bits in the range. Return false if we abort iteration
+/// early.
+pub fn break_bit_range<V>(
+    start_addr: Address,
+    start_bit: BitOffset,
+    end_addr: Address,
+    end_bit: BitOffset,
+    forwards: bool,
+    visitor: &mut V,
+) -> bool
+where
+    V: FnMut(BitByteRange) -> bool,
+{
+    // The start and the end are the same, we don't need to do anything.
+    if start_addr == end_addr && start_bit == end_bit {
+        return false;
+    }
+
+    // If the range is already byte-aligned, visit the entire range as whole bytes.
+    if start_bit == 0 && end_bit == 0 {
+        return visitor(BitByteRange::Bytes {
+            start: start_addr,
+            end: end_addr,
+        });
+    }
+
+    // If the start and the end are within the same byte,
+    // visit the bit range within the byte.
+    if start_addr == end_addr {
+        return visitor(BitByteRange::BitsInByte {
+            addr: start_addr,
+            bit_start: start_bit,
+            bit_end: end_bit,
+        });
+    }
+
+    // If the end is the 0th bit of the next byte of the start,
+    // visit the bit range from the start to the end (bit 8) of the same byte.
+    if start_addr + 1usize == end_addr && end_bit == 0 {
+        return visitor(BitByteRange::BitsInByte {
+            addr: start_addr,
+            bit_start: start_bit,
+            bit_end: 8_u8,
+        });
+    }
+
+    // Otherwise, the range spans over multiple bytes, and is bit-unaligned at either the start or
+    // the end.  Try to break it into (at most) three sub-ranges.
+
+    let start_aligned = start_bit == 0;
+    let end_aligned = end_bit == 0;
+
+    // We cannot let multiple closures capture `visitor` mutably at the same time, so we pass the
+    // visitor in as `v` every time.
+
+    // visit bits within the first byte
+    let visit_start = |v: &mut V| {
+        if !start_aligned {
+            v(BitByteRange::BitsInByte {
+                addr: start_addr,
+                bit_start: start_bit,
+                bit_end: 8_u8,
+            })
+        } else {
+            // The start is already aligned.  No sub-byte range at the start.
+            false
+        }
+    };
+
+    // visit whole bytes in the middle
+    let visit_middle = |v: &mut V| {
+        let start = if start_aligned {
+            start_addr
+        } else {
+            // If the start is not aligned, the whole-byte range starts after the first byte.
+            start_addr + 1usize
+        };
+        let end = end_addr;
+        if start < end {
+            v(BitByteRange::Bytes { start, end })
+        } else {
+            // There are no whole bytes in the middle.
+            false
+        }
+    };
+
+    // visit bits within the last byte
+    let visit_end = |v: &mut V| {
+        if !end_aligned {
+            v(BitByteRange::BitsInByte {
+                addr: end_addr,
+                bit_start: 0_u8,
+                bit_end: end_bit,
+            })
+        } else {
+            // The end is aligned.  No sub-byte range at the end.
+            false
+        }
+    };
+
+    // Visit the three segments forwards or backwards.
+    if forwards {
+        visit_start(visitor) || visit_middle(visitor) || visit_end(visitor)
+    } else {
+        visit_end(visitor) || visit_middle(visitor) || visit_start(visitor)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::util::constants::BITS_IN_BYTE;
+
+    use super::*;
+
+    fn mk_addr(addr: usize) -> Address {
+        unsafe { Address::from_usize(addr) }
+    }
+
+    fn break_bit_range_wrapped(
+        start_addr: Address,
+        start_bit: usize,
+        end_addr: Address,
+        end_bit: usize,
+    ) -> Vec<BitByteRange> {
+        let mut vec = vec![];
+        break_bit_range(
+            start_addr,
+            start_bit as u8,
+            end_addr,
+            end_bit as u8,
+            true,
+            &mut |range| {
+                vec.push(range);
+                false
+            },
+        );
+        vec
+    }
+
+    #[test]
+    fn test_empty_range() {
+        let base = mk_addr(0x1000);
+        for bit in 0..BITS_IN_BYTE {
+            let result = break_bit_range_wrapped(base, bit, base, bit);
+            assert!(
+                result.is_empty(),
+                "Not empty. bit: {bit}, result: {result:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn test_subbyte_range() {
+        let base = mk_addr(0x1000);
+        for bit0 in 0..BITS_IN_BYTE {
+            for bit1 in (bit0 + 1)..BITS_IN_BYTE {
+                let result = break_bit_range_wrapped(base, bit0, base, bit1);
+                assert_eq!(
+                    result,
+                    vec![BitByteRange::BitsInByte {
+                        addr: base,
+                        bit_start: bit0 as u8,
+                        bit_end: bit1 as u8
+                    }],
+                    "Not equal.  bit0: {bit0}, bit1: {bit1}",
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_end_byte_range() {
+        let base = mk_addr(0x1000);
+        for bit0 in 1..BITS_IN_BYTE {
+            let result = break_bit_range_wrapped(base, bit0, base + 1usize, 0);
+            assert_eq!(
+                result,
+                vec![BitByteRange::BitsInByte {
+                    addr: base,
+                    bit_start: bit0 as u8,
+                    bit_end: BITS_IN_BYTE as u8
+                }],
+                "Not equal.  bit0: {bit0}",
+            );
+        }
+    }
+
+    #[test]
+    fn test_adjacent_grain_range() {
+        let base = mk_addr(0x1000);
+        for bit0 in 1..BITS_IN_BYTE {
+            for bit1 in 1..BITS_IN_BYTE {
+                let result = break_bit_range_wrapped(base, bit0, base + 1usize, bit1);
+                assert_eq!(
+                    result,
+                    vec![
+                        BitByteRange::BitsInByte {
+                            addr: base,
+                            bit_start: bit0 as u8,
+                            bit_end: BITS_IN_BYTE as u8,
+                        },
+                        BitByteRange::BitsInByte {
+                            addr: base + 1usize,
+                            bit_start: 0,
+                            bit_end: bit1 as u8,
+                        },
+                    ],
+                    "Not equal.  bit0: {bit0}, bit1: {bit1}",
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_left_and_whole_range() {
+        let base = mk_addr(0x1000);
+        for bit0 in 1..BITS_IN_BYTE {
+            for byte1 in 2usize..8 {
+                let result = break_bit_range_wrapped(base, bit0, base + byte1, 0);
+                assert_eq!(
+                    result,
+                    vec![
+                        BitByteRange::BitsInByte {
+                            addr: base,
+                            bit_start: bit0 as u8,
+                            bit_end: BITS_IN_BYTE as u8,
+                        },
+                        BitByteRange::Bytes {
+                            start: base + 1usize,
+                            end: base + byte1,
+                        },
+                    ],
+                    "Not equal.  bit0: {bit0}, byte1: {byte1}",
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_whole_and_right_range() {
+        let base = mk_addr(0x1000);
+        for byte0 in 1..8 {
+            for bit1 in 1..BITS_IN_BYTE {
+                let result = break_bit_range_wrapped(base - byte0, 0, base, bit1);
+                assert_eq!(
+                    result,
+                    vec![
+                        BitByteRange::Bytes {
+                            start: base - byte0,
+                            end: base,
+                        },
+                        BitByteRange::BitsInByte {
+                            addr: base,
+                            bit_start: 0,
+                            bit_end: bit1 as u8,
+                        },
+                    ],
+                    "Not equal.  byte0: {byte0}, bit1: {bit1}",
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_whole_range() {
+        let base = mk_addr(0x1000);
+        let result = break_bit_range_wrapped(base, 0, base + 42usize, 0);
+        assert_eq!(
+            result,
+            vec![BitByteRange::Bytes {
+                start: base,
+                end: base + 42usize,
+            },],
+        );
+    }
+
+    #[test]
+    fn test_left_whole_right_range() {
+        let base0 = mk_addr(0x1000);
+        let base1 = mk_addr(0x2000);
+
+        for bit0 in 1..BITS_IN_BYTE {
+            for bit1 in 1..BITS_IN_BYTE {
+                let result = break_bit_range_wrapped(base0 - 1usize, bit0, base1, bit1);
+                assert_eq!(
+                    result,
+                    vec![
+                        BitByteRange::BitsInByte {
+                            addr: base0 - 1usize,
+                            bit_start: bit0 as u8,
+                            bit_end: BITS_IN_BYTE as u8,
+                        },
+                        BitByteRange::Bytes {
+                            start: base0,
+                            end: base1,
+                        },
+                        BitByteRange::BitsInByte {
+                            addr: base1,
+                            bit_start: 0,
+                            bit_end: bit1 as u8,
+                        },
+                    ],
+                    "Not equal.  bit0: {bit0}, bit1: {bit1}",
+                );
+            }
+        }
+    }
+}
diff --git a/src/util/metadata/vo_bit/mod.rs b/src/util/metadata/vo_bit/mod.rs
index e7b0f2551b..3ae0aee8d7 100644
--- a/src/util/metadata/vo_bit/mod.rs
+++ b/src/util/metadata/vo_bit/mod.rs
@@ -201,7 +201,7 @@ fn get_in_object_address_for_potential_object<VM: VMBinding>(potential_obj: Addr
 }
 
 /// Get the object reference from an aligned address where VO bit is set.
-fn get_object_ref_for_vo_addr<VM: VMBinding>(vo_addr: Address) -> ObjectReference {
+pub(crate) fn get_object_ref_for_vo_addr<VM: VMBinding>(vo_addr: Address) -> ObjectReference {
     let addr = vo_addr.offset(-VM::VMObjectModel::IN_OBJECT_ADDRESS_OFFSET);
     let aligned = addr.align_up(ObjectReference::ALIGNMENT);
     unsafe { ObjectReference::from_raw_address_unchecked(aligned) }
diff --git a/src/util/mod.rs b/src/util/mod.rs
index 8036ad7ae1..d22c29a2e3 100644
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@@ -36,6 +36,8 @@ pub mod metadata;
 pub mod opaque_pointer;
 /// MMTk command line options.
 pub mod options;
+#[cfg(feature = "test_private")]
+pub mod test_private;
 /// Test utilities. We need this module for `MockVM` in criterion benches, which does not include code with `cfg(test)`.
 #[cfg(any(test, feature = "mock_test"))]
 pub mod test_util;
@@ -44,12 +46,14 @@ pub mod test_util;
 /// An analysis framework for collecting data and profiling in GC.
 #[cfg(feature = "analysis")]
 pub(crate) mod analysis;
+pub(crate) mod epilogue;
 /// Non-generic refs to generic types of `<VM>`.
 pub(crate) mod erase_vm;
 /// Finalization implementation.
 pub(crate) mod finalizable_processor;
 /// Logger initialization
 pub(crate) mod logger;
+pub(crate) mod object_enum;
 /// Forwarding word in object copying.
 pub(crate) mod object_forwarding;
 /// Reference processing implementation.
diff --git a/src/util/object_enum.rs b/src/util/object_enum.rs
new file mode 100644
index 0000000000..61378f7a57
--- /dev/null
+++ b/src/util/object_enum.rs
@@ -0,0 +1,107 @@
+//! Helper types for object enumeration
+
+use std::marker::PhantomData;
+
+use crate::vm::VMBinding;
+
+use super::{
+    heap::{
+        chunk_map::{ChunkMap, ChunkState},
+        MonotonePageResource,
+    },
+    linear_scan::Region,
+    metadata::{side_metadata::spec_defs::VO_BIT, vo_bit},
+    Address, ObjectReference,
+};
+
+/// A trait for enumerating objects in spaces, used by [`Space::enumerate_objects`].
+///
+/// [`Space::enumerate_objects`]: crate::policy::space::Space::enumerate_objects
+pub trait ObjectEnumerator {
+    /// Visit a single object.
+    fn visit_object(&mut self, object: ObjectReference);
+    /// Visit an address range that may contain objects.
+    fn visit_address_range(&mut self, start: Address, end: Address);
+}
+
+/// An implementation of `ObjectEnumerator` that wraps a callback.
+pub struct ClosureObjectEnumerator<F, VM>
+where
+    F: FnMut(ObjectReference),
+    VM: VMBinding,
+{
+    object_callback: F,
+    phantom_data: PhantomData<VM>,
+}
+
+impl<F, VM> ClosureObjectEnumerator<F, VM>
+where
+    F: FnMut(ObjectReference),
+    VM: VMBinding,
+{
+    pub fn new(object_callback: F) -> Self {
+        Self {
+            object_callback,
+            phantom_data: PhantomData,
+        }
+    }
+}
+
+impl<F, VM> ObjectEnumerator for ClosureObjectEnumerator<F, VM>
+where
+    F: FnMut(ObjectReference),
+    VM: VMBinding,
+{
+    fn visit_object(&mut self, object: ObjectReference) {
+        (self.object_callback)(object);
+    }
+
+    fn visit_address_range(&mut self, start: Address, end: Address) {
+        VO_BIT.scan_non_zero_values::<u8>(start, end, &mut |address| {
+            let object = vo_bit::get_object_ref_for_vo_addr::<VM>(address);
+            (self.object_callback)(object);
+        })
+    }
+}
+
+/// Allow querying if a block may have objects. `MarkSweepSpace` and `ImmixSpace` use different
+/// `Block` types, and they have different block states. This trait lets both `Block` types provide
+/// the same `may_have_objects` method.
+pub(crate) trait BlockMayHaveObjects: Region {
+    /// Return `true` if the block may contain valid objects (objects with the VO bit set). Return
+    /// `false` otherwise.
+    ///
+    /// This function is used during object enumeration to filter out memory regions that do not
+    /// contain objects. Because object enumeration may happen at mutator time, and another mutators
+    /// may be allocating objects, this function only needs to reflect the state of the block at the
+    /// time of calling, and is allowed to conservatively return `true`.
+    fn may_have_objects(&self) -> bool;
+}
+
+pub(crate) fn enumerate_blocks_from_chunk_map<B>(
+    enumerator: &mut dyn ObjectEnumerator,
+    chunk_map: &ChunkMap,
+) where
+    B: BlockMayHaveObjects,
+{
+    for chunk in chunk_map.all_chunks() {
+        if chunk_map.get(chunk) == ChunkState::Allocated {
+            for block in chunk.iter_region::<B>() {
+                if block.may_have_objects() {
+                    enumerator.visit_address_range(block.start(), block.end());
+                }
+            }
+        }
+    }
+}
+
+pub(crate) fn enumerate_blocks_from_monotonic_page_resource<VM>(
+    enumerator: &mut dyn ObjectEnumerator,
+    pr: &MonotonePageResource<VM>,
+) where
+    VM: VMBinding,
+{
+    for (start, size) in pr.iterate_allocated_regions() {
+        enumerator.visit_address_range(start, start + size);
+    }
+}
diff --git a/src/util/test_private/mod.rs b/src/util/test_private/mod.rs
new file mode 100644
index 0000000000..c3f4dfceae
--- /dev/null
+++ b/src/util/test_private/mod.rs
@@ -0,0 +1,36 @@
+//! This module exposes private items in mmtk-core for testing and benchmarking. They must not be
+//! used in production.
+//!
+//! # Notes on inlining
+//!
+//! In mmtk-core, we refrain from inserting inlining hints manually.  But we use `#[inline(always)]`
+//! in this module explicitly because the functions here are simple wrappers of private functions,
+//! and the compiler usually fails to make the right decision given that those functions are not
+//! used often, and we don't compile the benchmarks using feedback-directed optimizations.
+
+pub use crate::util::metadata::side_metadata::helpers::scan_non_zero_bits_in_metadata_bytes;
+use crate::util::metadata::side_metadata::SideMetadataSpec;
+
+use super::Address;
+
+/// Expose `zero_meta_bits` when running `cargo bench`.
+#[inline(always)]
+pub fn zero_meta_bits(
+    meta_start_addr: Address,
+    meta_start_bit: u8,
+    meta_end_addr: Address,
+    meta_end_bit: u8,
+) {
+    SideMetadataSpec::zero_meta_bits(meta_start_addr, meta_start_bit, meta_end_addr, meta_end_bit)
+}
+
+/// Expose `set_meta_bits` when running `cargo bench`.
+#[inline(always)]
+pub fn set_meta_bits(
+    meta_start_addr: Address,
+    meta_start_bit: u8,
+    meta_end_addr: Address,
+    meta_end_bit: u8,
+) {
+    SideMetadataSpec::set_meta_bits(meta_start_addr, meta_start_bit, meta_end_addr, meta_end_bit)
+}
diff --git a/src/util/treadmill.rs b/src/util/treadmill.rs
index e8cfe92d4f..96b5eb9e7d 100644
--- a/src/util/treadmill.rs
+++ b/src/util/treadmill.rs
@@ -4,6 +4,8 @@ use std::sync::Mutex;
 
 use crate::util::ObjectReference;
 
+use super::object_enum::ObjectEnumerator;
+
 pub struct TreadMill {
     from_space: Mutex<HashSet<ObjectReference>>,
     to_space: Mutex<HashSet<ObjectReference>>,
@@ -99,6 +101,17 @@ impl TreadMill {
             trace!("Flipped from_space and to_space");
         }
     }
+
+    pub(crate) fn enumerate_objects(&self, enumerator: &mut dyn ObjectEnumerator) {
+        let mut visit_objects = |set: &Mutex<HashSet<ObjectReference>>| {
+            let set = set.lock().unwrap();
+            for object in set.iter() {
+                enumerator.visit_object(*object);
+            }
+        };
+        visit_objects(&self.alloc_nursery);
+        visit_objects(&self.to_space);
+    }
 }
 
 impl Default for TreadMill {
diff --git a/src/vm/tests/mock_tests/mock_test_heap_traversal.rs b/src/vm/tests/mock_tests/mock_test_heap_traversal.rs
new file mode 100644
index 0000000000..430c46ff2a
--- /dev/null
+++ b/src/vm/tests/mock_tests/mock_test_heap_traversal.rs
@@ -0,0 +1,87 @@
+// GITHUB-CI: MMTK_PLAN=NoGC,MarkSweep,MarkCompact,SemiSpace,Immix
+// GITHUB-CI: FEATURES=vo_bit
+
+// Note on the plans chosen for CI:
+// - Those plans cover the MarkSweepSpace, MarkCompactSpace, CopySpace and ImmixSpace.
+//   Each plan other than NoGC also include ImmortalSpace and LOS.
+// - PageProtect consumes too much memory and the test will fail with the default heap size
+//   chosen by the MutatorFixture.
+
+use std::collections::HashSet;
+
+use constants::BYTES_IN_WORD;
+
+use super::mock_test_prelude::*;
+
+use crate::{util::*, AllocationSemantics, MMTK};
+
+lazy_static! {
+    static ref FIXTURE: Fixture<MutatorFixture> = Fixture::new();
+}
+
+pub fn get_all_objects(mmtk: &'static MMTK<MockVM>) -> HashSet<ObjectReference> {
+    let mut result = HashSet::new();
+    mmtk.enumerate_objects(|object| {
+        result.insert(object);
+    });
+    result
+}
+
+#[test]
+pub fn test_heap_traversal() {
+    with_mockvm(
+        default_setup,
+        || {
+            FIXTURE.with_fixture_mut(|fixture| {
+                let mmtk = fixture.mmtk();
+                let traversal0 = get_all_objects(mmtk);
+                assert!(traversal0.is_empty());
+
+                let mutator = &mut fixture.mutator;
+
+                let align = BYTES_IN_WORD;
+
+                let mut new_obj = |size: usize, semantics: AllocationSemantics| {
+                    let start = memory_manager::alloc(mutator, size, align, 0, semantics);
+                    let object = MockVM::object_start_to_ref(start);
+                    memory_manager::post_alloc(mutator, object, size, semantics);
+                    object
+                };
+
+                let mut known_objects = HashSet::new();
+
+                let mut new_and_assert = |size: usize, semantics: AllocationSemantics| {
+                    let object = new_obj(size, semantics); // a random size
+                    known_objects.insert(object);
+                    let traversal = get_all_objects(mmtk);
+                    assert_eq!(traversal, known_objects);
+                };
+
+                {
+                    use AllocationSemantics::*;
+
+                    // Add some single objects.  Size doesn't matter.
+                    new_and_assert(40, Default);
+                    new_and_assert(64, Default);
+                    new_and_assert(96, Immortal);
+                    new_and_assert(131000, Los);
+
+                    // Allocate enough memory to fill up a 64KB Immix block
+                    for _ in 0..1000 {
+                        new_and_assert(112, Default);
+                    }
+                    // Allocate a few objects in the immortal space.
+                    for _ in 0..10 {
+                        new_and_assert(64, Immortal);
+                    }
+                    // And a few objects in the LOS.
+                    for _ in 0..3 {
+                        // The MutatorFixture only has 1MB of memory.  Don't allocate too much.
+                        new_and_assert(65504, Immortal);
+                    }
+                }
+            });
+        },
+        no_cleanup,
+    )
+}
diff --git a/src/vm/tests/mock_tests/mod.rs b/src/vm/tests/mock_tests/mod.rs
index 751bc58c08..114d8f1859 100644
--- a/src/vm/tests/mock_tests/mod.rs
+++ b/src/vm/tests/mock_tests/mod.rs
@@ -35,6 +35,8 @@ mod mock_test_conservatism;
 #[cfg(target_os = "linux")]
 mod mock_test_handle_mmap_conflict;
 mod mock_test_handle_mmap_oom;
+#[cfg(feature = "vo_bit")]
+mod mock_test_heap_traversal;
 mod mock_test_init_fork;
 #[cfg(feature = "is_mmtk_object")]
 mod mock_test_internal_ptr_before_object_ref;