From a4dceebd8ffb65d3edcd9bf5c3dfc5df3abbbd46 Mon Sep 17 00:00:00 2001
From: Matt Keeter <matt@oxide.computer>
Date: Fri, 20 Oct 2023 08:28:21 -0400
Subject: [PATCH] Encode active context in metadata region

---
 downstairs/src/extent_inner_raw.rs    | 213 +++++++++++++++-----------
 downstairs/src/extent_inner_sqlite.rs |   6 +
 2 files changed, 133 insertions(+), 86 deletions(-)

diff --git a/downstairs/src/extent_inner_raw.rs b/downstairs/src/extent_inner_raw.rs
index 5db52f9c9..1eb3a833a 100644
--- a/downstairs/src/extent_inner_raw.rs
+++ b/downstairs/src/extent_inner_raw.rs
@@ -62,13 +62,18 @@ const DEFRAGMENT_THRESHOLD: u64 = 3;
 /// - [`BLOCK_META_SIZE_BYTES`], which contains an [`OnDiskMeta`] serialized
 ///   using `bincode`.  The first byte of this range is `dirty`, serialized as a
 ///   `u8` (where `1` is dirty and `0` is clean).
-/// - Block contexts (for encryption).  Each block index (in the range
-///   `0..extent_size`) has two context slots; we use a ping-pong strategy when
-///   writing to ensure that one slot is always valid.  Each slot is
-///   [`BLOCK_CONTEXT_SLOT_SIZE_BYTES`] in size, so this region is
-///   `BLOCK_CONTEXT_SLOT_SIZE_BYTES * extent_size * 2` bytes in total.  The
-///   slots contain an `Option<OnDiskDownstairsBlockContext>`, serialized using
-///   `bincode`.
+/// - Active context slots, stored as a bit-packed array (where 0 is
+///   [`ContextSlot::A`] and 1 is [`ContextSlot::B`]).  This array contains
+///   `(extent_size + 7) / 8` bytes.  It is only valid when the `dirty` bit is
+///   cleared.  This is an optimization that speeds up opening a clean extent
+///   file; otherwise, we would have to rehash every block to find the active
+///   context slot.
+/// - Block contexts (for encryption).  There are two arrays of context slots,
+///   each containing `extent_size` elements (i.e. one slot for each block).
+///   Each slot is [`BLOCK_CONTEXT_SLOT_SIZE_BYTES`] in size, so this section of
+///   the file is `BLOCK_CONTEXT_SLOT_SIZE_BYTES * extent_size * 2` bytes in
+///   total.  The slots contain an `Option<OnDiskDownstairsBlockContext>`,
+///   serialized using `bincode`.
 #[derive(Debug)]
 pub struct RawInner {
     file: File,
@@ -517,6 +522,7 @@ impl RawInner {
         let bcount = def.extent_size().value;
         let size = def.block_size().checked_mul(bcount).unwrap()
             + BLOCK_META_SIZE_BYTES
+            + (bcount + 7) / 8
             + BLOCK_CONTEXT_SLOT_SIZE_BYTES * bcount * 2;
 
         mkdir_for_file(&path)?;
@@ -571,8 +577,9 @@ impl RawInner {
         let extent_size = def.extent_size();
         let bcount = extent_size.value;
         let size = def.block_size().checked_mul(bcount).unwrap()
-            + BLOCK_CONTEXT_SLOT_SIZE_BYTES * bcount * 2
-            + BLOCK_META_SIZE_BYTES;
+            + BLOCK_META_SIZE_BYTES
+            + (bcount + 7) / 8
+            + BLOCK_CONTEXT_SLOT_SIZE_BYTES * bcount * 2;
 
         /*
          * Open the extent file and verify the size is as we expect.
@@ -620,84 +627,112 @@ impl RawInner {
         // Buffer the file so we don't spend all day waiting on syscalls
         let mut file_buffered = BufReader::with_capacity(64 * 1024, &file);
 
-        // Read the metadata block
-        let dirty = {
-            let mut meta_buf = [0u8; BLOCK_META_SIZE_BYTES as usize];
-            file_buffered.read_exact(&mut meta_buf)?;
-            match meta_buf[0] {
-                0 => false,
-                1 => true,
-                i => bail!("invalid dirty value: {i}"),
-            }
+        // Read the metadata block and active slots
+        let mut meta_buf = [0u8; BLOCK_META_SIZE_BYTES as usize];
+        file_buffered.read_exact(&mut meta_buf)?;
+        let dirty = match meta_buf[0] {
+            0 => false,
+            1 => true,
+            i => bail!("invalid dirty value: {i}"),
         };
 
-        // Read the two context slot arrays
-        let mut context_arrays = vec![];
-        for _slot in [ContextSlot::A, ContextSlot::B] {
-            let mut contexts = Vec::with_capacity(bcount as usize);
-            let mut buf = vec![0; BLOCK_CONTEXT_SLOT_SIZE_BYTES as usize];
-            for _block in 0..bcount as usize {
-                file_buffered.read_exact(&mut buf)?;
-                let context: Option<OnDiskDownstairsBlockContext> =
-                    bincode::deserialize(&buf).map_err(|e| {
-                        CrucibleError::IoError(format!(
-                            "context deserialization failed: {e}"
-                        ))
-                    })?;
-                contexts.push(context);
+        // If the file is dirty, then we have to recompute which context slot is
+        // active for every block.  This is slow, but can't be avoided; we
+        // closed the file without a flush so we can't be confident about the
+        // data that was on disk.
+        let active_context = if !dirty {
+            // Easy case first: if it's **not** dirty, then just assign active
+            // slots based on trailing bytes in the metadata section of the file
+            let mut active_context = vec![];
+            let mut buf = vec![0u8; (bcount as usize + 7) / 8];
+            file_buffered.read_exact(&mut buf)?;
+            for b in buf[BLOCK_META_SIZE_BYTES as usize..].iter() {
+                // Unpack bits from each byte
+                for i in 0..8 {
+                    active_context.push(if b & (1 << i) == 0 {
+                        ContextSlot::A
+                    } else {
+                        ContextSlot::B
+                    });
+                }
             }
-            context_arrays.push(contexts);
-        }
-
-        file.seek(SeekFrom::Start(0))?;
-        let mut file_buffered = BufReader::with_capacity(64 * 1024, &file);
-        let mut active_context = vec![];
-        let mut buf = vec![0; extent_size.block_size_in_bytes() as usize];
-        let mut last_seek_block = 0;
-        for (block, (context_a, context_b)) in context_arrays[0]
-            .iter()
-            .zip(context_arrays[1].iter())
-            .enumerate()
-        {
-            let slot = if context_a == context_b {
-                // If both slots are identical, then either they're both None or
-                // we have defragmented recently (which copies the active slot
-                // to the inactive one).  That makes life easy!
-                ContextSlot::A
-            } else {
-                // Otherwise, we have to compute hashes from the file.
-                if block != last_seek_block {
-                    file_buffered.seek_relative(
-                        (block - last_seek_block) as i64
-                            * extent_size.block_size_in_bytes() as i64,
-                    )?;
+            // It's possible that block count isn't a multiple of 8; in that
+            // case, shrink down the active context array.
+            assert!(bcount as usize <= active_context.len());
+            active_context.resize(bcount as usize, ContextSlot::A);
+            active_context
+        } else {
+            // Read the two context slot arrays
+            let mut context_arrays = vec![];
+            for _slot in [ContextSlot::A, ContextSlot::B] {
+                let mut contexts = Vec::with_capacity(bcount as usize);
+                let mut buf = vec![0; BLOCK_CONTEXT_SLOT_SIZE_BYTES as usize];
+                for _block in 0..bcount as usize {
+                    file_buffered.read_exact(&mut buf)?;
+                    let context: Option<OnDiskDownstairsBlockContext> =
+                        bincode::deserialize(&buf).map_err(|e| {
+                            CrucibleError::IoError(format!(
+                                "context deserialization failed: {e}"
+                            ))
+                        })?;
+                    contexts.push(context);
                 }
-                file_buffered.read_exact(&mut buf)?;
-                last_seek_block = block + 1; // since we just read a block
-                let hash = integrity_hash(&[&buf]);
-
-                let mut matching_slot = None;
-                let mut empty_slot = None;
-
-                for slot in [ContextSlot::A, ContextSlot::B] {
-                    let context = [context_a, context_b][slot as usize];
-                    if let Some(context) = context {
-                        if context.on_disk_hash == hash {
-                            matching_slot = Some(slot);
+                context_arrays.push(contexts);
+            }
+
+            file.seek(SeekFrom::Start(0))?;
+            let mut file_buffered = BufReader::with_capacity(64 * 1024, &file);
+            let mut active_context = vec![];
+            let mut buf = vec![0; extent_size.block_size_in_bytes() as usize];
+            let mut last_seek_block = 0;
+            for (block, (context_a, context_b)) in context_arrays[0]
+                .iter()
+                .zip(context_arrays[1].iter())
+                .enumerate()
+            {
+                let slot = if context_a == context_b {
+                    // If both slots are identical, then either they're both None or
+                    // we have defragmented recently (which copies the active slot
+                    // to the inactive one).  That makes life easy!
+                    ContextSlot::A
+                } else {
+                    // Otherwise, we have to compute hashes from the file.
+                    if block != last_seek_block {
+                        file_buffered.seek_relative(
+                            (block - last_seek_block) as i64
+                                * extent_size.block_size_in_bytes() as i64,
+                        )?;
+                    }
+                    file_buffered.read_exact(&mut buf)?;
+                    last_seek_block = block + 1; // since we just read a block
+                    let hash = integrity_hash(&[&buf]);
+
+                    let mut matching_slot = None;
+                    let mut empty_slot = None;
+
+                    for slot in [ContextSlot::A, ContextSlot::B] {
+                        let context = [context_a, context_b][slot as usize];
+                        if let Some(context) = context {
+                            if context.on_disk_hash == hash {
+                                matching_slot = Some(slot);
+                            }
+                        } else if empty_slot.is_none() {
+                            empty_slot = Some(slot);
                         }
-                    } else if empty_slot.is_none() {
-                        empty_slot = Some(slot);
                     }
-                }
 
-                matching_slot.or(empty_slot).ok_or(CrucibleError::IoError(
-                    format!("open: no slot found for {block}"),
-                ))?
-            };
-            active_context.push(slot);
-        }
+                    matching_slot.or(empty_slot).ok_or(
+                        CrucibleError::IoError(format!(
+                            "open: no slot found for {block}"
+                        )),
+                    )?
+                };
+                active_context.push(slot);
+            }
+            active_context
+        };
 
-        let mut out = Self {
+        Ok(Self {
             file,
             active_context,
             dirty,
@@ -710,11 +745,7 @@ impl RawInner {
             extra_syscall_count: 0,
             extra_syscall_denominator: 0,
             sync_index: 0,
-        };
-        if !read_only {
-            out.defragment()?;
-        }
-        Ok(out)
+        })
     }
 
     fn set_dirty(&mut self) -> Result<(), CrucibleError> {
@@ -907,6 +938,7 @@ impl RawInner {
     fn context_slot_offset(&self, block: u64, slot: ContextSlot) -> u64 {
         self.extent_size.block_size_in_bytes() as u64 * self.extent_size.value
             + BLOCK_META_SIZE_BYTES
+            + (self.extent_size.value + 7) / 8
             + (self.extent_size.value * slot as u64 + block)
                 * BLOCK_CONTEXT_SLOT_SIZE_BYTES
     }
@@ -930,11 +962,20 @@ impl RawInner {
             gen_number: new_gen,
             ext_version: EXTENT_META_RAW,
         };
-        // Byte 0 is the dirty byte
-        let mut buf = [0u8; BLOCK_META_SIZE_BYTES as usize];
+        let mut buf = vec![0u8; BLOCK_META_SIZE_BYTES as usize];
 
         bincode::serialize_into(buf.as_mut_slice(), &d)?;
+
+        // Serialize bitpacked active slot values
         let offset = self.meta_offset();
+        for c in self.active_context.chunks(8) {
+            let mut v = 0;
+            for (i, slot) in c.iter().enumerate() {
+                v |= (*slot as u8) << i;
+            }
+            buf.push(v);
+        }
+
         nix::sys::uio::pwrite(self.file.as_raw_fd(), &buf, offset as i64)
             .map_err(|e| CrucibleError::IoError(e.to_string()))?;
         self.dirty = false;
diff --git a/downstairs/src/extent_inner_sqlite.rs b/downstairs/src/extent_inner_sqlite.rs
index 71cecd965..8183784e0 100644
--- a/downstairs/src/extent_inner_sqlite.rs
+++ b/downstairs/src/extent_inner_sqlite.rs
@@ -548,6 +548,12 @@ impl SqliteInner {
         bincode::serialize_into(buf.as_mut_slice(), &meta)
             .map_err(|e| CrucibleError::IoError(e.to_string()))?;
 
+        // Add bitpacked data indicating which slot is active; this is always A
+        buf.extend(
+            std::iter::repeat(0)
+                .take((self.extent_size.value as usize + 7) / 8),
+        );
+
         // Put the context data after the metadata, all in slot A
         for c in ctxs {
             let ctx = match c.len() {