Store ciphertext size in the DB instead of the plaintext size

This is useful as a first cheap check to see if backup data on the backend still has the expected size. The plaintext size wasn't really used for anything anyway. Existing installation that still store the plaintext size will log different file size as suspicious and prefer those chunks in checks, but there shouldn't be other negative consequences from this.
grote · Nov 27, 2024 · da26663 · da26663
1 parent e5ba72e
commit da26663
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 20 deletions.
diff --git a/storage/lib/src/main/java/org/calyxos/backup/storage/backup/ChunkWriter.kt b/storage/lib/src/main/java/org/calyxos/backup/storage/backup/ChunkWriter.kt
@@ -49,6 +49,7 @@ internal class ChunkWriter(
         var writtenBytes = 0L
         chunks.forEach { chunk ->
             val cachedChunk = chunksCache.get(chunk.id)
+            // TODO missing chunks used by several files will get uploaded several times
             val isMissing = chunk.id in missingChunkIds
             val notCached = cachedChunk == null
             if (isMissing) Log.w(TAG, "Chunk ${chunk.id} is missing (cached: ${!notCached})")
@@ -58,14 +59,16 @@ internal class ChunkWriter(
                 }
                 if (notCached) chunksCache.insert(chunk.toCachedChunk())
                 writtenChunks++
-                writtenBytes += chunk.size
+                writtenBytes += chunk.plaintextSize
             } else { // chunk already uploaded
-                val skipped = inputStream.skip(chunk.size)
-                check(chunk.size == skipped) { "skipping error" }
+                val skipped = inputStream.skip(chunk.plaintextSize)
+                check(chunk.plaintextSize == skipped) { "skipping error" }
             }
         }
         val endByte = inputStream.read()
         check(endByte == -1) { "Stream did continue with $endByte" }
+        // FIXME the writtenBytes are based on plaintext size, not ciphertext size
+        //  However, they don't seem to be really used for anything at the moment.
         return ChunkWriterResult(writtenChunks, writtenBytes)
     }
 
@@ -89,14 +92,14 @@ internal class ChunkWriter(
     ) {
         var totalBytesRead = 0L
         do {
-            val sizeLeft = (chunk.size - totalBytesRead).toInt()
+            val sizeLeft = (chunk.plaintextSize - totalBytesRead).toInt()
             val bytesRead = inputStream.read(buffer, 0, min(bufferSize, sizeLeft))
             if (bytesRead == -1) throw IOException("unexpected end of stream for ${chunk.id}")
             outputStream.write(buffer, 0, bytesRead)
             totalBytesRead += bytesRead
-        } while (bytesRead >= 0 && totalBytesRead < chunk.size)
-        check(totalBytesRead == chunk.size) {
-            "copyChunkFromInputStream: $totalBytesRead != ${chunk.size}"
+        } while (bytesRead >= 0 && totalBytesRead < chunk.plaintextSize)
+        check(totalBytesRead == chunk.plaintextSize) {
+            "copyChunkFromInputStream: $totalBytesRead != ${chunk.plaintextSize}"
         }
     }
 

diff --git a/storage/lib/src/main/java/org/calyxos/backup/storage/backup/Chunker.kt b/storage/lib/src/main/java/org/calyxos/backup/storage/backup/Chunker.kt
@@ -6,6 +6,7 @@
 package org.calyxos.backup.storage.backup
 
 import org.calyxos.backup.storage.db.CachedChunk
+import org.calyxos.seedvault.core.crypto.CoreCrypto
 import org.calyxos.seedvault.core.toHexString
 import java.io.IOException
 import java.io.InputStream
@@ -15,9 +16,14 @@ import kotlin.math.min
 internal data class Chunk(
     val id: String,
     val offset: Long,
-    val size: Long,
+    val plaintextSize: Long,
 ) {
-    fun toCachedChunk() = CachedChunk(id, 0, size)
+    fun toCachedChunk() = CachedChunk(
+        id = id,
+        refCount = 0,
+        // FIXME sometimes, the ciphertext size is not as expected
+        size = 1 + CoreCrypto.expectedCiphertextSize(plaintextSize),
+    )
 }
 
 internal class Chunker(

diff --git a/storage/lib/src/main/java/org/calyxos/backup/storage/check/Checker.kt b/storage/lib/src/main/java/org/calyxos/backup/storage/check/Checker.kt
@@ -25,7 +25,6 @@ import org.calyxos.backup.storage.restore.readVersion
 import org.calyxos.seedvault.core.backends.FileBackupFileType
 import org.calyxos.seedvault.core.backends.IBackendManager
 import org.calyxos.seedvault.core.backends.TopLevelFolder
-import org.calyxos.seedvault.core.crypto.CoreCrypto
 import org.calyxos.seedvault.core.crypto.KeyManager
 import org.calyxos.seedvault.core.toHexString
 import java.io.IOException
@@ -219,11 +218,8 @@ internal class Checker(
                     Log.i(TAG, "Chunk ${chunk.id} known to be corrupted.")
                     return@forEach
                 }
-                val plaintextSize = chunk?.size ?: 0L
-                // expectedSize is one version byte plus ciphertext size
-                val expectedSize = 1 + CoreCrypto.expectedCiphertextSize(plaintextSize)
-                if (size != expectedSize) {
-                    Log.i(TAG, "Expected size $expectedSize, but chunk had $size: $chunkId")
+                if (size != chunk?.size) {
+                    Log.i(TAG, "Expected size ${chunk?.size}, but chunk had $size: $chunkId")
                     suspiciousChunkIds.add(chunkId)
                 }
             }
@@ -257,7 +253,7 @@ internal class Checker(
         val size = getBackupSize()
         val targetSize = (size * (percent.toDouble() / 100)).roundToLong()
         val blobSample = mutableListOf<String>()
-        val priorityChunksIds = referencedChunkIds.intersect(suspiciousChunkIds)
+        val priorityChunksIds = referencedChunkIds.intersect(suspiciousChunkIds).shuffled()
         val iterator = (priorityChunksIds + referencedChunkIds.shuffled()).iterator()
         var currentSize = 0L
         while (currentSize < targetSize && iterator.hasNext()) {

diff --git a/storage/lib/src/main/java/org/calyxos/backup/storage/db/ChunksCache.kt b/storage/lib/src/main/java/org/calyxos/backup/storage/db/ChunksCache.kt
@@ -26,6 +26,10 @@ internal data class CachedChunk(
      * Note that this is *not* about how many files across various snapshots are referencing it.
      */
     @ColumnInfo(name = "ref_count") val refCount: Long,
+    /**
+     * The ciphertext size of the chunk on disk, including the version byte.
+     * Not the plaintext size.
+     */
     val size: Long,
     val version: Byte = Backup.VERSION,
     @ColumnInfo(defaultValue = "0")

diff --git a/storage/lib/src/test/java/org/calyxos/backup/storage/backup/ChunkWriterTest.kt b/storage/lib/src/test/java/org/calyxos/backup/storage/backup/ChunkWriterTest.kt
@@ -214,15 +214,15 @@ internal class ChunkWriterTest {
         }
 
         // check that output matches chunk data
-        assertEquals(1 + chunks[0].size.toInt(), chunk1Output.size())
+        assertEquals(1 + chunks[0].plaintextSize.toInt(), chunk1Output.size())
         assertArrayEquals(
             chunk1Bytes,
-            chunk1Output.toByteArray().copyOfRange(1, 1 + chunks[0].size.toInt())
+            chunk1Output.toByteArray().copyOfRange(1, 1 + chunks[0].plaintextSize.toInt())
         )
-        assertEquals(1 + chunks[2].size.toInt(), chunk3Output.size())
+        assertEquals(1 + chunks[2].plaintextSize.toInt(), chunk3Output.size())
         assertArrayEquals(
             chunk3Bytes,
-            chunk3Output.toByteArray().copyOfRange(1, 1 + chunks[2].size.toInt())
+            chunk3Output.toByteArray().copyOfRange(1, 1 + chunks[2].plaintextSize.toInt())
         )
     }