From 8968e863089ddeb32dafe906a37e6bb76ead1193 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Wed, 26 Jul 2023 19:44:14 +0200
Subject: [PATCH 01/35] Implement some DAG walking

---
 Cargo.lock            | 983 +++++++++++++++++++++++++++++++++++++++++-
 car-mirror/Cargo.toml |   7 +
 car-mirror/src/lib.rs |  76 +++-
 3 files changed, 1042 insertions(+), 24 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4747b69..32b1cd0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,21 @@
 # It is not intended for manual editing.
 version = 3
 
+[[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "anes"
 version = "0.1.6"
@@ -14,6 +29,175 @@ version = "1.0.71"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
 
+[[package]]
+name = "arrayref"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545"
+
+[[package]]
+name = "arrayvec"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
+
+[[package]]
+name = "async-attributes"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3203e79f4dd9bdda415ed03cf14dae5a2bf775c683a00f94e9cd1faf0f596e5"
+dependencies = [
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "async-channel"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35"
+dependencies = [
+ "concurrent-queue",
+ "event-listener",
+ "futures-core",
+]
+
+[[package]]
+name = "async-executor"
+version = "1.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fa3dc5f2a8564f07759c008b9109dc0d39de92a88d5588b8a5036d286383afb"
+dependencies = [
+ "async-lock",
+ "async-task",
+ "concurrent-queue",
+ "fastrand",
+ "futures-lite",
+ "slab",
+]
+
+[[package]]
+name = "async-global-executor"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1b6f5d7df27bd294849f8eec66ecfc63d11814df7a4f5d74168a2394467b776"
+dependencies = [
+ "async-channel",
+ "async-executor",
+ "async-io",
+ "async-lock",
+ "blocking",
+ "futures-lite",
+ "once_cell",
+]
+
+[[package]]
+name = "async-io"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fc5b45d93ef0529756f812ca52e44c221b35341892d3dcc34132ac02f3dd2af"
+dependencies = [
+ "async-lock",
+ "autocfg",
+ "cfg-if",
+ "concurrent-queue",
+ "futures-lite",
+ "log",
+ "parking",
+ "polling",
+ "rustix",
+ "slab",
+ "socket2",
+ "waker-fn",
+]
+
+[[package]]
+name = "async-lock"
+version = "2.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa24f727524730b077666307f2734b4a1a1c57acb79193127dcc8914d5242dd7"
+dependencies = [
+ "event-listener",
+]
+
+[[package]]
+name = "async-once-cell"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b49bd4c5b769125ea6323601c39815848972880efd33ffb2d01f9f909adc699"
+
+[[package]]
+name = "async-std"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d"
+dependencies = [
+ "async-attributes",
+ "async-channel",
+ "async-global-executor",
+ "async-io",
+ "async-lock",
+ "crossbeam-utils",
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-lite",
+ "gloo-timers",
+ "kv-log-macro",
+ "log",
+ "memchr",
+ "once_cell",
+ "pin-project-lite",
+ "pin-utils",
+ "slab",
+ "wasm-bindgen-futures",
+]
+
+[[package]]
+name = "async-stream"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51"
+dependencies = [
+ "async-stream-impl",
+ "futures-core",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "async-stream-impl"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.27",
+]
+
+[[package]]
+name = "async-task"
+version = "4.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ecc7ab41815b3c653ccd2978ec3255c81349336702dfdf62ee6f7069b12a3aae"
+
+[[package]]
+name = "async-trait"
+version = "0.1.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b2d0f03b3640e3a630367e40c468cb7f309529c708ed1d88597047b0e7c6ef7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.27",
+]
+
+[[package]]
+name = "atomic-waker"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1181e1e0d1fce796a03db1ae795d67167da795f9cf4a39c37589e85ef57f26d3"
+
 [[package]]
 name = "atty"
 version = "0.2.14"
@@ -31,6 +215,12 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
+[[package]]
+name = "base-x"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4cbbc9d0964165b47557570cce6c952866c2678457aca742aafc9fb771d30270"
+
 [[package]]
 name = "bit-set"
 version = "0.5.3"
@@ -52,6 +242,65 @@ version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
+[[package]]
+name = "blake2b_simd"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c2f0dc9a68c6317d884f97cc36cf5a3d20ba14ce404227df55e1af708ab04bc"
+dependencies = [
+ "arrayref",
+ "arrayvec",
+ "constant_time_eq 0.2.6",
+]
+
+[[package]]
+name = "blake2s_simd"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6637f448b9e61dfadbdcbae9a885fadee1f3eaffb1f8d3c1965d3ade8bdfd44f"
+dependencies = [
+ "arrayref",
+ "arrayvec",
+ "constant_time_eq 0.2.6",
+]
+
+[[package]]
+name = "blake3"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "199c42ab6972d92c9f8995f086273d25c42fc0f7b2a1fcefba465c1352d25ba5"
+dependencies = [
+ "arrayref",
+ "arrayvec",
+ "cc",
+ "cfg-if",
+ "constant_time_eq 0.3.0",
+]
+
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "blocking"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77231a1c8f801696fc0123ec6150ce92cffb8e164a02afb9c8ddee0e9b65ad65"
+dependencies = [
+ "async-channel",
+ "async-lock",
+ "async-task",
+ "atomic-waker",
+ "fastrand",
+ "futures-lite",
+ "log",
+]
+
 [[package]]
 name = "bumpalo"
 version = "3.13.0"
@@ -64,14 +313,30 @@ version = "1.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
 
+[[package]]
+name = "bytes"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "car-mirror"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "async-std",
+ "async-stream",
+ "bytes",
+ "futures",
+ "libipld",
+ "libipld-core",
  "proptest",
  "tracing",
  "tracing-subscriber",
+ "wnfs-common",
 ]
 
 [[package]]
@@ -113,6 +378,18 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
+[[package]]
+name = "chrono"
+version = "0.4.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5"
+dependencies = [
+ "android-tzdata",
+ "iana-time-zone",
+ "num-traits",
+ "winapi",
+]
+
 [[package]]
 name = "ciborium"
 version = "0.2.1"
@@ -140,6 +417,20 @@ dependencies = [
  "half",
 ]
 
+[[package]]
+name = "cid"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd94671561e36e4e7de75f753f577edafb0e7c05d6e4547229fdf7938fbcd2c3"
+dependencies = [
+ "core2",
+ "multibase",
+ "multihash",
+ "serde",
+ "serde_bytes",
+ "unsigned-varint",
+]
+
 [[package]]
 name = "clap"
 version = "3.2.25"
@@ -161,6 +452,15 @@ dependencies = [
  "os_str_bytes",
 ]
 
+[[package]]
+name = "concurrent-queue"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62ec6771ecfa0762d24683ee5a32ad78487a3d3afdc0fb8cae19d2c5deb50b7c"
+dependencies = [
+ "crossbeam-utils",
+]
+
 [[package]]
 name = "console_error_panic_hook"
 version = "0.1.7"
@@ -171,6 +471,42 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "constant_time_eq"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21a53c0a4d288377e7415b53dcfc3c04da5cdc2cc95c8d5ac178b58f0b861ad6"
+
+[[package]]
+name = "constant_time_eq"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2"
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
+
+[[package]]
+name = "core2"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "cpufeatures"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "criterion"
 version = "0.4.0"
@@ -205,6 +541,61 @@ dependencies = [
  "itertools",
 ]
 
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "crypto-common"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
+dependencies = [
+ "generic-array",
+ "typenum",
+]
+
+[[package]]
+name = "data-encoding"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2e66c9d817f1720209181c316d28635c050fa304f9c79e47a520882661b7308"
+
+[[package]]
+name = "data-encoding-macro"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c904b33cc60130e1aeea4956ab803d08a3f4a0ca82d64ed757afac3891f2bb99"
+dependencies = [
+ "data-encoding",
+ "data-encoding-macro-internal",
+]
+
+[[package]]
+name = "data-encoding-macro-internal"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fdf3fce3ce863539ec1d7fd1b6dcc3c645663376b43ed376bbf887733e4f772"
+dependencies = [
+ "data-encoding",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer",
+ "crypto-common",
+]
+
 [[package]]
 name = "either"
 version = "1.8.1"
@@ -232,6 +623,12 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "event-listener"
+version = "2.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
+
 [[package]]
 name = "examples"
 version = "0.1.0"
@@ -254,15 +651,141 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
+[[package]]
+name = "futures"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-channel"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c"
+
+[[package]]
+name = "futures-executor"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-io"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964"
+
+[[package]]
+name = "futures-lite"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce"
+dependencies = [
+ "fastrand",
+ "futures-core",
+ "futures-io",
+ "memchr",
+ "parking",
+ "pin-project-lite",
+ "waker-fn",
+]
+
+[[package]]
+name = "futures-macro"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.27",
+]
+
+[[package]]
+name = "futures-sink"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e"
+
+[[package]]
+name = "futures-task"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65"
+
+[[package]]
+name = "futures-util"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-macro",
+ "futures-sink",
+ "futures-task",
+ "memchr",
+ "pin-project-lite",
+ "pin-utils",
+ "slab",
+]
+
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+]
+
 [[package]]
 name = "getrandom"
 version = "0.2.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427"
 dependencies = [
- "cfg-if",
- "libc",
- "wasi",
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
+[[package]]
+name = "gloo-timers"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b995a66bb87bebce9a0f4a95aed01daca4872c050bfcb21653361c03bc35e5c"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "js-sys",
+ "wasm-bindgen",
 ]
 
 [[package]]
@@ -292,6 +815,29 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
 
+[[package]]
+name = "iana-time-zone"
+version = "0.1.57"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "wasm-bindgen",
+ "windows",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "indexmap"
 version = "1.9.3"
@@ -346,6 +892,24 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "keccak"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f6d5ed8676d904364de097082f4e7d240b571b67989ced0240f08b7f966f940"
+dependencies = [
+ "cpufeatures",
+]
+
+[[package]]
+name = "kv-log-macro"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f"
+dependencies = [
+ "log",
+]
+
 [[package]]
 name = "lazy_static"
 version = "1.4.0"
@@ -358,6 +922,96 @@ version = "0.2.146"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b"
 
+[[package]]
+name = "libipld"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1ccd6b8ffb3afee7081fcaec00e1b099fd1c7ccf35ba5729d88538fcc3b4599"
+dependencies = [
+ "fnv",
+ "libipld-cbor",
+ "libipld-cbor-derive",
+ "libipld-core",
+ "libipld-json",
+ "libipld-macro",
+ "libipld-pb",
+ "log",
+ "multihash",
+ "thiserror",
+]
+
+[[package]]
+name = "libipld-cbor"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77d98c9d1747aa5eef1cf099cd648c3fd2d235249f5fed07522aaebc348e423b"
+dependencies = [
+ "byteorder",
+ "libipld-core",
+ "thiserror",
+]
+
+[[package]]
+name = "libipld-cbor-derive"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d5ba3a729b72973e456a1812b0afe2e176a376c1836cc1528e9fc98ae8cb838"
+dependencies = [
+ "proc-macro-crate",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+ "synstructure",
+]
+
+[[package]]
+name = "libipld-core"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5acd707e8d8b092e967b2af978ed84709eaded82b75effe6cb6f6cc797ef8158"
+dependencies = [
+ "anyhow",
+ "cid",
+ "core2",
+ "multibase",
+ "multihash",
+ "serde",
+ "thiserror",
+]
+
+[[package]]
+name = "libipld-json"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25856def940047b07b25c33d4e66d248597049ab0202085215dc4dca0487731c"
+dependencies = [
+ "libipld-core",
+ "multihash",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "libipld-macro"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "71171c54214f866ae6722f3027f81dff0931e600e5a61e6b1b6a49ca0b5ed4ae"
+dependencies = [
+ "libipld-core",
+]
+
+[[package]]
+name = "libipld-pb"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3f2d0f866c4cd5dc9aa8068c429ba478d2882a3a4b70ab56f7e9a0eddf5d16f"
+dependencies = [
+ "bytes",
+ "libipld-core",
+ "quick-protobuf",
+ "thiserror",
+]
+
 [[package]]
 name = "libm"
 version = "0.2.7"
@@ -375,6 +1029,59 @@ name = "log"
 version = "0.4.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4"
+dependencies = [
+ "value-bag",
+]
+
+[[package]]
+name = "memchr"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+
+[[package]]
+name = "multibase"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b3539ec3c1f04ac9748a260728e855f261b4977f5c3406612c884564f329404"
+dependencies = [
+ "base-x",
+ "data-encoding",
+ "data-encoding-macro",
+]
+
+[[package]]
+name = "multihash"
+version = "0.18.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfd8a792c1694c6da4f68db0a9d707c72bd260994da179e6030a5dcee00bb815"
+dependencies = [
+ "blake2b_simd",
+ "blake2s_simd",
+ "blake3",
+ "core2",
+ "digest",
+ "multihash-derive",
+ "serde",
+ "serde-big-array",
+ "sha2",
+ "sha3",
+ "unsigned-varint",
+]
+
+[[package]]
+name = "multihash-derive"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d6d4752e6230d8ef7adf7bd5d8c4b1f6561c1014c5ba9a37445ccefe18aa1db"
+dependencies = [
+ "proc-macro-crate",
+ "proc-macro-error",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+ "synstructure",
+]
 
 [[package]]
 name = "nu-ansi-term"
@@ -420,23 +1127,85 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 
+[[package]]
+name = "parking"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14f2252c834a40ed9bb5422029649578e63aa341ac401f74e719dd1afda8394e"
+
 [[package]]
 name = "pin-project-lite"
 version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116"
 
+[[package]]
+name = "pin-utils"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
+
+[[package]]
+name = "polling"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b2d323e8ca7996b3e23126511a523f7e62924d93ecd5ae73b333815b0eb3dce"
+dependencies = [
+ "autocfg",
+ "bitflags",
+ "cfg-if",
+ "concurrent-queue",
+ "libc",
+ "log",
+ "pin-project-lite",
+ "windows-sys",
+]
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
+[[package]]
+name = "proc-macro-crate"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e17d47ce914bf4de440332250b0edd23ce48c005f59fab39d3335866b114f11a"
+dependencies = [
+ "thiserror",
+ "toml",
+]
+
+[[package]]
+name = "proc-macro-error"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
+dependencies = [
+ "proc-macro-error-attr",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro-error-attr"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "version_check",
+]
+
 [[package]]
 name = "proc-macro2"
-version = "1.0.60"
+version = "1.0.66"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406"
+checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
 dependencies = [
  "unicode-ident",
 ]
@@ -467,6 +1236,15 @@ version = "1.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
 
+[[package]]
+name = "quick-protobuf"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d6da84cc204722a989e01ba2f6e1e276e190f22263d0cb6ce8526fcdb0d2e1f"
+dependencies = [
+ "byteorder",
+]
+
 [[package]]
 name = "quote"
 version = "1.0.28"
@@ -594,22 +1372,40 @@ checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294"
 
 [[package]]
 name = "serde"
-version = "1.0.164"
+version = "1.0.175"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d"
+checksum = "5d25439cd7397d044e2748a6fe2432b5e85db703d6d097bd014b3c0ad1ebff0b"
 dependencies = [
  "serde_derive",
 ]
 
+[[package]]
+name = "serde-big-array"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd31f59f6fe2b0c055371bb2f16d7f0aa7d8881676c04a55b1596d1a17cd10a4"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "serde_bytes"
+version = "0.11.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab33ec92f677585af6d88c65593ae2375adde54efdbf16d597f2cbc7a6d368ff"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "serde_derive"
-version = "1.0.164"
+version = "1.0.175"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68"
+checksum = "b23f7ade6f110613c0d63858ddb8b94c1041f550eab58a16b371bdf2c9c80ab4"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.27",
 ]
 
 [[package]]
@@ -623,6 +1419,27 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "sha2"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
+[[package]]
+name = "sha3"
+version = "0.10.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75872d278a8f37ef87fa0ddbda7802605cb18344497949862c0d4dcb291eba60"
+dependencies = [
+ "digest",
+ "keccak",
+]
+
 [[package]]
 name = "sharded-slab"
 version = "0.1.4"
@@ -632,23 +1449,65 @@ dependencies = [
  "lazy_static",
 ]
 
+[[package]]
+name = "slab"
+version = "0.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d"
+dependencies = [
+ "autocfg",
+]
+
 [[package]]
 name = "smallvec"
 version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
 
+[[package]]
+name = "socket2"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
 [[package]]
 name = "syn"
-version = "2.0.18"
+version = "2.0.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e"
+checksum = "b60f673f44a8255b9c8c657daf66a596d435f2da81a555b06dc644d080ba45e0"
 dependencies = [
  "proc-macro2",
  "quote",
  "unicode-ident",
 ]
 
+[[package]]
+name = "synstructure"
+version = "0.12.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+ "unicode-xid",
+]
+
 [[package]]
 name = "tempfile"
 version = "3.6.0"
@@ -669,6 +1528,26 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
 
+[[package]]
+name = "thiserror"
+version = "1.0.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.27",
+]
+
 [[package]]
 name = "thread_local"
 version = "1.1.7"
@@ -689,6 +1568,15 @@ dependencies = [
  "serde_json",
 ]
 
+[[package]]
+name = "toml"
+version = "0.5.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "tracing"
 version = "0.1.37"
@@ -709,7 +1597,7 @@ checksum = "8803eee176538f94ae9a14b55b2804eb7e1441f8210b1c31290b3bccdccff73b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.27",
 ]
 
 [[package]]
@@ -747,6 +1635,12 @@ dependencies = [
  "tracing-log",
 ]
 
+[[package]]
+name = "typenum"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
+
 [[package]]
 name = "unarray"
 version = "0.1.4"
@@ -759,12 +1653,36 @@ version = "1.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
 
+[[package]]
+name = "unicode-xid"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c"
+
+[[package]]
+name = "unsigned-varint"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d86a8dc7f45e4c1b0d30e43038c38f274e77af056aa5f74b93c2cf9eb3c1c836"
+
 [[package]]
 name = "valuable"
 version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
+[[package]]
+name = "value-bag"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d92ccd67fb88503048c01b59152a04effd0782d035a83a6d256ce6085f08f4a3"
+
+[[package]]
+name = "version_check"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
+
 [[package]]
 name = "wait-timeout"
 version = "0.2.0"
@@ -774,6 +1692,12 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "waker-fn"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca"
+
 [[package]]
 name = "walkdir"
 version = "2.3.3"
@@ -813,7 +1737,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.27",
  "wasm-bindgen-shared",
 ]
 
@@ -847,7 +1771,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
+ "syn 2.0.27",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -923,6 +1847,15 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
+[[package]]
+name = "windows"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
+dependencies = [
+ "windows-targets",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
@@ -988,3 +1921,23 @@ name = "windows_x86_64_msvc"
 version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
+
+[[package]]
+name = "wnfs-common"
+version = "0.1.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5dfcb4584f3866ead49adae8c05cec6f633139d19283448aa7807280612e24b7"
+dependencies = [
+ "anyhow",
+ "async-once-cell",
+ "async-trait",
+ "bytes",
+ "chrono",
+ "futures",
+ "libipld",
+ "multihash",
+ "once_cell",
+ "rand_core",
+ "serde",
+ "thiserror",
+]
diff --git a/car-mirror/Cargo.toml b/car-mirror/Cargo.toml
index 7a95196..12fbca9 100644
--- a/car-mirror/Cargo.toml
+++ b/car-mirror/Cargo.toml
@@ -24,12 +24,19 @@ doc = true
 
 [dependencies]
 anyhow = "1.0"
+async-stream = "0.3.5"
+bytes = "1.4.0"
+futures = "0.3.28"
+libipld = "0.16.0"
+libipld-core = "0.16.0"
 proptest = { version = "1.1", optional = true }
 tracing = "0.1"
 tracing-subscriber = "0.3"
+wnfs-common = "0.1.23"
 
 [dev-dependencies]
 proptest = "1.1"
+async-std = { version = "1.11", features = ["attributes"] }
 
 [features]
 default = []
diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs
index 2da8f41..ce4eaa8 100644
--- a/car-mirror/src/lib.rs
+++ b/car-mirror/src/lib.rs
@@ -4,27 +4,85 @@
 
 //! car-mirror
 
+use anyhow::Result;
+use async_stream::try_stream;
+use bytes::Bytes;
+use futures::Stream;
+use libipld::{Ipld, IpldCodec};
+use libipld_core::{cid::Cid, codec::References};
+use std::{
+    collections::{HashSet, VecDeque},
+    io::Cursor,
+};
+use wnfs_common::BlockStore;
+
 /// Test utilities.
 #[cfg(any(test, feature = "test_utils"))]
 #[cfg_attr(docsrs, doc(cfg(feature = "test_utils")))]
 pub mod test_utils;
 
-/// Add two integers together.
-pub fn add(a: i32, b: i32) -> i32 {
-    a + b
+/// walks a DAG from given root breadth-first along IPLD links
+pub fn walk_dag_in_order_breadth_first<'a>(
+    root: Cid,
+    store: &'a impl BlockStore,
+) -> impl Stream<Item = Result<(Cid, Bytes)>> + 'a {
+    try_stream! {
+        let mut visited = HashSet::new();
+        let mut frontier = VecDeque::from([root]);
+        while let Some(cid) = frontier.pop_front() {
+            if visited.contains(&cid) {
+                continue;
+            }
+            visited.insert(cid);
+            let block = store.get_block(&cid).await?;
+            let codec = IpldCodec::try_from(cid.codec())?;
+            frontier.extend(references(codec, &block)?);
+            yield (cid, block);
+        }
+    }
 }
 
-/// Multiplies two integers together.
-pub fn mult(a: i32, b: i32) -> i32 {
-    a * b
+fn references(codec: IpldCodec, block: impl AsRef<[u8]>) -> Result<Vec<Cid>> {
+    let mut refs = Vec::new();
+    <Ipld as References<IpldCodec>>::references(codec, &mut Cursor::new(block), &mut refs)?;
+    Ok(refs)
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
+    use futures::TryStreamExt;
+    use wnfs_common::MemoryBlockStore;
+
+    #[async_std::test]
+    async fn test_walk_dag_breadth_first() -> Result<()> {
+        let store = &MemoryBlockStore::new();
+
+        let cid_1 = store.put_serializable(&Ipld::String("1".into())).await?;
+        let cid_2 = store.put_serializable(&Ipld::String("2".into())).await?;
+        let cid_3 = store.put_serializable(&Ipld::String("3".into())).await?;
+
+        let cid_1_wrap = store
+            .put_serializable(&Ipld::List(vec![Ipld::Link(cid_1)]))
+            .await?;
+
+        let cid_root = store
+            .put_serializable(&Ipld::List(vec![
+                Ipld::Link(cid_1_wrap),
+                Ipld::Link(cid_2),
+                Ipld::Link(cid_3),
+            ]))
+            .await?;
+
+        let cids = walk_dag_in_order_breadth_first(cid_root, store)
+            .try_collect::<Vec<_>>()
+            .await?
+            .into_iter()
+            .map(|(cid, _block)| cid)
+            .collect::<Vec<_>>();
+
+        assert_eq!(cids, vec![cid_root, cid_1_wrap, cid_2, cid_3, cid_1]);
 
-    #[test]
-    fn test_mult() {
-        assert_eq!(mult(3, 2), 6);
+        Ok(())
     }
 }

From 77b1f1138ef43b1c8ec74512202b359808bf6785 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Tue, 15 Aug 2023 15:47:00 +0200
Subject: [PATCH 02/35] Update to new `roaring_graphs` library version

Co-authored-by: James Walker <walkah@walkah.net>
---
 Cargo.lock                                | 86 +++++++++++++++++++++++
 car-mirror/Cargo.toml                     |  6 +-
 car-mirror/proptest-regressions/lib.txt   |  7 ++
 car-mirror/src/lib.rs                     | 60 ++++++++++++++++
 car-mirror/src/test_utils/dag_strategy.rs | 51 ++++++++++++++
 car-mirror/src/test_utils/mod.rs          |  4 ++
 6 files changed, 213 insertions(+), 1 deletion(-)
 create mode 100644 car-mirror/proptest-regressions/lib.txt
 create mode 100644 car-mirror/src/test_utils/dag_strategy.rs

diff --git a/Cargo.lock b/Cargo.lock
index 32b1cd0..45b7775 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -307,6 +307,12 @@ version = "3.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
 
+[[package]]
+name = "bytemuck"
+version = "1.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "17febce684fd15d89027105661fec94afb475cb995fbc59d2865198446ba2eea"
+
 [[package]]
 name = "byteorder"
 version = "1.4.3"
@@ -330,10 +336,14 @@ dependencies = [
  "async-std",
  "async-stream",
  "bytes",
+ "car-mirror",
+ "fixedbitset",
  "futures",
  "libipld",
  "libipld-core",
  "proptest",
+ "roaring-graphs",
+ "test-strategy",
  "tracing",
  "tracing-subscriber",
  "wnfs-common",
@@ -498,6 +508,12 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "cov-mark"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ffa3d3e0138386cd4361f63537765cac7ee40698028844635a54495a92f67f3"
+
 [[package]]
 name = "cpufeatures"
 version = "0.2.9"
@@ -645,6 +661,12 @@ dependencies = [
  "instant",
 ]
 
+[[package]]
+name = "fixedbitset"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
+
 [[package]]
 name = "fnv"
 version = "1.0.7"
@@ -1323,6 +1345,35 @@ version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"
 
+[[package]]
+name = "retain_mut"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086"
+
+[[package]]
+name = "roaring"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6106b5cf8587f5834158895e9715a3c6c9716c8aefab57f1f7680917191c7873"
+dependencies = [
+ "bytemuck",
+ "byteorder",
+ "retain_mut",
+]
+
+[[package]]
+name = "roaring-graphs"
+version = "0.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff3b6db6a957b3ee92cf83d4a107d37827c4aa7a92ca71a933f0bea83a35d61f"
+dependencies = [
+ "cov-mark",
+ "proptest",
+ "rand",
+ "roaring",
+]
+
 [[package]]
 name = "rustix"
 version = "0.37.20"
@@ -1474,6 +1525,29 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "structmeta"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ad9e09554f0456d67a69c1584c9798ba733a5b50349a6c0d0948710523922d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "structmeta-derive",
+ "syn 2.0.27",
+]
+
+[[package]]
+name = "structmeta-derive"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a60bcaff7397072dca0017d1db428e30d5002e00b6847703e2e42005c95fbe00"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.27",
+]
+
 [[package]]
 name = "syn"
 version = "1.0.109"
@@ -1522,6 +1596,18 @@ dependencies = [
  "windows-sys",
 ]
 
+[[package]]
+name = "test-strategy"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8361c808554228ad09bfed70f5c823caf8a3450b6881cc3a38eb57e8c08c1d9"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "structmeta",
+ "syn 2.0.27",
+]
+
 [[package]]
 name = "textwrap"
 version = "0.16.0"
diff --git a/car-mirror/Cargo.toml b/car-mirror/Cargo.toml
index 12fbca9..30f2028 100644
--- a/car-mirror/Cargo.toml
+++ b/car-mirror/Cargo.toml
@@ -26,17 +26,21 @@ doc = true
 anyhow = "1.0"
 async-stream = "0.3.5"
 bytes = "1.4.0"
+fixedbitset = "0.4.2"
 futures = "0.3.28"
 libipld = "0.16.0"
 libipld-core = "0.16.0"
 proptest = { version = "1.1", optional = true }
+roaring-graphs = "0.12"
 tracing = "0.1"
 tracing-subscriber = "0.3"
 wnfs-common = "0.1.23"
 
 [dev-dependencies]
-proptest = "1.1"
 async-std = { version = "1.11", features = ["attributes"] }
+car-mirror = { path = ".", features = ["test_utils"] }
+proptest = "1.1"
+test-strategy = "0.3"
 
 [features]
 default = []
diff --git a/car-mirror/proptest-regressions/lib.txt b/car-mirror/proptest-regressions/lib.txt
new file mode 100644
index 0000000..b3f356f
--- /dev/null
+++ b/car-mirror/proptest-regressions/lib.txt
@@ -0,0 +1,7 @@
+# Seeds for failure cases proptest has generated in the past. It is
+# automatically read and these particular cases re-run before any
+# novel cases are generated.
+#
+# It is recommended to check this file in to source control so that
+# everyone who runs the test benefits from these saved cases.
+cc ecfcb732e093de600a3a3012674efabacf88f817b47d75c0ad0da9762ca3b6f7 # shrinks to input = _WalkDagNeverIteratesBlockTwiceArgs { dag: [(Cid(bafyreigvk2vd4s7ecxqhr7vlf5ei5tpdpalx73wbt53zokomvab5ouzq2a), b"\x86\xd8*X%\0\x01q\x12 pg\xacO\xb3\xfe\xacy\xd9k\xee\xc10\xdd\x8b\xbbc\x81\xc1\x06\x12\xf5Uw\x9e\xed\x11n\r?8\xdc\xd8*X%\0\x01q\x12 \xe0\xed\xdb\xa6\x0c\x8b\xf9\xe2fk\x12\xdd3\xf9\xb9Y%[\x85\xf6\xd9\xd8\x15\x8a\xce3\xbb\xdfN\xcfM\x93\xd8*X%\0\x01q\x12 \xfei\x9fJr\x7f\xed\xfd\t\0\x02lz5\x0eD\xc5\xf9\xe2\xda\"\x9ez5y\xd5\xc8\x02\xab+\xc0\x92\xd8*X%\0\x01q\x12 \xc6M\xa8\xd1B\x12\xcfT\xbfC\xd0\x1e\x89\x8c\xaa\x11\xafq\xed^sF\xb5\xda\x19\x98\xf2B\xb9\xe2\x9f\xb3\xd8*X%\0\x01q\x12 v\xbe\x8bR\x8d\0u\xf7\xaa\xe9\x8do\xa5zm<\x83\xaeH\n\x84i\xe6h\xd7\xb0\xaf\x96\x89\x95\xacq\xd8*X%\0\x01q\x12 \x92`zHK\xcfR\xb8\x10G\xa3+t\t\xb7_\xc9\xf6\x9b+\xee\xe0\x83S\"#\xba\xe8Q\xdd\x10\x02")] }
diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs
index ce4eaa8..533ffda 100644
--- a/car-mirror/src/lib.rs
+++ b/car-mirror/src/lib.rs
@@ -86,3 +86,63 @@ mod tests {
         Ok(())
     }
 }
+
+#[cfg(test)]
+mod proptests {
+    use crate::{
+        test_utils::{encode, generate_dag},
+        walk_dag_in_order_breadth_first,
+    };
+    use futures::TryStreamExt;
+    use libipld::{
+        multihash::{Code, MultihashDigest},
+        Cid, Ipld, IpldCodec,
+    };
+    use proptest::strategy::Strategy;
+    use std::collections::BTreeSet;
+    use test_strategy::proptest;
+    use wnfs_common::{BlockStore, MemoryBlockStore};
+
+    fn ipld_dags() -> impl Strategy<Value = (Vec<(Cid, Ipld)>, Cid)> {
+        generate_dag(256, |cids| {
+            let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect());
+            let cid = Cid::new_v1(
+                IpldCodec::DagCbor.into(),
+                Code::Blake3_256.digest(&encode(&ipld)),
+            );
+            (cid, ipld)
+        })
+    }
+
+    #[proptest(max_shrink_iters = 100_000)]
+    fn walk_dag_never_iterates_block_twice(#[strategy(ipld_dags())] dag: (Vec<(Cid, Ipld)>, Cid)) {
+        async_std::task::block_on(async {
+            let (dag, root) = dag;
+            let store = &MemoryBlockStore::new();
+            for (cid, ipld) in dag.iter() {
+                let cid_store = store
+                    .put_block(encode(ipld), IpldCodec::DagCbor.into())
+                    .await
+                    .unwrap();
+                assert_eq!(*cid, cid_store);
+            }
+
+            let mut cids = walk_dag_in_order_breadth_first(root, store)
+                .map_ok(|(cid, _)| cid)
+                .try_collect::<Vec<_>>()
+                .await
+                .unwrap();
+
+            cids.sort();
+
+            let unique_cids = cids
+                .iter()
+                .cloned()
+                .collect::<BTreeSet<_>>()
+                .into_iter()
+                .collect::<Vec<_>>();
+
+            assert_eq!(cids, unique_cids);
+        });
+    }
+}
diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs
new file mode 100644
index 0000000..003c25e
--- /dev/null
+++ b/car-mirror/src/test_utils/dag_strategy.rs
@@ -0,0 +1,51 @@
+use std::{collections::HashSet, fmt::Debug};
+
+use bytes::Bytes;
+use libipld::{Cid, Ipld, IpldCodec};
+use libipld_core::codec::Encode;
+use proptest::strategy::Strategy;
+use roaring_graphs::{arb_dag, DirectedAcyclicGraph, Vertex};
+
+pub fn encode(ipld: &Ipld) -> Bytes {
+    let mut vec = Vec::new();
+    ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap(); // TODO(matheus23) unwrap
+    Bytes::from(vec)
+}
+
+pub fn generate_dag<T: Debug + Clone>(
+    max_nodes: u16,
+    generate_block: fn(Vec<Cid>) -> (Cid, T),
+) -> impl Strategy<Value = (Vec<(Cid, T)>, Cid)> {
+    arb_dag(1..max_nodes, 0.5).prop_map(move |dag| dag_to_nodes(&dag, generate_block))
+}
+
+pub fn dag_to_nodes<T>(
+    dag: &DirectedAcyclicGraph,
+    generate_node: fn(Vec<Cid>) -> (Cid, T),
+) -> (Vec<(Cid, T)>, Cid) {
+    let mut blocks = Vec::new();
+    let mut visited = HashSet::new();
+    let (cid, block) = dag_to_nodes_helper(dag, 0, generate_node, &mut blocks, &mut visited);
+    blocks.push((cid, block));
+    (blocks, cid)
+}
+
+pub fn dag_to_nodes_helper<T>(
+    dag: &DirectedAcyclicGraph,
+    root: Vertex,
+    generate_node: fn(Vec<Cid>) -> (Cid, T),
+    arr: &mut Vec<(Cid, T)>,
+    visited: &mut HashSet<Vertex>,
+) -> (Cid, T) {
+    let mut child_blocks = Vec::new();
+    for child in dag.iter_children(root) {
+        if visited.contains(&child) {
+            continue;
+        }
+        visited.insert(child);
+        child_blocks.push(dag_to_nodes_helper(dag, child, generate_node, arr, visited));
+    }
+    let result = generate_node(child_blocks.iter().map(|(cid, _)| *cid).collect());
+    arr.extend(child_blocks);
+    result
+}
diff --git a/car-mirror/src/test_utils/mod.rs b/car-mirror/src/test_utils/mod.rs
index 4a30e2a..890a5ad 100644
--- a/car-mirror/src/test_utils/mod.rs
+++ b/car-mirror/src/test_utils/mod.rs
@@ -1,5 +1,9 @@
+#[cfg(feature = "test_utils")]
+mod dag_strategy;
 /// Random value generator for sampling data.
 #[cfg(feature = "test_utils")]
 mod rvg;
 #[cfg(feature = "test_utils")]
+pub use dag_strategy::*;
+#[cfg(feature = "test_utils")]
 pub use rvg::*;

From ca386907a258256baf9c25fac03d7fb9372381cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Wed, 16 Aug 2023 11:27:30 +0200
Subject: [PATCH 03/35] Write a stream of blocks into a CAR file

---
 Cargo.lock            | 113 ++++++++++++++++++++++++++++++++++++++++--
 car-mirror/Cargo.toml |   4 ++
 car-mirror/src/lib.rs |  53 ++++++++++++++++++--
 3 files changed, 163 insertions(+), 7 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 45b7775..d4cff6a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,21 @@
 # It is not intended for manual editing.
 version = 3
 
+[[package]]
+name = "addr2line"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4fa78e18c64fce05e902adecd7a5eed15a5e0a3439f7b0e169f0252214865e3"
+dependencies = [
+ "gimli",
+]
+
+[[package]]
+name = "adler"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+
 [[package]]
 name = "android-tzdata"
 version = "0.1.1"
@@ -183,9 +198,9 @@ checksum = "ecc7ab41815b3c653ccd2978ec3255c81349336702dfdf62ee6f7069b12a3aae"
 
 [[package]]
 name = "async-trait"
-version = "0.1.69"
+version = "0.1.73"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b2d0f03b3640e3a630367e40c468cb7f309529c708ed1d88597047b0e7c6ef7"
+checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -215,6 +230,21 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
+[[package]]
+name = "backtrace"
+version = "0.3.68"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12"
+dependencies = [
+ "addr2line",
+ "cc",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+]
+
 [[package]]
 name = "base-x"
 version = "0.2.11"
@@ -335,15 +365,19 @@ dependencies = [
  "anyhow",
  "async-std",
  "async-stream",
+ "async-trait",
  "bytes",
  "car-mirror",
  "fixedbitset",
  "futures",
+ "iroh-car",
  "libipld",
  "libipld-core",
  "proptest",
  "roaring-graphs",
  "test-strategy",
+ "tokio",
+ "tokio-util",
  "tracing",
  "tracing-subscriber",
  "wnfs-common",
@@ -798,6 +832,12 @@ dependencies = [
  "wasi",
 ]
 
+[[package]]
+name = "gimli"
+version = "0.27.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e"
+
 [[package]]
 name = "gloo-timers"
 version = "0.2.6"
@@ -890,6 +930,21 @@ dependencies = [
  "windows-sys",
 ]
 
+[[package]]
+name = "iroh-car"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a291220adb48738bdea587156c5f44ca5ec4ad31fdeb8fb88fda1dcd7886a24"
+dependencies = [
+ "anyhow",
+ "cid",
+ "futures",
+ "libipld",
+ "thiserror",
+ "tokio",
+ "unsigned-varint",
+]
+
 [[package]]
 name = "itertools"
 version = "0.10.5"
@@ -1061,6 +1116,15 @@ version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
 
+[[package]]
+name = "miniz_oxide"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+dependencies = [
+ "adler",
+]
+
 [[package]]
 name = "multibase"
 version = "0.9.1"
@@ -1125,6 +1189,15 @@ dependencies = [
  "libm",
 ]
 
+[[package]]
+name = "object"
+version = "0.31.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bda667d9f2b5051b8833f59f3bf748b28ef54f850f4fcb389a252aa383866d1"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "once_cell"
 version = "1.18.0"
@@ -1269,9 +1342,9 @@ dependencies = [
 
 [[package]]
 name = "quote"
-version = "1.0.28"
+version = "1.0.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
+checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
 dependencies = [
  "proc-macro2",
 ]
@@ -1374,6 +1447,12 @@ dependencies = [
  "roaring",
 ]
 
+[[package]]
+name = "rustc-demangle"
+version = "0.1.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
+
 [[package]]
 name = "rustix"
 version = "0.37.20"
@@ -1654,6 +1733,32 @@ dependencies = [
  "serde_json",
 ]
 
+[[package]]
+name = "tokio"
+version = "1.29.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "532826ff75199d5833b9d2c5fe410f29235e25704ee5f0ef599fb51c21f4a4da"
+dependencies = [
+ "autocfg",
+ "backtrace",
+ "bytes",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "tokio-util"
+version = "0.7.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "futures-io",
+ "futures-sink",
+ "pin-project-lite",
+ "tokio",
+]
+
 [[package]]
 name = "toml"
 version = "0.5.11"
diff --git a/car-mirror/Cargo.toml b/car-mirror/Cargo.toml
index 30f2028..5d7094f 100644
--- a/car-mirror/Cargo.toml
+++ b/car-mirror/Cargo.toml
@@ -28,13 +28,17 @@ async-stream = "0.3.5"
 bytes = "1.4.0"
 fixedbitset = "0.4.2"
 futures = "0.3.28"
+iroh-car = "0.3.0"
 libipld = "0.16.0"
 libipld-core = "0.16.0"
 proptest = { version = "1.1", optional = true }
 roaring-graphs = "0.12"
+tokio-util = { version = "0.7.8", features = ["compat"] }
+tokio = { version = "^1", features = ["io-util"] }
 tracing = "0.1"
 tracing-subscriber = "0.3"
 wnfs-common = "0.1.23"
+async-trait = "0.1.73"
 
 [dev-dependencies]
 async-std = { version = "1.11", features = ["attributes"] }
diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs
index 533ffda..3ee79e2 100644
--- a/car-mirror/src/lib.rs
+++ b/car-mirror/src/lib.rs
@@ -7,13 +7,15 @@
 use anyhow::Result;
 use async_stream::try_stream;
 use bytes::Bytes;
-use futures::Stream;
+use futures::{Stream, StreamExt};
+use iroh_car::CarWriter;
 use libipld::{Ipld, IpldCodec};
 use libipld_core::{cid::Cid, codec::References};
 use std::{
     collections::{HashSet, VecDeque},
     io::Cursor,
 };
+use tokio::io::AsyncWrite;
 use wnfs_common::BlockStore;
 
 /// Test utilities.
@@ -25,8 +27,8 @@ pub mod test_utils;
 pub fn walk_dag_in_order_breadth_first<'a>(
     root: Cid,
     store: &'a impl BlockStore,
-) -> impl Stream<Item = Result<(Cid, Bytes)>> + 'a {
-    try_stream! {
+) -> impl Stream<Item = Result<(Cid, Bytes)>> + Unpin + 'a {
+    Box::pin(try_stream! {
         let mut visited = HashSet::new();
         let mut frontier = VecDeque::from([root]);
         while let Some(cid) = frontier.pop_front() {
@@ -39,7 +41,19 @@ pub fn walk_dag_in_order_breadth_first<'a>(
             frontier.extend(references(codec, &block)?);
             yield (cid, block);
         }
+    })
+}
+
+/// Writes a stream of blocks into a car file
+pub async fn stream_into_car<W: AsyncWrite + Send + Unpin>(
+    mut blocks: impl Stream<Item = Result<(Cid, Bytes)>> + Unpin,
+    writer: &mut CarWriter<W>,
+) -> Result<()> {
+    while let Some(result) = blocks.next().await {
+        let (cid, bytes) = result?;
+        writer.write(cid, bytes).await?;
     }
+    Ok(())
 }
 
 fn references(codec: IpldCodec, block: impl AsRef<[u8]>) -> Result<Vec<Cid>> {
@@ -50,10 +64,43 @@ fn references(codec: IpldCodec, block: impl AsRef<[u8]>) -> Result<Vec<Cid>> {
 
 #[cfg(test)]
 mod tests {
+    use crate::test_utils::{encode, generate_dag, Rvg};
+
     use super::*;
+    use async_std::fs::File;
     use futures::TryStreamExt;
+    use iroh_car::CarHeader;
+    use libipld_core::multihash::{Code, MultihashDigest};
+    use tokio_util::compat::FuturesAsyncWriteCompatExt;
     use wnfs_common::MemoryBlockStore;
 
+    #[async_std::test]
+    async fn test_write_into_car() -> Result<()> {
+        let (blocks, root) = Rvg::new().sample(&generate_dag(256, |cids| {
+            let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect());
+            let bytes = encode(&ipld);
+            let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes));
+            (cid, bytes)
+        }));
+
+        let store = &MemoryBlockStore::new();
+        for (cid, bytes) in blocks.iter() {
+            let cid_store = store
+                .put_block(bytes.clone(), IpldCodec::DagCbor.into())
+                .await?;
+            assert_eq!(*cid, cid_store);
+        }
+
+        let file = File::create("./my-car3.car").await?;
+        let mut writer = CarWriter::new(CarHeader::new_v1(vec![root]), file.compat_write());
+        writer.write_header().await?;
+        let block_stream = walk_dag_in_order_breadth_first(root, store);
+        stream_into_car(block_stream, &mut writer).await?;
+        let file = writer.finish().await?;
+
+        Ok(())
+    }
+
     #[async_std::test]
     async fn test_walk_dag_breadth_first() -> Result<()> {
         let store = &MemoryBlockStore::new();

From 9e4fcf860914c426076880ce7ac7f52e1f9e35aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Wed, 16 Aug 2023 12:14:09 +0200
Subject: [PATCH 04/35] Read back car files and make sure they're incrementally
 verified

---
 car-mirror/src/lib.rs | 75 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 9 deletions(-)

diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs
index 3ee79e2..f2e8cb7 100644
--- a/car-mirror/src/lib.rs
+++ b/car-mirror/src/lib.rs
@@ -4,18 +4,21 @@
 
 //! car-mirror
 
-use anyhow::Result;
+use anyhow::{anyhow, bail, Result};
 use async_stream::try_stream;
 use bytes::Bytes;
 use futures::{Stream, StreamExt};
-use iroh_car::CarWriter;
+use iroh_car::{CarReader, CarWriter};
 use libipld::{Ipld, IpldCodec};
-use libipld_core::{cid::Cid, codec::References};
+use libipld_core::{
+    cid::Cid,
+    codec::References,
+    multihash::{Code, MultihashDigest},
+};
 use std::{
     collections::{HashSet, VecDeque},
     io::Cursor,
 };
-use tokio::io::AsyncWrite;
 use wnfs_common::BlockStore;
 
 /// Test utilities.
@@ -45,7 +48,7 @@ pub fn walk_dag_in_order_breadth_first<'a>(
 }
 
 /// Writes a stream of blocks into a car file
-pub async fn stream_into_car<W: AsyncWrite + Send + Unpin>(
+pub async fn stream_into_car<W: tokio::io::AsyncWrite + Send + Unpin>(
     mut blocks: impl Stream<Item = Result<(Cid, Bytes)>> + Unpin,
     writer: &mut CarWriter<W>,
 ) -> Result<()> {
@@ -56,6 +59,49 @@ pub async fn stream_into_car<W: AsyncWrite + Send + Unpin>(
     Ok(())
 }
 
+/// Read a directed acyclic graph from a CAR file, making sure it's read in-order and
+/// only blocks reachable from the root are included.
+pub fn read_in_order_dag_from_car<'a, R: tokio::io::AsyncRead + Unpin>(
+    root: Cid,
+    reader: &'a mut CarReader<R>,
+) -> impl Stream<Item = Result<(Cid, Bytes)>> + Unpin + 'a {
+    Box::pin(try_stream! {
+        let mut reachable_from_root = HashSet::from([root]);
+        while let Some((cid, vec)) = reader.next_block().await.map_err(|e| anyhow!(e))? {
+            let block = Bytes::from(vec);
+
+            let code: Code = cid
+                .hash()
+                .code()
+                .try_into()
+                .map_err(|_| anyhow!("Unsupported hash code in Cid: {cid}"))?;
+
+            let codec: IpldCodec = cid
+                .codec()
+                .try_into()
+                .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?;
+
+            let digest = code.digest(&block);
+
+            if cid.hash() != &digest {
+                Err(anyhow!(
+                    "Digest mismatch in CAR file: expected {:?}, got {:?}",
+                    digest,
+                    cid.hash()
+                ))?;
+            }
+
+            if !reachable_from_root.contains(&cid) {
+                Err(anyhow!("Unexpected block or block out of order: {cid}"))?;
+            }
+
+            reachable_from_root.extend(references(codec, &block)?);
+
+            yield (cid, block);
+        }
+    })
+}
+
 fn references(codec: IpldCodec, block: impl AsRef<[u8]>) -> Result<Vec<Cid>> {
     let mut refs = Vec::new();
     <Ipld as References<IpldCodec>>::references(codec, &mut Cursor::new(block), &mut refs)?;
@@ -68,10 +114,10 @@ mod tests {
 
     use super::*;
     use async_std::fs::File;
-    use futures::TryStreamExt;
+    use futures::{future, TryStreamExt};
     use iroh_car::CarHeader;
     use libipld_core::multihash::{Code, MultihashDigest};
-    use tokio_util::compat::FuturesAsyncWriteCompatExt;
+    use tokio_util::compat::{FuturesAsyncReadCompatExt, FuturesAsyncWriteCompatExt};
     use wnfs_common::MemoryBlockStore;
 
     #[async_std::test]
@@ -91,12 +137,23 @@ mod tests {
             assert_eq!(*cid, cid_store);
         }
 
-        let file = File::create("./my-car3.car").await?;
+        let filename = "./my-car.car";
+
+        let file = File::create(filename).await?;
         let mut writer = CarWriter::new(CarHeader::new_v1(vec![root]), file.compat_write());
         writer.write_header().await?;
         let block_stream = walk_dag_in_order_breadth_first(root, store);
         stream_into_car(block_stream, &mut writer).await?;
-        let file = writer.finish().await?;
+        writer.finish().await?;
+
+        let mut reader = CarReader::new(File::open(filename).await?.compat()).await?;
+
+        read_in_order_dag_from_car(root, &mut reader)
+            .try_for_each(|(cid, _)| {
+                println!("Got {cid}");
+                future::ready(Ok(()))
+            })
+            .await?;
 
         Ok(())
     }

From aa22ea06722fe797e4cb453375be896f6a25abd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Wed, 16 Aug 2023 14:18:44 +0200
Subject: [PATCH 05/35] Add over-the-wire datatypes

---
 Cargo.lock                 | 63 ++++++++++++++++++++++++++++----------
 car-mirror/Cargo.toml      |  2 ++
 car-mirror/src/lib.rs      |  5 ++-
 car-mirror/src/messages.rs | 60 ++++++++++++++++++++++++++++++++++++
 4 files changed, 112 insertions(+), 18 deletions(-)
 create mode 100644 car-mirror/src/messages.rs

diff --git a/Cargo.lock b/Cargo.lock
index d4cff6a..15f9f73 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -187,7 +187,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.27",
+ "syn 2.0.28",
 ]
 
 [[package]]
@@ -204,7 +204,7 @@ checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.27",
+ "syn 2.0.28",
 ]
 
 [[package]]
@@ -375,6 +375,8 @@ dependencies = [
  "libipld-core",
  "proptest",
  "roaring-graphs",
+ "serde",
+ "serde_ipld_dagcbor",
  "test-strategy",
  "tokio",
  "tokio-util",
@@ -410,6 +412,15 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
+[[package]]
+name = "cbor4ii"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b544cf8c89359205f4f990d0e6f3828db42df85b5dac95d09157a250eb0749c4"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "cc"
 version = "1.0.79"
@@ -778,7 +789,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.27",
+ "syn 2.0.28",
 ]
 
 [[package]]
@@ -1500,11 +1511,17 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294"
 
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
 [[package]]
 name = "serde"
-version = "1.0.175"
+version = "1.0.183"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d25439cd7397d044e2748a6fe2432b5e85db703d6d097bd014b3c0ad1ebff0b"
+checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
 dependencies = [
  "serde_derive",
 ]
@@ -1529,13 +1546,25 @@ dependencies = [
 
 [[package]]
 name = "serde_derive"
-version = "1.0.175"
+version = "1.0.183"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b23f7ade6f110613c0d63858ddb8b94c1041f550eab58a16b371bdf2c9c80ab4"
+checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.27",
+ "syn 2.0.28",
+]
+
+[[package]]
+name = "serde_ipld_dagcbor"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ace39c1b7526be78c755a4c698313f699cf44e62408c0029bf9ab9450fe836da"
+dependencies = [
+ "cbor4ii",
+ "cid",
+ "scopeguard",
+ "serde",
 ]
 
 [[package]]
@@ -1613,7 +1642,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "structmeta-derive",
- "syn 2.0.27",
+ "syn 2.0.28",
 ]
 
 [[package]]
@@ -1624,7 +1653,7 @@ checksum = "a60bcaff7397072dca0017d1db428e30d5002e00b6847703e2e42005c95fbe00"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.27",
+ "syn 2.0.28",
 ]
 
 [[package]]
@@ -1640,9 +1669,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.27"
+version = "2.0.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b60f673f44a8255b9c8c657daf66a596d435f2da81a555b06dc644d080ba45e0"
+checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1684,7 +1713,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "structmeta",
- "syn 2.0.27",
+ "syn 2.0.28",
 ]
 
 [[package]]
@@ -1710,7 +1739,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.27",
+ "syn 2.0.28",
 ]
 
 [[package]]
@@ -1788,7 +1817,7 @@ checksum = "8803eee176538f94ae9a14b55b2804eb7e1441f8210b1c31290b3bccdccff73b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.27",
+ "syn 2.0.28",
 ]
 
 [[package]]
@@ -1928,7 +1957,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.27",
+ "syn 2.0.28",
  "wasm-bindgen-shared",
 ]
 
@@ -1962,7 +1991,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.27",
+ "syn 2.0.28",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
diff --git a/car-mirror/Cargo.toml b/car-mirror/Cargo.toml
index 5d7094f..a25201f 100644
--- a/car-mirror/Cargo.toml
+++ b/car-mirror/Cargo.toml
@@ -39,6 +39,8 @@ tracing = "0.1"
 tracing-subscriber = "0.3"
 wnfs-common = "0.1.23"
 async-trait = "0.1.73"
+serde_ipld_dagcbor = "0.4.0"
+serde = "1.0.183"
 
 [dev-dependencies]
 async-std = { version = "1.11", features = ["attributes"] }
diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs
index f2e8cb7..d3fc7e2 100644
--- a/car-mirror/src/lib.rs
+++ b/car-mirror/src/lib.rs
@@ -4,7 +4,7 @@
 
 //! car-mirror
 
-use anyhow::{anyhow, bail, Result};
+use anyhow::{anyhow, Result};
 use async_stream::try_stream;
 use bytes::Bytes;
 use futures::{Stream, StreamExt};
@@ -26,6 +26,9 @@ use wnfs_common::BlockStore;
 #[cfg_attr(docsrs, doc(cfg(feature = "test_utils")))]
 pub mod test_utils;
 
+/// Contains the data types that are sent over-the-wire and relevant serialization code.
+pub mod messages;
+
 /// walks a DAG from given root breadth-first along IPLD links
 pub fn walk_dag_in_order_breadth_first<'a>(
     root: Cid,
diff --git a/car-mirror/src/messages.rs b/car-mirror/src/messages.rs
new file mode 100644
index 0000000..815fe45
--- /dev/null
+++ b/car-mirror/src/messages.rs
@@ -0,0 +1,60 @@
+use libipld_core::cid::Cid;
+use serde::{Deserialize, Serialize};
+
+/// Initial message for pull requests.
+///
+/// Over-the-wire data type from the [specification].
+///
+/// [specification]: https://github.com/fission-codes/spec/blob/86fcfb07d507f1df4fdaaf49088abecbb1dda76a/car-pool/car-mirror/http.md#12-requestor-payload
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct PullRequest {
+    /// Requested CID roots
+    #[serde(rename = "rs")]
+    pub resources: Vec<Cid>,
+
+    /// Bloom filter hash count
+    #[serde(rename = "bk")]
+    pub bloom_k: u32,
+
+    /// Bloom filter Binary
+    #[serde(rename = "bb")]
+    pub bloom: Vec<u8>,
+}
+
+/// Part of the initial message for push requests.
+/// The other part is simply tupled together with the actual initial
+/// CAR file.
+///
+/// Wire data type from the [specification].
+///
+/// [specification]: https://github.com/fission-codes/spec/blob/86fcfb07d507f1df4fdaaf49088abecbb1dda76a/car-pool/car-mirror/http.md#22-requestor-payload
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct PushRequestHeader {
+    /// Bloom filter hash count
+    #[serde(rename = "bk")]
+    pub bloom_k: u32,
+
+    /// Bloom filter Binary
+    #[serde(rename = "bb")]
+    pub bloom: Vec<u8>,
+}
+
+/// The response sent after the initial and subsequent push requests.
+///
+/// Wire data type from the [specification].
+///
+/// [specification]: https://github.com/fission-codes/spec/blob/86fcfb07d507f1df4fdaaf49088abecbb1dda76a/car-pool/car-mirror/http.md#23-provider-payload
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct PushResponse {
+    /// Incomplete subgraph roots
+    #[serde(rename = "sr")]
+    pub subgraph_roots: Vec<Cid>,
+
+    /// Bloom filter hash count
+    #[serde(rename = "bk")]
+    pub bloom_k: u32,
+
+    /// Bloom filter Binary
+    #[serde(rename = "bb")]
+    pub bloom: Vec<u8>,
+}

From 0c197aad717dba70ae642a4050946c8a6a831858 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Wed, 16 Aug 2023 18:42:51 +0200
Subject: [PATCH 06/35] Small demo of running CAR mirror in-memory

---
 car-mirror/src/lib.rs | 213 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 196 insertions(+), 17 deletions(-)

diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs
index d3fc7e2..2701d8e 100644
--- a/car-mirror/src/lib.rs
+++ b/car-mirror/src/lib.rs
@@ -4,17 +4,18 @@
 
 //! car-mirror
 
-use anyhow::{anyhow, Result};
+use anyhow::{anyhow, bail, Result};
 use async_stream::try_stream;
 use bytes::Bytes;
-use futures::{Stream, StreamExt};
-use iroh_car::{CarReader, CarWriter};
+use futures::{stream::LocalBoxStream, Stream, StreamExt, TryStreamExt};
+use iroh_car::{CarHeader, CarReader, CarWriter};
 use libipld::{Ipld, IpldCodec};
 use libipld_core::{
     cid::Cid,
     codec::References,
     multihash::{Code, MultihashDigest},
 };
+use messages::PushResponse;
 use std::{
     collections::{HashSet, VecDeque},
     io::Cursor,
@@ -29,14 +30,137 @@ pub mod test_utils;
 /// Contains the data types that are sent over-the-wire and relevant serialization code.
 pub mod messages;
 
+pub struct PushSenderSession<'a, B: BlockStore> {
+    last_response: PushResponse,
+    send_limit: usize,
+    store: &'a B,
+}
+
+impl<'a, B: BlockStore> PushSenderSession<'a, B> {
+    pub fn new(root: Cid, store: &'a B) -> Self {
+        Self {
+            last_response: PushResponse {
+                subgraph_roots: vec![root],
+                // Just putting an empty bloom here initially
+                bloom_k: 3,
+                bloom: Vec::new(),
+            },
+            send_limit: 256 * 1024, // 256KiB
+            store,
+        }
+    }
+
+    pub fn handle_response(&mut self, response: PushResponse) -> bool {
+        self.last_response = response;
+        self.last_response.subgraph_roots.is_empty()
+    }
+
+    pub async fn next_request(&mut self) -> Result<Bytes> {
+        let mut writer = CarWriter::new(
+            CarHeader::new_v1(
+                // TODO(matheus23): This is stupid
+                // CAR files *must* have at least one CID in them, and all of them
+                // need to appear as a block in the payload.
+                // It would probably make most sense to just write all subgraph roots into this,
+                // but we don't know how many of the subgraph roots fit into this round yet,
+                // so we're simply writing the first one in here, since we know
+                // at least one block will be written (and it'll be that one).
+                self.last_response
+                    .subgraph_roots
+                    .iter()
+                    .take(1)
+                    .cloned()
+                    .collect(),
+            ),
+            Vec::new(),
+        );
+        writer.write_header().await?;
+
+        let mut block_bytes = 0;
+        let mut stream =
+            walk_dag_in_order_breadth_first(self.last_response.subgraph_roots.clone(), self.store);
+        while let Some((cid, block)) = stream.try_next().await? {
+            // TODO Eventually we'll need to turn the `LocalBoxStream` into a more configurable
+            // "external iterator", and then this will be the point where we prune parts of the DAG
+            // that the recipient already has.
+
+            // TODO(matheus23): Count the actual bytes sent?
+            block_bytes += block.len();
+            if block_bytes > self.send_limit {
+                break;
+            }
+
+            writer.write(cid, &block).await?;
+        }
+
+        Ok(writer.finish().await?.into())
+    }
+}
+
+pub struct PushReceiverSession<'a, B: BlockStore> {
+    accepted_roots: Vec<Cid>,
+    receive_limit: usize,
+    store: &'a B,
+}
+
+impl<'a, B: BlockStore> PushReceiverSession<'a, B> {
+    pub fn new(root: Cid, store: &'a B) -> Self {
+        Self {
+            accepted_roots: vec![root],
+            receive_limit: 256 * 1024, // 256KiB
+            store,
+        }
+    }
+
+    pub async fn handle_request(&mut self, request: Bytes) -> Result<PushResponse> {
+        let mut reader = CarReader::new(Cursor::new(request)).await?;
+        let mut stream = read_in_order_dag_from_car(self.accepted_roots.clone(), &mut reader);
+
+        let mut missing_subgraphs: HashSet<_> = self.accepted_roots.iter().cloned().collect();
+
+        let mut block_bytes = 0;
+        while let Some((cid, block)) = stream.try_next().await? {
+            block_bytes += block.len();
+            if block_bytes > self.receive_limit {
+                bail!(
+                    "Received more than {} bytes ({block_bytes}), aborting request.",
+                    self.receive_limit
+                );
+            }
+
+            let codec: IpldCodec = cid
+                .codec()
+                .try_into()
+                .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?;
+
+            missing_subgraphs.remove(&cid);
+            missing_subgraphs.extend(references(codec, &block)?);
+
+            self.store.put_block(block, cid.codec()).await?;
+        }
+
+        let subgraph_roots: Vec<_> = missing_subgraphs.into_iter().collect();
+
+        self.accepted_roots = subgraph_roots.clone();
+
+        Ok(PushResponse {
+            subgraph_roots,
+            // We ignore blooms for now
+            bloom_k: 3,
+            bloom: Vec::new(),
+        })
+    }
+}
+
 /// walks a DAG from given root breadth-first along IPLD links
-pub fn walk_dag_in_order_breadth_first<'a>(
-    root: Cid,
-    store: &'a impl BlockStore,
-) -> impl Stream<Item = Result<(Cid, Bytes)>> + Unpin + 'a {
+pub fn walk_dag_in_order_breadth_first(
+    roots: impl IntoIterator<Item = Cid>,
+    store: &impl BlockStore,
+) -> LocalBoxStream<'_, Result<(Cid, Bytes)>> {
+    let mut frontier: VecDeque<_> = roots.into_iter().collect();
+
     Box::pin(try_stream! {
         let mut visited = HashSet::new();
-        let mut frontier = VecDeque::from([root]);
         while let Some(cid) = frontier.pop_front() {
             if visited.contains(&cid) {
                 continue;
@@ -64,12 +188,12 @@ pub async fn stream_into_car<W: tokio::io::AsyncWrite + Send + Unpin>(
 
 /// Read a directed acyclic graph from a CAR file, making sure it's read in-order and
 /// only blocks reachable from the root are included.
-pub fn read_in_order_dag_from_car<'a, R: tokio::io::AsyncRead + Unpin>(
-    root: Cid,
-    reader: &'a mut CarReader<R>,
-) -> impl Stream<Item = Result<(Cid, Bytes)>> + Unpin + 'a {
+pub fn read_in_order_dag_from_car<R: tokio::io::AsyncRead + Unpin>(
+    roots: impl IntoIterator<Item = Cid>,
+    reader: &mut CarReader<R>,
+) -> LocalBoxStream<'_, Result<(Cid, Bytes)>> {
+    let mut reachable_from_root: HashSet<_> = roots.into_iter().collect();
     Box::pin(try_stream! {
-        let mut reachable_from_root = HashSet::from([root]);
         while let Some((cid, vec)) = reader.next_block().await.map_err(|e| anyhow!(e))? {
             let block = Bytes::from(vec);
 
@@ -113,6 +237,8 @@ fn references(codec: IpldCodec, block: impl AsRef<[u8]>) -> Result<Vec<Cid>> {
 
 #[cfg(test)]
 mod tests {
+    use std::collections::BTreeMap;
+
     use crate::test_utils::{encode, generate_dag, Rvg};
 
     use super::*;
@@ -145,13 +271,13 @@ mod tests {
         let file = File::create(filename).await?;
         let mut writer = CarWriter::new(CarHeader::new_v1(vec![root]), file.compat_write());
         writer.write_header().await?;
-        let block_stream = walk_dag_in_order_breadth_first(root, store);
+        let block_stream = walk_dag_in_order_breadth_first([root], store);
         stream_into_car(block_stream, &mut writer).await?;
         writer.finish().await?;
 
         let mut reader = CarReader::new(File::open(filename).await?.compat()).await?;
 
-        read_in_order_dag_from_car(root, &mut reader)
+        read_in_order_dag_from_car([root], &mut reader)
             .try_for_each(|(cid, _)| {
                 println!("Got {cid}");
                 future::ready(Ok(()))
@@ -161,6 +287,59 @@ mod tests {
         Ok(())
     }
 
+    #[async_std::test]
+    async fn test_transfer() -> Result<()> {
+        let (blocks, root) = Rvg::new().sample(&generate_dag(256, |cids| {
+            let ipld = Ipld::Map(BTreeMap::from([
+                ("data".into(), Ipld::Bytes(vec![0u8; 10 * 1024])),
+                (
+                    "links".into(),
+                    Ipld::List(cids.into_iter().map(Ipld::Link).collect()),
+                ),
+            ]));
+            let bytes = encode(&ipld);
+            let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes));
+            (cid, bytes)
+        }));
+
+        let sender_store = &MemoryBlockStore::new();
+        for (cid, bytes) in blocks.iter() {
+            let cid_store = sender_store
+                .put_block(bytes.clone(), IpldCodec::DagCbor.into())
+                .await?;
+            assert_eq!(*cid, cid_store);
+        }
+
+        let receiver_store = &MemoryBlockStore::new();
+
+        let mut sender = PushSenderSession::new(root, sender_store);
+        let mut receiver = PushReceiverSession::new(root, receiver_store);
+
+        loop {
+            let request = sender.next_request().await?;
+            println!("Sending request {} bytes", request.len());
+            let response = receiver.handle_request(request).await?;
+            if sender.handle_response(response) {
+                // Should be done
+                break;
+            }
+        }
+
+        // receiver should have all data
+        let sender_cids = walk_dag_in_order_breadth_first([root], sender_store)
+            .map_ok(|(cid, _)| cid)
+            .try_collect::<Vec<_>>()
+            .await?;
+        let receiver_cids = walk_dag_in_order_breadth_first([root], receiver_store)
+            .map_ok(|(cid, _)| cid)
+            .try_collect::<Vec<_>>()
+            .await?;
+
+        assert_eq!(sender_cids, receiver_cids);
+
+        Ok(())
+    }
+
     #[async_std::test]
     async fn test_walk_dag_breadth_first() -> Result<()> {
         let store = &MemoryBlockStore::new();
@@ -181,7 +360,7 @@ mod tests {
             ]))
             .await?;
 
-        let cids = walk_dag_in_order_breadth_first(cid_root, store)
+        let cids = walk_dag_in_order_breadth_first([cid_root], store)
             .try_collect::<Vec<_>>()
             .await?
             .into_iter()
@@ -234,7 +413,7 @@ mod proptests {
                 assert_eq!(*cid, cid_store);
             }
 
-            let mut cids = walk_dag_in_order_breadth_first(root, store)
+            let mut cids = walk_dag_in_order_breadth_first([root], store)
                 .map_ok(|(cid, _)| cid)
                 .try_collect::<Vec<_>>()
                 .await

From 1f2a8590446b3db4382d032b5c3404414273b18e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Fri, 18 Aug 2023 11:09:50 +0200
Subject: [PATCH 07/35] Make protocol stateless

---
 car-mirror/src/lib.rs      | 477 ++++++++++++++++++++-----------------
 car-mirror/src/messages.rs |   6 +
 2 files changed, 266 insertions(+), 217 deletions(-)

diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs
index 2701d8e..ad547c3 100644
--- a/car-mirror/src/lib.rs
+++ b/car-mirror/src/lib.rs
@@ -5,22 +5,18 @@
 //! car-mirror
 
 use anyhow::{anyhow, bail, Result};
-use async_stream::try_stream;
 use bytes::Bytes;
-use futures::{stream::LocalBoxStream, Stream, StreamExt, TryStreamExt};
+use futures::{stream::try_unfold, Stream, StreamExt, TryStreamExt};
 use iroh_car::{CarHeader, CarReader, CarWriter};
 use libipld::{Ipld, IpldCodec};
-use libipld_core::{
-    cid::Cid,
-    codec::References,
-    multihash::{Code, MultihashDigest},
-};
+use libipld_core::{cid::Cid, codec::References};
 use messages::PushResponse;
 use std::{
     collections::{HashSet, VecDeque},
+    eprintln,
     io::Cursor,
 };
-use wnfs_common::BlockStore;
+use wnfs_common::{BlockStore, BlockStoreError};
 
 /// Test utilities.
 #[cfg(any(test, feature = "test_utils"))]
@@ -30,148 +26,199 @@ pub mod test_utils;
 /// Contains the data types that are sent over-the-wire and relevant serialization code.
 pub mod messages;
 
-pub struct PushSenderSession<'a, B: BlockStore> {
-    last_response: PushResponse,
-    send_limit: usize,
-    store: &'a B,
+pub struct PushConfig {
+    send_minimum: usize,
+    receive_maximum: usize,
+    max_roots_per_round: usize,
 }
 
-impl<'a, B: BlockStore> PushSenderSession<'a, B> {
-    pub fn new(root: Cid, store: &'a B) -> Self {
+impl Default for PushConfig {
+    fn default() -> Self {
         Self {
-            last_response: PushResponse {
-                subgraph_roots: vec![root],
-                // Just putting an empty bloom here initially
-                bloom_k: 3,
-                bloom: Vec::new(),
-            },
-            send_limit: 256 * 1024, // 256KiB
-            store,
+            send_minimum: 128 * 1024,    // 128KiB
+            receive_maximum: 512 * 1024, // 512KiB
+            max_roots_per_round: 1000,   // max. ~41KB of CIDs
         }
     }
+}
+
+pub async fn client_initiate_push(
+    root: Cid,
+    config: &PushConfig,
+    store: &impl BlockStore,
+) -> Result<Bytes> {
+    let fake_response = PushResponse {
+        subgraph_roots: vec![root],
+        // Just putting an empty bloom here
+        bloom_k: 3,
+        bloom: Vec::new(),
+    };
+    client_push(root, &fake_response, config, store).await
+}
 
-    pub fn handle_response(&mut self, response: PushResponse) -> bool {
-        self.last_response = response;
-        self.last_response.subgraph_roots.is_empty()
+pub async fn client_push(
+    root: Cid,
+    last_response: &PushResponse,
+    config: &PushConfig,
+    store: &impl BlockStore,
+) -> Result<Bytes> {
+    // Verify that all subgraph roots are in the relevant DAG:
+    let subgraph_roots: Vec<Cid> = DagWalk::breadth_first([root])
+        .stream(store)
+        .try_filter_map(|(cid, _)| async move {
+            Ok(last_response.subgraph_roots.contains(&cid).then_some(cid))
+        })
+        .try_collect()
+        .await?;
+
+    let mut writer = CarWriter::new(
+        CarHeader::new_v1(
+            // TODO(matheus23): This is stupid
+            // CAR files *must* have at least one CID in them, and all of them
+            // need to appear as a block in the payload.
+            // It would probably make most sense to just write all subgraph roots into this,
+            // but we don't know how many of the subgraph roots fit into this round yet,
+            // so we're simply writing the first one in here, since we know
+            // at least one block will be written (and it'll be that one).
+            subgraph_roots.iter().take(1).cloned().collect(),
+        ),
+        Vec::new(),
+    );
+
+    writer.write_header().await?;
+
+    let mut block_bytes = 0;
+    let mut dag_walk = DagWalk::breadth_first(subgraph_roots);
+    while let Some((cid, block)) = dag_walk.next(store).await? {
+        writer.write(cid, &block).await?;
+        println!("Sending {cid}");
+
+        // TODO(matheus23): Count the actual bytes sent?
+        block_bytes += block.len();
+        if block_bytes > config.send_minimum {
+            break;
+        }
     }
 
-    pub async fn next_request(&mut self) -> Result<Bytes> {
-        let mut writer = CarWriter::new(
-            CarHeader::new_v1(
-                // TODO(matheus23): This is stupid
-                // CAR files *must* have at least one CID in them, and all of them
-                // need to appear as a block in the payload.
-                // It would probably make most sense to just write all subgraph roots into this,
-                // but we don't know how many of the subgraph roots fit into this round yet,
-                // so we're simply writing the first one in here, since we know
-                // at least one block will be written (and it'll be that one).
-                self.last_response
-                    .subgraph_roots
-                    .iter()
-                    .take(1)
-                    .cloned()
-                    .collect(),
-            ),
-            Vec::new(),
-        );
-        writer.write_header().await?;
-
-        let mut block_bytes = 0;
-        let mut stream =
-            walk_dag_in_order_breadth_first(self.last_response.subgraph_roots.clone(), self.store);
-        while let Some((cid, block)) = stream.try_next().await? {
-            // TODO Eventually we'll need to turn the `LocalBoxStream` into a more configurable
-            // "external iterator", and then this will be the point where we prune parts of the DAG
-            // that the recipient already has.
-
-            // TODO(matheus23): Count the actual bytes sent?
-            block_bytes += block.len();
-            if block_bytes > self.send_limit {
-                break;
-            }
+    Ok(writer.finish().await?.into())
+}
+
+pub async fn server_push_response(
+    root: Cid,
+    request: Bytes,
+    config: &PushConfig,
+    store: &impl BlockStore,
+) -> Result<PushResponse> {
+    let mut dag_verification = IncrementalDagVerification::new([root], store).await?;
 
-            writer.write(cid, &block).await?;
+    let mut reader = CarReader::new(Cursor::new(request)).await?;
+    let mut block_bytes = 0;
+
+    while let Some((cid, vec)) = reader.next_block().await? {
+        let block = Bytes::from(vec);
+        println!("Received {cid}");
+
+        block_bytes += block.len();
+        if block_bytes > config.receive_maximum {
+            bail!(
+                "Received more than {} bytes ({block_bytes}), aborting request.",
+                config.receive_maximum
+            );
         }
 
-        Ok(writer.finish().await?.into())
+        dag_verification
+            .verify_and_store_block((cid, block), store)
+            .await?;
     }
+
+    let subgraph_roots = dag_verification
+        .want_cids
+        .iter()
+        .take(config.max_roots_per_round)
+        .cloned()
+        .collect();
+
+    Ok(PushResponse {
+        subgraph_roots,
+        // We ignore blooms for now
+        bloom_k: 3,
+        bloom: Vec::new(),
+    })
 }
 
-pub struct PushReceiverSession<'a, B: BlockStore> {
-    accepted_roots: Vec<Cid>,
-    receive_limit: usize,
-    store: &'a B,
+pub struct DagWalk {
+    pub frontier: VecDeque<Cid>,
+    pub visited: HashSet<Cid>,
+    pub breadth_first: bool,
 }
 
-impl<'a, B: BlockStore> PushReceiverSession<'a, B> {
-    pub fn new(root: Cid, store: &'a B) -> Self {
-        Self {
-            accepted_roots: vec![root],
-            receive_limit: 256 * 1024, // 256KiB
-            store,
-        }
+impl DagWalk {
+    pub fn breadth_first(roots: impl IntoIterator<Item = Cid>) -> Self {
+        Self::new(roots, true)
     }
 
-    pub async fn handle_request(&mut self, request: Bytes) -> Result<PushResponse> {
-        let mut reader = CarReader::new(Cursor::new(request)).await?;
-        let mut stream = read_in_order_dag_from_car(self.accepted_roots.clone(), &mut reader);
+    pub fn depth_first(roots: impl IntoIterator<Item = Cid>) -> Self {
+        Self::new(roots, false)
+    }
 
-        let mut missing_subgraphs: HashSet<_> = self.accepted_roots.iter().cloned().collect();
+    pub fn new(roots: impl IntoIterator<Item = Cid>, breadth_first: bool) -> Self {
+        let frontier = roots.into_iter().collect();
+        let visited = HashSet::new();
+        Self {
+            frontier,
+            visited,
+            breadth_first,
+        }
+    }
 
-        let mut block_bytes = 0;
-        while let Some((cid, block)) = stream.try_next().await? {
-            block_bytes += block.len();
-            if block_bytes > self.receive_limit {
-                bail!(
-                    "Received more than {} bytes ({block_bytes}), aborting request.",
-                    self.receive_limit
-                );
+    pub async fn next(&mut self, store: &impl BlockStore) -> Result<Option<(Cid, Bytes)>> {
+        let cid = loop {
+            let popped = if self.breadth_first {
+                self.frontier.pop_front()
+            } else {
+                self.frontier.pop_back()
+            };
+
+            let Some(cid) = popped else {
+                return Ok(None);
+            };
+
+            // We loop until we find an unvisited block
+            if self.visited.insert(cid) {
+                break cid;
             }
+        };
 
-            let codec: IpldCodec = cid
-                .codec()
-                .try_into()
-                .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?;
-
-            missing_subgraphs.remove(&cid);
-            missing_subgraphs.extend(references(codec, &block)?);
-
-            self.store.put_block(block, cid.codec()).await?;
+        let block = store.get_block(&cid).await?;
+        let codec = IpldCodec::try_from(cid.codec())?;
+        for ref_cid in references(codec, &block)? {
+            if !self.visited.contains(&ref_cid) {
+                self.frontier.push_back(ref_cid);
+            }
         }
 
-        let subgraph_roots: Vec<_> = missing_subgraphs.into_iter().collect();
-
-        self.accepted_roots = subgraph_roots.clone();
+        Ok(Some((cid, block)))
+    }
 
-        Ok(PushResponse {
-            subgraph_roots,
-            // We ignore blooms for now
-            bloom_k: 3,
-            bloom: Vec::new(),
-        })
+    pub fn stream(
+        self,
+        store: &impl BlockStore,
+    ) -> impl Stream<Item = Result<(Cid, Bytes)>> + Unpin + '_ {
+        Box::pin(try_unfold(self, move |mut this| async move {
+            let maybe_block = this.next(store).await?;
+            Ok(maybe_block.map(|b| (b, this)))
+        }))
     }
-}
 
-/// walks a DAG from given root breadth-first along IPLD links
-pub fn walk_dag_in_order_breadth_first(
-    roots: impl IntoIterator<Item = Cid>,
-    store: &impl BlockStore,
-) -> LocalBoxStream<'_, Result<(Cid, Bytes)>> {
-    let mut frontier: VecDeque<_> = roots.into_iter().collect();
-
-    Box::pin(try_stream! {
-        let mut visited = HashSet::new();
-        while let Some(cid) = frontier.pop_front() {
-            if visited.contains(&cid) {
-                continue;
-            }
-            visited.insert(cid);
-            let block = store.get_block(&cid).await?;
-            let codec = IpldCodec::try_from(cid.codec())?;
-            frontier.extend(references(codec, &block)?);
-            yield (cid, block);
-        }
-    })
+    pub fn is_finished(&self) -> bool {
+        // We're finished if the frontier does not contain any CIDs that we have not visited yet.
+        // Put differently:
+        // We're not finished if there exist unvisited CIDs in the frontier.
+        !self
+            .frontier
+            .iter()
+            .any(|frontier_cid| !self.visited.contains(frontier_cid))
+    }
 }
 
 /// Writes a stream of blocks into a car file
@@ -186,47 +233,84 @@ pub async fn stream_into_car<W: tokio::io::AsyncWrite + Send + Unpin>(
     Ok(())
 }
 
-/// Read a directed acyclic graph from a CAR file, making sure it's read in-order and
-/// only blocks reachable from the root are included.
-pub fn read_in_order_dag_from_car<R: tokio::io::AsyncRead + Unpin>(
-    roots: impl IntoIterator<Item = Cid>,
-    reader: &mut CarReader<R>,
-) -> LocalBoxStream<'_, Result<(Cid, Bytes)>> {
-    let mut reachable_from_root: HashSet<_> = roots.into_iter().collect();
-    Box::pin(try_stream! {
-        while let Some((cid, vec)) = reader.next_block().await.map_err(|e| anyhow!(e))? {
-            let block = Bytes::from(vec);
-
-            let code: Code = cid
-                .hash()
-                .code()
-                .try_into()
-                .map_err(|_| anyhow!("Unsupported hash code in Cid: {cid}"))?;
-
-            let codec: IpldCodec = cid
-                .codec()
-                .try_into()
-                .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?;
-
-            let digest = code.digest(&block);
-
-            if cid.hash() != &digest {
-                Err(anyhow!(
-                    "Digest mismatch in CAR file: expected {:?}, got {:?}",
-                    digest,
-                    cid.hash()
-                ))?;
+pub struct IncrementalDagVerification {
+    pub want_cids: HashSet<Cid>,
+    pub have_cids: HashSet<Cid>,
+}
+
+impl IncrementalDagVerification {
+    pub async fn new(
+        roots: impl IntoIterator<Item = Cid>,
+        store: &impl BlockStore,
+    ) -> Result<Self> {
+        let mut want_cids = HashSet::new();
+        let mut have_cids = HashSet::new();
+        let mut dag_walk = DagWalk::breadth_first(roots);
+
+        loop {
+            match dag_walk.next(store).await {
+                Err(e) => {
+                    if let Some(BlockStoreError::CIDNotFound(not_found)) =
+                        e.downcast_ref::<BlockStoreError>()
+                    {
+                        want_cids.insert(*not_found);
+                    } else {
+                        bail!(e);
+                    }
+                }
+                Ok(Some((cid, _))) => {
+                    have_cids.insert(cid);
+                }
+                Ok(None) => {
+                    break;
+                }
             }
+        }
 
-            if !reachable_from_root.contains(&cid) {
-                Err(anyhow!("Unexpected block or block out of order: {cid}"))?;
+        Ok(Self {
+            want_cids,
+            have_cids,
+        })
+    }
+
+    pub async fn verify_and_store_block(
+        &mut self,
+        block: (Cid, Bytes),
+        store: &impl BlockStore,
+    ) -> Result<()> {
+        let (cid, bytes) = block;
+
+        let codec: IpldCodec = cid
+            .codec()
+            .try_into()
+            .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?;
+
+        if !self.want_cids.contains(&cid) {
+            if self.have_cids.contains(&cid) {
+                eprintln!("Warn: Received {cid}, even though we already have it");
+            } else {
+                bail!("Unexpected block or block out of order: {cid}");
             }
+        }
 
-            reachable_from_root.extend(references(codec, &block)?);
+        let refs = references(codec, &bytes)?;
+        let result_cid = store.put_block(bytes, codec.into()).await?;
 
-            yield (cid, block);
+        if result_cid != cid {
+            bail!("Digest mismatch in CAR file: expected {cid}, got {result_cid}");
         }
-    })
+
+        for ref_cid in refs {
+            if !self.have_cids.contains(&ref_cid) {
+                self.want_cids.insert(ref_cid);
+            }
+        }
+
+        self.want_cids.remove(&cid);
+        self.have_cids.insert(cid);
+
+        Ok(())
+    }
 }
 
 fn references(codec: IpldCodec, block: impl AsRef<[u8]>) -> Result<Vec<Cid>> {
@@ -237,56 +321,13 @@ fn references(codec: IpldCodec, block: impl AsRef<[u8]>) -> Result<Vec<Cid>> {
 
 #[cfg(test)]
 mod tests {
-    use std::collections::BTreeMap;
-
-    use crate::test_utils::{encode, generate_dag, Rvg};
-
     use super::*;
-    use async_std::fs::File;
-    use futures::{future, TryStreamExt};
-    use iroh_car::CarHeader;
+    use crate::test_utils::{encode, generate_dag, Rvg};
+    use futures::TryStreamExt;
     use libipld_core::multihash::{Code, MultihashDigest};
-    use tokio_util::compat::{FuturesAsyncReadCompatExt, FuturesAsyncWriteCompatExt};
+    use std::collections::BTreeMap;
     use wnfs_common::MemoryBlockStore;
 
-    #[async_std::test]
-    async fn test_write_into_car() -> Result<()> {
-        let (blocks, root) = Rvg::new().sample(&generate_dag(256, |cids| {
-            let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect());
-            let bytes = encode(&ipld);
-            let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes));
-            (cid, bytes)
-        }));
-
-        let store = &MemoryBlockStore::new();
-        for (cid, bytes) in blocks.iter() {
-            let cid_store = store
-                .put_block(bytes.clone(), IpldCodec::DagCbor.into())
-                .await?;
-            assert_eq!(*cid, cid_store);
-        }
-
-        let filename = "./my-car.car";
-
-        let file = File::create(filename).await?;
-        let mut writer = CarWriter::new(CarHeader::new_v1(vec![root]), file.compat_write());
-        writer.write_header().await?;
-        let block_stream = walk_dag_in_order_breadth_first([root], store);
-        stream_into_car(block_stream, &mut writer).await?;
-        writer.finish().await?;
-
-        let mut reader = CarReader::new(File::open(filename).await?.compat()).await?;
-
-        read_in_order_dag_from_car([root], &mut reader)
-            .try_for_each(|(cid, _)| {
-                println!("Got {cid}");
-                future::ready(Ok(()))
-            })
-            .await?;
-
-        Ok(())
-    }
-
     #[async_std::test]
     async fn test_transfer() -> Result<()> {
         let (blocks, root) = Rvg::new().sample(&generate_dag(256, |cids| {
@@ -311,26 +352,26 @@ mod tests {
         }
 
         let receiver_store = &MemoryBlockStore::new();
-
-        let mut sender = PushSenderSession::new(root, sender_store);
-        let mut receiver = PushReceiverSession::new(root, receiver_store);
-
+        let config = &PushConfig::default();
+        let mut request = client_initiate_push(root, config, sender_store).await?;
         loop {
-            let request = sender.next_request().await?;
             println!("Sending request {} bytes", request.len());
-            let response = receiver.handle_request(request).await?;
-            if sender.handle_response(response) {
-                // Should be done
+            let response = server_push_response(root, request, config, receiver_store).await?;
+            println!("Response: {:?}", response.subgraph_roots);
+            if response.indicates_finished() {
                 break;
             }
+            request = client_push(root, &response, config, sender_store).await?;
         }
 
         // receiver should have all data
-        let sender_cids = walk_dag_in_order_breadth_first([root], sender_store)
+        let sender_cids = DagWalk::breadth_first([root])
+            .stream(sender_store)
             .map_ok(|(cid, _)| cid)
             .try_collect::<Vec<_>>()
             .await?;
-        let receiver_cids = walk_dag_in_order_breadth_first([root], receiver_store)
+        let receiver_cids = DagWalk::breadth_first([root])
+            .stream(receiver_store)
             .map_ok(|(cid, _)| cid)
             .try_collect::<Vec<_>>()
             .await?;
@@ -360,7 +401,8 @@ mod tests {
             ]))
             .await?;
 
-        let cids = walk_dag_in_order_breadth_first([cid_root], store)
+        let cids = DagWalk::breadth_first([cid_root])
+            .stream(store)
             .try_collect::<Vec<_>>()
             .await?
             .into_iter()
@@ -377,7 +419,7 @@ mod tests {
 mod proptests {
     use crate::{
         test_utils::{encode, generate_dag},
-        walk_dag_in_order_breadth_first,
+        DagWalk,
     };
     use futures::TryStreamExt;
     use libipld::{
@@ -413,7 +455,8 @@ mod proptests {
                 assert_eq!(*cid, cid_store);
             }
 
-            let mut cids = walk_dag_in_order_breadth_first([root], store)
+            let mut cids = DagWalk::breadth_first([root])
+                .stream(store)
                 .map_ok(|(cid, _)| cid)
                 .try_collect::<Vec<_>>()
                 .await
diff --git a/car-mirror/src/messages.rs b/car-mirror/src/messages.rs
index 815fe45..c5a778f 100644
--- a/car-mirror/src/messages.rs
+++ b/car-mirror/src/messages.rs
@@ -58,3 +58,9 @@ pub struct PushResponse {
     #[serde(rename = "bb")]
     pub bloom: Vec<u8>,
 }
+
+impl PushResponse {
+    pub fn indicates_finished(&self) -> bool {
+        self.subgraph_roots.is_empty()
+    }
+}

From 7535cf84a2b7b159dac0cf55e92810cab55364cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Fri, 18 Aug 2023 11:58:40 +0200
Subject: [PATCH 08/35] Implement bloom filters

---
 Cargo.lock            | 90 +++++++++++++++++++++++++++++++++++++++++++
 car-mirror/Cargo.toml |  1 +
 car-mirror/src/lib.rs | 87 +++++++++++++++++++++++++++++++----------
 3 files changed, 158 insertions(+), 20 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 15f9f73..952b6fb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -272,6 +272,19 @@ version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
+[[package]]
+name = "bitvec"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
+dependencies = [
+ "funty",
+ "radium",
+ "serde",
+ "tap",
+ "wyz",
+]
+
 [[package]]
 name = "blake2b_simd"
 version = "1.0.1"
@@ -368,6 +381,7 @@ dependencies = [
  "async-trait",
  "bytes",
  "car-mirror",
+ "deterministic-bloom",
  "fixedbitset",
  "futures",
  "iroh-car",
@@ -647,6 +661,20 @@ dependencies = [
  "syn 1.0.109",
 ]
 
+[[package]]
+name = "deterministic-bloom"
+version = "0.1.0"
+source = "git+https://github.com/wnfs-wg/deterministic-bloom#a8cd85b#a8cd85b1d71da9f79f5058c0a20e53a83a283230"
+dependencies = [
+ "bitvec",
+ "miette",
+ "rand_core",
+ "serde",
+ "thiserror",
+ "tracing",
+ "xxhash-rust",
+]
+
 [[package]]
 name = "digest"
 version = "0.10.7"
@@ -718,6 +746,12 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
+[[package]]
+name = "funty"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
+
 [[package]]
 name = "futures"
 version = "0.3.28"
@@ -1127,6 +1161,29 @@ version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
 
+[[package]]
+name = "miette"
+version = "5.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59bb584eaeeab6bd0226ccf3509a69d7936d148cf3d036ad350abe35e8c6856e"
+dependencies = [
+ "miette-derive",
+ "once_cell",
+ "thiserror",
+ "unicode-width",
+]
+
+[[package]]
+name = "miette-derive"
+version = "5.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49e7bc1560b95a3c4a25d03de42fe76ca718ab92d1a22a55b9b4cf67b3ae635c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.28",
+]
+
 [[package]]
 name = "miniz_oxide"
 version = "0.7.1"
@@ -1360,6 +1417,12 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "radium"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
+
 [[package]]
 name = "rand"
 version = "0.8.5"
@@ -1690,6 +1753,12 @@ dependencies = [
  "unicode-xid",
 ]
 
+[[package]]
+name = "tap"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
+
 [[package]]
 name = "tempfile"
 version = "3.6.0"
@@ -1873,6 +1942,12 @@ version = "1.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
 
+[[package]]
+name = "unicode-width"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
+
 [[package]]
 name = "unicode-xid"
 version = "0.2.4"
@@ -2161,3 +2236,18 @@ dependencies = [
  "serde",
  "thiserror",
 ]
+
+[[package]]
+name = "wyz"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed"
+dependencies = [
+ "tap",
+]
+
+[[package]]
+name = "xxhash-rust"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "735a71d46c4d68d71d4b24d03fdc2b98e38cea81730595801db779c04fe80d70"
diff --git a/car-mirror/Cargo.toml b/car-mirror/Cargo.toml
index a25201f..e5216c2 100644
--- a/car-mirror/Cargo.toml
+++ b/car-mirror/Cargo.toml
@@ -26,6 +26,7 @@ doc = true
 anyhow = "1.0"
 async-stream = "0.3.5"
 bytes = "1.4.0"
+deterministic-bloom = { git = "https://github.com/wnfs-wg/deterministic-bloom#a8cd85b" }
 fixedbitset = "0.4.2"
 futures = "0.3.28"
 iroh-car = "0.3.0"
diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs
index ad547c3..7f1871a 100644
--- a/car-mirror/src/lib.rs
+++ b/car-mirror/src/lib.rs
@@ -6,6 +6,7 @@
 
 use anyhow::{anyhow, bail, Result};
 use bytes::Bytes;
+use deterministic_bloom::runtime_size::BloomFilter;
 use futures::{stream::try_unfold, Stream, StreamExt, TryStreamExt};
 use iroh_car::{CarHeader, CarReader, CarWriter};
 use libipld::{Ipld, IpldCodec};
@@ -30,6 +31,7 @@ pub struct PushConfig {
     send_minimum: usize,
     receive_maximum: usize,
     max_roots_per_round: usize,
+    bloom_fpr: f64,
 }
 
 impl Default for PushConfig {
@@ -38,6 +40,7 @@ impl Default for PushConfig {
             send_minimum: 128 * 1024,    // 128KiB
             receive_maximum: 512 * 1024, // 512KiB
             max_roots_per_round: 1000,   // max. ~41KB of CIDs
+            bloom_fpr: 1.0 / 1_000.0,    // 0.1%
         }
     }
 }
@@ -53,24 +56,34 @@ pub async fn client_initiate_push(
         bloom_k: 3,
         bloom: Vec::new(),
     };
-    client_push(root, &fake_response, config, store).await
+    client_push(root, fake_response, config, store).await
 }
 
 pub async fn client_push(
     root: Cid,
-    last_response: &PushResponse,
+    last_response: PushResponse,
     config: &PushConfig,
     store: &impl BlockStore,
 ) -> Result<Bytes> {
+    let PushResponse {
+        ref subgraph_roots,
+        bloom_k,
+        bloom,
+    } = last_response;
+
     // Verify that all subgraph roots are in the relevant DAG:
     let subgraph_roots: Vec<Cid> = DagWalk::breadth_first([root])
         .stream(store)
-        .try_filter_map(|(cid, _)| async move {
-            Ok(last_response.subgraph_roots.contains(&cid).then_some(cid))
-        })
+        .try_filter_map(|(cid, _)| async move { Ok(subgraph_roots.contains(&cid).then_some(cid)) })
         .try_collect()
         .await?;
 
+    let bloom = if bloom.is_empty() {
+        BloomFilter::new_with(1, Box::new([0])) // An empty bloom that contains nothing
+    } else {
+        BloomFilter::new_with(bloom_k as usize, bloom.into_boxed_slice())
+    };
+
     let mut writer = CarWriter::new(
         CarHeader::new_v1(
             // TODO(matheus23): This is stupid
@@ -88,8 +101,21 @@ pub async fn client_push(
     writer.write_header().await?;
 
     let mut block_bytes = 0;
-    let mut dag_walk = DagWalk::breadth_first(subgraph_roots);
+    let mut dag_walk = DagWalk::breadth_first(subgraph_roots.clone());
     while let Some((cid, block)) = dag_walk.next(store).await? {
+        if bloom.contains(&cid.to_bytes()) && !subgraph_roots.contains(&cid) {
+            // TODO(matheus23) I think the spec means to prune the whole subgraph.
+            // But
+            // 1. That requires the receiver to check the whole subgraph at that CID to find out whether there's a missing block at the subgraph.
+            // 2. It requires the sender to go through every block under this subgraph down to the leaves to mark all of these CIDs as visited.
+            // Both of these are *huge* traversals. I'd say likely not worth it. The only case I can image they're worth it, is if the DAG
+            // is *heavily* using structural sharing and not tree-like.
+            // Also: This fails completely if the sender is just missing a single leaf. It couldn't add the block to the bloom in that case.
+            dag_walk.skip_walking((cid, block))?;
+            println!("Skipped walking {cid} due to bloom");
+            break;
+        }
+
         writer.write(cid, &block).await?;
         println!("Sending {cid}");
 
@@ -138,11 +164,19 @@ pub async fn server_push_response(
         .cloned()
         .collect();
 
+    let mut bloom =
+        BloomFilter::new_from_fpr_po2(dag_verification.have_cids.len() as u64, config.bloom_fpr);
+
+    dag_verification
+        .have_cids
+        .iter()
+        .for_each(|cid| bloom.insert(&cid.to_bytes()));
+
     Ok(PushResponse {
         subgraph_roots,
         // We ignore blooms for now
-        bloom_k: 3,
-        bloom: Vec::new(),
+        bloom_k: bloom.hash_count() as u32,
+        bloom: bloom.as_bytes().to_vec(),
     })
 }
 
@@ -190,8 +224,7 @@ impl DagWalk {
         };
 
         let block = store.get_block(&cid).await?;
-        let codec = IpldCodec::try_from(cid.codec())?;
-        for ref_cid in references(codec, &block)? {
+        for ref_cid in references(cid, &block)? {
             if !self.visited.contains(&ref_cid) {
                 self.frontier.push_back(ref_cid);
             }
@@ -219,6 +252,16 @@ impl DagWalk {
             .iter()
             .any(|frontier_cid| !self.visited.contains(frontier_cid))
     }
+
+    pub fn skip_walking(&mut self, block: (Cid, Bytes)) -> Result<()> {
+        let (cid, bytes) = block;
+        let refs = references(cid, bytes)?;
+        self.visited.insert(cid);
+        self.frontier
+            .retain(|frontier_cid| !refs.contains(frontier_cid));
+
+        Ok(())
+    }
 }
 
 /// Writes a stream of blocks into a car file
@@ -280,11 +323,6 @@ impl IncrementalDagVerification {
     ) -> Result<()> {
         let (cid, bytes) = block;
 
-        let codec: IpldCodec = cid
-            .codec()
-            .try_into()
-            .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?;
-
         if !self.want_cids.contains(&cid) {
             if self.have_cids.contains(&cid) {
                 eprintln!("Warn: Received {cid}, even though we already have it");
@@ -293,8 +331,8 @@ impl IncrementalDagVerification {
             }
         }
 
-        let refs = references(codec, &bytes)?;
-        let result_cid = store.put_block(bytes, codec.into()).await?;
+        let refs = references(cid, &bytes)?;
+        let result_cid = store.put_block(bytes, cid.codec()).await?;
 
         if result_cid != cid {
             bail!("Digest mismatch in CAR file: expected {cid}, got {result_cid}");
@@ -313,7 +351,12 @@ impl IncrementalDagVerification {
     }
 }
 
-fn references(codec: IpldCodec, block: impl AsRef<[u8]>) -> Result<Vec<Cid>> {
+fn references(cid: Cid, block: impl AsRef<[u8]>) -> Result<Vec<Cid>> {
+    let codec: IpldCodec = cid
+        .codec()
+        .try_into()
+        .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?;
+
     let mut refs = Vec::new();
     <Ipld as References<IpldCodec>>::references(codec, &mut Cursor::new(block), &mut refs)?;
     Ok(refs)
@@ -357,11 +400,15 @@ mod tests {
         loop {
             println!("Sending request {} bytes", request.len());
             let response = server_push_response(root, request, config, receiver_store).await?;
-            println!("Response: {:?}", response.subgraph_roots);
+            println!(
+                "Response (bloom bytes: {}): {:?}",
+                response.bloom.len(),
+                response.subgraph_roots,
+            );
             if response.indicates_finished() {
                 break;
             }
-            request = client_push(root, &response, config, sender_store).await?;
+            request = client_push(root, response, config, sender_store).await?;
         }
 
         // receiver should have all data

From 5c4baa72dd950b62da9ee6162a246023edaa9089 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Fri, 18 Aug 2023 14:26:37 +0200
Subject: [PATCH 09/35] Add some docs

---
 car-mirror/src/lib.rs                     | 91 +++++++++++++++++++++--
 car-mirror/src/messages.rs                |  1 +
 car-mirror/src/test_utils/dag_strategy.rs |  8 +-
 3 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs
index 7f1871a..5982219 100644
--- a/car-mirror/src/lib.rs
+++ b/car-mirror/src/lib.rs
@@ -27,11 +27,20 @@ pub mod test_utils;
 /// Contains the data types that are sent over-the-wire and relevant serialization code.
 pub mod messages;
 
+/// Configuration values (such as byte limits) for the CAR mirror push protocol
+#[derive(Clone, Debug)]
 pub struct PushConfig {
-    send_minimum: usize,
-    receive_maximum: usize,
-    max_roots_per_round: usize,
-    bloom_fpr: f64,
+    /// A client will try to send at least `send_minimum` bytes of block data
+    /// in each request, except if close to the end of the protocol (when there's)
+    /// not that much data left.
+    pub send_minimum: usize,
+    /// The maximum number of bytes per request that the server accepts.
+    pub receive_maximum: usize,
+    /// The maximum number of roots per request that the server will send to the client,
+    /// and that the client will consume.
+    pub max_roots_per_round: usize,
+    /// The target false positive rate for the bloom filter that the server sends.
+    pub bloom_fpr: f64,
 }
 
 impl Default for PushConfig {
@@ -40,11 +49,17 @@ impl Default for PushConfig {
             send_minimum: 128 * 1024,    // 128KiB
             receive_maximum: 512 * 1024, // 512KiB
             max_roots_per_round: 1000,   // max. ~41KB of CIDs
-            bloom_fpr: 1.0 / 1_000.0,    // 0.1%
+            bloom_fpr: 1.0 / 10_000.0,   // 0.1%
         }
     }
 }
 
+/// Initiate a car mirror push request.
+///
+/// The goal is to transfer the DAG below the root CID to
+/// the server.
+///
+/// The return value is a CAR file.
 pub async fn client_initiate_push(
     root: Cid,
     config: &PushConfig,
@@ -59,6 +74,13 @@ pub async fn client_initiate_push(
     client_push(root, fake_response, config, store).await
 }
 
+/// Send a subsequent car mirror push request, following up on
+/// a response retrieved from an initial `client_initiate_push` request.
+///
+/// Make sure to call `response.indicates_finished()` before initiating
+/// a follow-up `client_push` request.
+///
+/// The return value is another CAR file with more blocks from the DAG below the root.
 pub async fn client_push(
     root: Cid,
     last_response: PushResponse,
@@ -129,6 +151,12 @@ pub async fn client_push(
     Ok(writer.finish().await?.into())
 }
 
+/// This handles a car mirror push request on the server side.
+///
+/// The root is the root CID of the DAG that is pushed, the request is a CAR file
+/// with some blocks from the cold call.
+///
+/// Returns a response to answer the client's request with.
 pub async fn server_push_response(
     root: Cid,
     request: Bytes,
@@ -180,21 +208,41 @@ pub async fn server_push_response(
     })
 }
 
+/// A struct that represents an ongoing walk through the Dag.
+#[derive(Clone, Debug)]
 pub struct DagWalk {
+    /// A queue of CIDs to visit next
     pub frontier: VecDeque<Cid>,
+    /// The set of already visited CIDs. This prevents re-visiting.
     pub visited: HashSet<Cid>,
+    /// Whether to do a breadth-first or depth-first traversal.
+    /// This controls whether newly discovered links are appended or prepended to the frontier.
     pub breadth_first: bool,
 }
 
 impl DagWalk {
+    /// Start a breadth-first traversal of given roots.
+    ///
+    /// Breadth-first is explained the easiest in the simple case of a tree (which is a DAG):
+    /// It will visit each node in the tree layer-by-layer.
+    ///
+    /// So the first nodes it will visit are going to be all roots in order.
     pub fn breadth_first(roots: impl IntoIterator<Item = Cid>) -> Self {
         Self::new(roots, true)
     }
 
+    /// Start a depth-first traversal of given roots.
+    ///
+    /// Depth-first will follow links immediately after discovering them, taking the fastest
+    /// path towards leaves.
+    ///
+    /// The very first node is guaranteed to be the first root, but subsequent nodes may not be
+    /// from the initial roots.
     pub fn depth_first(roots: impl IntoIterator<Item = Cid>) -> Self {
         Self::new(roots, false)
     }
 
+    /// Start a DAG traversal of given roots. See also `breadth_first` and `depth_first`.
     pub fn new(roots: impl IntoIterator<Item = Cid>, breadth_first: bool) -> Self {
         let frontier = roots.into_iter().collect();
         let visited = HashSet::new();
@@ -205,12 +253,15 @@ impl DagWalk {
         }
     }
 
+    /// Return the next node in the traversal.
+    ///
+    /// Returns `None` if no nodes are left to be visited.
     pub async fn next(&mut self, store: &impl BlockStore) -> Result<Option<(Cid, Bytes)>> {
         let cid = loop {
             let popped = if self.breadth_first {
-                self.frontier.pop_front()
-            } else {
                 self.frontier.pop_back()
+            } else {
+                self.frontier.pop_front()
             };
 
             let Some(cid) = popped else {
@@ -226,13 +277,14 @@ impl DagWalk {
         let block = store.get_block(&cid).await?;
         for ref_cid in references(cid, &block)? {
             if !self.visited.contains(&ref_cid) {
-                self.frontier.push_back(ref_cid);
+                self.frontier.push_front(ref_cid);
             }
         }
 
         Ok(Some((cid, block)))
     }
 
+    /// Turn this traversal into a stream
     pub fn stream(
         self,
         store: &impl BlockStore,
@@ -243,6 +295,9 @@ impl DagWalk {
         }))
     }
 
+    /// Find out whether the traversal is finished.
+    ///
+    /// The next call to `next` would result in `None` if this returns true.
     pub fn is_finished(&self) -> bool {
         // We're finished if the frontier does not contain any CIDs that we have not visited yet.
         // Put differently:
@@ -253,6 +308,7 @@ impl DagWalk {
             .any(|frontier_cid| !self.visited.contains(frontier_cid))
     }
 
+    /// Skip a node from the traversal for now.
     pub fn skip_walking(&mut self, block: (Cid, Bytes)) -> Result<()> {
         let (cid, bytes) = block;
         let refs = references(cid, bytes)?;
@@ -276,12 +332,20 @@ pub async fn stream_into_car<W: tokio::io::AsyncWrite + Send + Unpin>(
     Ok(())
 }
 
+/// A data structure that keeps state about incremental DAG verification.
+#[derive(Clone, Debug)]
 pub struct IncrementalDagVerification {
+    /// All the CIDs that have been discovered to be missing from the DAG.
     pub want_cids: HashSet<Cid>,
+    /// All the CIDs that are available locally.
     pub have_cids: HashSet<Cid>,
 }
 
 impl IncrementalDagVerification {
+    /// Initiate incremental DAG verification of given roots.
+    ///
+    /// This will already run a traversal to find missing subgraphs and
+    /// CIDs that are already present.
     pub async fn new(
         roots: impl IntoIterator<Item = Cid>,
         store: &impl BlockStore,
@@ -316,6 +380,17 @@ impl IncrementalDagVerification {
         })
     }
 
+    /// Verify that
+    /// - the block actually hashes to the hash from given CID and
+    /// - the block is part of the graph below the roots.
+    ///
+    /// And finally stores the block in the blockstore.
+    ///
+    /// This *may* fail, even if the block is part of the graph below the roots,
+    /// if intermediate blocks between the roots and this block are missing.
+    ///
+    /// This *may* add the block to the blockstore, but still fail to verify, specifically
+    /// if the block's bytes don't match the hash in the CID.
     pub async fn verify_and_store_block(
         &mut self,
         block: (Cid, Bytes),
diff --git a/car-mirror/src/messages.rs b/car-mirror/src/messages.rs
index c5a778f..283d55c 100644
--- a/car-mirror/src/messages.rs
+++ b/car-mirror/src/messages.rs
@@ -60,6 +60,7 @@ pub struct PushResponse {
 }
 
 impl PushResponse {
+    /// Whether this response indicates that the protocol is finished.
     pub fn indicates_finished(&self) -> bool {
         self.subgraph_roots.is_empty()
     }
diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs
index 003c25e..fc7ce00 100644
--- a/car-mirror/src/test_utils/dag_strategy.rs
+++ b/car-mirror/src/test_utils/dag_strategy.rs
@@ -6,12 +6,16 @@ use libipld_core::codec::Encode;
 use proptest::strategy::Strategy;
 use roaring_graphs::{arb_dag, DirectedAcyclicGraph, Vertex};
 
+/// Encode some IPLD as dag-cbor
 pub fn encode(ipld: &Ipld) -> Bytes {
     let mut vec = Vec::new();
     ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap(); // TODO(matheus23) unwrap
     Bytes::from(vec)
 }
 
+/// A strategy for use with proptest to generate random DAGs (directed acyclic graphs).
+/// The strategy generates a list of blocks of type T and their CIDs, as well as
+/// the root block's CID.
 pub fn generate_dag<T: Debug + Clone>(
     max_nodes: u16,
     generate_block: fn(Vec<Cid>) -> (Cid, T),
@@ -19,7 +23,7 @@ pub fn generate_dag<T: Debug + Clone>(
     arb_dag(1..max_nodes, 0.5).prop_map(move |dag| dag_to_nodes(&dag, generate_block))
 }
 
-pub fn dag_to_nodes<T>(
+fn dag_to_nodes<T>(
     dag: &DirectedAcyclicGraph,
     generate_node: fn(Vec<Cid>) -> (Cid, T),
 ) -> (Vec<(Cid, T)>, Cid) {
@@ -30,7 +34,7 @@ pub fn dag_to_nodes<T>(
     (blocks, cid)
 }
 
-pub fn dag_to_nodes_helper<T>(
+fn dag_to_nodes_helper<T>(
     dag: &DirectedAcyclicGraph,
     root: Vertex,
     generate_node: fn(Vec<Cid>) -> (Cid, T),

From b0da652a43cb677a36d885cb99987137f784c392 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Fri, 18 Aug 2023 14:39:41 +0200
Subject: [PATCH 10/35] Split into modules

---
 car-mirror/src/common.rs                   |  20 +
 car-mirror/src/dag_walk.rs                 | 219 ++++++++
 car-mirror/src/incremental_verification.rs | 100 ++++
 car-mirror/src/lib.rs                      | 581 +--------------------
 car-mirror/src/push.rs                     | 259 +++++++++
 5 files changed, 607 insertions(+), 572 deletions(-)
 create mode 100644 car-mirror/src/common.rs
 create mode 100644 car-mirror/src/dag_walk.rs
 create mode 100644 car-mirror/src/incremental_verification.rs
 create mode 100644 car-mirror/src/push.rs

diff --git a/car-mirror/src/common.rs b/car-mirror/src/common.rs
new file mode 100644
index 0000000..bbf0cad
--- /dev/null
+++ b/car-mirror/src/common.rs
@@ -0,0 +1,20 @@
+use anyhow::{anyhow, Result};
+use libipld::{Ipld, IpldCodec};
+use libipld_core::{cid::Cid, codec::References};
+use std::io::Cursor;
+
+/// Find all CIDs that a block references.
+///
+/// This will error out if
+/// - the codec is not supported
+/// - the block can't be parsed.
+pub fn references(cid: Cid, block: impl AsRef<[u8]>) -> Result<Vec<Cid>> {
+    let codec: IpldCodec = cid
+        .codec()
+        .try_into()
+        .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?;
+
+    let mut refs = Vec::new();
+    <Ipld as References<IpldCodec>>::references(codec, &mut Cursor::new(block), &mut refs)?;
+    Ok(refs)
+}
diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs
new file mode 100644
index 0000000..0f57740
--- /dev/null
+++ b/car-mirror/src/dag_walk.rs
@@ -0,0 +1,219 @@
+use crate::common::references;
+use anyhow::Result;
+use bytes::Bytes;
+use futures::{stream::try_unfold, Stream};
+use libipld_core::cid::Cid;
+use std::collections::{HashSet, VecDeque};
+use wnfs_common::BlockStore;
+
+/// A struct that represents an ongoing walk through the Dag.
+#[derive(Clone, Debug)]
+pub struct DagWalk {
+    /// A queue of CIDs to visit next
+    pub frontier: VecDeque<Cid>,
+    /// The set of already visited CIDs. This prevents re-visiting.
+    pub visited: HashSet<Cid>,
+    /// Whether to do a breadth-first or depth-first traversal.
+    /// This controls whether newly discovered links are appended or prepended to the frontier.
+    pub breadth_first: bool,
+}
+
+impl DagWalk {
+    /// Start a breadth-first traversal of given roots.
+    ///
+    /// Breadth-first is explained the easiest in the simple case of a tree (which is a DAG):
+    /// It will visit each node in the tree layer-by-layer.
+    ///
+    /// So the first nodes it will visit are going to be all roots in order.
+    pub fn breadth_first(roots: impl IntoIterator<Item = Cid>) -> Self {
+        Self::new(roots, true)
+    }
+
+    /// Start a depth-first traversal of given roots.
+    ///
+    /// Depth-first will follow links immediately after discovering them, taking the fastest
+    /// path towards leaves.
+    ///
+    /// The very first node is guaranteed to be the first root, but subsequent nodes may not be
+    /// from the initial roots.
+    pub fn depth_first(roots: impl IntoIterator<Item = Cid>) -> Self {
+        Self::new(roots, false)
+    }
+
+    /// Start a DAG traversal of given roots. See also `breadth_first` and `depth_first`.
+    pub fn new(roots: impl IntoIterator<Item = Cid>, breadth_first: bool) -> Self {
+        let frontier = roots.into_iter().collect();
+        let visited = HashSet::new();
+        Self {
+            frontier,
+            visited,
+            breadth_first,
+        }
+    }
+
+    /// Return the next node in the traversal.
+    ///
+    /// Returns `None` if no nodes are left to be visited.
+    pub async fn next(&mut self, store: &impl BlockStore) -> Result<Option<(Cid, Bytes)>> {
+        let cid = loop {
+            let popped = if self.breadth_first {
+                self.frontier.pop_back()
+            } else {
+                self.frontier.pop_front()
+            };
+
+            let Some(cid) = popped else {
+                return Ok(None);
+            };
+
+            // We loop until we find an unvisited block
+            if self.visited.insert(cid) {
+                break cid;
+            }
+        };
+
+        let block = store.get_block(&cid).await?;
+        for ref_cid in references(cid, &block)? {
+            if !self.visited.contains(&ref_cid) {
+                self.frontier.push_front(ref_cid);
+            }
+        }
+
+        Ok(Some((cid, block)))
+    }
+
+    /// Turn this traversal into a stream
+    pub fn stream(
+        self,
+        store: &impl BlockStore,
+    ) -> impl Stream<Item = Result<(Cid, Bytes)>> + Unpin + '_ {
+        Box::pin(try_unfold(self, move |mut this| async move {
+            let maybe_block = this.next(store).await?;
+            Ok(maybe_block.map(|b| (b, this)))
+        }))
+    }
+
+    /// Find out whether the traversal is finished.
+    ///
+    /// The next call to `next` would result in `None` if this returns true.
+    pub fn is_finished(&self) -> bool {
+        // We're finished if the frontier does not contain any CIDs that we have not visited yet.
+        // Put differently:
+        // We're not finished if there exist unvisited CIDs in the frontier.
+        !self
+            .frontier
+            .iter()
+            .any(|frontier_cid| !self.visited.contains(frontier_cid))
+    }
+
+    /// Skip a node from the traversal for now.
+    pub fn skip_walking(&mut self, block: (Cid, Bytes)) -> Result<()> {
+        let (cid, bytes) = block;
+        let refs = references(cid, bytes)?;
+        self.visited.insert(cid);
+        self.frontier
+            .retain(|frontier_cid| !refs.contains(frontier_cid));
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use futures::TryStreamExt;
+    use libipld::Ipld;
+    use wnfs_common::MemoryBlockStore;
+
+    #[async_std::test]
+    async fn test_walk_dag_breadth_first() -> Result<()> {
+        let store = &MemoryBlockStore::new();
+
+        let cid_1 = store.put_serializable(&Ipld::String("1".into())).await?;
+        let cid_2 = store.put_serializable(&Ipld::String("2".into())).await?;
+        let cid_3 = store.put_serializable(&Ipld::String("3".into())).await?;
+
+        let cid_1_wrap = store
+            .put_serializable(&Ipld::List(vec![Ipld::Link(cid_1)]))
+            .await?;
+
+        let cid_root = store
+            .put_serializable(&Ipld::List(vec![
+                Ipld::Link(cid_1_wrap),
+                Ipld::Link(cid_2),
+                Ipld::Link(cid_3),
+            ]))
+            .await?;
+
+        let cids = DagWalk::breadth_first([cid_root])
+            .stream(store)
+            .try_collect::<Vec<_>>()
+            .await?
+            .into_iter()
+            .map(|(cid, _block)| cid)
+            .collect::<Vec<_>>();
+
+        assert_eq!(cids, vec![cid_root, cid_1_wrap, cid_2, cid_3, cid_1]);
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod proptests {
+    use super::*;
+    use crate::test_utils::{encode, generate_dag};
+    use futures::TryStreamExt;
+    use libipld::{
+        multihash::{Code, MultihashDigest},
+        Cid, Ipld, IpldCodec,
+    };
+    use proptest::strategy::Strategy;
+    use std::collections::BTreeSet;
+    use test_strategy::proptest;
+    use wnfs_common::{BlockStore, MemoryBlockStore};
+
+    fn ipld_dags() -> impl Strategy<Value = (Vec<(Cid, Ipld)>, Cid)> {
+        generate_dag(256, |cids| {
+            let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect());
+            let cid = Cid::new_v1(
+                IpldCodec::DagCbor.into(),
+                Code::Blake3_256.digest(&encode(&ipld)),
+            );
+            (cid, ipld)
+        })
+    }
+
+    #[proptest(max_shrink_iters = 100_000)]
+    fn walk_dag_never_iterates_block_twice(#[strategy(ipld_dags())] dag: (Vec<(Cid, Ipld)>, Cid)) {
+        async_std::task::block_on(async {
+            let (dag, root) = dag;
+            let store = &MemoryBlockStore::new();
+            for (cid, ipld) in dag.iter() {
+                let cid_store = store
+                    .put_block(encode(ipld), IpldCodec::DagCbor.into())
+                    .await
+                    .unwrap();
+                assert_eq!(*cid, cid_store);
+            }
+
+            let mut cids = DagWalk::breadth_first([root])
+                .stream(store)
+                .map_ok(|(cid, _)| cid)
+                .try_collect::<Vec<_>>()
+                .await
+                .unwrap();
+
+            cids.sort();
+
+            let unique_cids = cids
+                .iter()
+                .cloned()
+                .collect::<BTreeSet<_>>()
+                .into_iter()
+                .collect::<Vec<_>>();
+
+            assert_eq!(cids, unique_cids);
+        });
+    }
+}
diff --git a/car-mirror/src/incremental_verification.rs b/car-mirror/src/incremental_verification.rs
new file mode 100644
index 0000000..24edb3b
--- /dev/null
+++ b/car-mirror/src/incremental_verification.rs
@@ -0,0 +1,100 @@
+use crate::{common::references, dag_walk::DagWalk};
+use anyhow::{bail, Result};
+use bytes::Bytes;
+use libipld_core::cid::Cid;
+use std::{collections::HashSet, eprintln};
+use wnfs_common::{BlockStore, BlockStoreError};
+
+/// A data structure that keeps state about incremental DAG verification.
+#[derive(Clone, Debug)]
+pub struct IncrementalDagVerification {
+    /// All the CIDs that have been discovered to be missing from the DAG.
+    pub want_cids: HashSet<Cid>,
+    /// All the CIDs that are available locally.
+    pub have_cids: HashSet<Cid>,
+}
+
+impl IncrementalDagVerification {
+    /// Initiate incremental DAG verification of given roots.
+    ///
+    /// This will already run a traversal to find missing subgraphs and
+    /// CIDs that are already present.
+    pub async fn new(
+        roots: impl IntoIterator<Item = Cid>,
+        store: &impl BlockStore,
+    ) -> Result<Self> {
+        let mut want_cids = HashSet::new();
+        let mut have_cids = HashSet::new();
+        let mut dag_walk = DagWalk::breadth_first(roots);
+
+        loop {
+            match dag_walk.next(store).await {
+                Err(e) => {
+                    if let Some(BlockStoreError::CIDNotFound(not_found)) =
+                        e.downcast_ref::<BlockStoreError>()
+                    {
+                        want_cids.insert(*not_found);
+                    } else {
+                        bail!(e);
+                    }
+                }
+                Ok(Some((cid, _))) => {
+                    have_cids.insert(cid);
+                }
+                Ok(None) => {
+                    break;
+                }
+            }
+        }
+
+        Ok(Self {
+            want_cids,
+            have_cids,
+        })
+    }
+
+    /// Verify that
+    /// - the block actually hashes to the hash from given CID and
+    /// - the block is part of the graph below the roots.
+    ///
+    /// And finally stores the block in the blockstore.
+    ///
+    /// This *may* fail, even if the block is part of the graph below the roots,
+    /// if intermediate blocks between the roots and this block are missing.
+    ///
+    /// This *may* add the block to the blockstore, but still fail to verify, specifically
+    /// if the block's bytes don't match the hash in the CID.
+    pub async fn verify_and_store_block(
+        &mut self,
+        block: (Cid, Bytes),
+        store: &impl BlockStore,
+    ) -> Result<()> {
+        let (cid, bytes) = block;
+
+        if !self.want_cids.contains(&cid) {
+            if self.have_cids.contains(&cid) {
+                eprintln!("Warn: Received {cid}, even though we already have it");
+            } else {
+                bail!("Unexpected block or block out of order: {cid}");
+            }
+        }
+
+        let refs = references(cid, &bytes)?;
+        let result_cid = store.put_block(bytes, cid.codec()).await?;
+
+        if result_cid != cid {
+            bail!("Digest mismatch in CAR file: expected {cid}, got {result_cid}");
+        }
+
+        for ref_cid in refs {
+            if !self.have_cids.contains(&ref_cid) {
+                self.want_cids.insert(ref_cid);
+            }
+        }
+
+        self.want_cids.remove(&cid);
+        self.have_cids.insert(cid);
+
+        Ok(())
+    }
+}
diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs
index 5982219..2ed447b 100644
--- a/car-mirror/src/lib.rs
+++ b/car-mirror/src/lib.rs
@@ -24,576 +24,13 @@ use wnfs_common::{BlockStore, BlockStoreError};
 #[cfg_attr(docsrs, doc(cfg(feature = "test_utils")))]
 pub mod test_utils;
 
-/// Contains the data types that are sent over-the-wire and relevant serialization code.
+/// Common utilities
+pub mod common;
+/// Algorithms for walking IPLD directed acyclic graphs
+pub mod dag_walk;
+/// Algorithms for doing incremental verification of IPLD DAGs on the receiving end.
+pub mod incremental_verification;
+/// Data types that are sent over-the-wire and relevant serialization code.
 pub mod messages;
-
-/// Configuration values (such as byte limits) for the CAR mirror push protocol
-#[derive(Clone, Debug)]
-pub struct PushConfig {
-    /// A client will try to send at least `send_minimum` bytes of block data
-    /// in each request, except if close to the end of the protocol (when there's)
-    /// not that much data left.
-    pub send_minimum: usize,
-    /// The maximum number of bytes per request that the server accepts.
-    pub receive_maximum: usize,
-    /// The maximum number of roots per request that the server will send to the client,
-    /// and that the client will consume.
-    pub max_roots_per_round: usize,
-    /// The target false positive rate for the bloom filter that the server sends.
-    pub bloom_fpr: f64,
-}
-
-impl Default for PushConfig {
-    fn default() -> Self {
-        Self {
-            send_minimum: 128 * 1024,    // 128KiB
-            receive_maximum: 512 * 1024, // 512KiB
-            max_roots_per_round: 1000,   // max. ~41KB of CIDs
-            bloom_fpr: 1.0 / 10_000.0,   // 0.1%
-        }
-    }
-}
-
-/// Initiate a car mirror push request.
-///
-/// The goal is to transfer the DAG below the root CID to
-/// the server.
-///
-/// The return value is a CAR file.
-pub async fn client_initiate_push(
-    root: Cid,
-    config: &PushConfig,
-    store: &impl BlockStore,
-) -> Result<Bytes> {
-    let fake_response = PushResponse {
-        subgraph_roots: vec![root],
-        // Just putting an empty bloom here
-        bloom_k: 3,
-        bloom: Vec::new(),
-    };
-    client_push(root, fake_response, config, store).await
-}
-
-/// Send a subsequent car mirror push request, following up on
-/// a response retrieved from an initial `client_initiate_push` request.
-///
-/// Make sure to call `response.indicates_finished()` before initiating
-/// a follow-up `client_push` request.
-///
-/// The return value is another CAR file with more blocks from the DAG below the root.
-pub async fn client_push(
-    root: Cid,
-    last_response: PushResponse,
-    config: &PushConfig,
-    store: &impl BlockStore,
-) -> Result<Bytes> {
-    let PushResponse {
-        ref subgraph_roots,
-        bloom_k,
-        bloom,
-    } = last_response;
-
-    // Verify that all subgraph roots are in the relevant DAG:
-    let subgraph_roots: Vec<Cid> = DagWalk::breadth_first([root])
-        .stream(store)
-        .try_filter_map(|(cid, _)| async move { Ok(subgraph_roots.contains(&cid).then_some(cid)) })
-        .try_collect()
-        .await?;
-
-    let bloom = if bloom.is_empty() {
-        BloomFilter::new_with(1, Box::new([0])) // An empty bloom that contains nothing
-    } else {
-        BloomFilter::new_with(bloom_k as usize, bloom.into_boxed_slice())
-    };
-
-    let mut writer = CarWriter::new(
-        CarHeader::new_v1(
-            // TODO(matheus23): This is stupid
-            // CAR files *must* have at least one CID in them, and all of them
-            // need to appear as a block in the payload.
-            // It would probably make most sense to just write all subgraph roots into this,
-            // but we don't know how many of the subgraph roots fit into this round yet,
-            // so we're simply writing the first one in here, since we know
-            // at least one block will be written (and it'll be that one).
-            subgraph_roots.iter().take(1).cloned().collect(),
-        ),
-        Vec::new(),
-    );
-
-    writer.write_header().await?;
-
-    let mut block_bytes = 0;
-    let mut dag_walk = DagWalk::breadth_first(subgraph_roots.clone());
-    while let Some((cid, block)) = dag_walk.next(store).await? {
-        if bloom.contains(&cid.to_bytes()) && !subgraph_roots.contains(&cid) {
-            // TODO(matheus23) I think the spec means to prune the whole subgraph.
-            // But
-            // 1. That requires the receiver to check the whole subgraph at that CID to find out whether there's a missing block at the subgraph.
-            // 2. It requires the sender to go through every block under this subgraph down to the leaves to mark all of these CIDs as visited.
-            // Both of these are *huge* traversals. I'd say likely not worth it. The only case I can image they're worth it, is if the DAG
-            // is *heavily* using structural sharing and not tree-like.
-            // Also: This fails completely if the sender is just missing a single leaf. It couldn't add the block to the bloom in that case.
-            dag_walk.skip_walking((cid, block))?;
-            println!("Skipped walking {cid} due to bloom");
-            break;
-        }
-
-        writer.write(cid, &block).await?;
-        println!("Sending {cid}");
-
-        // TODO(matheus23): Count the actual bytes sent?
-        block_bytes += block.len();
-        if block_bytes > config.send_minimum {
-            break;
-        }
-    }
-
-    Ok(writer.finish().await?.into())
-}
-
-/// This handles a car mirror push request on the server side.
-///
-/// The root is the root CID of the DAG that is pushed, the request is a CAR file
-/// with some blocks from the cold call.
-///
-/// Returns a response to answer the client's request with.
-pub async fn server_push_response(
-    root: Cid,
-    request: Bytes,
-    config: &PushConfig,
-    store: &impl BlockStore,
-) -> Result<PushResponse> {
-    let mut dag_verification = IncrementalDagVerification::new([root], store).await?;
-
-    let mut reader = CarReader::new(Cursor::new(request)).await?;
-    let mut block_bytes = 0;
-
-    while let Some((cid, vec)) = reader.next_block().await? {
-        let block = Bytes::from(vec);
-        println!("Received {cid}");
-
-        block_bytes += block.len();
-        if block_bytes > config.receive_maximum {
-            bail!(
-                "Received more than {} bytes ({block_bytes}), aborting request.",
-                config.receive_maximum
-            );
-        }
-
-        dag_verification
-            .verify_and_store_block((cid, block), store)
-            .await?;
-    }
-
-    let subgraph_roots = dag_verification
-        .want_cids
-        .iter()
-        .take(config.max_roots_per_round)
-        .cloned()
-        .collect();
-
-    let mut bloom =
-        BloomFilter::new_from_fpr_po2(dag_verification.have_cids.len() as u64, config.bloom_fpr);
-
-    dag_verification
-        .have_cids
-        .iter()
-        .for_each(|cid| bloom.insert(&cid.to_bytes()));
-
-    Ok(PushResponse {
-        subgraph_roots,
-        // We ignore blooms for now
-        bloom_k: bloom.hash_count() as u32,
-        bloom: bloom.as_bytes().to_vec(),
-    })
-}
-
-/// A struct that represents an ongoing walk through the Dag.
-#[derive(Clone, Debug)]
-pub struct DagWalk {
-    /// A queue of CIDs to visit next
-    pub frontier: VecDeque<Cid>,
-    /// The set of already visited CIDs. This prevents re-visiting.
-    pub visited: HashSet<Cid>,
-    /// Whether to do a breadth-first or depth-first traversal.
-    /// This controls whether newly discovered links are appended or prepended to the frontier.
-    pub breadth_first: bool,
-}
-
-impl DagWalk {
-    /// Start a breadth-first traversal of given roots.
-    ///
-    /// Breadth-first is explained the easiest in the simple case of a tree (which is a DAG):
-    /// It will visit each node in the tree layer-by-layer.
-    ///
-    /// So the first nodes it will visit are going to be all roots in order.
-    pub fn breadth_first(roots: impl IntoIterator<Item = Cid>) -> Self {
-        Self::new(roots, true)
-    }
-
-    /// Start a depth-first traversal of given roots.
-    ///
-    /// Depth-first will follow links immediately after discovering them, taking the fastest
-    /// path towards leaves.
-    ///
-    /// The very first node is guaranteed to be the first root, but subsequent nodes may not be
-    /// from the initial roots.
-    pub fn depth_first(roots: impl IntoIterator<Item = Cid>) -> Self {
-        Self::new(roots, false)
-    }
-
-    /// Start a DAG traversal of given roots. See also `breadth_first` and `depth_first`.
-    pub fn new(roots: impl IntoIterator<Item = Cid>, breadth_first: bool) -> Self {
-        let frontier = roots.into_iter().collect();
-        let visited = HashSet::new();
-        Self {
-            frontier,
-            visited,
-            breadth_first,
-        }
-    }
-
-    /// Return the next node in the traversal.
-    ///
-    /// Returns `None` if no nodes are left to be visited.
-    pub async fn next(&mut self, store: &impl BlockStore) -> Result<Option<(Cid, Bytes)>> {
-        let cid = loop {
-            let popped = if self.breadth_first {
-                self.frontier.pop_back()
-            } else {
-                self.frontier.pop_front()
-            };
-
-            let Some(cid) = popped else {
-                return Ok(None);
-            };
-
-            // We loop until we find an unvisited block
-            if self.visited.insert(cid) {
-                break cid;
-            }
-        };
-
-        let block = store.get_block(&cid).await?;
-        for ref_cid in references(cid, &block)? {
-            if !self.visited.contains(&ref_cid) {
-                self.frontier.push_front(ref_cid);
-            }
-        }
-
-        Ok(Some((cid, block)))
-    }
-
-    /// Turn this traversal into a stream
-    pub fn stream(
-        self,
-        store: &impl BlockStore,
-    ) -> impl Stream<Item = Result<(Cid, Bytes)>> + Unpin + '_ {
-        Box::pin(try_unfold(self, move |mut this| async move {
-            let maybe_block = this.next(store).await?;
-            Ok(maybe_block.map(|b| (b, this)))
-        }))
-    }
-
-    /// Find out whether the traversal is finished.
-    ///
-    /// The next call to `next` would result in `None` if this returns true.
-    pub fn is_finished(&self) -> bool {
-        // We're finished if the frontier does not contain any CIDs that we have not visited yet.
-        // Put differently:
-        // We're not finished if there exist unvisited CIDs in the frontier.
-        !self
-            .frontier
-            .iter()
-            .any(|frontier_cid| !self.visited.contains(frontier_cid))
-    }
-
-    /// Skip a node from the traversal for now.
-    pub fn skip_walking(&mut self, block: (Cid, Bytes)) -> Result<()> {
-        let (cid, bytes) = block;
-        let refs = references(cid, bytes)?;
-        self.visited.insert(cid);
-        self.frontier
-            .retain(|frontier_cid| !refs.contains(frontier_cid));
-
-        Ok(())
-    }
-}
-
-/// Writes a stream of blocks into a car file
-pub async fn stream_into_car<W: tokio::io::AsyncWrite + Send + Unpin>(
-    mut blocks: impl Stream<Item = Result<(Cid, Bytes)>> + Unpin,
-    writer: &mut CarWriter<W>,
-) -> Result<()> {
-    while let Some(result) = blocks.next().await {
-        let (cid, bytes) = result?;
-        writer.write(cid, bytes).await?;
-    }
-    Ok(())
-}
-
-/// A data structure that keeps state about incremental DAG verification.
-#[derive(Clone, Debug)]
-pub struct IncrementalDagVerification {
-    /// All the CIDs that have been discovered to be missing from the DAG.
-    pub want_cids: HashSet<Cid>,
-    /// All the CIDs that are available locally.
-    pub have_cids: HashSet<Cid>,
-}
-
-impl IncrementalDagVerification {
-    /// Initiate incremental DAG verification of given roots.
-    ///
-    /// This will already run a traversal to find missing subgraphs and
-    /// CIDs that are already present.
-    pub async fn new(
-        roots: impl IntoIterator<Item = Cid>,
-        store: &impl BlockStore,
-    ) -> Result<Self> {
-        let mut want_cids = HashSet::new();
-        let mut have_cids = HashSet::new();
-        let mut dag_walk = DagWalk::breadth_first(roots);
-
-        loop {
-            match dag_walk.next(store).await {
-                Err(e) => {
-                    if let Some(BlockStoreError::CIDNotFound(not_found)) =
-                        e.downcast_ref::<BlockStoreError>()
-                    {
-                        want_cids.insert(*not_found);
-                    } else {
-                        bail!(e);
-                    }
-                }
-                Ok(Some((cid, _))) => {
-                    have_cids.insert(cid);
-                }
-                Ok(None) => {
-                    break;
-                }
-            }
-        }
-
-        Ok(Self {
-            want_cids,
-            have_cids,
-        })
-    }
-
-    /// Verify that
-    /// - the block actually hashes to the hash from given CID and
-    /// - the block is part of the graph below the roots.
-    ///
-    /// And finally stores the block in the blockstore.
-    ///
-    /// This *may* fail, even if the block is part of the graph below the roots,
-    /// if intermediate blocks between the roots and this block are missing.
-    ///
-    /// This *may* add the block to the blockstore, but still fail to verify, specifically
-    /// if the block's bytes don't match the hash in the CID.
-    pub async fn verify_and_store_block(
-        &mut self,
-        block: (Cid, Bytes),
-        store: &impl BlockStore,
-    ) -> Result<()> {
-        let (cid, bytes) = block;
-
-        if !self.want_cids.contains(&cid) {
-            if self.have_cids.contains(&cid) {
-                eprintln!("Warn: Received {cid}, even though we already have it");
-            } else {
-                bail!("Unexpected block or block out of order: {cid}");
-            }
-        }
-
-        let refs = references(cid, &bytes)?;
-        let result_cid = store.put_block(bytes, cid.codec()).await?;
-
-        if result_cid != cid {
-            bail!("Digest mismatch in CAR file: expected {cid}, got {result_cid}");
-        }
-
-        for ref_cid in refs {
-            if !self.have_cids.contains(&ref_cid) {
-                self.want_cids.insert(ref_cid);
-            }
-        }
-
-        self.want_cids.remove(&cid);
-        self.have_cids.insert(cid);
-
-        Ok(())
-    }
-}
-
-fn references(cid: Cid, block: impl AsRef<[u8]>) -> Result<Vec<Cid>> {
-    let codec: IpldCodec = cid
-        .codec()
-        .try_into()
-        .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?;
-
-    let mut refs = Vec::new();
-    <Ipld as References<IpldCodec>>::references(codec, &mut Cursor::new(block), &mut refs)?;
-    Ok(refs)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::test_utils::{encode, generate_dag, Rvg};
-    use futures::TryStreamExt;
-    use libipld_core::multihash::{Code, MultihashDigest};
-    use std::collections::BTreeMap;
-    use wnfs_common::MemoryBlockStore;
-
-    #[async_std::test]
-    async fn test_transfer() -> Result<()> {
-        let (blocks, root) = Rvg::new().sample(&generate_dag(256, |cids| {
-            let ipld = Ipld::Map(BTreeMap::from([
-                ("data".into(), Ipld::Bytes(vec![0u8; 10 * 1024])),
-                (
-                    "links".into(),
-                    Ipld::List(cids.into_iter().map(Ipld::Link).collect()),
-                ),
-            ]));
-            let bytes = encode(&ipld);
-            let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes));
-            (cid, bytes)
-        }));
-
-        let sender_store = &MemoryBlockStore::new();
-        for (cid, bytes) in blocks.iter() {
-            let cid_store = sender_store
-                .put_block(bytes.clone(), IpldCodec::DagCbor.into())
-                .await?;
-            assert_eq!(*cid, cid_store);
-        }
-
-        let receiver_store = &MemoryBlockStore::new();
-        let config = &PushConfig::default();
-        let mut request = client_initiate_push(root, config, sender_store).await?;
-        loop {
-            println!("Sending request {} bytes", request.len());
-            let response = server_push_response(root, request, config, receiver_store).await?;
-            println!(
-                "Response (bloom bytes: {}): {:?}",
-                response.bloom.len(),
-                response.subgraph_roots,
-            );
-            if response.indicates_finished() {
-                break;
-            }
-            request = client_push(root, response, config, sender_store).await?;
-        }
-
-        // receiver should have all data
-        let sender_cids = DagWalk::breadth_first([root])
-            .stream(sender_store)
-            .map_ok(|(cid, _)| cid)
-            .try_collect::<Vec<_>>()
-            .await?;
-        let receiver_cids = DagWalk::breadth_first([root])
-            .stream(receiver_store)
-            .map_ok(|(cid, _)| cid)
-            .try_collect::<Vec<_>>()
-            .await?;
-
-        assert_eq!(sender_cids, receiver_cids);
-
-        Ok(())
-    }
-
-    #[async_std::test]
-    async fn test_walk_dag_breadth_first() -> Result<()> {
-        let store = &MemoryBlockStore::new();
-
-        let cid_1 = store.put_serializable(&Ipld::String("1".into())).await?;
-        let cid_2 = store.put_serializable(&Ipld::String("2".into())).await?;
-        let cid_3 = store.put_serializable(&Ipld::String("3".into())).await?;
-
-        let cid_1_wrap = store
-            .put_serializable(&Ipld::List(vec![Ipld::Link(cid_1)]))
-            .await?;
-
-        let cid_root = store
-            .put_serializable(&Ipld::List(vec![
-                Ipld::Link(cid_1_wrap),
-                Ipld::Link(cid_2),
-                Ipld::Link(cid_3),
-            ]))
-            .await?;
-
-        let cids = DagWalk::breadth_first([cid_root])
-            .stream(store)
-            .try_collect::<Vec<_>>()
-            .await?
-            .into_iter()
-            .map(|(cid, _block)| cid)
-            .collect::<Vec<_>>();
-
-        assert_eq!(cids, vec![cid_root, cid_1_wrap, cid_2, cid_3, cid_1]);
-
-        Ok(())
-    }
-}
-
-#[cfg(test)]
-mod proptests {
-    use crate::{
-        test_utils::{encode, generate_dag},
-        DagWalk,
-    };
-    use futures::TryStreamExt;
-    use libipld::{
-        multihash::{Code, MultihashDigest},
-        Cid, Ipld, IpldCodec,
-    };
-    use proptest::strategy::Strategy;
-    use std::collections::BTreeSet;
-    use test_strategy::proptest;
-    use wnfs_common::{BlockStore, MemoryBlockStore};
-
-    fn ipld_dags() -> impl Strategy<Value = (Vec<(Cid, Ipld)>, Cid)> {
-        generate_dag(256, |cids| {
-            let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect());
-            let cid = Cid::new_v1(
-                IpldCodec::DagCbor.into(),
-                Code::Blake3_256.digest(&encode(&ipld)),
-            );
-            (cid, ipld)
-        })
-    }
-
-    #[proptest(max_shrink_iters = 100_000)]
-    fn walk_dag_never_iterates_block_twice(#[strategy(ipld_dags())] dag: (Vec<(Cid, Ipld)>, Cid)) {
-        async_std::task::block_on(async {
-            let (dag, root) = dag;
-            let store = &MemoryBlockStore::new();
-            for (cid, ipld) in dag.iter() {
-                let cid_store = store
-                    .put_block(encode(ipld), IpldCodec::DagCbor.into())
-                    .await
-                    .unwrap();
-                assert_eq!(*cid, cid_store);
-            }
-
-            let mut cids = DagWalk::breadth_first([root])
-                .stream(store)
-                .map_ok(|(cid, _)| cid)
-                .try_collect::<Vec<_>>()
-                .await
-                .unwrap();
-
-            cids.sort();
-
-            let unique_cids = cids
-                .iter()
-                .cloned()
-                .collect::<BTreeSet<_>>()
-                .into_iter()
-                .collect::<Vec<_>>();
-
-            assert_eq!(cids, unique_cids);
-        });
-    }
-}
+/// The CAR mirror push protocol
+pub mod push;
diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs
new file mode 100644
index 0000000..f7819f8
--- /dev/null
+++ b/car-mirror/src/push.rs
@@ -0,0 +1,259 @@
+use crate::{
+    dag_walk::DagWalk, incremental_verification::IncrementalDagVerification, messages::PushResponse,
+};
+use anyhow::{bail, Result};
+use bytes::Bytes;
+use deterministic_bloom::runtime_size::BloomFilter;
+use futures::TryStreamExt;
+use iroh_car::{CarHeader, CarReader, CarWriter};
+use libipld_core::cid::Cid;
+use std::io::Cursor;
+use wnfs_common::BlockStore;
+
+/// Configuration values (such as byte limits) for the CAR mirror push protocol
+#[derive(Clone, Debug)]
+pub struct PushConfig {
+    /// A client will try to send at least `send_minimum` bytes of block data
+    /// in each request, except if close to the end of the protocol (when there's)
+    /// not that much data left.
+    pub send_minimum: usize,
+    /// The maximum number of bytes per request that the server accepts.
+    pub receive_maximum: usize,
+    /// The maximum number of roots per request that the server will send to the client,
+    /// and that the client will consume.
+    pub max_roots_per_round: usize,
+    /// The target false positive rate for the bloom filter that the server sends.
+    pub bloom_fpr: f64,
+}
+
+impl Default for PushConfig {
+    fn default() -> Self {
+        Self {
+            send_minimum: 128 * 1024,    // 128KiB
+            receive_maximum: 512 * 1024, // 512KiB
+            max_roots_per_round: 1000,   // max. ~41KB of CIDs
+            bloom_fpr: 1.0 / 10_000.0,   // 0.1%
+        }
+    }
+}
+
+/// Initiate a car mirror push request.
+///
+/// The goal is to transfer the DAG below the root CID to
+/// the server.
+///
+/// The return value is a CAR file.
+pub async fn client_initiate_push(
+    root: Cid,
+    config: &PushConfig,
+    store: &impl BlockStore,
+) -> Result<Bytes> {
+    let fake_response = PushResponse {
+        subgraph_roots: vec![root],
+        // Just putting an empty bloom here
+        bloom_k: 3,
+        bloom: Vec::new(),
+    };
+    client_push(root, fake_response, config, store).await
+}
+
+/// Send a subsequent car mirror push request, following up on
+/// a response retrieved from an initial `client_initiate_push` request.
+///
+/// Make sure to call `response.indicates_finished()` before initiating
+/// a follow-up `client_push` request.
+///
+/// The return value is another CAR file with more blocks from the DAG below the root.
+pub async fn client_push(
+    root: Cid,
+    last_response: PushResponse,
+    config: &PushConfig,
+    store: &impl BlockStore,
+) -> Result<Bytes> {
+    let PushResponse {
+        ref subgraph_roots,
+        bloom_k,
+        bloom,
+    } = last_response;
+
+    // Verify that all subgraph roots are in the relevant DAG:
+    let subgraph_roots: Vec<Cid> = DagWalk::breadth_first([root])
+        .stream(store)
+        .try_filter_map(|(cid, _)| async move { Ok(subgraph_roots.contains(&cid).then_some(cid)) })
+        .try_collect()
+        .await?;
+
+    let bloom = if bloom.is_empty() {
+        BloomFilter::new_with(1, Box::new([0])) // An empty bloom that contains nothing
+    } else {
+        BloomFilter::new_with(bloom_k as usize, bloom.into_boxed_slice())
+    };
+
+    let mut writer = CarWriter::new(
+        CarHeader::new_v1(
+            // TODO(matheus23): This is stupid
+            // CAR files *must* have at least one CID in them, and all of them
+            // need to appear as a block in the payload.
+            // It would probably make most sense to just write all subgraph roots into this,
+            // but we don't know how many of the subgraph roots fit into this round yet,
+            // so we're simply writing the first one in here, since we know
+            // at least one block will be written (and it'll be that one).
+            subgraph_roots.iter().take(1).cloned().collect(),
+        ),
+        Vec::new(),
+    );
+
+    writer.write_header().await?;
+
+    let mut block_bytes = 0;
+    let mut dag_walk = DagWalk::breadth_first(subgraph_roots.clone());
+    while let Some((cid, block)) = dag_walk.next(store).await? {
+        if bloom.contains(&cid.to_bytes()) && !subgraph_roots.contains(&cid) {
+            // TODO(matheus23) I think the spec means to prune the whole subgraph.
+            // But
+            // 1. That requires the receiver to check the whole subgraph at that CID to find out whether there's a missing block at the subgraph.
+            // 2. It requires the sender to go through every block under this subgraph down to the leaves to mark all of these CIDs as visited.
+            // Both of these are *huge* traversals. I'd say likely not worth it. The only case I can image they're worth it, is if the DAG
+            // is *heavily* using structural sharing and not tree-like.
+            // Also: This fails completely if the sender is just missing a single leaf. It couldn't add the block to the bloom in that case.
+            dag_walk.skip_walking((cid, block))?;
+            println!("Skipped walking {cid} due to bloom");
+            break;
+        }
+
+        writer.write(cid, &block).await?;
+        println!("Sending {cid}");
+
+        // TODO(matheus23): Count the actual bytes sent?
+        block_bytes += block.len();
+        if block_bytes > config.send_minimum {
+            break;
+        }
+    }
+
+    Ok(writer.finish().await?.into())
+}
+
+/// This handles a car mirror push request on the server side.
+///
+/// The root is the root CID of the DAG that is pushed, the request is a CAR file
+/// with some blocks from the cold call.
+///
+/// Returns a response to answer the client's request with.
+pub async fn server_push_response(
+    root: Cid,
+    request: Bytes,
+    config: &PushConfig,
+    store: &impl BlockStore,
+) -> Result<PushResponse> {
+    let mut dag_verification = IncrementalDagVerification::new([root], store).await?;
+
+    let mut reader = CarReader::new(Cursor::new(request)).await?;
+    let mut block_bytes = 0;
+
+    while let Some((cid, vec)) = reader.next_block().await? {
+        let block = Bytes::from(vec);
+        println!("Received {cid}");
+
+        block_bytes += block.len();
+        if block_bytes > config.receive_maximum {
+            bail!(
+                "Received more than {} bytes ({block_bytes}), aborting request.",
+                config.receive_maximum
+            );
+        }
+
+        dag_verification
+            .verify_and_store_block((cid, block), store)
+            .await?;
+    }
+
+    let subgraph_roots = dag_verification
+        .want_cids
+        .iter()
+        .take(config.max_roots_per_round)
+        .cloned()
+        .collect();
+
+    let mut bloom =
+        BloomFilter::new_from_fpr_po2(dag_verification.have_cids.len() as u64, config.bloom_fpr);
+
+    dag_verification
+        .have_cids
+        .iter()
+        .for_each(|cid| bloom.insert(&cid.to_bytes()));
+
+    Ok(PushResponse {
+        subgraph_roots,
+        // We ignore blooms for now
+        bloom_k: bloom.hash_count() as u32,
+        bloom: bloom.as_bytes().to_vec(),
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::test_utils::{encode, generate_dag, Rvg};
+    use libipld::{Ipld, IpldCodec};
+    use libipld_core::multihash::{Code, MultihashDigest};
+    use std::collections::BTreeMap;
+    use wnfs_common::MemoryBlockStore;
+
+    #[async_std::test]
+    async fn test_transfer() -> Result<()> {
+        let (blocks, root) = Rvg::new().sample(&generate_dag(256, |cids| {
+            let ipld = Ipld::Map(BTreeMap::from([
+                ("data".into(), Ipld::Bytes(vec![0u8; 10 * 1024])),
+                (
+                    "links".into(),
+                    Ipld::List(cids.into_iter().map(Ipld::Link).collect()),
+                ),
+            ]));
+            let bytes = encode(&ipld);
+            let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes));
+            (cid, bytes)
+        }));
+
+        let sender_store = &MemoryBlockStore::new();
+        for (cid, bytes) in blocks.iter() {
+            let cid_store = sender_store
+                .put_block(bytes.clone(), IpldCodec::DagCbor.into())
+                .await?;
+            assert_eq!(*cid, cid_store);
+        }
+
+        let receiver_store = &MemoryBlockStore::new();
+        let config = &PushConfig::default();
+        let mut request = client_initiate_push(root, config, sender_store).await?;
+        loop {
+            println!("Sending request {} bytes", request.len());
+            let response = server_push_response(root, request, config, receiver_store).await?;
+            println!(
+                "Response (bloom bytes: {}): {:?}",
+                response.bloom.len(),
+                response.subgraph_roots,
+            );
+            if response.indicates_finished() {
+                break;
+            }
+            request = client_push(root, response, config, sender_store).await?;
+        }
+
+        // receiver should have all data
+        let sender_cids = DagWalk::breadth_first([root])
+            .stream(sender_store)
+            .map_ok(|(cid, _)| cid)
+            .try_collect::<Vec<_>>()
+            .await?;
+        let receiver_cids = DagWalk::breadth_first([root])
+            .stream(receiver_store)
+            .map_ok(|(cid, _)| cid)
+            .try_collect::<Vec<_>>()
+            .await?;
+
+        assert_eq!(sender_cids, receiver_cids);
+
+        Ok(())
+    }
+}

From 51c7dc96b3b815a8bd7cdd9118b9e91c12f2cdc4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Fri, 18 Aug 2023 15:12:22 +0200
Subject: [PATCH 11/35] Remove `println`s, add rough test for round trips

---
 car-mirror/src/lib.rs                     | 15 -----
 car-mirror/src/push.rs                    | 67 ++++++++++++++++-------
 car-mirror/src/test_utils/dag_strategy.rs |  4 +-
 3 files changed, 49 insertions(+), 37 deletions(-)

diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs
index 2ed447b..5b7779b 100644
--- a/car-mirror/src/lib.rs
+++ b/car-mirror/src/lib.rs
@@ -4,21 +4,6 @@
 
 //! car-mirror
 
-use anyhow::{anyhow, bail, Result};
-use bytes::Bytes;
-use deterministic_bloom::runtime_size::BloomFilter;
-use futures::{stream::try_unfold, Stream, StreamExt, TryStreamExt};
-use iroh_car::{CarHeader, CarReader, CarWriter};
-use libipld::{Ipld, IpldCodec};
-use libipld_core::{cid::Cid, codec::References};
-use messages::PushResponse;
-use std::{
-    collections::{HashSet, VecDeque},
-    eprintln,
-    io::Cursor,
-};
-use wnfs_common::{BlockStore, BlockStoreError};
-
 /// Test utilities.
 #[cfg(any(test, feature = "test_utils"))]
 #[cfg_attr(docsrs, doc(cfg(feature = "test_utils")))]
diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs
index f7819f8..13d95f6 100644
--- a/car-mirror/src/push.rs
+++ b/car-mirror/src/push.rs
@@ -116,13 +116,10 @@ pub async fn client_push(
             // Both of these are *huge* traversals. I'd say likely not worth it. The only case I can image they're worth it, is if the DAG
             // is *heavily* using structural sharing and not tree-like.
             // Also: This fails completely if the sender is just missing a single leaf. It couldn't add the block to the bloom in that case.
-            dag_walk.skip_walking((cid, block))?;
-            println!("Skipped walking {cid} due to bloom");
             break;
         }
 
         writer.write(cid, &block).await?;
-        println!("Sending {cid}");
 
         // TODO(matheus23): Count the actual bytes sent?
         block_bytes += block.len();
@@ -153,7 +150,6 @@ pub async fn server_push_response(
 
     while let Some((cid, vec)) = reader.next_block().await? {
         let block = Bytes::from(vec);
-        println!("Received {cid}");
 
         block_bytes += block.len();
         if block_bytes > config.receive_maximum {
@@ -185,7 +181,6 @@ pub async fn server_push_response(
 
     Ok(PushResponse {
         subgraph_roots,
-        // We ignore blooms for now
         bloom_k: bloom.hash_count() as u32,
         bloom: bloom.as_bytes().to_vec(),
     })
@@ -200,11 +195,12 @@ mod tests {
     use std::collections::BTreeMap;
     use wnfs_common::MemoryBlockStore;
 
-    #[async_std::test]
-    async fn test_transfer() -> Result<()> {
-        let (blocks, root) = Rvg::new().sample(&generate_dag(256, |cids| {
+    async fn setup_random_dag<const BLOCK_PADDING: usize>(
+        dag_size: u16,
+    ) -> Result<(Cid, MemoryBlockStore)> {
+        let (blocks, root) = Rvg::new().sample(&generate_dag(dag_size, |cids| {
             let ipld = Ipld::Map(BTreeMap::from([
-                ("data".into(), Ipld::Bytes(vec![0u8; 10 * 1024])),
+                ("data".into(), Ipld::Bytes(vec![0u8; BLOCK_PADDING])),
                 (
                     "links".into(),
                     Ipld::List(cids.into_iter().map(Ipld::Link).collect()),
@@ -215,25 +211,24 @@ mod tests {
             (cid, bytes)
         }));
 
-        let sender_store = &MemoryBlockStore::new();
-        for (cid, bytes) in blocks.iter() {
-            let cid_store = sender_store
-                .put_block(bytes.clone(), IpldCodec::DagCbor.into())
-                .await?;
-            assert_eq!(*cid, cid_store);
+        let store = MemoryBlockStore::new();
+        for (cid, bytes) in blocks.into_iter() {
+            let cid_store = store.put_block(bytes, IpldCodec::DagCbor.into()).await?;
+            assert_eq!(cid, cid_store);
         }
 
+        Ok((root, store))
+    }
+
+    #[async_std::test]
+    async fn test_transfer() -> Result<()> {
+        const BLOCK_PADDING: usize = 10 * 1024;
+        let (root, ref sender_store) = setup_random_dag::<BLOCK_PADDING>(256).await?;
         let receiver_store = &MemoryBlockStore::new();
         let config = &PushConfig::default();
         let mut request = client_initiate_push(root, config, sender_store).await?;
         loop {
-            println!("Sending request {} bytes", request.len());
             let response = server_push_response(root, request, config, receiver_store).await?;
-            println!(
-                "Response (bloom bytes: {}): {:?}",
-                response.bloom.len(),
-                response.subgraph_roots,
-            );
             if response.indicates_finished() {
                 break;
             }
@@ -256,4 +251,34 @@ mod tests {
 
         Ok(())
     }
+
+    #[async_std::test]
+    async fn print_average_number_of_rounds() -> Result<()> {
+        const TESTS: usize = 200;
+        const DAG_SIZE: u16 = 256;
+        const BLOCK_PADDING: usize = 10 * 1024;
+
+        let mut total_rounds = 0;
+        for _ in 0..TESTS {
+            let (root, ref sender_store) = setup_random_dag::<BLOCK_PADDING>(DAG_SIZE).await?;
+            let receiver_store = &MemoryBlockStore::new();
+            let config = &PushConfig::default();
+            let mut request = client_initiate_push(root, config, sender_store).await?;
+            loop {
+                let response = server_push_response(root, request, config, receiver_store).await?;
+                total_rounds += 1;
+                if response.indicates_finished() {
+                    break;
+                }
+                request = client_push(root, response, config, sender_store).await?;
+            }
+        }
+
+        println!(
+            "Average # of rounds: {}",
+            total_rounds as f64 / TESTS as f64
+        );
+
+        Ok(())
+    }
 }
diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs
index fc7ce00..7e0ac52 100644
--- a/car-mirror/src/test_utils/dag_strategy.rs
+++ b/car-mirror/src/test_utils/dag_strategy.rs
@@ -23,7 +23,9 @@ pub fn generate_dag<T: Debug + Clone>(
     arb_dag(1..max_nodes, 0.5).prop_map(move |dag| dag_to_nodes(&dag, generate_block))
 }
 
-fn dag_to_nodes<T>(
+/// Turn a directed acyclic graph into a list of nodes (with their CID) and a root CID.
+/// This will select only the DAG that's reachable from the root.
+pub fn dag_to_nodes<T>(
     dag: &DirectedAcyclicGraph,
     generate_node: fn(Vec<Cid>) -> (Cid, T),
 ) -> (Vec<(Cid, T)>, Cid) {

From d4c7729faaaadd6d62e69f0f34f98cfd8085ba44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Fri, 18 Aug 2023 17:21:06 +0200
Subject: [PATCH 12/35] Generate bigger random DAGs

---
 Cargo.toml                                |   1 +
 car-mirror/src/dag_walk.rs                |   2 +-
 car-mirror/src/push.rs                    | 117 ++++++++++++++++------
 car-mirror/src/test_utils/dag_strategy.rs |  26 +++--
 4 files changed, 108 insertions(+), 38 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 408bb4f..96fa3b8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,3 +23,4 @@ opt-level = "s" # or 'z' to optimize "aggressively" for size
 # See https://blog.rust-lang.org/2021/03/25/Rust-1.51.0.html#splitting-debug-information
 [profile.dev]
 split-debuginfo = "unpacked"
+opt-level = 3
\ No newline at end of file
diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs
index 0f57740..6210272 100644
--- a/car-mirror/src/dag_walk.rs
+++ b/car-mirror/src/dag_walk.rs
@@ -174,7 +174,7 @@ mod proptests {
     use wnfs_common::{BlockStore, MemoryBlockStore};
 
     fn ipld_dags() -> impl Strategy<Value = (Vec<(Cid, Ipld)>, Cid)> {
-        generate_dag(256, |cids| {
+        generate_dag(256, |cids, _| {
             let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect());
             let cid = Cid::new_v1(
                 IpldCodec::DagCbor.into(),
diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs
index 13d95f6..e1e2311 100644
--- a/car-mirror/src/push.rs
+++ b/car-mirror/src/push.rs
@@ -192,15 +192,25 @@ mod tests {
     use crate::test_utils::{encode, generate_dag, Rvg};
     use libipld::{Ipld, IpldCodec};
     use libipld_core::multihash::{Code, MultihashDigest};
+    use proptest::prelude::Rng;
     use std::collections::BTreeMap;
     use wnfs_common::MemoryBlockStore;
 
+    #[derive(Clone, Debug)]
+    struct Metrics {
+        request_bytes: usize,
+        response_bytes: usize,
+    }
+
     async fn setup_random_dag<const BLOCK_PADDING: usize>(
         dag_size: u16,
     ) -> Result<(Cid, MemoryBlockStore)> {
-        let (blocks, root) = Rvg::new().sample(&generate_dag(dag_size, |cids| {
+        let (blocks, root) = Rvg::new().sample(&generate_dag(dag_size, |cids, rng| {
             let ipld = Ipld::Map(BTreeMap::from([
-                ("data".into(), Ipld::Bytes(vec![0u8; BLOCK_PADDING])),
+                (
+                    "data".into(),
+                    Ipld::Bytes((0..BLOCK_PADDING).map(|_| rng.gen::<u8>()).collect()),
+                ),
                 (
                     "links".into(),
                     Ipld::List(cids.into_iter().map(Ipld::Link).collect()),
@@ -220,64 +230,113 @@ mod tests {
         Ok((root, store))
     }
 
-    #[async_std::test]
-    async fn test_transfer() -> Result<()> {
-        const BLOCK_PADDING: usize = 10 * 1024;
-        let (root, ref sender_store) = setup_random_dag::<BLOCK_PADDING>(256).await?;
-        let receiver_store = &MemoryBlockStore::new();
-        let config = &PushConfig::default();
-        let mut request = client_initiate_push(root, config, sender_store).await?;
+    async fn simulate_protocol(
+        root: Cid,
+        config: &PushConfig,
+        client_store: &MemoryBlockStore,
+        server_store: &MemoryBlockStore,
+    ) -> Result<Vec<Metrics>> {
+        let mut metrics = Vec::new();
+        let mut request = client_initiate_push(root, config, client_store).await?;
         loop {
-            let response = server_push_response(root, request, config, receiver_store).await?;
+            let request_bytes = request.len();
+            let response = server_push_response(root, request, config, server_store).await?;
+            let response_bytes = serde_ipld_dagcbor::to_vec(&response)?.len();
+
+            metrics.push(Metrics {
+                request_bytes,
+                response_bytes,
+            });
+
             if response.indicates_finished() {
                 break;
             }
-            request = client_push(root, response, config, sender_store).await?;
+            request = client_push(root, response, config, client_store).await?;
         }
 
+        Ok(metrics)
+    }
+
+    async fn total_dag_bytes(root: Cid, store: &impl BlockStore) -> Result<usize> {
+        Ok(DagWalk::breadth_first([root])
+            .stream(store)
+            .map_ok(|(_, block)| block.len())
+            .try_collect::<Vec<_>>()
+            .await?
+            .into_iter()
+            .sum::<usize>())
+    }
+
+    async fn total_dag_blocks(root: Cid, store: &impl BlockStore) -> Result<usize> {
+        Ok(DagWalk::breadth_first([root])
+            .stream(store)
+            .map_ok(|(_, block)| block.len())
+            .try_collect::<Vec<_>>()
+            .await?
+            .len())
+    }
+
+    #[async_std::test]
+    async fn test_transfer() -> Result<()> {
+        const BLOCK_PADDING: usize = 10 * 1024;
+        let (root, ref client_store) = setup_random_dag::<BLOCK_PADDING>(256).await?;
+        let server_store = &MemoryBlockStore::new();
+        simulate_protocol(root, &PushConfig::default(), client_store, server_store).await?;
+
         // receiver should have all data
-        let sender_cids = DagWalk::breadth_first([root])
-            .stream(sender_store)
+        let client_cids = DagWalk::breadth_first([root])
+            .stream(client_store)
             .map_ok(|(cid, _)| cid)
             .try_collect::<Vec<_>>()
             .await?;
-        let receiver_cids = DagWalk::breadth_first([root])
-            .stream(receiver_store)
+        let server_cids = DagWalk::breadth_first([root])
+            .stream(server_store)
             .map_ok(|(cid, _)| cid)
             .try_collect::<Vec<_>>()
             .await?;
 
-        assert_eq!(sender_cids, receiver_cids);
+        assert_eq!(client_cids, server_cids);
 
         Ok(())
     }
 
     #[async_std::test]
-    async fn print_average_number_of_rounds() -> Result<()> {
+    async fn print_metrics() -> Result<()> {
         const TESTS: usize = 200;
         const DAG_SIZE: u16 = 256;
         const BLOCK_PADDING: usize = 10 * 1024;
 
         let mut total_rounds = 0;
+        let mut total_blocks = 0;
+        let mut total_block_bytes = 0;
+        let mut total_network_bytes = 0;
         for _ in 0..TESTS {
-            let (root, ref sender_store) = setup_random_dag::<BLOCK_PADDING>(DAG_SIZE).await?;
-            let receiver_store = &MemoryBlockStore::new();
-            let config = &PushConfig::default();
-            let mut request = client_initiate_push(root, config, sender_store).await?;
-            loop {
-                let response = server_push_response(root, request, config, receiver_store).await?;
-                total_rounds += 1;
-                if response.indicates_finished() {
-                    break;
-                }
-                request = client_push(root, response, config, sender_store).await?;
-            }
+            let (root, ref client_store) = setup_random_dag::<BLOCK_PADDING>(DAG_SIZE).await?;
+            let server_store = &MemoryBlockStore::new();
+            let metrics =
+                simulate_protocol(root, &PushConfig::default(), client_store, server_store).await?;
+
+            total_rounds += metrics.len();
+            total_blocks += total_dag_blocks(root, client_store).await?;
+            total_block_bytes += total_dag_bytes(root, client_store).await?;
+            total_network_bytes += metrics
+                .iter()
+                .map(|metric| metric.request_bytes + metric.response_bytes)
+                .sum::<usize>();
         }
 
         println!(
             "Average # of rounds: {}",
             total_rounds as f64 / TESTS as f64
         );
+        println!(
+            "Average # of blocks: {}",
+            total_blocks as f64 / TESTS as f64
+        );
+        println!(
+            "Average network overhead: {}%",
+            (total_network_bytes as f64 / total_block_bytes as f64 - 1.0) * 100.0
+        );
 
         Ok(())
     }
diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs
index 7e0ac52..9e91b0b 100644
--- a/car-mirror/src/test_utils/dag_strategy.rs
+++ b/car-mirror/src/test_utils/dag_strategy.rs
@@ -3,7 +3,7 @@ use std::{collections::HashSet, fmt::Debug};
 use bytes::Bytes;
 use libipld::{Cid, Ipld, IpldCodec};
 use libipld_core::codec::Encode;
-use proptest::strategy::Strategy;
+use proptest::{strategy::Strategy, test_runner::TestRng};
 use roaring_graphs::{arb_dag, DirectedAcyclicGraph, Vertex};
 
 /// Encode some IPLD as dag-cbor
@@ -18,20 +18,22 @@ pub fn encode(ipld: &Ipld) -> Bytes {
 /// the root block's CID.
 pub fn generate_dag<T: Debug + Clone>(
     max_nodes: u16,
-    generate_block: fn(Vec<Cid>) -> (Cid, T),
+    generate_block: fn(Vec<Cid>, rng: &mut TestRng) -> (Cid, T),
 ) -> impl Strategy<Value = (Vec<(Cid, T)>, Cid)> {
-    arb_dag(1..max_nodes, 0.5).prop_map(move |dag| dag_to_nodes(&dag, generate_block))
+    arb_dag(1..max_nodes, 0.5)
+        .prop_perturb(move |dag, mut rng| dag_to_nodes(&dag, &mut rng, generate_block))
 }
 
 /// Turn a directed acyclic graph into a list of nodes (with their CID) and a root CID.
 /// This will select only the DAG that's reachable from the root.
 pub fn dag_to_nodes<T>(
     dag: &DirectedAcyclicGraph,
-    generate_node: fn(Vec<Cid>) -> (Cid, T),
+    rng: &mut TestRng,
+    generate_node: fn(Vec<Cid>, &mut TestRng) -> (Cid, T),
 ) -> (Vec<(Cid, T)>, Cid) {
     let mut blocks = Vec::new();
     let mut visited = HashSet::new();
-    let (cid, block) = dag_to_nodes_helper(dag, 0, generate_node, &mut blocks, &mut visited);
+    let (cid, block) = dag_to_nodes_helper(dag, 0, rng, generate_node, &mut blocks, &mut visited);
     blocks.push((cid, block));
     (blocks, cid)
 }
@@ -39,7 +41,8 @@ pub fn dag_to_nodes<T>(
 fn dag_to_nodes_helper<T>(
     dag: &DirectedAcyclicGraph,
     root: Vertex,
-    generate_node: fn(Vec<Cid>) -> (Cid, T),
+    rng: &mut TestRng,
+    generate_node: fn(Vec<Cid>, &mut TestRng) -> (Cid, T),
     arr: &mut Vec<(Cid, T)>,
     visited: &mut HashSet<Vertex>,
 ) -> (Cid, T) {
@@ -49,9 +52,16 @@ fn dag_to_nodes_helper<T>(
             continue;
         }
         visited.insert(child);
-        child_blocks.push(dag_to_nodes_helper(dag, child, generate_node, arr, visited));
+        child_blocks.push(dag_to_nodes_helper(
+            dag,
+            child,
+            rng,
+            generate_node,
+            arr,
+            visited,
+        ));
     }
-    let result = generate_node(child_blocks.iter().map(|(cid, _)| *cid).collect());
+    let result = generate_node(child_blocks.iter().map(|(cid, _)| *cid).collect(), rng);
     arr.extend(child_blocks);
     result
 }

From 57a0d9f5294cd372ca0656b7f4f4841c3d31c6ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Fri, 18 Aug 2023 17:30:14 +0200
Subject: [PATCH 13/35] Use recommended FPR computation from the spec

---
 car-mirror/src/push.rs | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs
index e1e2311..d3da5a7 100644
--- a/car-mirror/src/push.rs
+++ b/car-mirror/src/push.rs
@@ -23,7 +23,7 @@ pub struct PushConfig {
     /// and that the client will consume.
     pub max_roots_per_round: usize,
     /// The target false positive rate for the bloom filter that the server sends.
-    pub bloom_fpr: f64,
+    pub bloom_fpr: fn(u64) -> f64,
 }
 
 impl Default for PushConfig {
@@ -32,7 +32,7 @@ impl Default for PushConfig {
             send_minimum: 128 * 1024,    // 128KiB
             receive_maximum: 512 * 1024, // 512KiB
             max_roots_per_round: 1000,   // max. ~41KB of CIDs
-            bloom_fpr: 1.0 / 10_000.0,   // 0.1%
+            bloom_fpr: |num_of_elems| 0.1 / num_of_elems as f64,
         }
     }
 }
@@ -171,8 +171,10 @@ pub async fn server_push_response(
         .cloned()
         .collect();
 
+    let bloom_capacity = dag_verification.have_cids.len() as u64;
+
     let mut bloom =
-        BloomFilter::new_from_fpr_po2(dag_verification.have_cids.len() as u64, config.bloom_fpr);
+        BloomFilter::new_from_fpr_po2(bloom_capacity, (config.bloom_fpr)(bloom_capacity));
 
     dag_verification
         .have_cids

From 61cda9556b72ec002d36b95c983578399dc60dc2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Fri, 18 Aug 2023 17:56:43 +0200
Subject: [PATCH 14/35] Test case for deduplicating transfer

---
 car-mirror/src/push.rs                    | 34 ++++++++++++++++++++--
 car-mirror/src/test_utils/dag_strategy.rs | 11 +------
 car-mirror/src/test_utils/mod.rs          | 35 +++++++++++++++++++++++
 3 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs
index d3da5a7..8b0e2e9 100644
--- a/car-mirror/src/push.rs
+++ b/car-mirror/src/push.rs
@@ -191,10 +191,10 @@ pub async fn server_push_response(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::test_utils::{encode, generate_dag, Rvg};
+    use crate::test_utils::{encode, generate_dag, get_cid_at_approx_path, Rvg};
     use libipld::{Ipld, IpldCodec};
     use libipld_core::multihash::{Code, MultihashDigest};
-    use proptest::prelude::Rng;
+    use proptest::{collection::vec, prelude::Rng};
     use std::collections::BTreeMap;
     use wnfs_common::MemoryBlockStore;
 
@@ -302,6 +302,31 @@ mod tests {
         Ok(())
     }
 
+    #[async_std::test]
+    async fn test_deduplicating_transfer() -> Result<()> {
+        const BLOCK_PADDING: usize = 10 * 1024;
+        let (root, ref client_store) = setup_random_dag::<BLOCK_PADDING>(256).await?;
+        let total_bytes = total_dag_bytes(root, client_store).await?;
+        let path = Rvg::new().sample(&vec(0usize..128, 0..64));
+        let second_root = get_cid_at_approx_path(path, root, client_store).await?;
+
+        let server_store = &MemoryBlockStore::new();
+        let config = &PushConfig::default();
+        let metrics1 = simulate_protocol(second_root, config, client_store, server_store).await?;
+        let metrics2 = simulate_protocol(root, config, client_store, server_store).await?;
+
+        let total_network_bytes = metrics1
+            .into_iter()
+            .chain(metrics2.into_iter())
+            .map(|metric| metric.request_bytes + metric.response_bytes)
+            .sum::<usize>();
+
+        println!("Total DAG bytes: {total_bytes}");
+        println!("Total network bytes: {total_network_bytes}");
+
+        Ok(())
+    }
+
     #[async_std::test]
     async fn print_metrics() -> Result<()> {
         const TESTS: usize = 200;
@@ -343,3 +368,8 @@ mod tests {
         Ok(())
     }
 }
+
+#[cfg(test)]
+mod proptests {
+    use super::*;
+}
diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs
index 9e91b0b..144a927 100644
--- a/car-mirror/src/test_utils/dag_strategy.rs
+++ b/car-mirror/src/test_utils/dag_strategy.rs
@@ -1,18 +1,9 @@
 use std::{collections::HashSet, fmt::Debug};
 
-use bytes::Bytes;
-use libipld::{Cid, Ipld, IpldCodec};
-use libipld_core::codec::Encode;
+use libipld::Cid;
 use proptest::{strategy::Strategy, test_runner::TestRng};
 use roaring_graphs::{arb_dag, DirectedAcyclicGraph, Vertex};
 
-/// Encode some IPLD as dag-cbor
-pub fn encode(ipld: &Ipld) -> Bytes {
-    let mut vec = Vec::new();
-    ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap(); // TODO(matheus23) unwrap
-    Bytes::from(vec)
-}
-
 /// A strategy for use with proptest to generate random DAGs (directed acyclic graphs).
 /// The strategy generates a list of blocks of type T and their CIDs, as well as
 /// the root block's CID.
diff --git a/car-mirror/src/test_utils/mod.rs b/car-mirror/src/test_utils/mod.rs
index 890a5ad..8442d38 100644
--- a/car-mirror/src/test_utils/mod.rs
+++ b/car-mirror/src/test_utils/mod.rs
@@ -1,3 +1,10 @@
+use crate::common::references;
+use anyhow::Result;
+use bytes::Bytes;
+use libipld::{Cid, Ipld, IpldCodec};
+use libipld_core::codec::Encode;
+use wnfs_common::BlockStore;
+
 #[cfg(feature = "test_utils")]
 mod dag_strategy;
 /// Random value generator for sampling data.
@@ -7,3 +14,31 @@ mod rvg;
 pub use dag_strategy::*;
 #[cfg(feature = "test_utils")]
 pub use rvg::*;
+
+/// Encode some IPLD as dag-cbor
+pub fn encode(ipld: &Ipld) -> Bytes {
+    let mut vec = Vec::new();
+    ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap(); // TODO(matheus23) unwrap
+    Bytes::from(vec)
+}
+
+/// Walk a root DAG along some path.
+/// At each node, take the `n % numlinks`th link,
+/// and only walk the path as long as there are further links.
+pub async fn get_cid_at_approx_path(
+    path: Vec<usize>,
+    root: Cid,
+    store: &impl BlockStore,
+) -> Result<Cid> {
+    let mut working_cid = root;
+    for nth in path {
+        let block = store.get_block(&working_cid).await?;
+        let refs = references(working_cid, block)?;
+        if refs.is_empty() {
+            break;
+        }
+
+        working_cid = refs[nth % refs.len()];
+    }
+    Ok(working_cid)
+}

From eb53a4460acdf001d2c3f6a67b80497eca124291 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Mon, 21 Aug 2023 10:44:22 +0200
Subject: [PATCH 15/35] Delete irrelevant TODO

---
 car-mirror/src/push.rs | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs
index 8b0e2e9..c3679f3 100644
--- a/car-mirror/src/push.rs
+++ b/car-mirror/src/push.rs
@@ -109,13 +109,6 @@ pub async fn client_push(
     let mut dag_walk = DagWalk::breadth_first(subgraph_roots.clone());
     while let Some((cid, block)) = dag_walk.next(store).await? {
         if bloom.contains(&cid.to_bytes()) && !subgraph_roots.contains(&cid) {
-            // TODO(matheus23) I think the spec means to prune the whole subgraph.
-            // But
-            // 1. That requires the receiver to check the whole subgraph at that CID to find out whether there's a missing block at the subgraph.
-            // 2. It requires the sender to go through every block under this subgraph down to the leaves to mark all of these CIDs as visited.
-            // Both of these are *huge* traversals. I'd say likely not worth it. The only case I can image they're worth it, is if the DAG
-            // is *heavily* using structural sharing and not tree-like.
-            // Also: This fails completely if the sender is just missing a single leaf. It couldn't add the block to the bloom in that case.
             break;
         }
 

From b926d54e2828dba211410dfb78c98cb762cd83c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Mon, 21 Aug 2023 11:54:51 +0200
Subject: [PATCH 16/35] Implement pull protocol.

Also:
- Put `CarFile` bytes into a newtype
- Abstract out push&pull protocol parts into `common`
- Abstract out test utilities
---
 car-mirror/src/common.rs         | 261 ++++++++++++++++++++++++++++++-
 car-mirror/src/lib.rs            |   4 +-
 car-mirror/src/messages.rs       |   7 +
 car-mirror/src/pull.rs           |  93 +++++++++++
 car-mirror/src/push.rs           | 260 +++++-------------------------
 car-mirror/src/test_utils/mod.rs |  70 ++++++++-
 6 files changed, 466 insertions(+), 229 deletions(-)
 create mode 100644 car-mirror/src/pull.rs

diff --git a/car-mirror/src/common.rs b/car-mirror/src/common.rs
index bbf0cad..cd3229a 100644
--- a/car-mirror/src/common.rs
+++ b/car-mirror/src/common.rs
@@ -1,7 +1,175 @@
-use anyhow::{anyhow, Result};
+use anyhow::{anyhow, bail, Result};
+use bytes::Bytes;
+use deterministic_bloom::runtime_size::BloomFilter;
+use futures::TryStreamExt;
+use iroh_car::{CarHeader, CarReader, CarWriter};
 use libipld::{Ipld, IpldCodec};
 use libipld_core::{cid::Cid, codec::References};
 use std::io::Cursor;
+use wnfs_common::BlockStore;
+
+use crate::{
+    dag_walk::DagWalk,
+    incremental_verification::IncrementalDagVerification,
+    messages::{PullRequest, PushResponse},
+};
+
+//--------------------------------------------------------------------------------------------------
+// Types
+//--------------------------------------------------------------------------------------------------
+
+/// Configuration values (such as byte limits) for the CAR mirror protocol
+#[derive(Clone, Debug)]
+pub struct Config {
+    /// A client will try to send at least `send_minimum` bytes of block data
+    /// in each request, except if close to the end of the protocol (when there's)
+    /// not that much data left.
+    pub send_minimum: usize,
+    /// The maximum number of bytes per request that the server accepts.
+    pub receive_maximum: usize,
+    /// The maximum number of roots per request that the server will send to the client,
+    /// and that the client will consume.
+    pub max_roots_per_round: usize,
+    /// The target false positive rate for the bloom filter that the server sends.
+    pub bloom_fpr: fn(u64) -> f64,
+}
+
+#[derive(Debug, Clone)]
+pub struct ReceiverState {
+    pub missing_subgraph_roots: Vec<Cid>,
+    pub have_cids_bloom: Option<BloomFilter>,
+}
+
+/// Newtype around bytes that are supposed to represent a CAR file
+#[derive(Debug, Clone)]
+pub struct CarFile {
+    pub bytes: Bytes,
+}
+
+//--------------------------------------------------------------------------------------------------
+// Functions
+//--------------------------------------------------------------------------------------------------
+
+pub async fn block_send(
+    root: Cid,
+    last_state: Option<ReceiverState>,
+    config: &Config,
+    store: &impl BlockStore,
+) -> Result<CarFile> {
+    let ReceiverState {
+        ref missing_subgraph_roots,
+        have_cids_bloom,
+    } = last_state.unwrap_or(ReceiverState {
+        missing_subgraph_roots: vec![root],
+        have_cids_bloom: None,
+    });
+
+    // Verify that all missing subgraph roots are in the relevant DAG:
+    let subgraph_roots: Vec<Cid> = DagWalk::breadth_first([root])
+        .stream(store)
+        .try_filter_map(|(cid, _)| async move {
+            Ok(missing_subgraph_roots.contains(&cid).then_some(cid))
+        })
+        .try_collect()
+        .await?;
+
+    let bloom = have_cids_bloom.unwrap_or(BloomFilter::new_with(1, Box::new([0]))); // An empty bloom that contains nothing
+
+    let mut writer = CarWriter::new(
+        CarHeader::new_v1(
+            // TODO(matheus23): This is stupid
+            // CAR files *must* have at least one CID in them, and all of them
+            // need to appear as a block in the payload.
+            // It would probably make most sense to just write all subgraph roots into this,
+            // but we don't know how many of the subgraph roots fit into this round yet,
+            // so we're simply writing the first one in here, since we know
+            // at least one block will be written (and it'll be that one).
+            subgraph_roots.iter().take(1).cloned().collect(),
+        ),
+        Vec::new(),
+    );
+
+    writer.write_header().await?;
+
+    let mut block_bytes = 0;
+    let mut dag_walk = DagWalk::breadth_first(subgraph_roots.clone());
+    while let Some((cid, block)) = dag_walk.next(store).await? {
+        if bloom.contains(&cid.to_bytes()) && !subgraph_roots.contains(&cid) {
+            break;
+        }
+
+        writer.write(cid, &block).await?;
+
+        // TODO(matheus23): Count the actual bytes sent?
+        block_bytes += block.len();
+        if block_bytes > config.send_minimum {
+            break;
+        }
+    }
+
+    Ok(CarFile {
+        bytes: writer.finish().await?.into(),
+    })
+}
+
+pub async fn block_receive(
+    root: Cid,
+    last_car: Option<CarFile>,
+    config: &Config,
+    store: &impl BlockStore,
+) -> Result<ReceiverState> {
+    let mut dag_verification = IncrementalDagVerification::new([root], store).await?;
+
+    if let Some(car) = last_car {
+        let mut reader = CarReader::new(Cursor::new(car.bytes)).await?;
+        let mut block_bytes = 0;
+
+        while let Some((cid, vec)) = reader.next_block().await? {
+            let block = Bytes::from(vec);
+
+            block_bytes += block.len();
+            if block_bytes > config.receive_maximum {
+                bail!(
+                    "Received more than {} bytes ({block_bytes}), aborting request.",
+                    config.receive_maximum
+                );
+            }
+
+            dag_verification
+                .verify_and_store_block((cid, block), store)
+                .await?;
+        }
+    }
+
+    let missing_subgraph_roots = dag_verification
+        .want_cids
+        .iter()
+        .take(config.max_roots_per_round)
+        .cloned()
+        .collect();
+
+    let bloom_capacity = dag_verification.have_cids.len() as u64;
+
+    if bloom_capacity == 0 {
+        return Ok(ReceiverState {
+            missing_subgraph_roots,
+            have_cids_bloom: None,
+        });
+    }
+
+    let mut bloom =
+        BloomFilter::new_from_fpr_po2(bloom_capacity, (config.bloom_fpr)(bloom_capacity));
+
+    dag_verification
+        .have_cids
+        .iter()
+        .for_each(|cid| bloom.insert(&cid.to_bytes()));
+
+    Ok(ReceiverState {
+        missing_subgraph_roots,
+        have_cids_bloom: Some(bloom),
+    })
+}
 
 /// Find all CIDs that a block references.
 ///
@@ -18,3 +186,94 @@ pub fn references(cid: Cid, block: impl AsRef<[u8]>) -> Result<Vec<Cid>> {
     <Ipld as References<IpldCodec>>::references(codec, &mut Cursor::new(block), &mut refs)?;
     Ok(refs)
 }
+
+//--------------------------------------------------------------------------------------------------
+// Implementations
+//--------------------------------------------------------------------------------------------------
+
+impl ReceiverState {
+    pub fn from_push_response(push: PushResponse) -> Self {
+        let PushResponse {
+            subgraph_roots,
+            bloom_k,
+            bloom,
+        } = push;
+
+        Self {
+            missing_subgraph_roots: subgraph_roots,
+            have_cids_bloom: Self::bloom_deserialize(bloom_k, bloom),
+        }
+    }
+
+    pub fn from_pull_request(pull: PullRequest) -> Self {
+        let PullRequest {
+            resources,
+            bloom_k,
+            bloom,
+        } = pull;
+
+        Self {
+            missing_subgraph_roots: resources,
+            have_cids_bloom: Self::bloom_deserialize(bloom_k, bloom),
+        }
+    }
+
+    pub fn into_push_response(self) -> PushResponse {
+        let ReceiverState {
+            missing_subgraph_roots,
+            have_cids_bloom,
+        } = self;
+
+        let (bloom_k, bloom) = Self::bloom_serialize(have_cids_bloom);
+
+        PushResponse {
+            subgraph_roots: missing_subgraph_roots,
+            bloom_k,
+            bloom,
+        }
+    }
+
+    pub fn into_pull_request(self) -> PullRequest {
+        let ReceiverState {
+            missing_subgraph_roots,
+            have_cids_bloom,
+        } = self;
+
+        let (bloom_k, bloom) = Self::bloom_serialize(have_cids_bloom);
+
+        PullRequest {
+            resources: missing_subgraph_roots,
+            bloom_k,
+            bloom,
+        }
+    }
+
+    fn bloom_serialize(bloom: Option<BloomFilter>) -> (u32, Vec<u8>) {
+        match bloom {
+            Some(bloom) => (bloom.hash_count() as u32, bloom.as_bytes().to_vec()),
+            None => (3, Vec::new()),
+        }
+    }
+
+    fn bloom_deserialize(bloom_k: u32, bloom: Vec<u8>) -> Option<BloomFilter> {
+        if bloom.is_empty() {
+            None
+        } else {
+            Some(BloomFilter::new_with(
+                bloom_k as usize,
+                bloom.into_boxed_slice(),
+            ))
+        }
+    }
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            send_minimum: 128 * 1024,    // 128KiB
+            receive_maximum: 512 * 1024, // 512KiB
+            max_roots_per_round: 1000,   // max. ~41KB of CIDs
+            bloom_fpr: |num_of_elems| 0.1 / num_of_elems as f64,
+        }
+    }
+}
diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs
index 5b7779b..04ef44a 100644
--- a/car-mirror/src/lib.rs
+++ b/car-mirror/src/lib.rs
@@ -17,5 +17,7 @@ pub mod dag_walk;
 pub mod incremental_verification;
 /// Data types that are sent over-the-wire and relevant serialization code.
 pub mod messages;
-/// The CAR mirror push protocol
+/// The CAR mirror pull protocol. Meant to be used qualified, i.e. `pull::request` and `pull::response`
+pub mod pull;
+/// The CAR mirror push protocol. Meant to be used qualified, i.e. `push::request` and `push::response`
 pub mod push;
diff --git a/car-mirror/src/messages.rs b/car-mirror/src/messages.rs
index 283d55c..e1471c1 100644
--- a/car-mirror/src/messages.rs
+++ b/car-mirror/src/messages.rs
@@ -65,3 +65,10 @@ impl PushResponse {
         self.subgraph_roots.is_empty()
     }
 }
+
+impl PullRequest {
+    /// Whether you need to actually send the request or not. If true, this indicates that the protocol is finished.
+    pub fn indicates_finished(&self) -> bool {
+        self.resources.is_empty()
+    }
+}
diff --git a/car-mirror/src/pull.rs b/car-mirror/src/pull.rs
new file mode 100644
index 0000000..20d6d1b
--- /dev/null
+++ b/car-mirror/src/pull.rs
@@ -0,0 +1,93 @@
+use crate::{
+    common::{block_receive, block_send, CarFile, Config, ReceiverState},
+    messages::PullRequest,
+};
+use anyhow::Result;
+use libipld::Cid;
+use wnfs_common::BlockStore;
+
+pub async fn request(
+    root: Cid,
+    last_response: Option<CarFile>,
+    config: &Config,
+    store: &impl BlockStore,
+) -> Result<PullRequest> {
+    Ok(block_receive(root, last_response, config, store)
+        .await?
+        .into_pull_request())
+}
+
+pub async fn response(
+    root: Cid,
+    request: PullRequest,
+    config: &Config,
+    store: &impl BlockStore,
+) -> Result<CarFile> {
+    let receiver_state = Some(ReceiverState::from_pull_request(request));
+    block_send(root, receiver_state, config, store).await
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        common::Config,
+        dag_walk::DagWalk,
+        test_utils::{setup_random_dag, Metrics},
+    };
+    use anyhow::Result;
+    use futures::TryStreamExt;
+    use libipld::Cid;
+    use wnfs_common::MemoryBlockStore;
+
+    async fn simulate_protocol(
+        root: Cid,
+        config: &Config,
+        client_store: &MemoryBlockStore,
+        server_store: &MemoryBlockStore,
+    ) -> Result<Vec<Metrics>> {
+        let mut metrics = Vec::new();
+        let mut request = crate::pull::request(root, None, config, client_store).await?;
+        loop {
+            let request_bytes = serde_ipld_dagcbor::to_vec(&request)?.len();
+            let response = crate::pull::response(root, request, config, server_store).await?;
+            let response_bytes = response.bytes.len();
+
+            metrics.push(Metrics {
+                request_bytes,
+                response_bytes,
+            });
+
+            request = crate::pull::request(root, Some(response), config, client_store).await?;
+            if request.indicates_finished() {
+                break;
+            }
+        }
+
+        Ok(metrics)
+    }
+
+    #[async_std::test]
+    async fn test_transfer() -> Result<()> {
+        const BLOCK_PADDING: usize = 10 * 1024; // 10KiB
+        let client_store = &MemoryBlockStore::new();
+        let (root, ref server_store) = setup_random_dag::<BLOCK_PADDING>(256).await?;
+
+        simulate_protocol(root, &Config::default(), client_store, server_store).await?;
+
+        // client should have all data
+        let client_cids = DagWalk::breadth_first([root])
+            .stream(client_store)
+            .map_ok(|(cid, _)| cid)
+            .try_collect::<Vec<_>>()
+            .await?;
+        let server_cids = DagWalk::breadth_first([root])
+            .stream(server_store)
+            .map_ok(|(cid, _)| cid)
+            .try_collect::<Vec<_>>()
+            .await?;
+
+        assert_eq!(client_cids, server_cids);
+
+        Ok(())
+    }
+}
diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs
index c3679f3..3d37940 100644
--- a/car-mirror/src/push.rs
+++ b/car-mirror/src/push.rs
@@ -1,62 +1,13 @@
 use crate::{
-    dag_walk::DagWalk, incremental_verification::IncrementalDagVerification, messages::PushResponse,
+    common::{block_receive, block_send, CarFile, Config, ReceiverState},
+    messages::PushResponse,
 };
-use anyhow::{bail, Result};
-use bytes::Bytes;
-use deterministic_bloom::runtime_size::BloomFilter;
-use futures::TryStreamExt;
-use iroh_car::{CarHeader, CarReader, CarWriter};
+use anyhow::Result;
 use libipld_core::cid::Cid;
-use std::io::Cursor;
 use wnfs_common::BlockStore;
 
-/// Configuration values (such as byte limits) for the CAR mirror push protocol
-#[derive(Clone, Debug)]
-pub struct PushConfig {
-    /// A client will try to send at least `send_minimum` bytes of block data
-    /// in each request, except if close to the end of the protocol (when there's)
-    /// not that much data left.
-    pub send_minimum: usize,
-    /// The maximum number of bytes per request that the server accepts.
-    pub receive_maximum: usize,
-    /// The maximum number of roots per request that the server will send to the client,
-    /// and that the client will consume.
-    pub max_roots_per_round: usize,
-    /// The target false positive rate for the bloom filter that the server sends.
-    pub bloom_fpr: fn(u64) -> f64,
-}
-
-impl Default for PushConfig {
-    fn default() -> Self {
-        Self {
-            send_minimum: 128 * 1024,    // 128KiB
-            receive_maximum: 512 * 1024, // 512KiB
-            max_roots_per_round: 1000,   // max. ~41KB of CIDs
-            bloom_fpr: |num_of_elems| 0.1 / num_of_elems as f64,
-        }
-    }
-}
-
-/// Initiate a car mirror push request.
-///
-/// The goal is to transfer the DAG below the root CID to
-/// the server.
+/// TODO(matheus23) update docs
 ///
-/// The return value is a CAR file.
-pub async fn client_initiate_push(
-    root: Cid,
-    config: &PushConfig,
-    store: &impl BlockStore,
-) -> Result<Bytes> {
-    let fake_response = PushResponse {
-        subgraph_roots: vec![root],
-        // Just putting an empty bloom here
-        bloom_k: 3,
-        bloom: Vec::new(),
-    };
-    client_push(root, fake_response, config, store).await
-}
-
 /// Send a subsequent car mirror push request, following up on
 /// a response retrieved from an initial `client_initiate_push` request.
 ///
@@ -64,178 +15,62 @@ pub async fn client_initiate_push(
 /// a follow-up `client_push` request.
 ///
 /// The return value is another CAR file with more blocks from the DAG below the root.
-pub async fn client_push(
+pub async fn request(
     root: Cid,
-    last_response: PushResponse,
-    config: &PushConfig,
+    last_response: Option<PushResponse>,
+    config: &Config,
     store: &impl BlockStore,
-) -> Result<Bytes> {
-    let PushResponse {
-        ref subgraph_roots,
-        bloom_k,
-        bloom,
-    } = last_response;
-
-    // Verify that all subgraph roots are in the relevant DAG:
-    let subgraph_roots: Vec<Cid> = DagWalk::breadth_first([root])
-        .stream(store)
-        .try_filter_map(|(cid, _)| async move { Ok(subgraph_roots.contains(&cid).then_some(cid)) })
-        .try_collect()
-        .await?;
-
-    let bloom = if bloom.is_empty() {
-        BloomFilter::new_with(1, Box::new([0])) // An empty bloom that contains nothing
-    } else {
-        BloomFilter::new_with(bloom_k as usize, bloom.into_boxed_slice())
-    };
-
-    let mut writer = CarWriter::new(
-        CarHeader::new_v1(
-            // TODO(matheus23): This is stupid
-            // CAR files *must* have at least one CID in them, and all of them
-            // need to appear as a block in the payload.
-            // It would probably make most sense to just write all subgraph roots into this,
-            // but we don't know how many of the subgraph roots fit into this round yet,
-            // so we're simply writing the first one in here, since we know
-            // at least one block will be written (and it'll be that one).
-            subgraph_roots.iter().take(1).cloned().collect(),
-        ),
-        Vec::new(),
-    );
-
-    writer.write_header().await?;
-
-    let mut block_bytes = 0;
-    let mut dag_walk = DagWalk::breadth_first(subgraph_roots.clone());
-    while let Some((cid, block)) = dag_walk.next(store).await? {
-        if bloom.contains(&cid.to_bytes()) && !subgraph_roots.contains(&cid) {
-            break;
-        }
-
-        writer.write(cid, &block).await?;
-
-        // TODO(matheus23): Count the actual bytes sent?
-        block_bytes += block.len();
-        if block_bytes > config.send_minimum {
-            break;
-        }
-    }
-
-    Ok(writer.finish().await?.into())
+) -> Result<CarFile> {
+    let receiver_state = last_response.map(ReceiverState::from_push_response);
+    block_send(root, receiver_state, config, store).await
 }
 
+/// TODO(matheus23) update docs
+///
 /// This handles a car mirror push request on the server side.
 ///
 /// The root is the root CID of the DAG that is pushed, the request is a CAR file
 /// with some blocks from the cold call.
 ///
 /// Returns a response to answer the client's request with.
-pub async fn server_push_response(
+pub async fn response(
     root: Cid,
-    request: Bytes,
-    config: &PushConfig,
+    request: CarFile,
+    config: &Config,
     store: &impl BlockStore,
 ) -> Result<PushResponse> {
-    let mut dag_verification = IncrementalDagVerification::new([root], store).await?;
-
-    let mut reader = CarReader::new(Cursor::new(request)).await?;
-    let mut block_bytes = 0;
-
-    while let Some((cid, vec)) = reader.next_block().await? {
-        let block = Bytes::from(vec);
-
-        block_bytes += block.len();
-        if block_bytes > config.receive_maximum {
-            bail!(
-                "Received more than {} bytes ({block_bytes}), aborting request.",
-                config.receive_maximum
-            );
-        }
-
-        dag_verification
-            .verify_and_store_block((cid, block), store)
-            .await?;
-    }
-
-    let subgraph_roots = dag_verification
-        .want_cids
-        .iter()
-        .take(config.max_roots_per_round)
-        .cloned()
-        .collect();
-
-    let bloom_capacity = dag_verification.have_cids.len() as u64;
-
-    let mut bloom =
-        BloomFilter::new_from_fpr_po2(bloom_capacity, (config.bloom_fpr)(bloom_capacity));
-
-    dag_verification
-        .have_cids
-        .iter()
-        .for_each(|cid| bloom.insert(&cid.to_bytes()));
-
-    Ok(PushResponse {
-        subgraph_roots,
-        bloom_k: bloom.hash_count() as u32,
-        bloom: bloom.as_bytes().to_vec(),
-    })
+    Ok(block_receive(root, Some(request), config, store)
+        .await?
+        .into_push_response())
 }
 
 #[cfg(test)]
 mod tests {
-    use super::*;
-    use crate::test_utils::{encode, generate_dag, get_cid_at_approx_path, Rvg};
-    use libipld::{Ipld, IpldCodec};
-    use libipld_core::multihash::{Code, MultihashDigest};
-    use proptest::{collection::vec, prelude::Rng};
-    use std::collections::BTreeMap;
+    use crate::{
+        common::Config,
+        dag_walk::DagWalk,
+        test_utils::{
+            get_cid_at_approx_path, setup_random_dag, total_dag_blocks, total_dag_bytes, Metrics,
+            Rvg,
+        },
+    };
+    use anyhow::Result;
+    use futures::TryStreamExt;
+    use libipld::Cid;
+    use proptest::collection::vec;
     use wnfs_common::MemoryBlockStore;
 
-    #[derive(Clone, Debug)]
-    struct Metrics {
-        request_bytes: usize,
-        response_bytes: usize,
-    }
-
-    async fn setup_random_dag<const BLOCK_PADDING: usize>(
-        dag_size: u16,
-    ) -> Result<(Cid, MemoryBlockStore)> {
-        let (blocks, root) = Rvg::new().sample(&generate_dag(dag_size, |cids, rng| {
-            let ipld = Ipld::Map(BTreeMap::from([
-                (
-                    "data".into(),
-                    Ipld::Bytes((0..BLOCK_PADDING).map(|_| rng.gen::<u8>()).collect()),
-                ),
-                (
-                    "links".into(),
-                    Ipld::List(cids.into_iter().map(Ipld::Link).collect()),
-                ),
-            ]));
-            let bytes = encode(&ipld);
-            let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes));
-            (cid, bytes)
-        }));
-
-        let store = MemoryBlockStore::new();
-        for (cid, bytes) in blocks.into_iter() {
-            let cid_store = store.put_block(bytes, IpldCodec::DagCbor.into()).await?;
-            assert_eq!(cid, cid_store);
-        }
-
-        Ok((root, store))
-    }
-
     async fn simulate_protocol(
         root: Cid,
-        config: &PushConfig,
+        config: &Config,
         client_store: &MemoryBlockStore,
         server_store: &MemoryBlockStore,
     ) -> Result<Vec<Metrics>> {
         let mut metrics = Vec::new();
-        let mut request = client_initiate_push(root, config, client_store).await?;
+        let mut request = crate::push::request(root, None, config, client_store).await?;
         loop {
-            let request_bytes = request.len();
-            let response = server_push_response(root, request, config, server_store).await?;
+            let request_bytes = request.bytes.len();
+            let response = crate::push::response(root, request, config, server_store).await?;
             let response_bytes = serde_ipld_dagcbor::to_vec(&response)?.len();
 
             metrics.push(Metrics {
@@ -246,37 +81,18 @@ mod tests {
             if response.indicates_finished() {
                 break;
             }
-            request = client_push(root, response, config, client_store).await?;
+            request = crate::push::request(root, Some(response), config, client_store).await?;
         }
 
         Ok(metrics)
     }
 
-    async fn total_dag_bytes(root: Cid, store: &impl BlockStore) -> Result<usize> {
-        Ok(DagWalk::breadth_first([root])
-            .stream(store)
-            .map_ok(|(_, block)| block.len())
-            .try_collect::<Vec<_>>()
-            .await?
-            .into_iter()
-            .sum::<usize>())
-    }
-
-    async fn total_dag_blocks(root: Cid, store: &impl BlockStore) -> Result<usize> {
-        Ok(DagWalk::breadth_first([root])
-            .stream(store)
-            .map_ok(|(_, block)| block.len())
-            .try_collect::<Vec<_>>()
-            .await?
-            .len())
-    }
-
     #[async_std::test]
     async fn test_transfer() -> Result<()> {
         const BLOCK_PADDING: usize = 10 * 1024;
         let (root, ref client_store) = setup_random_dag::<BLOCK_PADDING>(256).await?;
         let server_store = &MemoryBlockStore::new();
-        simulate_protocol(root, &PushConfig::default(), client_store, server_store).await?;
+        simulate_protocol(root, &Config::default(), client_store, server_store).await?;
 
         // receiver should have all data
         let client_cids = DagWalk::breadth_first([root])
@@ -304,7 +120,7 @@ mod tests {
         let second_root = get_cid_at_approx_path(path, root, client_store).await?;
 
         let server_store = &MemoryBlockStore::new();
-        let config = &PushConfig::default();
+        let config = &Config::default();
         let metrics1 = simulate_protocol(second_root, config, client_store, server_store).await?;
         let metrics2 = simulate_protocol(root, config, client_store, server_store).await?;
 
@@ -334,7 +150,7 @@ mod tests {
             let (root, ref client_store) = setup_random_dag::<BLOCK_PADDING>(DAG_SIZE).await?;
             let server_store = &MemoryBlockStore::new();
             let metrics =
-                simulate_protocol(root, &PushConfig::default(), client_store, server_store).await?;
+                simulate_protocol(root, &Config::default(), client_store, server_store).await?;
 
             total_rounds += metrics.len();
             total_blocks += total_dag_blocks(root, client_store).await?;
diff --git a/car-mirror/src/test_utils/mod.rs b/car-mirror/src/test_utils/mod.rs
index 8442d38..92bb0a0 100644
--- a/car-mirror/src/test_utils/mod.rs
+++ b/car-mirror/src/test_utils/mod.rs
@@ -1,9 +1,16 @@
-use crate::common::references;
+use std::collections::BTreeMap;
+
+use crate::{common::references, dag_walk::DagWalk};
 use anyhow::Result;
 use bytes::Bytes;
+use futures::TryStreamExt;
 use libipld::{Cid, Ipld, IpldCodec};
-use libipld_core::codec::Encode;
-use wnfs_common::BlockStore;
+use libipld_core::{
+    codec::Encode,
+    multihash::{Code, MultihashDigest},
+};
+use proptest::prelude::Rng;
+use wnfs_common::{BlockStore, MemoryBlockStore};
 
 #[cfg(feature = "test_utils")]
 mod dag_strategy;
@@ -15,8 +22,14 @@ pub use dag_strategy::*;
 #[cfg(feature = "test_utils")]
 pub use rvg::*;
 
+#[derive(Clone, Debug)]
+pub(crate) struct Metrics {
+    pub(crate) request_bytes: usize,
+    pub(crate) response_bytes: usize,
+}
+
 /// Encode some IPLD as dag-cbor
-pub fn encode(ipld: &Ipld) -> Bytes {
+pub(crate) fn encode(ipld: &Ipld) -> Bytes {
     let mut vec = Vec::new();
     ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap(); // TODO(matheus23) unwrap
     Bytes::from(vec)
@@ -25,7 +38,7 @@ pub fn encode(ipld: &Ipld) -> Bytes {
 /// Walk a root DAG along some path.
 /// At each node, take the `n % numlinks`th link,
 /// and only walk the path as long as there are further links.
-pub async fn get_cid_at_approx_path(
+pub(crate) async fn get_cid_at_approx_path(
     path: Vec<usize>,
     root: Cid,
     store: &impl BlockStore,
@@ -42,3 +55,50 @@ pub async fn get_cid_at_approx_path(
     }
     Ok(working_cid)
 }
+
+pub(crate) async fn setup_random_dag<const BLOCK_PADDING: usize>(
+    dag_size: u16,
+) -> Result<(Cid, MemoryBlockStore)> {
+    let (blocks, root) = Rvg::new().sample(&generate_dag(dag_size, |cids, rng| {
+        let ipld = Ipld::Map(BTreeMap::from([
+            (
+                "data".into(),
+                Ipld::Bytes((0..BLOCK_PADDING).map(|_| rng.gen::<u8>()).collect()),
+            ),
+            (
+                "links".into(),
+                Ipld::List(cids.into_iter().map(Ipld::Link).collect()),
+            ),
+        ]));
+        let bytes = encode(&ipld);
+        let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes));
+        (cid, bytes)
+    }));
+
+    let store = MemoryBlockStore::new();
+    for (cid, bytes) in blocks.into_iter() {
+        let cid_store = store.put_block(bytes, IpldCodec::DagCbor.into()).await?;
+        assert_eq!(cid, cid_store);
+    }
+
+    Ok((root, store))
+}
+
+pub(crate) async fn total_dag_bytes(root: Cid, store: &impl BlockStore) -> Result<usize> {
+    Ok(DagWalk::breadth_first([root])
+        .stream(store)
+        .map_ok(|(_, block)| block.len())
+        .try_collect::<Vec<_>>()
+        .await?
+        .into_iter()
+        .sum::<usize>())
+}
+
+pub(crate) async fn total_dag_blocks(root: Cid, store: &impl BlockStore) -> Result<usize> {
+    Ok(DagWalk::breadth_first([root])
+        .stream(store)
+        .map_ok(|(_, block)| block.len())
+        .try_collect::<Vec<_>>()
+        .await?
+        .len())
+}

From 2c1b7035e4a835c45101cfb30319adfe5352eeec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Mon, 21 Aug 2023 12:03:45 +0200
Subject: [PATCH 17/35] Make `generate_dag`'s function be able to capture

---
 car-mirror/src/dag_walk.rs                |  2 +-
 car-mirror/src/pull.rs                    |  3 +--
 car-mirror/src/push.rs                    |  8 +++-----
 car-mirror/src/test_utils/dag_strategy.rs |  8 ++++----
 car-mirror/src/test_utils/mod.rs          | 15 +++++++++------
 5 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs
index 6210272..1ec03ba 100644
--- a/car-mirror/src/dag_walk.rs
+++ b/car-mirror/src/dag_walk.rs
@@ -174,7 +174,7 @@ mod proptests {
     use wnfs_common::{BlockStore, MemoryBlockStore};
 
     fn ipld_dags() -> impl Strategy<Value = (Vec<(Cid, Ipld)>, Cid)> {
-        generate_dag(256, |cids, _| {
+        generate_dag(256, &|cids, _| {
             let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect());
             let cid = Cid::new_v1(
                 IpldCodec::DagCbor.into(),
diff --git a/car-mirror/src/pull.rs b/car-mirror/src/pull.rs
index 20d6d1b..4db2365 100644
--- a/car-mirror/src/pull.rs
+++ b/car-mirror/src/pull.rs
@@ -68,9 +68,8 @@ mod tests {
 
     #[async_std::test]
     async fn test_transfer() -> Result<()> {
-        const BLOCK_PADDING: usize = 10 * 1024; // 10KiB
         let client_store = &MemoryBlockStore::new();
-        let (root, ref server_store) = setup_random_dag::<BLOCK_PADDING>(256).await?;
+        let (root, ref server_store) = setup_random_dag(256, 10 * 1024 /* 10 KiB */).await?;
 
         simulate_protocol(root, &Config::default(), client_store, server_store).await?;
 
diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs
index 3d37940..8e652a2 100644
--- a/car-mirror/src/push.rs
+++ b/car-mirror/src/push.rs
@@ -89,8 +89,7 @@ mod tests {
 
     #[async_std::test]
     async fn test_transfer() -> Result<()> {
-        const BLOCK_PADDING: usize = 10 * 1024;
-        let (root, ref client_store) = setup_random_dag::<BLOCK_PADDING>(256).await?;
+        let (root, ref client_store) = setup_random_dag(256, 10 * 1024 /* 10 KiB */).await?;
         let server_store = &MemoryBlockStore::new();
         simulate_protocol(root, &Config::default(), client_store, server_store).await?;
 
@@ -113,8 +112,7 @@ mod tests {
 
     #[async_std::test]
     async fn test_deduplicating_transfer() -> Result<()> {
-        const BLOCK_PADDING: usize = 10 * 1024;
-        let (root, ref client_store) = setup_random_dag::<BLOCK_PADDING>(256).await?;
+        let (root, ref client_store) = setup_random_dag(256, 10 * 1024 /* 10 KiB */).await?;
         let total_bytes = total_dag_bytes(root, client_store).await?;
         let path = Rvg::new().sample(&vec(0usize..128, 0..64));
         let second_root = get_cid_at_approx_path(path, root, client_store).await?;
@@ -147,7 +145,7 @@ mod tests {
         let mut total_block_bytes = 0;
         let mut total_network_bytes = 0;
         for _ in 0..TESTS {
-            let (root, ref client_store) = setup_random_dag::<BLOCK_PADDING>(DAG_SIZE).await?;
+            let (root, ref client_store) = setup_random_dag(DAG_SIZE, BLOCK_PADDING).await?;
             let server_store = &MemoryBlockStore::new();
             let metrics =
                 simulate_protocol(root, &Config::default(), client_store, server_store).await?;
diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs
index 144a927..60e1ab6 100644
--- a/car-mirror/src/test_utils/dag_strategy.rs
+++ b/car-mirror/src/test_utils/dag_strategy.rs
@@ -9,8 +9,8 @@ use roaring_graphs::{arb_dag, DirectedAcyclicGraph, Vertex};
 /// the root block's CID.
 pub fn generate_dag<T: Debug + Clone>(
     max_nodes: u16,
-    generate_block: fn(Vec<Cid>, rng: &mut TestRng) -> (Cid, T),
-) -> impl Strategy<Value = (Vec<(Cid, T)>, Cid)> {
+    generate_block: &impl Fn(Vec<Cid>, &mut TestRng) -> (Cid, T),
+) -> impl Strategy<Value = (Vec<(Cid, T)>, Cid)> + '_ {
     arb_dag(1..max_nodes, 0.5)
         .prop_perturb(move |dag, mut rng| dag_to_nodes(&dag, &mut rng, generate_block))
 }
@@ -20,7 +20,7 @@ pub fn generate_dag<T: Debug + Clone>(
 pub fn dag_to_nodes<T>(
     dag: &DirectedAcyclicGraph,
     rng: &mut TestRng,
-    generate_node: fn(Vec<Cid>, &mut TestRng) -> (Cid, T),
+    generate_node: &impl Fn(Vec<Cid>, &mut TestRng) -> (Cid, T),
 ) -> (Vec<(Cid, T)>, Cid) {
     let mut blocks = Vec::new();
     let mut visited = HashSet::new();
@@ -33,7 +33,7 @@ fn dag_to_nodes_helper<T>(
     dag: &DirectedAcyclicGraph,
     root: Vertex,
     rng: &mut TestRng,
-    generate_node: fn(Vec<Cid>, &mut TestRng) -> (Cid, T),
+    generate_node: &impl Fn(Vec<Cid>, &mut TestRng) -> (Cid, T),
     arr: &mut Vec<(Cid, T)>,
     visited: &mut HashSet<Vertex>,
 ) -> (Cid, T) {
diff --git a/car-mirror/src/test_utils/mod.rs b/car-mirror/src/test_utils/mod.rs
index 92bb0a0..ed74678 100644
--- a/car-mirror/src/test_utils/mod.rs
+++ b/car-mirror/src/test_utils/mod.rs
@@ -56,15 +56,18 @@ pub(crate) async fn get_cid_at_approx_path(
     Ok(working_cid)
 }
 
-pub(crate) async fn setup_random_dag<const BLOCK_PADDING: usize>(
+pub(crate) async fn setup_random_dag(
     dag_size: u16,
+    block_padding: usize,
 ) -> Result<(Cid, MemoryBlockStore)> {
-    let (blocks, root) = Rvg::new().sample(&generate_dag(dag_size, |cids, rng| {
+    let (blocks, root) = Rvg::new().sample(&generate_dag(dag_size, &|cids, rng| {
+        let mut padding = Vec::with_capacity(block_padding);
+        for _ in 0..block_padding {
+            padding.push(rng.gen::<u8>());
+        }
+
         let ipld = Ipld::Map(BTreeMap::from([
-            (
-                "data".into(),
-                Ipld::Bytes((0..BLOCK_PADDING).map(|_| rng.gen::<u8>()).collect()),
-            ),
+            ("data".into(), Ipld::Bytes(padding)),
             (
                 "links".into(),
                 Ipld::List(cids.into_iter().map(Ipld::Link).collect()),

From 5a7ba80cd8ac6ac44b590d98e44b917f014cd821 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Mon, 21 Aug 2023 18:15:56 +0200
Subject: [PATCH 18/35] Handle TODOs, implement proptests

---
 car-mirror/src/common.rs                  |  42 ++++++-
 car-mirror/src/dag_walk.rs                |   2 +-
 car-mirror/src/pull.rs                    |  65 ++++++++++-
 car-mirror/src/push.rs                    |  75 ++++++++++---
 car-mirror/src/test_utils/dag_strategy.rs |  15 ++-
 car-mirror/src/test_utils/local_utils.rs  | 128 ++++++++++++++++++++++
 car-mirror/src/test_utils/mod.rs          | 101 +----------------
 7 files changed, 299 insertions(+), 129 deletions(-)
 create mode 100644 car-mirror/src/test_utils/local_utils.rs

diff --git a/car-mirror/src/common.rs b/car-mirror/src/common.rs
index cd3229a..ef48acc 100644
--- a/car-mirror/src/common.rs
+++ b/car-mirror/src/common.rs
@@ -34,15 +34,21 @@ pub struct Config {
     pub bloom_fpr: fn(u64) -> f64,
 }
 
+/// Some information that the block receiving end provides the block sending end
+/// in order to deduplicate block transfers.
 #[derive(Debug, Clone)]
 pub struct ReceiverState {
+    /// At least *some* of the subgraph roots that are missing for sure on the receiving end.
     pub missing_subgraph_roots: Vec<Cid>,
+    /// An optional bloom filter of all CIDs below the root that the receiving end has.
     pub have_cids_bloom: Option<BloomFilter>,
 }
 
 /// Newtype around bytes that are supposed to represent a CAR file
 #[derive(Debug, Clone)]
 pub struct CarFile {
+    /// The car file contents as bytes.
+    /// (`CarFile` is cheap to clone, since `Bytes` is an `Arc` wrapper around a byte buffer.)
     pub bytes: Bytes,
 }
 
@@ -50,6 +56,13 @@ pub struct CarFile {
 // Functions
 //--------------------------------------------------------------------------------------------------
 
+/// This function is run on the block sending side of the protocol.
+///
+/// It's used on the client during the push protocol, or on the server
+/// during the pull protocol.
+///
+/// It returns a `CarFile` of (a subset) of all blocks below `root`, that
+/// are thought to be missing on the receiving end.
 pub async fn block_send(
     root: Cid,
     last_state: Option<ReceiverState>,
@@ -77,7 +90,7 @@ pub async fn block_send(
 
     let mut writer = CarWriter::new(
         CarHeader::new_v1(
-            // TODO(matheus23): This is stupid
+            // https://github.com/wnfs-wg/car-mirror-spec/issues/6
             // CAR files *must* have at least one CID in them, and all of them
             // need to appear as a block in the payload.
             // It would probably make most sense to just write all subgraph roots into this,
@@ -101,6 +114,7 @@ pub async fn block_send(
         writer.write(cid, &block).await?;
 
         // TODO(matheus23): Count the actual bytes sent?
+        // At the moment, this is a rough estimate. iroh-car could be improved to return the written bytes.
         block_bytes += block.len();
         if block_bytes > config.send_minimum {
             break;
@@ -112,6 +126,14 @@ pub async fn block_send(
     })
 }
 
+/// This function is run on the block receiving end of the protocol.
+///
+/// It's used on the client during the pull protocol and on the server
+/// during the push protocol.
+///
+/// It takes a `CarFile`, verifies that its contents are related to the
+/// `root` and returns some information to help the block sending side
+/// figure out what blocks to send next.
 pub async fn block_receive(
     root: Cid,
     last_car: Option<CarFile>,
@@ -191,8 +213,8 @@ pub fn references(cid: Cid, block: impl AsRef<[u8]>) -> Result<Vec<Cid>> {
 // Implementations
 //--------------------------------------------------------------------------------------------------
 
-impl ReceiverState {
-    pub fn from_push_response(push: PushResponse) -> Self {
+impl From<PushResponse> for ReceiverState {
+    fn from(push: PushResponse) -> Self {
         let PushResponse {
             subgraph_roots,
             bloom_k,
@@ -204,8 +226,10 @@ impl ReceiverState {
             have_cids_bloom: Self::bloom_deserialize(bloom_k, bloom),
         }
     }
+}
 
-    pub fn from_pull_request(pull: PullRequest) -> Self {
+impl From<PullRequest> for ReceiverState {
+    fn from(pull: PullRequest) -> Self {
         let PullRequest {
             resources,
             bloom_k,
@@ -217,8 +241,10 @@ impl ReceiverState {
             have_cids_bloom: Self::bloom_deserialize(bloom_k, bloom),
         }
     }
+}
 
-    pub fn into_push_response(self) -> PushResponse {
+impl Into<PushResponse> for ReceiverState {
+    fn into(self) -> PushResponse {
         let ReceiverState {
             missing_subgraph_roots,
             have_cids_bloom,
@@ -232,8 +258,10 @@ impl ReceiverState {
             bloom,
         }
     }
+}
 
-    pub fn into_pull_request(self) -> PullRequest {
+impl Into<PullRequest> for ReceiverState {
+    fn into(self) -> PullRequest {
         let ReceiverState {
             missing_subgraph_roots,
             have_cids_bloom,
@@ -247,7 +275,9 @@ impl ReceiverState {
             bloom,
         }
     }
+}
 
+impl ReceiverState {
     fn bloom_serialize(bloom: Option<BloomFilter>) -> (u32, Vec<u8>) {
         match bloom {
             Some(bloom) => (bloom.hash_count() as u32, bloom.as_bytes().to_vec()),
diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs
index 1ec03ba..6210272 100644
--- a/car-mirror/src/dag_walk.rs
+++ b/car-mirror/src/dag_walk.rs
@@ -174,7 +174,7 @@ mod proptests {
     use wnfs_common::{BlockStore, MemoryBlockStore};
 
     fn ipld_dags() -> impl Strategy<Value = (Vec<(Cid, Ipld)>, Cid)> {
-        generate_dag(256, &|cids, _| {
+        generate_dag(256, |cids, _| {
             let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect());
             let cid = Cid::new_v1(
                 IpldCodec::DagCbor.into(),
diff --git a/car-mirror/src/pull.rs b/car-mirror/src/pull.rs
index 4db2365..8dc1292 100644
--- a/car-mirror/src/pull.rs
+++ b/car-mirror/src/pull.rs
@@ -6,6 +6,17 @@ use anyhow::Result;
 use libipld::Cid;
 use wnfs_common::BlockStore;
 
+/// Create a CAR mirror pull request.
+///
+/// If this is the first request that's sent for this
+/// particular root CID, then set `last_response` to `None`.
+///
+/// On subsequent requests, set `last_response` to the
+/// last successfully received response.
+///
+/// Before actually sending the request over the network,
+/// make sure to check the `request.indicates_finished()`.
+/// If true, the client already has all data.
 pub async fn request(
     root: Cid,
     last_response: Option<CarFile>,
@@ -14,16 +25,17 @@ pub async fn request(
 ) -> Result<PullRequest> {
     Ok(block_receive(root, last_response, config, store)
         .await?
-        .into_pull_request())
+        .into())
 }
 
+/// Respond to a CAR mirror pull request.
 pub async fn response(
     root: Cid,
     request: PullRequest,
     config: &Config,
     store: &impl BlockStore,
 ) -> Result<CarFile> {
-    let receiver_state = Some(ReceiverState::from_pull_request(request));
+    let receiver_state = Some(ReceiverState::from(request));
     block_send(root, receiver_state, config, store).await
 }
 
@@ -39,7 +51,7 @@ mod tests {
     use libipld::Cid;
     use wnfs_common::MemoryBlockStore;
 
-    async fn simulate_protocol(
+    pub(crate) async fn simulate_protocol(
         root: Cid,
         config: &Config,
         client_store: &MemoryBlockStore,
@@ -90,3 +102,50 @@ mod tests {
         Ok(())
     }
 }
+
+#[cfg(test)]
+mod proptests {
+    use crate::{
+        common::Config,
+        dag_walk::DagWalk,
+        test_utils::{setup_blockstore, variable_blocksize_dag},
+    };
+    use futures::TryStreamExt;
+    use libipld::{Cid, Ipld};
+    use test_strategy::proptest;
+    use wnfs_common::MemoryBlockStore;
+
+    #[proptest]
+    fn cold_transfer_completes(#[strategy(variable_blocksize_dag())] dag: (Vec<(Cid, Ipld)>, Cid)) {
+        let (blocks, root) = dag;
+        async_std::task::block_on(async {
+            let server_store = &setup_blockstore(blocks).await.unwrap();
+            let client_store = &MemoryBlockStore::new();
+
+            crate::pull::tests::simulate_protocol(
+                root,
+                &Config::default(),
+                client_store,
+                server_store,
+            )
+            .await
+            .unwrap();
+
+            // client should have all data
+            let client_cids = DagWalk::breadth_first([root])
+                .stream(client_store)
+                .map_ok(|(cid, _)| cid)
+                .try_collect::<Vec<_>>()
+                .await
+                .unwrap();
+            let server_cids = DagWalk::breadth_first([root])
+                .stream(server_store)
+                .map_ok(|(cid, _)| cid)
+                .try_collect::<Vec<_>>()
+                .await
+                .unwrap();
+
+            assert_eq!(client_cids, server_cids);
+        })
+    }
+}
diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs
index 8e652a2..bf37d7a 100644
--- a/car-mirror/src/push.rs
+++ b/car-mirror/src/push.rs
@@ -6,33 +6,34 @@ use anyhow::Result;
 use libipld_core::cid::Cid;
 use wnfs_common::BlockStore;
 
-/// TODO(matheus23) update docs
+/// Create a CAR mirror push request.
 ///
-/// Send a subsequent car mirror push request, following up on
-/// a response retrieved from an initial `client_initiate_push` request.
+/// On the first request for a particular `root`, set
+/// `last_response` to `None`.
 ///
-/// Make sure to call `response.indicates_finished()` before initiating
-/// a follow-up `client_push` request.
+/// For subsequent requests, set it to the last successful
+/// response from a request with the same `root`.
 ///
-/// The return value is another CAR file with more blocks from the DAG below the root.
+/// The returned request body is a CAR file from some of the first
+/// blocks below the root.
 pub async fn request(
     root: Cid,
     last_response: Option<PushResponse>,
     config: &Config,
     store: &impl BlockStore,
 ) -> Result<CarFile> {
-    let receiver_state = last_response.map(ReceiverState::from_push_response);
+    let receiver_state = last_response.map(ReceiverState::from);
     block_send(root, receiver_state, config, store).await
 }
 
-/// TODO(matheus23) update docs
+/// Create a response for a CAR mirror push request.
 ///
-/// This handles a car mirror push request on the server side.
+/// This takes in the CAR file from the request body and stores its blocks
+/// in the given `store`, if the blocks can be shown to relate
+/// to the `root` CID.
 ///
-/// The root is the root CID of the DAG that is pushed, the request is a CAR file
-/// with some blocks from the cold call.
-///
-/// Returns a response to answer the client's request with.
+/// Returnes a response that gives the client information about what
+/// other data remains to be fetched.
 pub async fn response(
     root: Cid,
     request: CarFile,
@@ -41,7 +42,7 @@ pub async fn response(
 ) -> Result<PushResponse> {
     Ok(block_receive(root, Some(request), config, store)
         .await?
-        .into_push_response())
+        .into())
 }
 
 #[cfg(test)]
@@ -60,7 +61,7 @@ mod tests {
     use proptest::collection::vec;
     use wnfs_common::MemoryBlockStore;
 
-    async fn simulate_protocol(
+    pub(crate) async fn simulate_protocol(
         root: Cid,
         config: &Config,
         client_store: &MemoryBlockStore,
@@ -178,5 +179,47 @@ mod tests {
 
 #[cfg(test)]
 mod proptests {
-    use super::*;
+    use crate::{
+        common::Config,
+        dag_walk::DagWalk,
+        test_utils::{setup_blockstore, variable_blocksize_dag},
+    };
+    use futures::TryStreamExt;
+    use libipld::{Cid, Ipld};
+    use test_strategy::proptest;
+    use wnfs_common::MemoryBlockStore;
+
+    #[proptest]
+    fn cold_transfer_completes(#[strategy(variable_blocksize_dag())] dag: (Vec<(Cid, Ipld)>, Cid)) {
+        let (blocks, root) = dag;
+        async_std::task::block_on(async {
+            let client_store = &setup_blockstore(blocks).await.unwrap();
+            let server_store = &MemoryBlockStore::new();
+
+            crate::push::tests::simulate_protocol(
+                root,
+                &Config::default(),
+                client_store,
+                server_store,
+            )
+            .await
+            .unwrap();
+
+            // client should have all data
+            let client_cids = DagWalk::breadth_first([root])
+                .stream(client_store)
+                .map_ok(|(cid, _)| cid)
+                .try_collect::<Vec<_>>()
+                .await
+                .unwrap();
+            let server_cids = DagWalk::breadth_first([root])
+                .stream(server_store)
+                .map_ok(|(cid, _)| cid)
+                .try_collect::<Vec<_>>()
+                .await
+                .unwrap();
+
+            assert_eq!(client_cids, server_cids);
+        })
+    }
 }
diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs
index 60e1ab6..e8fa931 100644
--- a/car-mirror/src/test_utils/dag_strategy.rs
+++ b/car-mirror/src/test_utils/dag_strategy.rs
@@ -9,10 +9,10 @@ use roaring_graphs::{arb_dag, DirectedAcyclicGraph, Vertex};
 /// the root block's CID.
 pub fn generate_dag<T: Debug + Clone>(
     max_nodes: u16,
-    generate_block: &impl Fn(Vec<Cid>, &mut TestRng) -> (Cid, T),
-) -> impl Strategy<Value = (Vec<(Cid, T)>, Cid)> + '_ {
+    generate_block: impl Fn(Vec<Cid>, &mut TestRng) -> (Cid, T) + Clone,
+) -> impl Strategy<Value = (Vec<(Cid, T)>, Cid)> {
     arb_dag(1..max_nodes, 0.5)
-        .prop_perturb(move |dag, mut rng| dag_to_nodes(&dag, &mut rng, generate_block))
+        .prop_perturb(move |dag, mut rng| dag_to_nodes(&dag, &mut rng, generate_block.clone()))
 }
 
 /// Turn a directed acyclic graph into a list of nodes (with their CID) and a root CID.
@@ -20,7 +20,7 @@ pub fn generate_dag<T: Debug + Clone>(
 pub fn dag_to_nodes<T>(
     dag: &DirectedAcyclicGraph,
     rng: &mut TestRng,
-    generate_node: &impl Fn(Vec<Cid>, &mut TestRng) -> (Cid, T),
+    generate_node: impl Fn(Vec<Cid>, &mut TestRng) -> (Cid, T) + Clone,
 ) -> (Vec<(Cid, T)>, Cid) {
     let mut blocks = Vec::new();
     let mut visited = HashSet::new();
@@ -33,11 +33,14 @@ fn dag_to_nodes_helper<T>(
     dag: &DirectedAcyclicGraph,
     root: Vertex,
     rng: &mut TestRng,
-    generate_node: &impl Fn(Vec<Cid>, &mut TestRng) -> (Cid, T),
+    generate_node: impl Fn(Vec<Cid>, &mut TestRng) -> (Cid, T) + Clone,
     arr: &mut Vec<(Cid, T)>,
     visited: &mut HashSet<Vertex>,
 ) -> (Cid, T) {
     let mut child_blocks = Vec::new();
+    if root >= dag.get_vertex_count() {
+        println!("{root}, {}", dag.get_vertex_count());
+    }
     for child in dag.iter_children(root) {
         if visited.contains(&child) {
             continue;
@@ -47,7 +50,7 @@ fn dag_to_nodes_helper<T>(
             dag,
             child,
             rng,
-            generate_node,
+            generate_node.clone(),
             arr,
             visited,
         ));
diff --git a/car-mirror/src/test_utils/local_utils.rs b/car-mirror/src/test_utils/local_utils.rs
new file mode 100644
index 0000000..becf29f
--- /dev/null
+++ b/car-mirror/src/test_utils/local_utils.rs
@@ -0,0 +1,128 @@
+///! Crate-local test utilities
+use super::{generate_dag, Rvg};
+use crate::{common::references, dag_walk::DagWalk};
+use anyhow::Result;
+use bytes::Bytes;
+use futures::TryStreamExt;
+use libipld::{Cid, Ipld, IpldCodec};
+use libipld_core::{
+    codec::Encode,
+    multihash::{Code, MultihashDigest},
+};
+use proptest::{prelude::Rng, strategy::Strategy};
+use std::collections::BTreeMap;
+use wnfs_common::{BlockStore, MemoryBlockStore};
+
+#[derive(Clone, Debug)]
+pub(crate) struct Metrics {
+    pub(crate) request_bytes: usize,
+    pub(crate) response_bytes: usize,
+}
+
+/// Walk a root DAG along some path.
+/// At each node, take the `n % numlinks`th link,
+/// and only walk the path as long as there are further links.
+pub(crate) async fn get_cid_at_approx_path(
+    path: Vec<usize>,
+    root: Cid,
+    store: &impl BlockStore,
+) -> Result<Cid> {
+    let mut working_cid = root;
+    for nth in path {
+        let block = store.get_block(&working_cid).await?;
+        let refs = references(working_cid, block)?;
+        if refs.is_empty() {
+            break;
+        }
+
+        working_cid = refs[nth % refs.len()];
+    }
+    Ok(working_cid)
+}
+
+pub(crate) fn padded_dag_strategy(
+    dag_size: u16,
+    block_padding: usize,
+) -> impl Strategy<Value = (Vec<(Cid, Ipld)>, Cid)> {
+    generate_dag(dag_size, move |cids, rng| {
+        let mut padding = Vec::with_capacity(block_padding);
+        for _ in 0..block_padding {
+            padding.push(rng.gen::<u8>());
+        }
+
+        let ipld = Ipld::Map(BTreeMap::from([
+            ("data".into(), Ipld::Bytes(padding)),
+            (
+                "links".into(),
+                Ipld::List(cids.into_iter().map(Ipld::Link).collect()),
+            ),
+        ]));
+        let bytes = encode(&ipld);
+        let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes));
+        (cid, ipld)
+    })
+}
+
+pub(crate) fn variable_blocksize_dag() -> impl Strategy<Value = (Vec<(Cid, Ipld)>, Cid)> {
+    const MAX_DAG_NODES: u16 = 128; // with this proptests run ~15 sec for me
+    const MAX_LINK_BYTES: usize = MAX_DAG_NODES as usize * 42; // 1 byte cbor CID tag, 1 byte multibase indicator, 40 bytes CID
+
+    // 1 byte cbor tag for whole object,
+    // 1 byte cbor tag for block padding bytes
+    // up to ~3 bytes for block padding size
+    // 1 bytes cbor tag for list (of cids)
+    // up to ~2 bytes for list size
+    const EST_OVERHEAD: usize = 1 + 1 + 3 + 1 + 2;
+    const MAX_BLOCK_SIZE: usize = 256 * 1024;
+    const MAX_BLOCK_PADDING: usize = MAX_BLOCK_SIZE - EST_OVERHEAD - MAX_LINK_BYTES;
+
+    (32..MAX_BLOCK_PADDING)
+        .prop_ind_flat_map(move |block_padding| padded_dag_strategy(MAX_DAG_NODES, block_padding))
+}
+
+pub(crate) async fn setup_blockstore(blocks: Vec<(Cid, Ipld)>) -> Result<MemoryBlockStore> {
+    let store = MemoryBlockStore::new();
+    for (cid, ipld) in blocks.into_iter() {
+        let cid_store = store
+            .put_block(encode(&ipld), IpldCodec::DagCbor.into())
+            .await?;
+        debug_assert_eq!(cid, cid_store);
+    }
+
+    Ok(store)
+}
+
+pub(crate) async fn setup_random_dag(
+    dag_size: u16,
+    block_padding: usize,
+) -> Result<(Cid, MemoryBlockStore)> {
+    let (blocks, root) = Rvg::new().sample(&padded_dag_strategy(dag_size, block_padding));
+    let store = setup_blockstore(blocks).await?;
+    Ok((root, store))
+}
+
+pub(crate) async fn total_dag_bytes(root: Cid, store: &impl BlockStore) -> Result<usize> {
+    Ok(DagWalk::breadth_first([root])
+        .stream(store)
+        .map_ok(|(_, block)| block.len())
+        .try_collect::<Vec<_>>()
+        .await?
+        .into_iter()
+        .sum::<usize>())
+}
+
+pub(crate) async fn total_dag_blocks(root: Cid, store: &impl BlockStore) -> Result<usize> {
+    Ok(DagWalk::breadth_first([root])
+        .stream(store)
+        .map_ok(|(_, block)| block.len())
+        .try_collect::<Vec<_>>()
+        .await?
+        .len())
+}
+
+/// Encode some IPLD as dag-cbor
+pub(crate) fn encode(ipld: &Ipld) -> Bytes {
+    let mut vec = Vec::new();
+    ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap();
+    Bytes::from(vec)
+}
diff --git a/car-mirror/src/test_utils/mod.rs b/car-mirror/src/test_utils/mod.rs
index ed74678..aa4d5d3 100644
--- a/car-mirror/src/test_utils/mod.rs
+++ b/car-mirror/src/test_utils/mod.rs
@@ -1,17 +1,3 @@
-use std::collections::BTreeMap;
-
-use crate::{common::references, dag_walk::DagWalk};
-use anyhow::Result;
-use bytes::Bytes;
-use futures::TryStreamExt;
-use libipld::{Cid, Ipld, IpldCodec};
-use libipld_core::{
-    codec::Encode,
-    multihash::{Code, MultihashDigest},
-};
-use proptest::prelude::Rng;
-use wnfs_common::{BlockStore, MemoryBlockStore};
-
 #[cfg(feature = "test_utils")]
 mod dag_strategy;
 /// Random value generator for sampling data.
@@ -22,86 +8,7 @@ pub use dag_strategy::*;
 #[cfg(feature = "test_utils")]
 pub use rvg::*;
 
-#[derive(Clone, Debug)]
-pub(crate) struct Metrics {
-    pub(crate) request_bytes: usize,
-    pub(crate) response_bytes: usize,
-}
-
-/// Encode some IPLD as dag-cbor
-pub(crate) fn encode(ipld: &Ipld) -> Bytes {
-    let mut vec = Vec::new();
-    ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap(); // TODO(matheus23) unwrap
-    Bytes::from(vec)
-}
-
-/// Walk a root DAG along some path.
-/// At each node, take the `n % numlinks`th link,
-/// and only walk the path as long as there are further links.
-pub(crate) async fn get_cid_at_approx_path(
-    path: Vec<usize>,
-    root: Cid,
-    store: &impl BlockStore,
-) -> Result<Cid> {
-    let mut working_cid = root;
-    for nth in path {
-        let block = store.get_block(&working_cid).await?;
-        let refs = references(working_cid, block)?;
-        if refs.is_empty() {
-            break;
-        }
-
-        working_cid = refs[nth % refs.len()];
-    }
-    Ok(working_cid)
-}
-
-pub(crate) async fn setup_random_dag(
-    dag_size: u16,
-    block_padding: usize,
-) -> Result<(Cid, MemoryBlockStore)> {
-    let (blocks, root) = Rvg::new().sample(&generate_dag(dag_size, &|cids, rng| {
-        let mut padding = Vec::with_capacity(block_padding);
-        for _ in 0..block_padding {
-            padding.push(rng.gen::<u8>());
-        }
-
-        let ipld = Ipld::Map(BTreeMap::from([
-            ("data".into(), Ipld::Bytes(padding)),
-            (
-                "links".into(),
-                Ipld::List(cids.into_iter().map(Ipld::Link).collect()),
-            ),
-        ]));
-        let bytes = encode(&ipld);
-        let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes));
-        (cid, bytes)
-    }));
-
-    let store = MemoryBlockStore::new();
-    for (cid, bytes) in blocks.into_iter() {
-        let cid_store = store.put_block(bytes, IpldCodec::DagCbor.into()).await?;
-        assert_eq!(cid, cid_store);
-    }
-
-    Ok((root, store))
-}
-
-pub(crate) async fn total_dag_bytes(root: Cid, store: &impl BlockStore) -> Result<usize> {
-    Ok(DagWalk::breadth_first([root])
-        .stream(store)
-        .map_ok(|(_, block)| block.len())
-        .try_collect::<Vec<_>>()
-        .await?
-        .into_iter()
-        .sum::<usize>())
-}
-
-pub(crate) async fn total_dag_blocks(root: Cid, store: &impl BlockStore) -> Result<usize> {
-    Ok(DagWalk::breadth_first([root])
-        .stream(store)
-        .map_ok(|(_, block)| block.len())
-        .try_collect::<Vec<_>>()
-        .await?
-        .len())
-}
+#[cfg(test)]
+mod local_utils;
+#[cfg(test)]
+pub(crate) use local_utils::*;

From 0c6388b06ffadf8ab0f27357d571830bf51d36af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Mon, 21 Aug 2023 18:17:45 +0200
Subject: [PATCH 19/35] Use `HashSet` for comparing CID sets

---
 car-mirror/src/pull.rs | 10 ++++++----
 car-mirror/src/push.rs | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/car-mirror/src/pull.rs b/car-mirror/src/pull.rs
index 8dc1292..8bfdbb8 100644
--- a/car-mirror/src/pull.rs
+++ b/car-mirror/src/pull.rs
@@ -49,6 +49,7 @@ mod tests {
     use anyhow::Result;
     use futures::TryStreamExt;
     use libipld::Cid;
+    use std::collections::HashSet;
     use wnfs_common::MemoryBlockStore;
 
     pub(crate) async fn simulate_protocol(
@@ -89,12 +90,12 @@ mod tests {
         let client_cids = DagWalk::breadth_first([root])
             .stream(client_store)
             .map_ok(|(cid, _)| cid)
-            .try_collect::<Vec<_>>()
+            .try_collect::<HashSet<_>>()
             .await?;
         let server_cids = DagWalk::breadth_first([root])
             .stream(server_store)
             .map_ok(|(cid, _)| cid)
-            .try_collect::<Vec<_>>()
+            .try_collect::<HashSet<_>>()
             .await?;
 
         assert_eq!(client_cids, server_cids);
@@ -112,6 +113,7 @@ mod proptests {
     };
     use futures::TryStreamExt;
     use libipld::{Cid, Ipld};
+    use std::collections::HashSet;
     use test_strategy::proptest;
     use wnfs_common::MemoryBlockStore;
 
@@ -135,13 +137,13 @@ mod proptests {
             let client_cids = DagWalk::breadth_first([root])
                 .stream(client_store)
                 .map_ok(|(cid, _)| cid)
-                .try_collect::<Vec<_>>()
+                .try_collect::<HashSet<_>>()
                 .await
                 .unwrap();
             let server_cids = DagWalk::breadth_first([root])
                 .stream(server_store)
                 .map_ok(|(cid, _)| cid)
-                .try_collect::<Vec<_>>()
+                .try_collect::<HashSet<_>>()
                 .await
                 .unwrap();
 
diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs
index bf37d7a..03f54fc 100644
--- a/car-mirror/src/push.rs
+++ b/car-mirror/src/push.rs
@@ -59,6 +59,7 @@ mod tests {
     use futures::TryStreamExt;
     use libipld::Cid;
     use proptest::collection::vec;
+    use std::collections::HashSet;
     use wnfs_common::MemoryBlockStore;
 
     pub(crate) async fn simulate_protocol(
@@ -98,12 +99,12 @@ mod tests {
         let client_cids = DagWalk::breadth_first([root])
             .stream(client_store)
             .map_ok(|(cid, _)| cid)
-            .try_collect::<Vec<_>>()
+            .try_collect::<HashSet<_>>()
             .await?;
         let server_cids = DagWalk::breadth_first([root])
             .stream(server_store)
             .map_ok(|(cid, _)| cid)
-            .try_collect::<Vec<_>>()
+            .try_collect::<HashSet<_>>()
             .await?;
 
         assert_eq!(client_cids, server_cids);
@@ -186,6 +187,7 @@ mod proptests {
     };
     use futures::TryStreamExt;
     use libipld::{Cid, Ipld};
+    use std::collections::HashSet;
     use test_strategy::proptest;
     use wnfs_common::MemoryBlockStore;
 
@@ -209,13 +211,13 @@ mod proptests {
             let client_cids = DagWalk::breadth_first([root])
                 .stream(client_store)
                 .map_ok(|(cid, _)| cid)
-                .try_collect::<Vec<_>>()
+                .try_collect::<HashSet<_>>()
                 .await
                 .unwrap();
             let server_cids = DagWalk::breadth_first([root])
                 .stream(server_store)
                 .map_ok(|(cid, _)| cid)
-                .try_collect::<Vec<_>>()
+                .try_collect::<HashSet<_>>()
                 .await
                 .unwrap();
 

From 6bf868aee04b4a790af83c3ce0d1b6bd725a6dd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Mon, 21 Aug 2023 18:57:38 +0200
Subject: [PATCH 20/35] Implement benchmarks

---
 Cargo.lock                                    |  3 +
 car-mirror-benches/Cargo.toml                 |  5 +-
 car-mirror-benches/benches/a_benchmark.rs     | 15 ----
 car-mirror-benches/benches/in_memory.rs       | 88 +++++++++++++++++++
 car-mirror/src/dag_walk.rs                    |  4 +-
 car-mirror/src/test_utils/blockstore_utils.rs | 26 ++++++
 car-mirror/src/test_utils/dag_strategy.rs     | 65 ++++++++++++--
 car-mirror/src/test_utils/local_utils.rs      | 54 ++----------
 car-mirror/src/test_utils/mod.rs              |  4 +
 car-mirror/tests/integration_test.rs          |  4 -
 10 files changed, 192 insertions(+), 76 deletions(-)
 delete mode 100644 car-mirror-benches/benches/a_benchmark.rs
 create mode 100644 car-mirror-benches/benches/in_memory.rs
 create mode 100644 car-mirror/src/test_utils/blockstore_utils.rs
 delete mode 100644 car-mirror/tests/integration_test.rs

diff --git a/Cargo.lock b/Cargo.lock
index 952b6fb..54cf3d0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -403,8 +403,11 @@ dependencies = [
 name = "car-mirror-benches"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
+ "async-std",
  "car-mirror",
  "criterion",
+ "wnfs-common",
 ]
 
 [[package]]
diff --git a/car-mirror-benches/Cargo.toml b/car-mirror-benches/Cargo.toml
index 4d5161e..7b2df9e 100644
--- a/car-mirror-benches/Cargo.toml
+++ b/car-mirror-benches/Cargo.toml
@@ -7,10 +7,13 @@ authors = ["Stephen Akinyemi <appcypher@outlook.com>"]
 
 [dependencies]
 car-mirror = { path = "../car-mirror", version = "0.1", features = ["test_utils"] }
+wnfs-common = "0.1.23"
+async-std = { version = "1.11", features = ["attributes"] }
+anyhow = "1.0"
 
 [dev-dependencies]
 criterion = { version = "0.4", default-features = false }
 
 [[bench]]
-name = "a_benchmark"
+name = "in_memory"
 harness = false
diff --git a/car-mirror-benches/benches/a_benchmark.rs b/car-mirror-benches/benches/a_benchmark.rs
deleted file mode 100644
index 6650d1a..0000000
--- a/car-mirror-benches/benches/a_benchmark.rs
+++ /dev/null
@@ -1,15 +0,0 @@
-use criterion::{criterion_group, criterion_main, Criterion};
-
-pub fn add_benchmark(c: &mut Criterion) {
-    let mut rvg = car_mirror::test_utils::Rvg::deterministic();
-    let int_val_1 = rvg.sample(&(0..100i32));
-    let int_val_2 = rvg.sample(&(0..100i32));
-
-    c.bench_function("add", |b| {
-        b.iter(|| {
-            car_mirror::add(int_val_1, int_val_2);
-        })
-    });
-}
-criterion_group!(benches, add_benchmark);
-criterion_main!(benches);
diff --git a/car-mirror-benches/benches/in_memory.rs b/car-mirror-benches/benches/in_memory.rs
new file mode 100644
index 0000000..84b4291
--- /dev/null
+++ b/car-mirror-benches/benches/in_memory.rs
@@ -0,0 +1,88 @@
+use car_mirror::{
+    common::Config,
+    pull, push,
+    test_utils::{arb_ipld_dag, links_to_padded_ipld, setup_blockstore},
+};
+use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
+use wnfs_common::MemoryBlockStore;
+
+pub fn push(c: &mut Criterion) {
+    let mut rvg = car_mirror::test_utils::Rvg::deterministic();
+
+    c.bench_function("push cold", |b| {
+        b.iter_batched(
+            || {
+                let (blocks, root) = rvg.sample(&arb_ipld_dag(
+                    250..256,
+                    0.9, // Very highly connected
+                    links_to_padded_ipld(10 * 1024),
+                ));
+                let store = async_std::task::block_on(setup_blockstore(blocks)).unwrap();
+                (store, root)
+            },
+            |(ref client_store, root)| {
+                let server_store = &MemoryBlockStore::new();
+                let config = &Config::default();
+
+                // Simulate a multi-round protocol run in-memory
+                async_std::task::block_on(async move {
+                    let mut request = push::request(root, None, config, client_store).await?;
+                    loop {
+                        let response = push::response(root, request, config, server_store).await?;
+
+                        if response.indicates_finished() {
+                            break;
+                        }
+                        request = push::request(root, Some(response), config, client_store).await?;
+                    }
+
+                    Ok::<(), anyhow::Error>(())
+                })
+                .unwrap();
+            },
+            BatchSize::LargeInput,
+        )
+    });
+}
+
+pub fn pull(c: &mut Criterion) {
+    let mut rvg = car_mirror::test_utils::Rvg::deterministic();
+
+    c.bench_function("pull cold", |b| {
+        b.iter_batched(
+            || {
+                let (blocks, root) = rvg.sample(&arb_ipld_dag(
+                    250..256,
+                    0.9,                             // Very highly connected
+                    links_to_padded_ipld(10 * 1024), // 10KiB random data per block
+                ));
+                let store = async_std::task::block_on(setup_blockstore(blocks)).unwrap();
+                (store, root)
+            },
+            |(ref server_store, root)| {
+                let client_store = &MemoryBlockStore::new();
+                let config = &Config::default();
+
+                // Simulate a multi-round protocol run in-memory
+                async_std::task::block_on(async move {
+                    let mut request = pull::request(root, None, config, client_store).await?;
+                    loop {
+                        let response = pull::response(root, request, config, server_store).await?;
+                        request = pull::request(root, Some(response), config, client_store).await?;
+
+                        if request.indicates_finished() {
+                            break;
+                        }
+                    }
+
+                    Ok::<(), anyhow::Error>(())
+                })
+                .unwrap();
+            },
+            BatchSize::LargeInput,
+        )
+    });
+}
+
+criterion_group!(benches, push, pull);
+criterion_main!(benches);
diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs
index 6210272..707c617 100644
--- a/car-mirror/src/dag_walk.rs
+++ b/car-mirror/src/dag_walk.rs
@@ -162,7 +162,7 @@ mod tests {
 #[cfg(test)]
 mod proptests {
     use super::*;
-    use crate::test_utils::{encode, generate_dag};
+    use crate::test_utils::{arb_ipld_dag, encode};
     use futures::TryStreamExt;
     use libipld::{
         multihash::{Code, MultihashDigest},
@@ -174,7 +174,7 @@ mod proptests {
     use wnfs_common::{BlockStore, MemoryBlockStore};
 
     fn ipld_dags() -> impl Strategy<Value = (Vec<(Cid, Ipld)>, Cid)> {
-        generate_dag(256, |cids, _| {
+        arb_ipld_dag(1..256, 0.5, |cids, _| {
             let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect());
             let cid = Cid::new_v1(
                 IpldCodec::DagCbor.into(),
diff --git a/car-mirror/src/test_utils/blockstore_utils.rs b/car-mirror/src/test_utils/blockstore_utils.rs
new file mode 100644
index 0000000..4bf9b4c
--- /dev/null
+++ b/car-mirror/src/test_utils/blockstore_utils.rs
@@ -0,0 +1,26 @@
+use anyhow::Result;
+use bytes::Bytes;
+use libipld::{Cid, Ipld, IpldCodec};
+use libipld_core::codec::Encode;
+use wnfs_common::{BlockStore, MemoryBlockStore};
+
+/// Take a list of dag-cbor IPLD blocks and store all of them as dag-cbor in a
+/// MemoryBlockStore & return it.
+pub async fn setup_blockstore(blocks: Vec<(Cid, Ipld)>) -> Result<MemoryBlockStore> {
+    let store = MemoryBlockStore::new();
+    for (cid, ipld) in blocks.into_iter() {
+        let cid_store = store
+            .put_block(encode(&ipld), IpldCodec::DagCbor.into())
+            .await?;
+        debug_assert_eq!(cid, cid_store);
+    }
+
+    Ok(store)
+}
+
+/// Encode some IPLD as dag-cbor.
+pub fn encode(ipld: &Ipld) -> Bytes {
+    let mut vec = Vec::new();
+    ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap();
+    Bytes::from(vec)
+}
diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs
index e8fa931..e751f55 100644
--- a/car-mirror/src/test_utils/dag_strategy.rs
+++ b/car-mirror/src/test_utils/dag_strategy.rs
@@ -1,20 +1,71 @@
-use std::{collections::HashSet, fmt::Debug};
-
-use libipld::Cid;
-use proptest::{strategy::Strategy, test_runner::TestRng};
+use super::encode;
+use bytes::Bytes;
+use libipld::{Cid, Ipld, IpldCodec};
+use libipld_core::multihash::{Code, MultihashDigest};
+use proptest::{prelude::Rng, strategy::Strategy, test_runner::TestRng};
 use roaring_graphs::{arb_dag, DirectedAcyclicGraph, Vertex};
+use std::{
+    collections::{BTreeMap, HashSet},
+    fmt::Debug,
+    ops::Range,
+};
 
 /// A strategy for use with proptest to generate random DAGs (directed acyclic graphs).
 /// The strategy generates a list of blocks of type T and their CIDs, as well as
 /// the root block's CID.
-pub fn generate_dag<T: Debug + Clone>(
-    max_nodes: u16,
+pub fn arb_ipld_dag<T: Debug + Clone>(
+    vertex_count: impl Into<Range<Vertex>>,
+    edge_probability: f64,
     generate_block: impl Fn(Vec<Cid>, &mut TestRng) -> (Cid, T) + Clone,
 ) -> impl Strategy<Value = (Vec<(Cid, T)>, Cid)> {
-    arb_dag(1..max_nodes, 0.5)
+    arb_dag(vertex_count, edge_probability)
         .prop_perturb(move |dag, mut rng| dag_to_nodes(&dag, &mut rng, generate_block.clone()))
 }
 
+/// A block-generating function for use with `arb_ipld_dag`.
+pub fn links_to_ipld(cids: Vec<Cid>, _: &mut TestRng) -> (Cid, Ipld) {
+    let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect());
+    let cid = Cid::new_v1(
+        IpldCodec::DagCbor.into(),
+        Code::Blake3_256.digest(&encode(&ipld)),
+    );
+    (cid, ipld)
+}
+
+/// A block-generating function for use with `arb_ipld_dag`.
+pub fn links_to_dag_cbor(cids: Vec<Cid>, _: &mut TestRng) -> (Cid, Bytes) {
+    let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect());
+    let bytes = encode(&ipld);
+    let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes));
+    (cid, bytes)
+}
+
+/// A block-generating function for use with `arb_ipld_dag`.
+///
+/// Creates (a function that creates) an IPLD block with given links & some
+/// random `padding_bytes` bytes attached.
+pub fn links_to_padded_ipld(
+    padding_bytes: usize,
+) -> impl Fn(Vec<Cid>, &mut TestRng) -> (Cid, Ipld) + Clone {
+    move |cids, rng| {
+        let mut padding = Vec::with_capacity(padding_bytes);
+        for _ in 0..padding_bytes {
+            padding.push(rng.gen::<u8>());
+        }
+
+        let ipld = Ipld::Map(BTreeMap::from([
+            ("data".into(), Ipld::Bytes(padding)),
+            (
+                "links".into(),
+                Ipld::List(cids.into_iter().map(Ipld::Link).collect()),
+            ),
+        ]));
+        let bytes = encode(&ipld);
+        let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes));
+        (cid, ipld)
+    }
+}
+
 /// Turn a directed acyclic graph into a list of nodes (with their CID) and a root CID.
 /// This will select only the DAG that's reachable from the root.
 pub fn dag_to_nodes<T>(
diff --git a/car-mirror/src/test_utils/local_utils.rs b/car-mirror/src/test_utils/local_utils.rs
index becf29f..5c7323f 100644
--- a/car-mirror/src/test_utils/local_utils.rs
+++ b/car-mirror/src/test_utils/local_utils.rs
@@ -1,16 +1,10 @@
 ///! Crate-local test utilities
-use super::{generate_dag, Rvg};
+use super::{arb_ipld_dag, links_to_padded_ipld, setup_blockstore, Rvg};
 use crate::{common::references, dag_walk::DagWalk};
 use anyhow::Result;
-use bytes::Bytes;
 use futures::TryStreamExt;
-use libipld::{Cid, Ipld, IpldCodec};
-use libipld_core::{
-    codec::Encode,
-    multihash::{Code, MultihashDigest},
-};
-use proptest::{prelude::Rng, strategy::Strategy};
-use std::collections::BTreeMap;
+use libipld::{Cid, Ipld};
+use proptest::strategy::Strategy;
 use wnfs_common::{BlockStore, MemoryBlockStore};
 
 #[derive(Clone, Debug)]
@@ -44,23 +38,7 @@ pub(crate) fn padded_dag_strategy(
     dag_size: u16,
     block_padding: usize,
 ) -> impl Strategy<Value = (Vec<(Cid, Ipld)>, Cid)> {
-    generate_dag(dag_size, move |cids, rng| {
-        let mut padding = Vec::with_capacity(block_padding);
-        for _ in 0..block_padding {
-            padding.push(rng.gen::<u8>());
-        }
-
-        let ipld = Ipld::Map(BTreeMap::from([
-            ("data".into(), Ipld::Bytes(padding)),
-            (
-                "links".into(),
-                Ipld::List(cids.into_iter().map(Ipld::Link).collect()),
-            ),
-        ]));
-        let bytes = encode(&ipld);
-        let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes));
-        (cid, ipld)
-    })
+    arb_ipld_dag(1..dag_size, 0.5, links_to_padded_ipld(block_padding))
 }
 
 pub(crate) fn variable_blocksize_dag() -> impl Strategy<Value = (Vec<(Cid, Ipld)>, Cid)> {
@@ -76,20 +54,9 @@ pub(crate) fn variable_blocksize_dag() -> impl Strategy<Value = (Vec<(Cid, Ipld)
     const MAX_BLOCK_SIZE: usize = 256 * 1024;
     const MAX_BLOCK_PADDING: usize = MAX_BLOCK_SIZE - EST_OVERHEAD - MAX_LINK_BYTES;
 
-    (32..MAX_BLOCK_PADDING)
-        .prop_ind_flat_map(move |block_padding| padded_dag_strategy(MAX_DAG_NODES, block_padding))
-}
-
-pub(crate) async fn setup_blockstore(blocks: Vec<(Cid, Ipld)>) -> Result<MemoryBlockStore> {
-    let store = MemoryBlockStore::new();
-    for (cid, ipld) in blocks.into_iter() {
-        let cid_store = store
-            .put_block(encode(&ipld), IpldCodec::DagCbor.into())
-            .await?;
-        debug_assert_eq!(cid, cid_store);
-    }
-
-    Ok(store)
+    (32..MAX_BLOCK_PADDING).prop_ind_flat_map(move |block_padding| {
+        arb_ipld_dag(1..MAX_DAG_NODES, 0.5, links_to_padded_ipld(block_padding))
+    })
 }
 
 pub(crate) async fn setup_random_dag(
@@ -119,10 +86,3 @@ pub(crate) async fn total_dag_blocks(root: Cid, store: &impl BlockStore) -> Resu
         .await?
         .len())
 }
-
-/// Encode some IPLD as dag-cbor
-pub(crate) fn encode(ipld: &Ipld) -> Bytes {
-    let mut vec = Vec::new();
-    ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap();
-    Bytes::from(vec)
-}
diff --git a/car-mirror/src/test_utils/mod.rs b/car-mirror/src/test_utils/mod.rs
index aa4d5d3..11cc566 100644
--- a/car-mirror/src/test_utils/mod.rs
+++ b/car-mirror/src/test_utils/mod.rs
@@ -7,6 +7,10 @@ mod rvg;
 pub use dag_strategy::*;
 #[cfg(feature = "test_utils")]
 pub use rvg::*;
+#[cfg(feature = "test_utils")]
+mod blockstore_utils;
+#[cfg(feature = "test_utils")]
+pub use blockstore_utils::*;
 
 #[cfg(test)]
 mod local_utils;
diff --git a/car-mirror/tests/integration_test.rs b/car-mirror/tests/integration_test.rs
deleted file mode 100644
index d60e3f3..0000000
--- a/car-mirror/tests/integration_test.rs
+++ /dev/null
@@ -1,4 +0,0 @@
-#[test]
-fn test_add() {
-    assert_eq!(car_mirror::add(3, 2), 5);
-}

From e3358fd3e97fa23c4c73583d1e4f6bb7cfda890e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Mon, 21 Aug 2023 19:28:41 +0200
Subject: [PATCH 21/35] Create benchmarks with throttled `get_block`s

---
 Cargo.lock                                    |   3 +
 car-mirror-benches/Cargo.toml                 |  11 +-
 .../benches/artificially_slow_blockstore.rs   | 117 ++++++++++++++++++
 car-mirror/src/test_utils/blockstore_utils.rs |  12 +-
 4 files changed, 140 insertions(+), 3 deletions(-)
 create mode 100644 car-mirror-benches/benches/artificially_slow_blockstore.rs

diff --git a/Cargo.lock b/Cargo.lock
index 54cf3d0..d80f0db 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -405,8 +405,11 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-std",
+ "async-trait",
+ "bytes",
  "car-mirror",
  "criterion",
+ "libipld",
  "wnfs-common",
 ]
 
diff --git a/car-mirror-benches/Cargo.toml b/car-mirror-benches/Cargo.toml
index 7b2df9e..ba1c467 100644
--- a/car-mirror-benches/Cargo.toml
+++ b/car-mirror-benches/Cargo.toml
@@ -6,10 +6,13 @@ edition = "2021"
 authors = ["Stephen Akinyemi <appcypher@outlook.com>"]
 
 [dependencies]
+anyhow = "1.0"
+async-std = { version = "1.11", features = ["attributes"] }
+async-trait = "0.1"
+bytes = "1.4.0"
 car-mirror = { path = "../car-mirror", version = "0.1", features = ["test_utils"] }
+libipld = "0.16.0"
 wnfs-common = "0.1.23"
-async-std = { version = "1.11", features = ["attributes"] }
-anyhow = "1.0"
 
 [dev-dependencies]
 criterion = { version = "0.4", default-features = false }
@@ -17,3 +20,7 @@ criterion = { version = "0.4", default-features = false }
 [[bench]]
 name = "in_memory"
 harness = false
+
+[[bench]]
+name = "artificially_slow_blockstore"
+harness = false
diff --git a/car-mirror-benches/benches/artificially_slow_blockstore.rs b/car-mirror-benches/benches/artificially_slow_blockstore.rs
new file mode 100644
index 0000000..f74ec72
--- /dev/null
+++ b/car-mirror-benches/benches/artificially_slow_blockstore.rs
@@ -0,0 +1,117 @@
+use anyhow::Result;
+use async_trait::async_trait;
+use bytes::Bytes;
+use car_mirror::{
+    common::Config,
+    pull, push,
+    test_utils::{arb_ipld_dag, links_to_padded_ipld, setup_blockstore},
+};
+use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
+use libipld::Cid;
+use std::time::Duration;
+use wnfs_common::{BlockStore, MemoryBlockStore};
+
+pub fn push_throttled(c: &mut Criterion) {
+    let mut rvg = car_mirror::test_utils::Rvg::deterministic();
+
+    c.bench_function("push cold, get_block throttled", |b| {
+        b.iter_batched(
+            || {
+                let (blocks, root) = rvg.sample(&arb_ipld_dag(
+                    60..64,
+                    0.9, // Very highly connected
+                    links_to_padded_ipld(10 * 1024),
+                ));
+                let store = async_std::task::block_on(setup_blockstore(blocks)).unwrap();
+                (store, root)
+            },
+            |(client_store, root)| {
+                let client_store = &ThrottledBlockStore(client_store);
+                let server_store = &ThrottledBlockStore::new();
+                let config = &Config::default();
+
+                // Simulate a multi-round protocol run in-memory
+                async_std::task::block_on(async move {
+                    let mut request = push::request(root, None, config, client_store).await?;
+                    loop {
+                        let response = push::response(root, request, config, server_store).await?;
+
+                        if response.indicates_finished() {
+                            break;
+                        }
+                        request = push::request(root, Some(response), config, client_store).await?;
+                    }
+
+                    Ok::<(), anyhow::Error>(())
+                })
+                .unwrap();
+            },
+            BatchSize::LargeInput,
+        )
+    });
+}
+
+pub fn pull_throttled(c: &mut Criterion) {
+    let mut rvg = car_mirror::test_utils::Rvg::deterministic();
+
+    c.bench_function("pull cold, get_block throttled", |b| {
+        b.iter_batched(
+            || {
+                let (blocks, root) = rvg.sample(&arb_ipld_dag(
+                    60..64,
+                    0.9,                             // Very highly connected
+                    links_to_padded_ipld(10 * 1024), // 10KiB random data added
+                ));
+                let store = async_std::task::block_on(setup_blockstore(blocks)).unwrap();
+                (store, root)
+            },
+            |(server_store, root)| {
+                let server_store = &ThrottledBlockStore(server_store);
+                let client_store = &ThrottledBlockStore::new();
+                let config = &Config::default();
+
+                // Simulate a multi-round protocol run in-memory
+                async_std::task::block_on(async move {
+                    let mut request = pull::request(root, None, config, client_store).await?;
+                    loop {
+                        let response = pull::response(root, request, config, server_store).await?;
+                        request = pull::request(root, Some(response), config, client_store).await?;
+
+                        if request.indicates_finished() {
+                            break;
+                        }
+                    }
+
+                    Ok::<(), anyhow::Error>(())
+                })
+                .unwrap();
+            },
+            BatchSize::LargeInput,
+        )
+    });
+}
+
+#[derive(Debug, Clone)]
+struct ThrottledBlockStore(MemoryBlockStore);
+
+#[async_trait(?Send)]
+impl BlockStore for ThrottledBlockStore {
+    async fn get_block(&self, cid: &Cid) -> Result<Bytes> {
+        let bytes = self.0.get_block(cid).await?;
+        async_std::task::sleep(Duration::from_micros(50)).await; // Block fetching is artifically slowed by 50 microseconds
+        Ok(bytes)
+    }
+
+    async fn put_block(&self, bytes: impl Into<Bytes>, codec: u64) -> Result<Cid> {
+        self.0.put_block(bytes, codec).await
+    }
+}
+
+impl ThrottledBlockStore {
+    pub fn new() -> Self {
+        Self(MemoryBlockStore::new())
+    }
+}
+
+criterion_group!(benches, push_throttled, pull_throttled);
+criterion_main!(benches);
diff --git a/car-mirror/src/test_utils/blockstore_utils.rs b/car-mirror/src/test_utils/blockstore_utils.rs
index 4bf9b4c..c3ab2c2 100644
--- a/car-mirror/src/test_utils/blockstore_utils.rs
+++ b/car-mirror/src/test_utils/blockstore_utils.rs
@@ -8,6 +8,16 @@ use wnfs_common::{BlockStore, MemoryBlockStore};
 /// MemoryBlockStore & return it.
 pub async fn setup_blockstore(blocks: Vec<(Cid, Ipld)>) -> Result<MemoryBlockStore> {
     let store = MemoryBlockStore::new();
+    setup_existing_blockstore(blocks, &store).await?;
+    Ok(store)
+}
+
+/// Take a list of dag-cbor IPLD blocks and store all of them as dag-cbor in
+/// the given `BlockStore`.
+pub async fn setup_existing_blockstore(
+    blocks: Vec<(Cid, Ipld)>,
+    store: &impl BlockStore,
+) -> Result<()> {
     for (cid, ipld) in blocks.into_iter() {
         let cid_store = store
             .put_block(encode(&ipld), IpldCodec::DagCbor.into())
@@ -15,7 +25,7 @@ pub async fn setup_blockstore(blocks: Vec<(Cid, Ipld)>) -> Result<MemoryBlockSto
         debug_assert_eq!(cid, cid_store);
     }
 
-    Ok(store)
+    Ok(())
 }
 
 /// Encode some IPLD as dag-cbor.

From 6b45fab652a7ba1f455107f3f2971b773d0f6e81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Mon, 21 Aug 2023 19:37:28 +0200
Subject: [PATCH 22/35] Add some perf improvment idea comments

---
 car-mirror/src/dag_walk.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs
index 707c617..2f9b3f0 100644
--- a/car-mirror/src/dag_walk.rs
+++ b/car-mirror/src/dag_walk.rs
@@ -72,6 +72,9 @@ impl DagWalk {
             }
         };
 
+        // TODO: Two opportunities for performance improvement:
+        // - skip Raw CIDs. They can't have further links (but needs adjustment to this function's return type)
+        // - run multiple `get_block` calls concurrently
         let block = store.get_block(&cid).await?;
         for ref_cid in references(cid, &block)? {
             if !self.visited.contains(&ref_cid) {

From 8b3d6494e536210148c76666dfed4dc76d147449 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Mon, 21 Aug 2023 19:40:35 +0200
Subject: [PATCH 23/35] Fix lints

---
 car-mirror/src/common.rs | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/car-mirror/src/common.rs b/car-mirror/src/common.rs
index ef48acc..6ede3e5 100644
--- a/car-mirror/src/common.rs
+++ b/car-mirror/src/common.rs
@@ -243,14 +243,14 @@ impl From<PullRequest> for ReceiverState {
     }
 }
 
-impl Into<PushResponse> for ReceiverState {
-    fn into(self) -> PushResponse {
+impl From<ReceiverState> for PushResponse {
+    fn from(receiver_state: ReceiverState) -> PushResponse {
         let ReceiverState {
             missing_subgraph_roots,
             have_cids_bloom,
-        } = self;
+        } = receiver_state;
 
-        let (bloom_k, bloom) = Self::bloom_serialize(have_cids_bloom);
+        let (bloom_k, bloom) = ReceiverState::bloom_serialize(have_cids_bloom);
 
         PushResponse {
             subgraph_roots: missing_subgraph_roots,
@@ -260,14 +260,14 @@ impl Into<PushResponse> for ReceiverState {
     }
 }
 
-impl Into<PullRequest> for ReceiverState {
-    fn into(self) -> PullRequest {
+impl From<ReceiverState> for PullRequest {
+    fn from(receiver_state: ReceiverState) -> PullRequest {
         let ReceiverState {
             missing_subgraph_roots,
             have_cids_bloom,
-        } = self;
+        } = receiver_state;
 
-        let (bloom_k, bloom) = Self::bloom_serialize(have_cids_bloom);
+        let (bloom_k, bloom) = ReceiverState::bloom_serialize(have_cids_bloom);
 
         PullRequest {
             resources: missing_subgraph_roots,

From 10e638aee948381c81c2e35e1005e7ea8d793f96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Mon, 21 Aug 2023 19:47:28 +0200
Subject: [PATCH 24/35] set MSRV to 1.66

---
 .github/workflows/tests_and_checks.yml |  2 +-
 Cargo.toml                             |  6 +++---
 car-mirror-wasm/Cargo.toml             |  2 +-
 car-mirror/Cargo.toml                  | 10 +++++-----
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/tests_and_checks.yml b/.github/workflows/tests_and_checks.yml
index b728ad7..8acc4f8 100644
--- a/.github/workflows/tests_and_checks.yml
+++ b/.github/workflows/tests_and_checks.yml
@@ -21,7 +21,7 @@ jobs:
           - stable
           - nightly
           # minimum version
-          - 1.64
+          - 1.66
     steps:
       - name: Checkout Repository
         uses: actions/checkout@v3
diff --git a/Cargo.toml b/Cargo.toml
index 96fa3b8..aa734b0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,10 +1,10 @@
 [workspace]
 members = [
-	"examples",
 	"car-mirror",
 	"car-mirror-benches",
 	"car-mirror-wasm"
-]
+,
+	"examples"]
 
 # See https://doc.rust-lang.org/cargo/reference/profiles.html for more info.
 [profile.release.package.car-mirror-wasm]
@@ -23,4 +23,4 @@ opt-level = "s" # or 'z' to optimize "aggressively" for size
 # See https://blog.rust-lang.org/2021/03/25/Rust-1.51.0.html#splitting-debug-information
 [profile.dev]
 split-debuginfo = "unpacked"
-opt-level = 3
\ No newline at end of file
+opt-level = 3
diff --git a/car-mirror-wasm/Cargo.toml b/car-mirror-wasm/Cargo.toml
index 97e75a3..0dcdd3a 100644
--- a/car-mirror-wasm/Cargo.toml
+++ b/car-mirror-wasm/Cargo.toml
@@ -8,7 +8,7 @@ include = ["/src", "README.md", "LICENSE-APACHE", "LICENSE-MIT"]
 license = "Apache-2.0 or MIT"
 readme = "README.md"
 edition = "2021"
-rust-version = "1.64"
+rust-version = "1.66"
 documentation = "https://docs.rs/car-mirror-wasm"
 repository = "https://github.com/fission-codes/rs-car-mirror/tree/main/car-mirror-wasm"
 authors = ["Stephen Akinyemi <appcypher@outlook.com>"]
diff --git a/car-mirror/Cargo.toml b/car-mirror/Cargo.toml
index e5216c2..8771faf 100644
--- a/car-mirror/Cargo.toml
+++ b/car-mirror/Cargo.toml
@@ -8,7 +8,7 @@ include = ["/src", "README.md", "LICENSE-APACHE", "LICENSE-MIT"]
 license = "Apache-2.0 or MIT"
 readme = "README.md"
 edition = "2021"
-rust-version = "1.64"
+rust-version = "1.66"
 documentation = "https://docs.rs/car-mirror"
 repository = "https://github.com/fission-codes/rs-car-mirror/tree/main/car-mirror"
 authors = ["Stephen Akinyemi <appcypher@outlook.com>"]
@@ -25,6 +25,7 @@ doc = true
 [dependencies]
 anyhow = "1.0"
 async-stream = "0.3.5"
+async-trait = "0.1.73"
 bytes = "1.4.0"
 deterministic-bloom = { git = "https://github.com/wnfs-wg/deterministic-bloom#a8cd85b" }
 fixedbitset = "0.4.2"
@@ -34,14 +35,13 @@ libipld = "0.16.0"
 libipld-core = "0.16.0"
 proptest = { version = "1.1", optional = true }
 roaring-graphs = "0.12"
-tokio-util = { version = "0.7.8", features = ["compat"] }
+serde = "1.0.183"
+serde_ipld_dagcbor = "0.4.0"
 tokio = { version = "^1", features = ["io-util"] }
+tokio-util = { version = "0.7.8", features = ["compat"] }
 tracing = "0.1"
 tracing-subscriber = "0.3"
 wnfs-common = "0.1.23"
-async-trait = "0.1.73"
-serde_ipld_dagcbor = "0.4.0"
-serde = "1.0.183"
 
 [dev-dependencies]
 async-std = { version = "1.11", features = ["attributes"] }

From 2bce61a12035f3dc8226a8cefc9d9545b7c7f16b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Mon, 21 Aug 2023 19:55:03 +0200
Subject: [PATCH 25/35] Specifically allow the BSL license

---
 deny.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/deny.toml b/deny.toml
index f532111..a6c5ca1 100644
--- a/deny.toml
+++ b/deny.toml
@@ -76,7 +76,8 @@ allow = [
     "BSD-2-Clause",
     "BSD-3-Clause",
     "ISC",
-    "Zlib"
+    "Zlib",
+    "BSL-1.0"
 ]
 # List of explicitly disallowed licenses
 # See https://spdx.org/licenses/ for list of possible licenses

From 208519a0ad0cb3f713c256b7046c530987ec603e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Mon, 21 Aug 2023 20:14:52 +0200
Subject: [PATCH 26/35] Depend on published `deterministic-bloom`

---
 Cargo.lock            | 3 ++-
 car-mirror/Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d80f0db..6278cd4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -670,7 +670,8 @@ dependencies = [
 [[package]]
 name = "deterministic-bloom"
 version = "0.1.0"
-source = "git+https://github.com/wnfs-wg/deterministic-bloom#a8cd85b#a8cd85b1d71da9f79f5058c0a20e53a83a283230"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "12a3873e91e360aee2403cbafd2beb42f02ace06da9b053574518f003aa2490d"
 dependencies = [
  "bitvec",
  "miette",
diff --git a/car-mirror/Cargo.toml b/car-mirror/Cargo.toml
index 8771faf..77855f8 100644
--- a/car-mirror/Cargo.toml
+++ b/car-mirror/Cargo.toml
@@ -27,7 +27,7 @@ anyhow = "1.0"
 async-stream = "0.3.5"
 async-trait = "0.1.73"
 bytes = "1.4.0"
-deterministic-bloom = { git = "https://github.com/wnfs-wg/deterministic-bloom#a8cd85b" }
+deterministic-bloom = "0.1"
 fixedbitset = "0.4.2"
 futures = "0.3.28"
 iroh-car = "0.3.0"

From cbe9246cba3b6bef30b98964c8fdf3c155dbd532 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Mon, 21 Aug 2023 20:15:21 +0200
Subject: [PATCH 27/35] Lint

---
 car-mirror/src/common.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/car-mirror/src/common.rs b/car-mirror/src/common.rs
index 6ede3e5..bdc91b7 100644
--- a/car-mirror/src/common.rs
+++ b/car-mirror/src/common.rs
@@ -86,7 +86,7 @@ pub async fn block_send(
         .try_collect()
         .await?;
 
-    let bloom = have_cids_bloom.unwrap_or(BloomFilter::new_with(1, Box::new([0]))); // An empty bloom that contains nothing
+    let bloom = have_cids_bloom.unwrap_or_else(|| BloomFilter::new_with(1, Box::new([0]))); // An empty bloom that contains nothing
 
     let mut writer = CarWriter::new(
         CarHeader::new_v1(

From d48e3828707384c0bdb160323993f94e87f7aa97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Mon, 21 Aug 2023 20:17:57 +0200
Subject: [PATCH 28/35] Some helpful doc

---
 car-mirror/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs
index 04ef44a..b40d1a0 100644
--- a/car-mirror/src/lib.rs
+++ b/car-mirror/src/lib.rs
@@ -4,7 +4,7 @@
 
 //! car-mirror
 
-/// Test utilities.
+/// Test utilities. Enabled with the `test_utils` feature flag.
 #[cfg(any(test, feature = "test_utils"))]
 #[cfg_attr(docsrs, doc(cfg(feature = "test_utils")))]
 pub mod test_utils;

From a257f6e9210cf82e0849a4008c711d18804d34e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Fri, 25 Aug 2023 11:40:44 +0200
Subject: [PATCH 29/35] Updates from feedback

---
 Cargo.lock                                    | 23 ------
 car-mirror/Cargo.toml                         |  8 +-
 car-mirror/src/common.rs                      | 19 +++--
 car-mirror/src/dag_walk.rs                    |  9 ++-
 car-mirror/src/incremental_verification.rs    | 79 ++++++++++++-------
 car-mirror/src/test_utils/blockstore_utils.rs | 48 ++++++++---
 car-mirror/src/test_utils/dag_strategy.rs     | 12 ++-
 7 files changed, 117 insertions(+), 81 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6278cd4..32f2802 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -382,7 +382,6 @@ dependencies = [
  "bytes",
  "car-mirror",
  "deterministic-bloom",
- "fixedbitset",
  "futures",
  "iroh-car",
  "libipld",
@@ -392,8 +391,6 @@ dependencies = [
  "serde",
  "serde_ipld_dagcbor",
  "test-strategy",
- "tokio",
- "tokio-util",
  "tracing",
  "tracing-subscriber",
  "wnfs-common",
@@ -741,12 +738,6 @@ dependencies = [
  "instant",
 ]
 
-[[package]]
-name = "fixedbitset"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
-
 [[package]]
 name = "fnv"
 version = "1.0.7"
@@ -1850,20 +1841,6 @@ dependencies = [
  "pin-project-lite",
 ]
 
-[[package]]
-name = "tokio-util"
-version = "0.7.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d"
-dependencies = [
- "bytes",
- "futures-core",
- "futures-io",
- "futures-sink",
- "pin-project-lite",
- "tokio",
-]
-
 [[package]]
 name = "toml"
 version = "0.5.11"
diff --git a/car-mirror/Cargo.toml b/car-mirror/Cargo.toml
index 77855f8..3a59ad5 100644
--- a/car-mirror/Cargo.toml
+++ b/car-mirror/Cargo.toml
@@ -28,17 +28,14 @@ async-stream = "0.3.5"
 async-trait = "0.1.73"
 bytes = "1.4.0"
 deterministic-bloom = "0.1"
-fixedbitset = "0.4.2"
 futures = "0.3.28"
 iroh-car = "0.3.0"
 libipld = "0.16.0"
 libipld-core = "0.16.0"
 proptest = { version = "1.1", optional = true }
-roaring-graphs = "0.12"
+roaring-graphs = { version = "0.12", optional = true }
 serde = "1.0.183"
 serde_ipld_dagcbor = "0.4.0"
-tokio = { version = "^1", features = ["io-util"] }
-tokio-util = { version = "0.7.8", features = ["compat"] }
 tracing = "0.1"
 tracing-subscriber = "0.3"
 wnfs-common = "0.1.23"
@@ -47,11 +44,12 @@ wnfs-common = "0.1.23"
 async-std = { version = "1.11", features = ["attributes"] }
 car-mirror = { path = ".", features = ["test_utils"] }
 proptest = "1.1"
+roaring-graphs = "0.12"
 test-strategy = "0.3"
 
 [features]
 default = []
-test_utils = ["proptest"]
+test_utils = ["proptest", "roaring-graphs"]
 
 [package.metadata.docs.rs]
 all-features = true
diff --git a/car-mirror/src/common.rs b/car-mirror/src/common.rs
index bdc91b7..cc26d8c 100644
--- a/car-mirror/src/common.rs
+++ b/car-mirror/src/common.rs
@@ -10,7 +10,7 @@ use wnfs_common::BlockStore;
 
 use crate::{
     dag_walk::DagWalk,
-    incremental_verification::IncrementalDagVerification,
+    incremental_verification::{BlockState, IncrementalDagVerification},
     messages::{PullRequest, PushResponse},
 };
 
@@ -108,7 +108,7 @@ pub async fn block_send(
     let mut dag_walk = DagWalk::breadth_first(subgraph_roots.clone());
     while let Some((cid, block)) = dag_walk.next(store).await? {
         if bloom.contains(&cid.to_bytes()) && !subgraph_roots.contains(&cid) {
-            break;
+            continue;
         }
 
         writer.write(cid, &block).await?;
@@ -157,9 +157,18 @@ pub async fn block_receive(
                 );
             }
 
-            dag_verification
-                .verify_and_store_block((cid, block), store)
-                .await?;
+            match dag_verification.block_state(cid) {
+                BlockState::Have => continue,
+                BlockState::Unexpected => {
+                    eprintln!("Warn: Received block {cid} out of order, may be due to bloom false positive.");
+                    break;
+                }
+                BlockState::Want => {
+                    dag_verification
+                        .verify_and_store_block((cid, block), store)
+                        .await?;
+                }
+            }
         }
     }
 
diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs
index 2f9b3f0..9050648 100644
--- a/car-mirror/src/dag_walk.rs
+++ b/car-mirror/src/dag_walk.rs
@@ -165,7 +165,7 @@ mod tests {
 #[cfg(test)]
 mod proptests {
     use super::*;
-    use crate::test_utils::{arb_ipld_dag, encode};
+    use crate::test_utils::arb_ipld_dag;
     use futures::TryStreamExt;
     use libipld::{
         multihash::{Code, MultihashDigest},
@@ -174,14 +174,14 @@ mod proptests {
     use proptest::strategy::Strategy;
     use std::collections::BTreeSet;
     use test_strategy::proptest;
-    use wnfs_common::{BlockStore, MemoryBlockStore};
+    use wnfs_common::{dagcbor::encode, BlockStore, MemoryBlockStore};
 
     fn ipld_dags() -> impl Strategy<Value = (Vec<(Cid, Ipld)>, Cid)> {
         arb_ipld_dag(1..256, 0.5, |cids, _| {
             let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect());
             let cid = Cid::new_v1(
                 IpldCodec::DagCbor.into(),
-                Code::Blake3_256.digest(&encode(&ipld)),
+                Code::Blake3_256.digest(&encode(&ipld).unwrap()),
             );
             (cid, ipld)
         })
@@ -193,8 +193,9 @@ mod proptests {
             let (dag, root) = dag;
             let store = &MemoryBlockStore::new();
             for (cid, ipld) in dag.iter() {
+                let block: Bytes = encode(ipld).unwrap().into();
                 let cid_store = store
-                    .put_block(encode(ipld), IpldCodec::DagCbor.into())
+                    .put_block(block, IpldCodec::DagCbor.into())
                     .await
                     .unwrap();
                 assert_eq!(*cid, cid_store);
diff --git a/car-mirror/src/incremental_verification.rs b/car-mirror/src/incremental_verification.rs
index 24edb3b..506c0c7 100644
--- a/car-mirror/src/incremental_verification.rs
+++ b/car-mirror/src/incremental_verification.rs
@@ -1,8 +1,8 @@
-use crate::{common::references, dag_walk::DagWalk};
+use crate::dag_walk::DagWalk;
 use anyhow::{bail, Result};
 use bytes::Bytes;
 use libipld_core::cid::Cid;
-use std::{collections::HashSet, eprintln};
+use std::{collections::HashSet, matches};
 use wnfs_common::{BlockStore, BlockStoreError};
 
 /// A data structure that keeps state about incremental DAG verification.
@@ -14,6 +14,17 @@ pub struct IncrementalDagVerification {
     pub have_cids: HashSet<Cid>,
 }
 
+/// The state of a block retrieval
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum BlockState {
+    /// The block was already received/is already stored
+    Have,
+    /// We know we will need this block
+    Want,
+    /// We don't know whether we'll need this block
+    Unexpected,
+}
+
 impl IncrementalDagVerification {
     /// Initiate incremental DAG verification of given roots.
     ///
@@ -23,9 +34,18 @@ impl IncrementalDagVerification {
         roots: impl IntoIterator<Item = Cid>,
         store: &impl BlockStore,
     ) -> Result<Self> {
-        let mut want_cids = HashSet::new();
-        let mut have_cids = HashSet::new();
-        let mut dag_walk = DagWalk::breadth_first(roots);
+        let mut this = Self {
+            want_cids: roots.into_iter().collect(),
+            have_cids: HashSet::new(),
+        };
+
+        this.update_have_cids(store).await?;
+
+        Ok(this)
+    }
+
+    async fn update_have_cids(&mut self, store: &impl BlockStore) -> Result<()> {
+        let mut dag_walk = DagWalk::breadth_first(self.want_cids.iter().cloned());
 
         loop {
             match dag_walk.next(store).await {
@@ -33,13 +53,14 @@ impl IncrementalDagVerification {
                     if let Some(BlockStoreError::CIDNotFound(not_found)) =
                         e.downcast_ref::<BlockStoreError>()
                     {
-                        want_cids.insert(*not_found);
+                        self.want_cids.insert(*not_found);
                     } else {
                         bail!(e);
                     }
                 }
                 Ok(Some((cid, _))) => {
-                    have_cids.insert(cid);
+                    self.want_cids.remove(&cid);
+                    self.have_cids.insert(cid);
                 }
                 Ok(None) => {
                     break;
@@ -47,15 +68,27 @@ impl IncrementalDagVerification {
             }
         }
 
-        Ok(Self {
-            want_cids,
-            have_cids,
-        })
+        Ok(())
+    }
+
+    /// Check the state of a CID to find out whether
+    /// - we expect it as one of the next possible blocks to receive (Want)
+    /// - we have already stored it (Have)
+    /// - we don't know whether we need it (Unexpected)
+    pub fn block_state(&self, cid: Cid) -> BlockState {
+        if self.want_cids.contains(&cid) {
+            BlockState::Want
+        } else if self.have_cids.contains(&cid) {
+            BlockState::Have
+        } else {
+            BlockState::Unexpected
+        }
     }
 
     /// Verify that
-    /// - the block actually hashes to the hash from given CID and
     /// - the block is part of the graph below the roots.
+    /// - the block hasn't been received before
+    /// - the block actually hashes to the hash from given CID and
     ///
     /// And finally stores the block in the blockstore.
     ///
@@ -71,29 +104,21 @@ impl IncrementalDagVerification {
     ) -> Result<()> {
         let (cid, bytes) = block;
 
-        if !self.want_cids.contains(&cid) {
-            if self.have_cids.contains(&cid) {
-                eprintln!("Warn: Received {cid}, even though we already have it");
-            } else {
-                bail!("Unexpected block or block out of order: {cid}");
-            }
+        let block_state = self.block_state(cid);
+        if !matches!(block_state, BlockState::Want) {
+            bail!("Incremental verification failed. Block state is: {block_state:?}, expected BlockState::Want");
         }
 
-        let refs = references(cid, &bytes)?;
+        // TODO(matheus23): Verify hash before putting it into the blockstore.
         let result_cid = store.put_block(bytes, cid.codec()).await?;
 
+        // TODO(matheus23): The BlockStore chooses the hashing function,
+        // so it may choose a different hashing function, causing a mismatch
         if result_cid != cid {
             bail!("Digest mismatch in CAR file: expected {cid}, got {result_cid}");
         }
 
-        for ref_cid in refs {
-            if !self.have_cids.contains(&ref_cid) {
-                self.want_cids.insert(ref_cid);
-            }
-        }
-
-        self.want_cids.remove(&cid);
-        self.have_cids.insert(cid);
+        self.update_have_cids(store).await?;
 
         Ok(())
     }
diff --git a/car-mirror/src/test_utils/blockstore_utils.rs b/car-mirror/src/test_utils/blockstore_utils.rs
index c3ab2c2..4bce5aa 100644
--- a/car-mirror/src/test_utils/blockstore_utils.rs
+++ b/car-mirror/src/test_utils/blockstore_utils.rs
@@ -1,8 +1,9 @@
+use crate::common::references;
 use anyhow::Result;
 use bytes::Bytes;
 use libipld::{Cid, Ipld, IpldCodec};
-use libipld_core::codec::Encode;
-use wnfs_common::{BlockStore, MemoryBlockStore};
+use std::io::Write;
+use wnfs_common::{dagcbor::encode, BlockStore, MemoryBlockStore};
 
 /// Take a list of dag-cbor IPLD blocks and store all of them as dag-cbor in a
 /// MemoryBlockStore & return it.
@@ -19,18 +20,45 @@ pub async fn setup_existing_blockstore(
     store: &impl BlockStore,
 ) -> Result<()> {
     for (cid, ipld) in blocks.into_iter() {
-        let cid_store = store
-            .put_block(encode(&ipld), IpldCodec::DagCbor.into())
-            .await?;
+        let block: Bytes = encode(&ipld)?.into();
+        let cid_store = store.put_block(block, IpldCodec::DagCbor.into()).await?;
         debug_assert_eq!(cid, cid_store);
     }
 
     Ok(())
 }
 
-/// Encode some IPLD as dag-cbor.
-pub fn encode(ipld: &Ipld) -> Bytes {
-    let mut vec = Vec::new();
-    ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap();
-    Bytes::from(vec)
+/// Print a DAG as a dot file with truncated CIDs
+pub fn dag_to_dot(
+    writer: &mut impl Write,
+    blocks: impl IntoIterator<Item = (Cid, Ipld)>,
+) -> Result<()> {
+    writeln!(writer, "digraph {{")?;
+
+    for (cid, ipld) in blocks {
+        let bytes = encode(&ipld)?;
+        let refs = references(cid, &bytes)?;
+        for to_cid in refs {
+            print_truncated_string(writer, cid.to_string())?;
+            write!(writer, " -> ")?;
+            print_truncated_string(writer, to_cid.to_string())?;
+            writeln!(writer)?;
+        }
+    }
+
+    writeln!(writer, "}}")?;
+
+    Ok(())
+}
+
+fn print_truncated_string(writer: &mut impl Write, mut string: String) -> Result<()> {
+    if string.len() > 20 {
+        let mut string_rest = string.split_off(10);
+        let string_end = string_rest.split_off(std::cmp::max(string_rest.len(), 10) - 10);
+        write!(writer, "\"{string}...{string_end}\"")?;
+    } else {
+        write!(writer, "\"{string}\"")?;
+    }
+
+    Ok(())
 }
diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs
index e751f55..571d376 100644
--- a/car-mirror/src/test_utils/dag_strategy.rs
+++ b/car-mirror/src/test_utils/dag_strategy.rs
@@ -1,4 +1,3 @@
-use super::encode;
 use bytes::Bytes;
 use libipld::{Cid, Ipld, IpldCodec};
 use libipld_core::multihash::{Code, MultihashDigest};
@@ -9,6 +8,7 @@ use std::{
     fmt::Debug,
     ops::Range,
 };
+use wnfs_common::dagcbor::encode;
 
 /// A strategy for use with proptest to generate random DAGs (directed acyclic graphs).
 /// The strategy generates a list of blocks of type T and their CIDs, as well as
@@ -25,17 +25,15 @@ pub fn arb_ipld_dag<T: Debug + Clone>(
 /// A block-generating function for use with `arb_ipld_dag`.
 pub fn links_to_ipld(cids: Vec<Cid>, _: &mut TestRng) -> (Cid, Ipld) {
     let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect());
-    let cid = Cid::new_v1(
-        IpldCodec::DagCbor.into(),
-        Code::Blake3_256.digest(&encode(&ipld)),
-    );
+    let bytes = encode(&ipld).unwrap();
+    let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes));
     (cid, ipld)
 }
 
 /// A block-generating function for use with `arb_ipld_dag`.
 pub fn links_to_dag_cbor(cids: Vec<Cid>, _: &mut TestRng) -> (Cid, Bytes) {
     let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect());
-    let bytes = encode(&ipld);
+    let bytes: Bytes = encode(&ipld).unwrap().into();
     let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes));
     (cid, bytes)
 }
@@ -60,7 +58,7 @@ pub fn links_to_padded_ipld(
                 Ipld::List(cids.into_iter().map(Ipld::Link).collect()),
             ),
         ]));
-        let bytes = encode(&ipld);
+        let bytes = encode(&ipld).unwrap();
         let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes));
         (cid, ipld)
     }

From 9d55be9f17c31620b3774e5441a0f60b969d5005 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Fri, 25 Aug 2023 11:43:23 +0200
Subject: [PATCH 30/35] Fix typo

---
 car-mirror/src/push.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs
index 03f54fc..0fb229d 100644
--- a/car-mirror/src/push.rs
+++ b/car-mirror/src/push.rs
@@ -32,7 +32,7 @@ pub async fn request(
 /// in the given `store`, if the blocks can be shown to relate
 /// to the `root` CID.
 ///
-/// Returnes a response that gives the client information about what
+/// Returns a response that gives the client information about what
 /// other data remains to be fetched.
 pub async fn response(
     root: Cid,

From 1cd43c7ed4e5ce815ba26ffd0315316d7ce12d18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Fri, 25 Aug 2023 11:48:51 +0200
Subject: [PATCH 31/35] Check the block hash prior to storing the block

---
 car-mirror/src/incremental_verification.rs | 23 ++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/car-mirror/src/incremental_verification.rs b/car-mirror/src/incremental_verification.rs
index 506c0c7..0960699 100644
--- a/car-mirror/src/incremental_verification.rs
+++ b/car-mirror/src/incremental_verification.rs
@@ -1,7 +1,10 @@
 use crate::dag_walk::DagWalk;
-use anyhow::{bail, Result};
+use anyhow::{anyhow, bail, Result};
 use bytes::Bytes;
-use libipld_core::cid::Cid;
+use libipld_core::{
+    cid::Cid,
+    multihash::{Code, MultihashDigest},
+};
 use std::{collections::HashSet, matches};
 use wnfs_common::{BlockStore, BlockStoreError};
 
@@ -109,13 +112,25 @@ impl IncrementalDagVerification {
             bail!("Incremental verification failed. Block state is: {block_state:?}, expected BlockState::Want");
         }
 
-        // TODO(matheus23): Verify hash before putting it into the blockstore.
+        let hash_func: Code = cid
+            .hash()
+            .code()
+            .try_into()
+            .map_err(|_| anyhow!("Unsupported hash code in CID {cid}"))?;
+
+        let hash = hash_func.digest(bytes.as_ref());
+
+        if &hash != cid.hash() {
+            let result_cid = Cid::new_v1(cid.codec(), hash);
+            bail!("Digest mismatch in CAR file: expected {cid}, got {result_cid}");
+        }
+
         let result_cid = store.put_block(bytes, cid.codec()).await?;
 
         // TODO(matheus23): The BlockStore chooses the hashing function,
         // so it may choose a different hashing function, causing a mismatch
         if result_cid != cid {
-            bail!("Digest mismatch in CAR file: expected {cid}, got {result_cid}");
+            bail!("BlockStore uses an incompatible hashing function: CID mismatched, expected {cid}, got {result_cid}");
         }
 
         self.update_have_cids(store).await?;

From a6d33ebaee81e4baa85801a495abf24e91c7ad3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Fri, 25 Aug 2023 11:59:39 +0200
Subject: [PATCH 32/35] Use `#[serde(flatten)]` to clean up serialization

---
 car-mirror/src/common.rs   | 39 ++++++++++++++++++-------------------
 car-mirror/src/messages.rs | 40 +++++++++++++-------------------------
 2 files changed, 32 insertions(+), 47 deletions(-)

diff --git a/car-mirror/src/common.rs b/car-mirror/src/common.rs
index cc26d8c..acbd0bf 100644
--- a/car-mirror/src/common.rs
+++ b/car-mirror/src/common.rs
@@ -11,7 +11,7 @@ use wnfs_common::BlockStore;
 use crate::{
     dag_walk::DagWalk,
     incremental_verification::{BlockState, IncrementalDagVerification},
-    messages::{PullRequest, PushResponse},
+    messages::{Bloom, PullRequest, PushResponse},
 };
 
 //--------------------------------------------------------------------------------------------------
@@ -226,28 +226,23 @@ impl From<PushResponse> for ReceiverState {
     fn from(push: PushResponse) -> Self {
         let PushResponse {
             subgraph_roots,
-            bloom_k,
             bloom,
         } = push;
 
         Self {
             missing_subgraph_roots: subgraph_roots,
-            have_cids_bloom: Self::bloom_deserialize(bloom_k, bloom),
+            have_cids_bloom: Self::bloom_deserialize(bloom),
         }
     }
 }
 
 impl From<PullRequest> for ReceiverState {
     fn from(pull: PullRequest) -> Self {
-        let PullRequest {
-            resources,
-            bloom_k,
-            bloom,
-        } = pull;
+        let PullRequest { resources, bloom } = pull;
 
         Self {
             missing_subgraph_roots: resources,
-            have_cids_bloom: Self::bloom_deserialize(bloom_k, bloom),
+            have_cids_bloom: Self::bloom_deserialize(bloom),
         }
     }
 }
@@ -259,11 +254,10 @@ impl From<ReceiverState> for PushResponse {
             have_cids_bloom,
         } = receiver_state;
 
-        let (bloom_k, bloom) = ReceiverState::bloom_serialize(have_cids_bloom);
+        let bloom = ReceiverState::bloom_serialize(have_cids_bloom);
 
         PushResponse {
             subgraph_roots: missing_subgraph_roots,
-            bloom_k,
             bloom,
         }
     }
@@ -276,31 +270,36 @@ impl From<ReceiverState> for PullRequest {
             have_cids_bloom,
         } = receiver_state;
 
-        let (bloom_k, bloom) = ReceiverState::bloom_serialize(have_cids_bloom);
+        let bloom = ReceiverState::bloom_serialize(have_cids_bloom);
 
         PullRequest {
             resources: missing_subgraph_roots,
-            bloom_k,
             bloom,
         }
     }
 }
 
 impl ReceiverState {
-    fn bloom_serialize(bloom: Option<BloomFilter>) -> (u32, Vec<u8>) {
+    fn bloom_serialize(bloom: Option<BloomFilter>) -> Bloom {
         match bloom {
-            Some(bloom) => (bloom.hash_count() as u32, bloom.as_bytes().to_vec()),
-            None => (3, Vec::new()),
+            Some(bloom) => Bloom {
+                hash_count: bloom.hash_count() as u32,
+                bytes: bloom.as_bytes().to_vec(),
+            },
+            None => Bloom {
+                hash_count: 3,
+                bytes: Vec::new(),
+            },
         }
     }
 
-    fn bloom_deserialize(bloom_k: u32, bloom: Vec<u8>) -> Option<BloomFilter> {
-        if bloom.is_empty() {
+    fn bloom_deserialize(bloom: Bloom) -> Option<BloomFilter> {
+        if bloom.bytes.is_empty() {
             None
         } else {
             Some(BloomFilter::new_with(
-                bloom_k as usize,
-                bloom.into_boxed_slice(),
+                bloom.hash_count as usize,
+                bloom.bytes.into_boxed_slice(),
             ))
         }
     }
diff --git a/car-mirror/src/messages.rs b/car-mirror/src/messages.rs
index e1471c1..85b1640 100644
--- a/car-mirror/src/messages.rs
+++ b/car-mirror/src/messages.rs
@@ -12,31 +12,9 @@ pub struct PullRequest {
     #[serde(rename = "rs")]
     pub resources: Vec<Cid>,
 
-    /// Bloom filter hash count
-    #[serde(rename = "bk")]
-    pub bloom_k: u32,
-
-    /// Bloom filter Binary
-    #[serde(rename = "bb")]
-    pub bloom: Vec<u8>,
-}
-
-/// Part of the initial message for push requests.
-/// The other part is simply tupled together with the actual initial
-/// CAR file.
-///
-/// Wire data type from the [specification].
-///
-/// [specification]: https://github.com/fission-codes/spec/blob/86fcfb07d507f1df4fdaaf49088abecbb1dda76a/car-pool/car-mirror/http.md#22-requestor-payload
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub struct PushRequestHeader {
-    /// Bloom filter hash count
-    #[serde(rename = "bk")]
-    pub bloom_k: u32,
-
-    /// Bloom filter Binary
-    #[serde(rename = "bb")]
-    pub bloom: Vec<u8>,
+    /// A bloom containing already stored blocks
+    #[serde(flatten)]
+    pub bloom: Bloom,
 }
 
 /// The response sent after the initial and subsequent push requests.
@@ -50,13 +28,21 @@ pub struct PushResponse {
     #[serde(rename = "sr")]
     pub subgraph_roots: Vec<Cid>,
 
+    /// A bloom containing already stored blocks
+    #[serde(flatten)]
+    pub bloom: Bloom,
+}
+
+/// The serialization format for bloom filters in CAR mirror
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct Bloom {
     /// Bloom filter hash count
     #[serde(rename = "bk")]
-    pub bloom_k: u32,
+    pub hash_count: u32,
 
     /// Bloom filter Binary
     #[serde(rename = "bb")]
-    pub bloom: Vec<u8>,
+    pub bytes: Vec<u8>,
 }
 
 impl PushResponse {

From cad84c80b5dda08e617db20363b81da5df0e0914 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Fri, 25 Aug 2023 12:00:42 +0200
Subject: [PATCH 33/35] Lint

---
 car-mirror/src/test_utils/blockstore_utils.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/car-mirror/src/test_utils/blockstore_utils.rs b/car-mirror/src/test_utils/blockstore_utils.rs
index 4bce5aa..1bcd9e5 100644
--- a/car-mirror/src/test_utils/blockstore_utils.rs
+++ b/car-mirror/src/test_utils/blockstore_utils.rs
@@ -37,7 +37,7 @@ pub fn dag_to_dot(
 
     for (cid, ipld) in blocks {
         let bytes = encode(&ipld)?;
-        let refs = references(cid, &bytes)?;
+        let refs = references(cid, bytes)?;
         for to_cid in refs {
             print_truncated_string(writer, cid.to_string())?;
             write!(writer, " -> ")?;

From b3c51ae923c098e4fad991f10d2612e6d1ac0148 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Fri, 25 Aug 2023 12:04:09 +0200
Subject: [PATCH 34/35] Choose appropriate datastructures for `references`

---
 car-mirror/src/common.rs                      | 3 +--
 car-mirror/src/dag_walk.rs                    | 4 ++--
 car-mirror/src/test_utils/blockstore_utils.rs | 2 +-
 car-mirror/src/test_utils/local_utils.rs      | 2 +-
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/car-mirror/src/common.rs b/car-mirror/src/common.rs
index acbd0bf..900aa32 100644
--- a/car-mirror/src/common.rs
+++ b/car-mirror/src/common.rs
@@ -207,13 +207,12 @@ pub async fn block_receive(
 /// This will error out if
 /// - the codec is not supported
 /// - the block can't be parsed.
-pub fn references(cid: Cid, block: impl AsRef<[u8]>) -> Result<Vec<Cid>> {
+pub fn references<E: Extend<Cid>>(cid: Cid, block: impl AsRef<[u8]>, mut refs: E) -> Result<E> {
     let codec: IpldCodec = cid
         .codec()
         .try_into()
         .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?;
 
-    let mut refs = Vec::new();
     <Ipld as References<IpldCodec>>::references(codec, &mut Cursor::new(block), &mut refs)?;
     Ok(refs)
 }
diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs
index 9050648..3b45c69 100644
--- a/car-mirror/src/dag_walk.rs
+++ b/car-mirror/src/dag_walk.rs
@@ -76,7 +76,7 @@ impl DagWalk {
         // - skip Raw CIDs. They can't have further links (but needs adjustment to this function's return type)
         // - run multiple `get_block` calls concurrently
         let block = store.get_block(&cid).await?;
-        for ref_cid in references(cid, &block)? {
+        for ref_cid in references(cid, &block, HashSet::new())? {
             if !self.visited.contains(&ref_cid) {
                 self.frontier.push_front(ref_cid);
             }
@@ -112,7 +112,7 @@ impl DagWalk {
     /// Skip a node from the traversal for now.
     pub fn skip_walking(&mut self, block: (Cid, Bytes)) -> Result<()> {
         let (cid, bytes) = block;
-        let refs = references(cid, bytes)?;
+        let refs = references(cid, bytes, HashSet::new())?;
         self.visited.insert(cid);
         self.frontier
             .retain(|frontier_cid| !refs.contains(frontier_cid));
diff --git a/car-mirror/src/test_utils/blockstore_utils.rs b/car-mirror/src/test_utils/blockstore_utils.rs
index 1bcd9e5..3394271 100644
--- a/car-mirror/src/test_utils/blockstore_utils.rs
+++ b/car-mirror/src/test_utils/blockstore_utils.rs
@@ -37,7 +37,7 @@ pub fn dag_to_dot(
 
     for (cid, ipld) in blocks {
         let bytes = encode(&ipld)?;
-        let refs = references(cid, bytes)?;
+        let refs = references(cid, bytes, Vec::new())?;
         for to_cid in refs {
             print_truncated_string(writer, cid.to_string())?;
             write!(writer, " -> ")?;
diff --git a/car-mirror/src/test_utils/local_utils.rs b/car-mirror/src/test_utils/local_utils.rs
index 5c7323f..51f021d 100644
--- a/car-mirror/src/test_utils/local_utils.rs
+++ b/car-mirror/src/test_utils/local_utils.rs
@@ -24,7 +24,7 @@ pub(crate) async fn get_cid_at_approx_path(
     let mut working_cid = root;
     for nth in path {
         let block = store.get_block(&working_cid).await?;
-        let refs = references(working_cid, block)?;
+        let refs = references(working_cid, block, Vec::new())?;
         if refs.is_empty() {
             break;
         }

From 28b3c2672b8fcedc1ddf4a49f78cfdf591d058ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Fri, 25 Aug 2023 12:12:56 +0200
Subject: [PATCH 35/35] Make `DagWalk` deterministic again

---
 car-mirror/src/dag_walk.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs
index 3b45c69..3f27e7e 100644
--- a/car-mirror/src/dag_walk.rs
+++ b/car-mirror/src/dag_walk.rs
@@ -76,7 +76,7 @@ impl DagWalk {
         // - skip Raw CIDs. They can't have further links (but needs adjustment to this function's return type)
         // - run multiple `get_block` calls concurrently
         let block = store.get_block(&cid).await?;
-        for ref_cid in references(cid, &block, HashSet::new())? {
+        for ref_cid in references(cid, &block, Vec::new())? {
             if !self.visited.contains(&ref_cid) {
                 self.frontier.push_front(ref_cid);
             }
@@ -132,6 +132,10 @@ mod tests {
     async fn test_walk_dag_breadth_first() -> Result<()> {
         let store = &MemoryBlockStore::new();
 
+        // cid_root ---> cid_1_wrap ---> cid_1
+        //            -> cid_2
+        //            -> cid_3
+
         let cid_1 = store.put_serializable(&Ipld::String("1".into())).await?;
         let cid_2 = store.put_serializable(&Ipld::String("2".into())).await?;
         let cid_3 = store.put_serializable(&Ipld::String("3".into())).await?;