From 8968e863089ddeb32dafe906a37e6bb76ead1193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Wed, 26 Jul 2023 19:44:14 +0200 Subject: [PATCH 01/35] Implement some DAG walking --- Cargo.lock | 983 +++++++++++++++++++++++++++++++++++++++++- car-mirror/Cargo.toml | 7 + car-mirror/src/lib.rs | 76 +++- 3 files changed, 1042 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4747b69..32b1cd0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,21 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anes" version = "0.1.6" @@ -14,6 +29,175 @@ version = "1.0.71" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8" +[[package]] +name = "arrayref" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" + +[[package]] +name = "arrayvec" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" + +[[package]] +name = "async-attributes" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3203e79f4dd9bdda415ed03cf14dae5a2bf775c683a00f94e9cd1faf0f596e5" +dependencies = [ + "quote", + "syn 1.0.109", +] + +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener", + "futures-core", +] + +[[package]] +name = "async-executor" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fa3dc5f2a8564f07759c008b9109dc0d39de92a88d5588b8a5036d286383afb" +dependencies = [ + "async-lock", + "async-task", + "concurrent-queue", + "fastrand", + "futures-lite", + "slab", +] + +[[package]] +name = "async-global-executor" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1b6f5d7df27bd294849f8eec66ecfc63d11814df7a4f5d74168a2394467b776" +dependencies = [ + "async-channel", + "async-executor", + "async-io", + "async-lock", + "blocking", + "futures-lite", + "once_cell", +] + +[[package]] +name = "async-io" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fc5b45d93ef0529756f812ca52e44c221b35341892d3dcc34132ac02f3dd2af" +dependencies = [ + "async-lock", + "autocfg", + "cfg-if", + "concurrent-queue", + "futures-lite", + "log", + "parking", + "polling", + "rustix", + "slab", + "socket2", + "waker-fn", +] + +[[package]] +name = "async-lock" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa24f727524730b077666307f2734b4a1a1c57acb79193127dcc8914d5242dd7" +dependencies = [ + "event-listener", +] + +[[package]] +name = "async-once-cell" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b49bd4c5b769125ea6323601c39815848972880efd33ffb2d01f9f909adc699" + +[[package]] +name = "async-std" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d" +dependencies = [ + "async-attributes", + "async-channel", + "async-global-executor", + "async-io", + "async-lock", + "crossbeam-utils", + "futures-channel", + "futures-core", + "futures-io", + "futures-lite", + "gloo-timers", + "kv-log-macro", + "log", + "memchr", + "once_cell", + "pin-project-lite", + "pin-utils", + "slab", + "wasm-bindgen-futures", +] + +[[package]] +name = "async-stream" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.27", +] + +[[package]] +name = "async-task" +version = "4.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc7ab41815b3c653ccd2978ec3255c81349336702dfdf62ee6f7069b12a3aae" + +[[package]] +name = "async-trait" +version = "0.1.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2d0f03b3640e3a630367e40c468cb7f309529c708ed1d88597047b0e7c6ef7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.27", +] + +[[package]] +name = "atomic-waker" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1181e1e0d1fce796a03db1ae795d67167da795f9cf4a39c37589e85ef57f26d3" + [[package]] name = "atty" version = "0.2.14" @@ -31,6 +215,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "base-x" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cbbc9d0964165b47557570cce6c952866c2678457aca742aafc9fb771d30270" + [[package]] name = "bit-set" version = "0.5.3" @@ -52,6 +242,65 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "blake2b_simd" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c2f0dc9a68c6317d884f97cc36cf5a3d20ba14ce404227df55e1af708ab04bc" +dependencies = [ + "arrayref", + "arrayvec", + "constant_time_eq 0.2.6", +] + +[[package]] +name = "blake2s_simd" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6637f448b9e61dfadbdcbae9a885fadee1f3eaffb1f8d3c1965d3ade8bdfd44f" +dependencies = [ + "arrayref", + "arrayvec", + "constant_time_eq 0.2.6", +] + +[[package]] +name = "blake3" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "199c42ab6972d92c9f8995f086273d25c42fc0f7b2a1fcefba465c1352d25ba5" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq 0.3.0", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "blocking" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77231a1c8f801696fc0123ec6150ce92cffb8e164a02afb9c8ddee0e9b65ad65" +dependencies = [ + "async-channel", + "async-lock", + "async-task", + "atomic-waker", + "fastrand", + "futures-lite", + "log", +] + [[package]] name = "bumpalo" version = "3.13.0" @@ -64,14 +313,30 @@ version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +[[package]] +name = "bytes" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" +dependencies = [ + "serde", +] + [[package]] name = "car-mirror" version = "0.1.0" dependencies = [ "anyhow", + "async-std", + "async-stream", + "bytes", + "futures", + "libipld", + "libipld-core", "proptest", "tracing", "tracing-subscriber", + "wnfs-common", ] [[package]] @@ -113,6 +378,18 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "num-traits", + "winapi", +] + [[package]] name = "ciborium" version = "0.2.1" @@ -140,6 +417,20 @@ dependencies = [ "half", ] +[[package]] +name = "cid" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd94671561e36e4e7de75f753f577edafb0e7c05d6e4547229fdf7938fbcd2c3" +dependencies = [ + "core2", + "multibase", + "multihash", + "serde", + "serde_bytes", + "unsigned-varint", +] + [[package]] name = "clap" version = "3.2.25" @@ -161,6 +452,15 @@ dependencies = [ "os_str_bytes", ] +[[package]] +name = "concurrent-queue" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62ec6771ecfa0762d24683ee5a32ad78487a3d3afdc0fb8cae19d2c5deb50b7c" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "console_error_panic_hook" version = "0.1.7" @@ -171,6 +471,42 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "constant_time_eq" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21a53c0a4d288377e7415b53dcfc3c04da5cdc2cc95c8d5ac178b58f0b861ad6" + +[[package]] +name = "constant_time_eq" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" + +[[package]] +name = "core-foundation-sys" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" + +[[package]] +name = "core2" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" +dependencies = [ + "memchr", +] + +[[package]] +name = "cpufeatures" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" +dependencies = [ + "libc", +] + [[package]] name = "criterion" version = "0.4.0" @@ -205,6 +541,61 @@ dependencies = [ "itertools", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "data-encoding" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2e66c9d817f1720209181c316d28635c050fa304f9c79e47a520882661b7308" + +[[package]] +name = "data-encoding-macro" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c904b33cc60130e1aeea4956ab803d08a3f4a0ca82d64ed757afac3891f2bb99" +dependencies = [ + "data-encoding", + "data-encoding-macro-internal", +] + +[[package]] +name = "data-encoding-macro-internal" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fdf3fce3ce863539ec1d7fd1b6dcc3c645663376b43ed376bbf887733e4f772" +dependencies = [ + "data-encoding", + "syn 1.0.109", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + [[package]] name = "either" version = "1.8.1" @@ -232,6 +623,12 @@ dependencies = [ "libc", ] +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + [[package]] name = "examples" version = "0.1.0" @@ -254,15 +651,141 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "futures" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" + +[[package]] +name = "futures-executor" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" + +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + +[[package]] +name = "futures-macro" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.27", +] + +[[package]] +name = "futures-sink" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" + +[[package]] +name = "futures-task" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" + +[[package]] +name = "futures-util" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" dependencies = [ - "cfg-if", - "libc", - "wasi", + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "gloo-timers" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b995a66bb87bebce9a0f4a95aed01daca4872c050bfcb21653361c03bc35e5c" +dependencies = [ + "futures-channel", + "futures-core", + "js-sys", + "wasm-bindgen", ] [[package]] @@ -292,6 +815,29 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" +[[package]] +name = "iana-time-zone" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + [[package]] name = "indexmap" version = "1.9.3" @@ -346,6 +892,24 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "keccak" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f6d5ed8676d904364de097082f4e7d240b571b67989ced0240f08b7f966f940" +dependencies = [ + "cpufeatures", +] + +[[package]] +name = "kv-log-macro" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" +dependencies = [ + "log", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -358,6 +922,96 @@ version = "0.2.146" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b" +[[package]] +name = "libipld" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1ccd6b8ffb3afee7081fcaec00e1b099fd1c7ccf35ba5729d88538fcc3b4599" +dependencies = [ + "fnv", + "libipld-cbor", + "libipld-cbor-derive", + "libipld-core", + "libipld-json", + "libipld-macro", + "libipld-pb", + "log", + "multihash", + "thiserror", +] + +[[package]] +name = "libipld-cbor" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77d98c9d1747aa5eef1cf099cd648c3fd2d235249f5fed07522aaebc348e423b" +dependencies = [ + "byteorder", + "libipld-core", + "thiserror", +] + +[[package]] +name = "libipld-cbor-derive" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d5ba3a729b72973e456a1812b0afe2e176a376c1836cc1528e9fc98ae8cb838" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 1.0.109", + "synstructure", +] + +[[package]] +name = "libipld-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5acd707e8d8b092e967b2af978ed84709eaded82b75effe6cb6f6cc797ef8158" +dependencies = [ + "anyhow", + "cid", + "core2", + "multibase", + "multihash", + "serde", + "thiserror", +] + +[[package]] +name = "libipld-json" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25856def940047b07b25c33d4e66d248597049ab0202085215dc4dca0487731c" +dependencies = [ + "libipld-core", + "multihash", + "serde", + "serde_json", +] + +[[package]] +name = "libipld-macro" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71171c54214f866ae6722f3027f81dff0931e600e5a61e6b1b6a49ca0b5ed4ae" +dependencies = [ + "libipld-core", +] + +[[package]] +name = "libipld-pb" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3f2d0f866c4cd5dc9aa8068c429ba478d2882a3a4b70ab56f7e9a0eddf5d16f" +dependencies = [ + "bytes", + "libipld-core", + "quick-protobuf", + "thiserror", +] + [[package]] name = "libm" version = "0.2.7" @@ -375,6 +1029,59 @@ name = "log" version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" +dependencies = [ + "value-bag", +] + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "multibase" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b3539ec3c1f04ac9748a260728e855f261b4977f5c3406612c884564f329404" +dependencies = [ + "base-x", + "data-encoding", + "data-encoding-macro", +] + +[[package]] +name = "multihash" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfd8a792c1694c6da4f68db0a9d707c72bd260994da179e6030a5dcee00bb815" +dependencies = [ + "blake2b_simd", + "blake2s_simd", + "blake3", + "core2", + "digest", + "multihash-derive", + "serde", + "serde-big-array", + "sha2", + "sha3", + "unsigned-varint", +] + +[[package]] +name = "multihash-derive" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d6d4752e6230d8ef7adf7bd5d8c4b1f6561c1014c5ba9a37445ccefe18aa1db" +dependencies = [ + "proc-macro-crate", + "proc-macro-error", + "proc-macro2", + "quote", + "syn 1.0.109", + "synstructure", +] [[package]] name = "nu-ansi-term" @@ -420,23 +1127,85 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" +[[package]] +name = "parking" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14f2252c834a40ed9bb5422029649578e63aa341ac401f74e719dd1afda8394e" + [[package]] name = "pin-project-lite" version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "polling" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b2d323e8ca7996b3e23126511a523f7e62924d93ecd5ae73b333815b0eb3dce" +dependencies = [ + "autocfg", + "bitflags", + "cfg-if", + "concurrent-queue", + "libc", + "log", + "pin-project-lite", + "windows-sys", +] + [[package]] name = "ppv-lite86" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "proc-macro-crate" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e17d47ce914bf4de440332250b0edd23ce48c005f59fab39d3335866b114f11a" +dependencies = [ + "thiserror", + "toml", +] + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn 1.0.109", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + [[package]] name = "proc-macro2" -version = "1.0.60" +version = "1.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406" +checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" dependencies = [ "unicode-ident", ] @@ -467,6 +1236,15 @@ version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" +[[package]] +name = "quick-protobuf" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d6da84cc204722a989e01ba2f6e1e276e190f22263d0cb6ce8526fcdb0d2e1f" +dependencies = [ + "byteorder", +] + [[package]] name = "quote" version = "1.0.28" @@ -594,22 +1372,40 @@ checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" [[package]] name = "serde" -version = "1.0.164" +version = "1.0.175" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d" +checksum = "5d25439cd7397d044e2748a6fe2432b5e85db703d6d097bd014b3c0ad1ebff0b" dependencies = [ "serde_derive", ] +[[package]] +name = "serde-big-array" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd31f59f6fe2b0c055371bb2f16d7f0aa7d8881676c04a55b1596d1a17cd10a4" +dependencies = [ + "serde", +] + +[[package]] +name = "serde_bytes" +version = "0.11.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab33ec92f677585af6d88c65593ae2375adde54efdbf16d597f2cbc7a6d368ff" +dependencies = [ + "serde", +] + [[package]] name = "serde_derive" -version = "1.0.164" +version = "1.0.175" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68" +checksum = "b23f7ade6f110613c0d63858ddb8b94c1041f550eab58a16b371bdf2c9c80ab4" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.27", ] [[package]] @@ -623,6 +1419,27 @@ dependencies = [ "serde", ] +[[package]] +name = "sha2" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha3" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75872d278a8f37ef87fa0ddbda7802605cb18344497949862c0d4dcb291eba60" +dependencies = [ + "digest", + "keccak", +] + [[package]] name = "sharded-slab" version = "0.1.4" @@ -632,23 +1449,65 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "slab" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d" +dependencies = [ + "autocfg", +] + [[package]] name = "smallvec" version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" +[[package]] +name = "socket2" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" -version = "2.0.18" +version = "2.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" +checksum = "b60f673f44a8255b9c8c657daf66a596d435f2da81a555b06dc644d080ba45e0" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] +[[package]] +name = "synstructure" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "unicode-xid", +] + [[package]] name = "tempfile" version = "3.6.0" @@ -669,6 +1528,26 @@ version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" +[[package]] +name = "thiserror" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.27", +] + [[package]] name = "thread_local" version = "1.1.7" @@ -689,6 +1568,15 @@ dependencies = [ "serde_json", ] +[[package]] +name = "toml" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" +dependencies = [ + "serde", +] + [[package]] name = "tracing" version = "0.1.37" @@ -709,7 +1597,7 @@ checksum = "8803eee176538f94ae9a14b55b2804eb7e1441f8210b1c31290b3bccdccff73b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.27", ] [[package]] @@ -747,6 +1635,12 @@ dependencies = [ "tracing-log", ] +[[package]] +name = "typenum" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" + [[package]] name = "unarray" version = "0.1.4" @@ -759,12 +1653,36 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" +[[package]] +name = "unicode-xid" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" + +[[package]] +name = "unsigned-varint" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d86a8dc7f45e4c1b0d30e43038c38f274e77af056aa5f74b93c2cf9eb3c1c836" + [[package]] name = "valuable" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "value-bag" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d92ccd67fb88503048c01b59152a04effd0782d035a83a6d256ce6085f08f4a3" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + [[package]] name = "wait-timeout" version = "0.2.0" @@ -774,6 +1692,12 @@ dependencies = [ "libc", ] +[[package]] +name = "waker-fn" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca" + [[package]] name = "walkdir" version = "2.3.3" @@ -813,7 +1737,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 2.0.27", "wasm-bindgen-shared", ] @@ -847,7 +1771,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.27", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -923,6 +1847,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.48.0" @@ -988,3 +1921,23 @@ name = "windows_x86_64_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" + +[[package]] +name = "wnfs-common" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dfcb4584f3866ead49adae8c05cec6f633139d19283448aa7807280612e24b7" +dependencies = [ + "anyhow", + "async-once-cell", + "async-trait", + "bytes", + "chrono", + "futures", + "libipld", + "multihash", + "once_cell", + "rand_core", + "serde", + "thiserror", +] diff --git a/car-mirror/Cargo.toml b/car-mirror/Cargo.toml index 7a95196..12fbca9 100644 --- a/car-mirror/Cargo.toml +++ b/car-mirror/Cargo.toml @@ -24,12 +24,19 @@ doc = true [dependencies] anyhow = "1.0" +async-stream = "0.3.5" +bytes = "1.4.0" +futures = "0.3.28" +libipld = "0.16.0" +libipld-core = "0.16.0" proptest = { version = "1.1", optional = true } tracing = "0.1" tracing-subscriber = "0.3" +wnfs-common = "0.1.23" [dev-dependencies] proptest = "1.1" +async-std = { version = "1.11", features = ["attributes"] } [features] default = [] diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs index 2da8f41..ce4eaa8 100644 --- a/car-mirror/src/lib.rs +++ b/car-mirror/src/lib.rs @@ -4,27 +4,85 @@ //! car-mirror +use anyhow::Result; +use async_stream::try_stream; +use bytes::Bytes; +use futures::Stream; +use libipld::{Ipld, IpldCodec}; +use libipld_core::{cid::Cid, codec::References}; +use std::{ + collections::{HashSet, VecDeque}, + io::Cursor, +}; +use wnfs_common::BlockStore; + /// Test utilities. #[cfg(any(test, feature = "test_utils"))] #[cfg_attr(docsrs, doc(cfg(feature = "test_utils")))] pub mod test_utils; -/// Add two integers together. -pub fn add(a: i32, b: i32) -> i32 { - a + b +/// walks a DAG from given root breadth-first along IPLD links +pub fn walk_dag_in_order_breadth_first<'a>( + root: Cid, + store: &'a impl BlockStore, +) -> impl Stream> + 'a { + try_stream! { + let mut visited = HashSet::new(); + let mut frontier = VecDeque::from([root]); + while let Some(cid) = frontier.pop_front() { + if visited.contains(&cid) { + continue; + } + visited.insert(cid); + let block = store.get_block(&cid).await?; + let codec = IpldCodec::try_from(cid.codec())?; + frontier.extend(references(codec, &block)?); + yield (cid, block); + } + } } -/// Multiplies two integers together. -pub fn mult(a: i32, b: i32) -> i32 { - a * b +fn references(codec: IpldCodec, block: impl AsRef<[u8]>) -> Result> { + let mut refs = Vec::new(); + >::references(codec, &mut Cursor::new(block), &mut refs)?; + Ok(refs) } #[cfg(test)] mod tests { use super::*; + use futures::TryStreamExt; + use wnfs_common::MemoryBlockStore; + + #[async_std::test] + async fn test_walk_dag_breadth_first() -> Result<()> { + let store = &MemoryBlockStore::new(); + + let cid_1 = store.put_serializable(&Ipld::String("1".into())).await?; + let cid_2 = store.put_serializable(&Ipld::String("2".into())).await?; + let cid_3 = store.put_serializable(&Ipld::String("3".into())).await?; + + let cid_1_wrap = store + .put_serializable(&Ipld::List(vec![Ipld::Link(cid_1)])) + .await?; + + let cid_root = store + .put_serializable(&Ipld::List(vec![ + Ipld::Link(cid_1_wrap), + Ipld::Link(cid_2), + Ipld::Link(cid_3), + ])) + .await?; + + let cids = walk_dag_in_order_breadth_first(cid_root, store) + .try_collect::>() + .await? + .into_iter() + .map(|(cid, _block)| cid) + .collect::>(); + + assert_eq!(cids, vec![cid_root, cid_1_wrap, cid_2, cid_3, cid_1]); - #[test] - fn test_mult() { - assert_eq!(mult(3, 2), 6); + Ok(()) } } From 77b1f1138ef43b1c8ec74512202b359808bf6785 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Tue, 15 Aug 2023 15:47:00 +0200 Subject: [PATCH 02/35] Update to new `roaring_graphs` library version Co-authored-by: James Walker --- Cargo.lock | 86 +++++++++++++++++++++++ car-mirror/Cargo.toml | 6 +- car-mirror/proptest-regressions/lib.txt | 7 ++ car-mirror/src/lib.rs | 60 ++++++++++++++++ car-mirror/src/test_utils/dag_strategy.rs | 51 ++++++++++++++ car-mirror/src/test_utils/mod.rs | 4 ++ 6 files changed, 213 insertions(+), 1 deletion(-) create mode 100644 car-mirror/proptest-regressions/lib.txt create mode 100644 car-mirror/src/test_utils/dag_strategy.rs diff --git a/Cargo.lock b/Cargo.lock index 32b1cd0..45b7775 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -307,6 +307,12 @@ version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" +[[package]] +name = "bytemuck" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17febce684fd15d89027105661fec94afb475cb995fbc59d2865198446ba2eea" + [[package]] name = "byteorder" version = "1.4.3" @@ -330,10 +336,14 @@ dependencies = [ "async-std", "async-stream", "bytes", + "car-mirror", + "fixedbitset", "futures", "libipld", "libipld-core", "proptest", + "roaring-graphs", + "test-strategy", "tracing", "tracing-subscriber", "wnfs-common", @@ -498,6 +508,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "cov-mark" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ffa3d3e0138386cd4361f63537765cac7ee40698028844635a54495a92f67f3" + [[package]] name = "cpufeatures" version = "0.2.9" @@ -645,6 +661,12 @@ dependencies = [ "instant", ] +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + [[package]] name = "fnv" version = "1.0.7" @@ -1323,6 +1345,35 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" +[[package]] +name = "retain_mut" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" + +[[package]] +name = "roaring" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6106b5cf8587f5834158895e9715a3c6c9716c8aefab57f1f7680917191c7873" +dependencies = [ + "bytemuck", + "byteorder", + "retain_mut", +] + +[[package]] +name = "roaring-graphs" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff3b6db6a957b3ee92cf83d4a107d37827c4aa7a92ca71a933f0bea83a35d61f" +dependencies = [ + "cov-mark", + "proptest", + "rand", + "roaring", +] + [[package]] name = "rustix" version = "0.37.20" @@ -1474,6 +1525,29 @@ dependencies = [ "winapi", ] +[[package]] +name = "structmeta" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ad9e09554f0456d67a69c1584c9798ba733a5b50349a6c0d0948710523922d" +dependencies = [ + "proc-macro2", + "quote", + "structmeta-derive", + "syn 2.0.27", +] + +[[package]] +name = "structmeta-derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a60bcaff7397072dca0017d1db428e30d5002e00b6847703e2e42005c95fbe00" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.27", +] + [[package]] name = "syn" version = "1.0.109" @@ -1522,6 +1596,18 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "test-strategy" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8361c808554228ad09bfed70f5c823caf8a3450b6881cc3a38eb57e8c08c1d9" +dependencies = [ + "proc-macro2", + "quote", + "structmeta", + "syn 2.0.27", +] + [[package]] name = "textwrap" version = "0.16.0" diff --git a/car-mirror/Cargo.toml b/car-mirror/Cargo.toml index 12fbca9..30f2028 100644 --- a/car-mirror/Cargo.toml +++ b/car-mirror/Cargo.toml @@ -26,17 +26,21 @@ doc = true anyhow = "1.0" async-stream = "0.3.5" bytes = "1.4.0" +fixedbitset = "0.4.2" futures = "0.3.28" libipld = "0.16.0" libipld-core = "0.16.0" proptest = { version = "1.1", optional = true } +roaring-graphs = "0.12" tracing = "0.1" tracing-subscriber = "0.3" wnfs-common = "0.1.23" [dev-dependencies] -proptest = "1.1" async-std = { version = "1.11", features = ["attributes"] } +car-mirror = { path = ".", features = ["test_utils"] } +proptest = "1.1" +test-strategy = "0.3" [features] default = [] diff --git a/car-mirror/proptest-regressions/lib.txt b/car-mirror/proptest-regressions/lib.txt new file mode 100644 index 0000000..b3f356f --- /dev/null +++ b/car-mirror/proptest-regressions/lib.txt @@ -0,0 +1,7 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc ecfcb732e093de600a3a3012674efabacf88f817b47d75c0ad0da9762ca3b6f7 # shrinks to input = _WalkDagNeverIteratesBlockTwiceArgs { dag: [(Cid(bafyreigvk2vd4s7ecxqhr7vlf5ei5tpdpalx73wbt53zokomvab5ouzq2a), b"\x86\xd8*X%\0\x01q\x12 pg\xacO\xb3\xfe\xacy\xd9k\xee\xc10\xdd\x8b\xbbc\x81\xc1\x06\x12\xf5Uw\x9e\xed\x11n\r?8\xdc\xd8*X%\0\x01q\x12 \xe0\xed\xdb\xa6\x0c\x8b\xf9\xe2fk\x12\xdd3\xf9\xb9Y%[\x85\xf6\xd9\xd8\x15\x8a\xce3\xbb\xdfN\xcfM\x93\xd8*X%\0\x01q\x12 \xfei\x9fJr\x7f\xed\xfd\t\0\x02lz5\x0eD\xc5\xf9\xe2\xda\"\x9ez5y\xd5\xc8\x02\xab+\xc0\x92\xd8*X%\0\x01q\x12 \xc6M\xa8\xd1B\x12\xcfT\xbfC\xd0\x1e\x89\x8c\xaa\x11\xafq\xed^sF\xb5\xda\x19\x98\xf2B\xb9\xe2\x9f\xb3\xd8*X%\0\x01q\x12 v\xbe\x8bR\x8d\0u\xf7\xaa\xe9\x8do\xa5zm<\x83\xaeH\n\x84i\xe6h\xd7\xb0\xaf\x96\x89\x95\xacq\xd8*X%\0\x01q\x12 \x92`zHK\xcfR\xb8\x10G\xa3+t\t\xb7_\xc9\xf6\x9b+\xee\xe0\x83S\"#\xba\xe8Q\xdd\x10\x02")] } diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs index ce4eaa8..533ffda 100644 --- a/car-mirror/src/lib.rs +++ b/car-mirror/src/lib.rs @@ -86,3 +86,63 @@ mod tests { Ok(()) } } + +#[cfg(test)] +mod proptests { + use crate::{ + test_utils::{encode, generate_dag}, + walk_dag_in_order_breadth_first, + }; + use futures::TryStreamExt; + use libipld::{ + multihash::{Code, MultihashDigest}, + Cid, Ipld, IpldCodec, + }; + use proptest::strategy::Strategy; + use std::collections::BTreeSet; + use test_strategy::proptest; + use wnfs_common::{BlockStore, MemoryBlockStore}; + + fn ipld_dags() -> impl Strategy, Cid)> { + generate_dag(256, |cids| { + let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect()); + let cid = Cid::new_v1( + IpldCodec::DagCbor.into(), + Code::Blake3_256.digest(&encode(&ipld)), + ); + (cid, ipld) + }) + } + + #[proptest(max_shrink_iters = 100_000)] + fn walk_dag_never_iterates_block_twice(#[strategy(ipld_dags())] dag: (Vec<(Cid, Ipld)>, Cid)) { + async_std::task::block_on(async { + let (dag, root) = dag; + let store = &MemoryBlockStore::new(); + for (cid, ipld) in dag.iter() { + let cid_store = store + .put_block(encode(ipld), IpldCodec::DagCbor.into()) + .await + .unwrap(); + assert_eq!(*cid, cid_store); + } + + let mut cids = walk_dag_in_order_breadth_first(root, store) + .map_ok(|(cid, _)| cid) + .try_collect::>() + .await + .unwrap(); + + cids.sort(); + + let unique_cids = cids + .iter() + .cloned() + .collect::>() + .into_iter() + .collect::>(); + + assert_eq!(cids, unique_cids); + }); + } +} diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs new file mode 100644 index 0000000..003c25e --- /dev/null +++ b/car-mirror/src/test_utils/dag_strategy.rs @@ -0,0 +1,51 @@ +use std::{collections::HashSet, fmt::Debug}; + +use bytes::Bytes; +use libipld::{Cid, Ipld, IpldCodec}; +use libipld_core::codec::Encode; +use proptest::strategy::Strategy; +use roaring_graphs::{arb_dag, DirectedAcyclicGraph, Vertex}; + +pub fn encode(ipld: &Ipld) -> Bytes { + let mut vec = Vec::new(); + ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap(); // TODO(matheus23) unwrap + Bytes::from(vec) +} + +pub fn generate_dag( + max_nodes: u16, + generate_block: fn(Vec) -> (Cid, T), +) -> impl Strategy, Cid)> { + arb_dag(1..max_nodes, 0.5).prop_map(move |dag| dag_to_nodes(&dag, generate_block)) +} + +pub fn dag_to_nodes( + dag: &DirectedAcyclicGraph, + generate_node: fn(Vec) -> (Cid, T), +) -> (Vec<(Cid, T)>, Cid) { + let mut blocks = Vec::new(); + let mut visited = HashSet::new(); + let (cid, block) = dag_to_nodes_helper(dag, 0, generate_node, &mut blocks, &mut visited); + blocks.push((cid, block)); + (blocks, cid) +} + +pub fn dag_to_nodes_helper( + dag: &DirectedAcyclicGraph, + root: Vertex, + generate_node: fn(Vec) -> (Cid, T), + arr: &mut Vec<(Cid, T)>, + visited: &mut HashSet, +) -> (Cid, T) { + let mut child_blocks = Vec::new(); + for child in dag.iter_children(root) { + if visited.contains(&child) { + continue; + } + visited.insert(child); + child_blocks.push(dag_to_nodes_helper(dag, child, generate_node, arr, visited)); + } + let result = generate_node(child_blocks.iter().map(|(cid, _)| *cid).collect()); + arr.extend(child_blocks); + result +} diff --git a/car-mirror/src/test_utils/mod.rs b/car-mirror/src/test_utils/mod.rs index 4a30e2a..890a5ad 100644 --- a/car-mirror/src/test_utils/mod.rs +++ b/car-mirror/src/test_utils/mod.rs @@ -1,5 +1,9 @@ +#[cfg(feature = "test_utils")] +mod dag_strategy; /// Random value generator for sampling data. #[cfg(feature = "test_utils")] mod rvg; #[cfg(feature = "test_utils")] +pub use dag_strategy::*; +#[cfg(feature = "test_utils")] pub use rvg::*; From ca386907a258256baf9c25fac03d7fb9372381cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Wed, 16 Aug 2023 11:27:30 +0200 Subject: [PATCH 03/35] Write a stream of blocks into a CAR file --- Cargo.lock | 113 ++++++++++++++++++++++++++++++++++++++++-- car-mirror/Cargo.toml | 4 ++ car-mirror/src/lib.rs | 53 ++++++++++++++++++-- 3 files changed, 163 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 45b7775..d4cff6a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,21 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4fa78e18c64fce05e902adecd7a5eed15a5e0a3439f7b0e169f0252214865e3" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + [[package]] name = "android-tzdata" version = "0.1.1" @@ -183,9 +198,9 @@ checksum = "ecc7ab41815b3c653ccd2978ec3255c81349336702dfdf62ee6f7069b12a3aae" [[package]] name = "async-trait" -version = "0.1.69" +version = "0.1.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b2d0f03b3640e3a630367e40c468cb7f309529c708ed1d88597047b0e7c6ef7" +checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0" dependencies = [ "proc-macro2", "quote", @@ -215,6 +230,21 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "backtrace" +version = "0.3.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + [[package]] name = "base-x" version = "0.2.11" @@ -335,15 +365,19 @@ dependencies = [ "anyhow", "async-std", "async-stream", + "async-trait", "bytes", "car-mirror", "fixedbitset", "futures", + "iroh-car", "libipld", "libipld-core", "proptest", "roaring-graphs", "test-strategy", + "tokio", + "tokio-util", "tracing", "tracing-subscriber", "wnfs-common", @@ -798,6 +832,12 @@ dependencies = [ "wasi", ] +[[package]] +name = "gimli" +version = "0.27.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e" + [[package]] name = "gloo-timers" version = "0.2.6" @@ -890,6 +930,21 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "iroh-car" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a291220adb48738bdea587156c5f44ca5ec4ad31fdeb8fb88fda1dcd7886a24" +dependencies = [ + "anyhow", + "cid", + "futures", + "libipld", + "thiserror", + "tokio", + "unsigned-varint", +] + [[package]] name = "itertools" version = "0.10.5" @@ -1061,6 +1116,15 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +[[package]] +name = "miniz_oxide" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +dependencies = [ + "adler", +] + [[package]] name = "multibase" version = "0.9.1" @@ -1125,6 +1189,15 @@ dependencies = [ "libm", ] +[[package]] +name = "object" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bda667d9f2b5051b8833f59f3bf748b28ef54f850f4fcb389a252aa383866d1" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" version = "1.18.0" @@ -1269,9 +1342,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.28" +version = "1.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" +checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965" dependencies = [ "proc-macro2", ] @@ -1374,6 +1447,12 @@ dependencies = [ "roaring", ] +[[package]] +name = "rustc-demangle" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + [[package]] name = "rustix" version = "0.37.20" @@ -1654,6 +1733,32 @@ dependencies = [ "serde_json", ] +[[package]] +name = "tokio" +version = "1.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "532826ff75199d5833b9d2c5fe410f29235e25704ee5f0ef599fb51c21f4a4da" +dependencies = [ + "autocfg", + "backtrace", + "bytes", + "pin-project-lite", +] + +[[package]] +name = "tokio-util" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" +dependencies = [ + "bytes", + "futures-core", + "futures-io", + "futures-sink", + "pin-project-lite", + "tokio", +] + [[package]] name = "toml" version = "0.5.11" diff --git a/car-mirror/Cargo.toml b/car-mirror/Cargo.toml index 30f2028..5d7094f 100644 --- a/car-mirror/Cargo.toml +++ b/car-mirror/Cargo.toml @@ -28,13 +28,17 @@ async-stream = "0.3.5" bytes = "1.4.0" fixedbitset = "0.4.2" futures = "0.3.28" +iroh-car = "0.3.0" libipld = "0.16.0" libipld-core = "0.16.0" proptest = { version = "1.1", optional = true } roaring-graphs = "0.12" +tokio-util = { version = "0.7.8", features = ["compat"] } +tokio = { version = "^1", features = ["io-util"] } tracing = "0.1" tracing-subscriber = "0.3" wnfs-common = "0.1.23" +async-trait = "0.1.73" [dev-dependencies] async-std = { version = "1.11", features = ["attributes"] } diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs index 533ffda..3ee79e2 100644 --- a/car-mirror/src/lib.rs +++ b/car-mirror/src/lib.rs @@ -7,13 +7,15 @@ use anyhow::Result; use async_stream::try_stream; use bytes::Bytes; -use futures::Stream; +use futures::{Stream, StreamExt}; +use iroh_car::CarWriter; use libipld::{Ipld, IpldCodec}; use libipld_core::{cid::Cid, codec::References}; use std::{ collections::{HashSet, VecDeque}, io::Cursor, }; +use tokio::io::AsyncWrite; use wnfs_common::BlockStore; /// Test utilities. @@ -25,8 +27,8 @@ pub mod test_utils; pub fn walk_dag_in_order_breadth_first<'a>( root: Cid, store: &'a impl BlockStore, -) -> impl Stream> + 'a { - try_stream! { +) -> impl Stream> + Unpin + 'a { + Box::pin(try_stream! { let mut visited = HashSet::new(); let mut frontier = VecDeque::from([root]); while let Some(cid) = frontier.pop_front() { @@ -39,7 +41,19 @@ pub fn walk_dag_in_order_breadth_first<'a>( frontier.extend(references(codec, &block)?); yield (cid, block); } + }) +} + +/// Writes a stream of blocks into a car file +pub async fn stream_into_car( + mut blocks: impl Stream> + Unpin, + writer: &mut CarWriter, +) -> Result<()> { + while let Some(result) = blocks.next().await { + let (cid, bytes) = result?; + writer.write(cid, bytes).await?; } + Ok(()) } fn references(codec: IpldCodec, block: impl AsRef<[u8]>) -> Result> { @@ -50,10 +64,43 @@ fn references(codec: IpldCodec, block: impl AsRef<[u8]>) -> Result> { #[cfg(test)] mod tests { + use crate::test_utils::{encode, generate_dag, Rvg}; + use super::*; + use async_std::fs::File; use futures::TryStreamExt; + use iroh_car::CarHeader; + use libipld_core::multihash::{Code, MultihashDigest}; + use tokio_util::compat::FuturesAsyncWriteCompatExt; use wnfs_common::MemoryBlockStore; + #[async_std::test] + async fn test_write_into_car() -> Result<()> { + let (blocks, root) = Rvg::new().sample(&generate_dag(256, |cids| { + let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect()); + let bytes = encode(&ipld); + let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes)); + (cid, bytes) + })); + + let store = &MemoryBlockStore::new(); + for (cid, bytes) in blocks.iter() { + let cid_store = store + .put_block(bytes.clone(), IpldCodec::DagCbor.into()) + .await?; + assert_eq!(*cid, cid_store); + } + + let file = File::create("./my-car3.car").await?; + let mut writer = CarWriter::new(CarHeader::new_v1(vec![root]), file.compat_write()); + writer.write_header().await?; + let block_stream = walk_dag_in_order_breadth_first(root, store); + stream_into_car(block_stream, &mut writer).await?; + let file = writer.finish().await?; + + Ok(()) + } + #[async_std::test] async fn test_walk_dag_breadth_first() -> Result<()> { let store = &MemoryBlockStore::new(); From 9e4fcf860914c426076880ce7ac7f52e1f9e35aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Wed, 16 Aug 2023 12:14:09 +0200 Subject: [PATCH 04/35] Read back car files and make sure they're incrementally verified --- car-mirror/src/lib.rs | 75 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 66 insertions(+), 9 deletions(-) diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs index 3ee79e2..f2e8cb7 100644 --- a/car-mirror/src/lib.rs +++ b/car-mirror/src/lib.rs @@ -4,18 +4,21 @@ //! car-mirror -use anyhow::Result; +use anyhow::{anyhow, bail, Result}; use async_stream::try_stream; use bytes::Bytes; use futures::{Stream, StreamExt}; -use iroh_car::CarWriter; +use iroh_car::{CarReader, CarWriter}; use libipld::{Ipld, IpldCodec}; -use libipld_core::{cid::Cid, codec::References}; +use libipld_core::{ + cid::Cid, + codec::References, + multihash::{Code, MultihashDigest}, +}; use std::{ collections::{HashSet, VecDeque}, io::Cursor, }; -use tokio::io::AsyncWrite; use wnfs_common::BlockStore; /// Test utilities. @@ -45,7 +48,7 @@ pub fn walk_dag_in_order_breadth_first<'a>( } /// Writes a stream of blocks into a car file -pub async fn stream_into_car( +pub async fn stream_into_car( mut blocks: impl Stream> + Unpin, writer: &mut CarWriter, ) -> Result<()> { @@ -56,6 +59,49 @@ pub async fn stream_into_car( Ok(()) } +/// Read a directed acyclic graph from a CAR file, making sure it's read in-order and +/// only blocks reachable from the root are included. +pub fn read_in_order_dag_from_car<'a, R: tokio::io::AsyncRead + Unpin>( + root: Cid, + reader: &'a mut CarReader, +) -> impl Stream> + Unpin + 'a { + Box::pin(try_stream! { + let mut reachable_from_root = HashSet::from([root]); + while let Some((cid, vec)) = reader.next_block().await.map_err(|e| anyhow!(e))? { + let block = Bytes::from(vec); + + let code: Code = cid + .hash() + .code() + .try_into() + .map_err(|_| anyhow!("Unsupported hash code in Cid: {cid}"))?; + + let codec: IpldCodec = cid + .codec() + .try_into() + .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?; + + let digest = code.digest(&block); + + if cid.hash() != &digest { + Err(anyhow!( + "Digest mismatch in CAR file: expected {:?}, got {:?}", + digest, + cid.hash() + ))?; + } + + if !reachable_from_root.contains(&cid) { + Err(anyhow!("Unexpected block or block out of order: {cid}"))?; + } + + reachable_from_root.extend(references(codec, &block)?); + + yield (cid, block); + } + }) +} + fn references(codec: IpldCodec, block: impl AsRef<[u8]>) -> Result> { let mut refs = Vec::new(); >::references(codec, &mut Cursor::new(block), &mut refs)?; @@ -68,10 +114,10 @@ mod tests { use super::*; use async_std::fs::File; - use futures::TryStreamExt; + use futures::{future, TryStreamExt}; use iroh_car::CarHeader; use libipld_core::multihash::{Code, MultihashDigest}; - use tokio_util::compat::FuturesAsyncWriteCompatExt; + use tokio_util::compat::{FuturesAsyncReadCompatExt, FuturesAsyncWriteCompatExt}; use wnfs_common::MemoryBlockStore; #[async_std::test] @@ -91,12 +137,23 @@ mod tests { assert_eq!(*cid, cid_store); } - let file = File::create("./my-car3.car").await?; + let filename = "./my-car.car"; + + let file = File::create(filename).await?; let mut writer = CarWriter::new(CarHeader::new_v1(vec![root]), file.compat_write()); writer.write_header().await?; let block_stream = walk_dag_in_order_breadth_first(root, store); stream_into_car(block_stream, &mut writer).await?; - let file = writer.finish().await?; + writer.finish().await?; + + let mut reader = CarReader::new(File::open(filename).await?.compat()).await?; + + read_in_order_dag_from_car(root, &mut reader) + .try_for_each(|(cid, _)| { + println!("Got {cid}"); + future::ready(Ok(())) + }) + .await?; Ok(()) } From aa22ea06722fe797e4cb453375be896f6a25abd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Wed, 16 Aug 2023 14:18:44 +0200 Subject: [PATCH 05/35] Add over-the-wire datatypes --- Cargo.lock | 63 ++++++++++++++++++++++++++++---------- car-mirror/Cargo.toml | 2 ++ car-mirror/src/lib.rs | 5 ++- car-mirror/src/messages.rs | 60 ++++++++++++++++++++++++++++++++++++ 4 files changed, 112 insertions(+), 18 deletions(-) create mode 100644 car-mirror/src/messages.rs diff --git a/Cargo.lock b/Cargo.lock index d4cff6a..15f9f73 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -187,7 +187,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.27", + "syn 2.0.28", ] [[package]] @@ -204,7 +204,7 @@ checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.27", + "syn 2.0.28", ] [[package]] @@ -375,6 +375,8 @@ dependencies = [ "libipld-core", "proptest", "roaring-graphs", + "serde", + "serde_ipld_dagcbor", "test-strategy", "tokio", "tokio-util", @@ -410,6 +412,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "cbor4ii" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544cf8c89359205f4f990d0e6f3828db42df85b5dac95d09157a250eb0749c4" +dependencies = [ + "serde", +] + [[package]] name = "cc" version = "1.0.79" @@ -778,7 +789,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.27", + "syn 2.0.28", ] [[package]] @@ -1500,11 +1511,17 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "serde" -version = "1.0.175" +version = "1.0.183" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d25439cd7397d044e2748a6fe2432b5e85db703d6d097bd014b3c0ad1ebff0b" +checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c" dependencies = [ "serde_derive", ] @@ -1529,13 +1546,25 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.175" +version = "1.0.183" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b23f7ade6f110613c0d63858ddb8b94c1041f550eab58a16b371bdf2c9c80ab4" +checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816" dependencies = [ "proc-macro2", "quote", - "syn 2.0.27", + "syn 2.0.28", +] + +[[package]] +name = "serde_ipld_dagcbor" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace39c1b7526be78c755a4c698313f699cf44e62408c0029bf9ab9450fe836da" +dependencies = [ + "cbor4ii", + "cid", + "scopeguard", + "serde", ] [[package]] @@ -1613,7 +1642,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn 2.0.27", + "syn 2.0.28", ] [[package]] @@ -1624,7 +1653,7 @@ checksum = "a60bcaff7397072dca0017d1db428e30d5002e00b6847703e2e42005c95fbe00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.27", + "syn 2.0.28", ] [[package]] @@ -1640,9 +1669,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.27" +version = "2.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b60f673f44a8255b9c8c657daf66a596d435f2da81a555b06dc644d080ba45e0" +checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567" dependencies = [ "proc-macro2", "quote", @@ -1684,7 +1713,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta", - "syn 2.0.27", + "syn 2.0.28", ] [[package]] @@ -1710,7 +1739,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.27", + "syn 2.0.28", ] [[package]] @@ -1788,7 +1817,7 @@ checksum = "8803eee176538f94ae9a14b55b2804eb7e1441f8210b1c31290b3bccdccff73b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.27", + "syn 2.0.28", ] [[package]] @@ -1928,7 +1957,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.27", + "syn 2.0.28", "wasm-bindgen-shared", ] @@ -1962,7 +1991,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.27", + "syn 2.0.28", "wasm-bindgen-backend", "wasm-bindgen-shared", ] diff --git a/car-mirror/Cargo.toml b/car-mirror/Cargo.toml index 5d7094f..a25201f 100644 --- a/car-mirror/Cargo.toml +++ b/car-mirror/Cargo.toml @@ -39,6 +39,8 @@ tracing = "0.1" tracing-subscriber = "0.3" wnfs-common = "0.1.23" async-trait = "0.1.73" +serde_ipld_dagcbor = "0.4.0" +serde = "1.0.183" [dev-dependencies] async-std = { version = "1.11", features = ["attributes"] } diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs index f2e8cb7..d3fc7e2 100644 --- a/car-mirror/src/lib.rs +++ b/car-mirror/src/lib.rs @@ -4,7 +4,7 @@ //! car-mirror -use anyhow::{anyhow, bail, Result}; +use anyhow::{anyhow, Result}; use async_stream::try_stream; use bytes::Bytes; use futures::{Stream, StreamExt}; @@ -26,6 +26,9 @@ use wnfs_common::BlockStore; #[cfg_attr(docsrs, doc(cfg(feature = "test_utils")))] pub mod test_utils; +/// Contains the data types that are sent over-the-wire and relevant serialization code. +pub mod messages; + /// walks a DAG from given root breadth-first along IPLD links pub fn walk_dag_in_order_breadth_first<'a>( root: Cid, diff --git a/car-mirror/src/messages.rs b/car-mirror/src/messages.rs new file mode 100644 index 0000000..815fe45 --- /dev/null +++ b/car-mirror/src/messages.rs @@ -0,0 +1,60 @@ +use libipld_core::cid::Cid; +use serde::{Deserialize, Serialize}; + +/// Initial message for pull requests. +/// +/// Over-the-wire data type from the [specification]. +/// +/// [specification]: https://github.com/fission-codes/spec/blob/86fcfb07d507f1df4fdaaf49088abecbb1dda76a/car-pool/car-mirror/http.md#12-requestor-payload +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct PullRequest { + /// Requested CID roots + #[serde(rename = "rs")] + pub resources: Vec, + + /// Bloom filter hash count + #[serde(rename = "bk")] + pub bloom_k: u32, + + /// Bloom filter Binary + #[serde(rename = "bb")] + pub bloom: Vec, +} + +/// Part of the initial message for push requests. +/// The other part is simply tupled together with the actual initial +/// CAR file. +/// +/// Wire data type from the [specification]. +/// +/// [specification]: https://github.com/fission-codes/spec/blob/86fcfb07d507f1df4fdaaf49088abecbb1dda76a/car-pool/car-mirror/http.md#22-requestor-payload +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct PushRequestHeader { + /// Bloom filter hash count + #[serde(rename = "bk")] + pub bloom_k: u32, + + /// Bloom filter Binary + #[serde(rename = "bb")] + pub bloom: Vec, +} + +/// The response sent after the initial and subsequent push requests. +/// +/// Wire data type from the [specification]. +/// +/// [specification]: https://github.com/fission-codes/spec/blob/86fcfb07d507f1df4fdaaf49088abecbb1dda76a/car-pool/car-mirror/http.md#23-provider-payload +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct PushResponse { + /// Incomplete subgraph roots + #[serde(rename = "sr")] + pub subgraph_roots: Vec, + + /// Bloom filter hash count + #[serde(rename = "bk")] + pub bloom_k: u32, + + /// Bloom filter Binary + #[serde(rename = "bb")] + pub bloom: Vec, +} From 0c197aad717dba70ae642a4050946c8a6a831858 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Wed, 16 Aug 2023 18:42:51 +0200 Subject: [PATCH 06/35] Small demo of running CAR mirror in-memory --- car-mirror/src/lib.rs | 213 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 196 insertions(+), 17 deletions(-) diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs index d3fc7e2..2701d8e 100644 --- a/car-mirror/src/lib.rs +++ b/car-mirror/src/lib.rs @@ -4,17 +4,18 @@ //! car-mirror -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, bail, Result}; use async_stream::try_stream; use bytes::Bytes; -use futures::{Stream, StreamExt}; -use iroh_car::{CarReader, CarWriter}; +use futures::{stream::LocalBoxStream, Stream, StreamExt, TryStreamExt}; +use iroh_car::{CarHeader, CarReader, CarWriter}; use libipld::{Ipld, IpldCodec}; use libipld_core::{ cid::Cid, codec::References, multihash::{Code, MultihashDigest}, }; +use messages::PushResponse; use std::{ collections::{HashSet, VecDeque}, io::Cursor, @@ -29,14 +30,137 @@ pub mod test_utils; /// Contains the data types that are sent over-the-wire and relevant serialization code. pub mod messages; +pub struct PushSenderSession<'a, B: BlockStore> { + last_response: PushResponse, + send_limit: usize, + store: &'a B, +} + +impl<'a, B: BlockStore> PushSenderSession<'a, B> { + pub fn new(root: Cid, store: &'a B) -> Self { + Self { + last_response: PushResponse { + subgraph_roots: vec![root], + // Just putting an empty bloom here initially + bloom_k: 3, + bloom: Vec::new(), + }, + send_limit: 256 * 1024, // 256KiB + store, + } + } + + pub fn handle_response(&mut self, response: PushResponse) -> bool { + self.last_response = response; + self.last_response.subgraph_roots.is_empty() + } + + pub async fn next_request(&mut self) -> Result { + let mut writer = CarWriter::new( + CarHeader::new_v1( + // TODO(matheus23): This is stupid + // CAR files *must* have at least one CID in them, and all of them + // need to appear as a block in the payload. + // It would probably make most sense to just write all subgraph roots into this, + // but we don't know how many of the subgraph roots fit into this round yet, + // so we're simply writing the first one in here, since we know + // at least one block will be written (and it'll be that one). + self.last_response + .subgraph_roots + .iter() + .take(1) + .cloned() + .collect(), + ), + Vec::new(), + ); + writer.write_header().await?; + + let mut block_bytes = 0; + let mut stream = + walk_dag_in_order_breadth_first(self.last_response.subgraph_roots.clone(), self.store); + while let Some((cid, block)) = stream.try_next().await? { + // TODO Eventually we'll need to turn the `LocalBoxStream` into a more configurable + // "external iterator", and then this will be the point where we prune parts of the DAG + // that the recipient already has. + + // TODO(matheus23): Count the actual bytes sent? + block_bytes += block.len(); + if block_bytes > self.send_limit { + break; + } + + writer.write(cid, &block).await?; + } + + Ok(writer.finish().await?.into()) + } +} + +pub struct PushReceiverSession<'a, B: BlockStore> { + accepted_roots: Vec, + receive_limit: usize, + store: &'a B, +} + +impl<'a, B: BlockStore> PushReceiverSession<'a, B> { + pub fn new(root: Cid, store: &'a B) -> Self { + Self { + accepted_roots: vec![root], + receive_limit: 256 * 1024, // 256KiB + store, + } + } + + pub async fn handle_request(&mut self, request: Bytes) -> Result { + let mut reader = CarReader::new(Cursor::new(request)).await?; + let mut stream = read_in_order_dag_from_car(self.accepted_roots.clone(), &mut reader); + + let mut missing_subgraphs: HashSet<_> = self.accepted_roots.iter().cloned().collect(); + + let mut block_bytes = 0; + while let Some((cid, block)) = stream.try_next().await? { + block_bytes += block.len(); + if block_bytes > self.receive_limit { + bail!( + "Received more than {} bytes ({block_bytes}), aborting request.", + self.receive_limit + ); + } + + let codec: IpldCodec = cid + .codec() + .try_into() + .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?; + + missing_subgraphs.remove(&cid); + missing_subgraphs.extend(references(codec, &block)?); + + self.store.put_block(block, cid.codec()).await?; + } + + let subgraph_roots: Vec<_> = missing_subgraphs.into_iter().collect(); + + self.accepted_roots = subgraph_roots.clone(); + + Ok(PushResponse { + subgraph_roots, + // We ignore blooms for now + bloom_k: 3, + bloom: Vec::new(), + }) + } +} + /// walks a DAG from given root breadth-first along IPLD links -pub fn walk_dag_in_order_breadth_first<'a>( - root: Cid, - store: &'a impl BlockStore, -) -> impl Stream> + Unpin + 'a { +pub fn walk_dag_in_order_breadth_first( + roots: impl IntoIterator, + store: &impl BlockStore, +) -> LocalBoxStream<'_, Result<(Cid, Bytes)>> { + let mut frontier: VecDeque<_> = roots.into_iter().collect(); + Box::pin(try_stream! { let mut visited = HashSet::new(); - let mut frontier = VecDeque::from([root]); while let Some(cid) = frontier.pop_front() { if visited.contains(&cid) { continue; @@ -64,12 +188,12 @@ pub async fn stream_into_car( /// Read a directed acyclic graph from a CAR file, making sure it's read in-order and /// only blocks reachable from the root are included. -pub fn read_in_order_dag_from_car<'a, R: tokio::io::AsyncRead + Unpin>( - root: Cid, - reader: &'a mut CarReader, -) -> impl Stream> + Unpin + 'a { +pub fn read_in_order_dag_from_car( + roots: impl IntoIterator, + reader: &mut CarReader, +) -> LocalBoxStream<'_, Result<(Cid, Bytes)>> { + let mut reachable_from_root: HashSet<_> = roots.into_iter().collect(); Box::pin(try_stream! { - let mut reachable_from_root = HashSet::from([root]); while let Some((cid, vec)) = reader.next_block().await.map_err(|e| anyhow!(e))? { let block = Bytes::from(vec); @@ -113,6 +237,8 @@ fn references(codec: IpldCodec, block: impl AsRef<[u8]>) -> Result> { #[cfg(test)] mod tests { + use std::collections::BTreeMap; + use crate::test_utils::{encode, generate_dag, Rvg}; use super::*; @@ -145,13 +271,13 @@ mod tests { let file = File::create(filename).await?; let mut writer = CarWriter::new(CarHeader::new_v1(vec![root]), file.compat_write()); writer.write_header().await?; - let block_stream = walk_dag_in_order_breadth_first(root, store); + let block_stream = walk_dag_in_order_breadth_first([root], store); stream_into_car(block_stream, &mut writer).await?; writer.finish().await?; let mut reader = CarReader::new(File::open(filename).await?.compat()).await?; - read_in_order_dag_from_car(root, &mut reader) + read_in_order_dag_from_car([root], &mut reader) .try_for_each(|(cid, _)| { println!("Got {cid}"); future::ready(Ok(())) @@ -161,6 +287,59 @@ mod tests { Ok(()) } + #[async_std::test] + async fn test_transfer() -> Result<()> { + let (blocks, root) = Rvg::new().sample(&generate_dag(256, |cids| { + let ipld = Ipld::Map(BTreeMap::from([ + ("data".into(), Ipld::Bytes(vec![0u8; 10 * 1024])), + ( + "links".into(), + Ipld::List(cids.into_iter().map(Ipld::Link).collect()), + ), + ])); + let bytes = encode(&ipld); + let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes)); + (cid, bytes) + })); + + let sender_store = &MemoryBlockStore::new(); + for (cid, bytes) in blocks.iter() { + let cid_store = sender_store + .put_block(bytes.clone(), IpldCodec::DagCbor.into()) + .await?; + assert_eq!(*cid, cid_store); + } + + let receiver_store = &MemoryBlockStore::new(); + + let mut sender = PushSenderSession::new(root, sender_store); + let mut receiver = PushReceiverSession::new(root, receiver_store); + + loop { + let request = sender.next_request().await?; + println!("Sending request {} bytes", request.len()); + let response = receiver.handle_request(request).await?; + if sender.handle_response(response) { + // Should be done + break; + } + } + + // receiver should have all data + let sender_cids = walk_dag_in_order_breadth_first([root], sender_store) + .map_ok(|(cid, _)| cid) + .try_collect::>() + .await?; + let receiver_cids = walk_dag_in_order_breadth_first([root], receiver_store) + .map_ok(|(cid, _)| cid) + .try_collect::>() + .await?; + + assert_eq!(sender_cids, receiver_cids); + + Ok(()) + } + #[async_std::test] async fn test_walk_dag_breadth_first() -> Result<()> { let store = &MemoryBlockStore::new(); @@ -181,7 +360,7 @@ mod tests { ])) .await?; - let cids = walk_dag_in_order_breadth_first(cid_root, store) + let cids = walk_dag_in_order_breadth_first([cid_root], store) .try_collect::>() .await? .into_iter() @@ -234,7 +413,7 @@ mod proptests { assert_eq!(*cid, cid_store); } - let mut cids = walk_dag_in_order_breadth_first(root, store) + let mut cids = walk_dag_in_order_breadth_first([root], store) .map_ok(|(cid, _)| cid) .try_collect::>() .await From 1f2a8590446b3db4382d032b5c3404414273b18e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 18 Aug 2023 11:09:50 +0200 Subject: [PATCH 07/35] Make protocol stateless --- car-mirror/src/lib.rs | 477 ++++++++++++++++++++----------------- car-mirror/src/messages.rs | 6 + 2 files changed, 266 insertions(+), 217 deletions(-) diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs index 2701d8e..ad547c3 100644 --- a/car-mirror/src/lib.rs +++ b/car-mirror/src/lib.rs @@ -5,22 +5,18 @@ //! car-mirror use anyhow::{anyhow, bail, Result}; -use async_stream::try_stream; use bytes::Bytes; -use futures::{stream::LocalBoxStream, Stream, StreamExt, TryStreamExt}; +use futures::{stream::try_unfold, Stream, StreamExt, TryStreamExt}; use iroh_car::{CarHeader, CarReader, CarWriter}; use libipld::{Ipld, IpldCodec}; -use libipld_core::{ - cid::Cid, - codec::References, - multihash::{Code, MultihashDigest}, -}; +use libipld_core::{cid::Cid, codec::References}; use messages::PushResponse; use std::{ collections::{HashSet, VecDeque}, + eprintln, io::Cursor, }; -use wnfs_common::BlockStore; +use wnfs_common::{BlockStore, BlockStoreError}; /// Test utilities. #[cfg(any(test, feature = "test_utils"))] @@ -30,148 +26,199 @@ pub mod test_utils; /// Contains the data types that are sent over-the-wire and relevant serialization code. pub mod messages; -pub struct PushSenderSession<'a, B: BlockStore> { - last_response: PushResponse, - send_limit: usize, - store: &'a B, +pub struct PushConfig { + send_minimum: usize, + receive_maximum: usize, + max_roots_per_round: usize, } -impl<'a, B: BlockStore> PushSenderSession<'a, B> { - pub fn new(root: Cid, store: &'a B) -> Self { +impl Default for PushConfig { + fn default() -> Self { Self { - last_response: PushResponse { - subgraph_roots: vec![root], - // Just putting an empty bloom here initially - bloom_k: 3, - bloom: Vec::new(), - }, - send_limit: 256 * 1024, // 256KiB - store, + send_minimum: 128 * 1024, // 128KiB + receive_maximum: 512 * 1024, // 512KiB + max_roots_per_round: 1000, // max. ~41KB of CIDs } } +} + +pub async fn client_initiate_push( + root: Cid, + config: &PushConfig, + store: &impl BlockStore, +) -> Result { + let fake_response = PushResponse { + subgraph_roots: vec![root], + // Just putting an empty bloom here + bloom_k: 3, + bloom: Vec::new(), + }; + client_push(root, &fake_response, config, store).await +} - pub fn handle_response(&mut self, response: PushResponse) -> bool { - self.last_response = response; - self.last_response.subgraph_roots.is_empty() +pub async fn client_push( + root: Cid, + last_response: &PushResponse, + config: &PushConfig, + store: &impl BlockStore, +) -> Result { + // Verify that all subgraph roots are in the relevant DAG: + let subgraph_roots: Vec = DagWalk::breadth_first([root]) + .stream(store) + .try_filter_map(|(cid, _)| async move { + Ok(last_response.subgraph_roots.contains(&cid).then_some(cid)) + }) + .try_collect() + .await?; + + let mut writer = CarWriter::new( + CarHeader::new_v1( + // TODO(matheus23): This is stupid + // CAR files *must* have at least one CID in them, and all of them + // need to appear as a block in the payload. + // It would probably make most sense to just write all subgraph roots into this, + // but we don't know how many of the subgraph roots fit into this round yet, + // so we're simply writing the first one in here, since we know + // at least one block will be written (and it'll be that one). + subgraph_roots.iter().take(1).cloned().collect(), + ), + Vec::new(), + ); + + writer.write_header().await?; + + let mut block_bytes = 0; + let mut dag_walk = DagWalk::breadth_first(subgraph_roots); + while let Some((cid, block)) = dag_walk.next(store).await? { + writer.write(cid, &block).await?; + println!("Sending {cid}"); + + // TODO(matheus23): Count the actual bytes sent? + block_bytes += block.len(); + if block_bytes > config.send_minimum { + break; + } } - pub async fn next_request(&mut self) -> Result { - let mut writer = CarWriter::new( - CarHeader::new_v1( - // TODO(matheus23): This is stupid - // CAR files *must* have at least one CID in them, and all of them - // need to appear as a block in the payload. - // It would probably make most sense to just write all subgraph roots into this, - // but we don't know how many of the subgraph roots fit into this round yet, - // so we're simply writing the first one in here, since we know - // at least one block will be written (and it'll be that one). - self.last_response - .subgraph_roots - .iter() - .take(1) - .cloned() - .collect(), - ), - Vec::new(), - ); - writer.write_header().await?; - - let mut block_bytes = 0; - let mut stream = - walk_dag_in_order_breadth_first(self.last_response.subgraph_roots.clone(), self.store); - while let Some((cid, block)) = stream.try_next().await? { - // TODO Eventually we'll need to turn the `LocalBoxStream` into a more configurable - // "external iterator", and then this will be the point where we prune parts of the DAG - // that the recipient already has. - - // TODO(matheus23): Count the actual bytes sent? - block_bytes += block.len(); - if block_bytes > self.send_limit { - break; - } + Ok(writer.finish().await?.into()) +} + +pub async fn server_push_response( + root: Cid, + request: Bytes, + config: &PushConfig, + store: &impl BlockStore, +) -> Result { + let mut dag_verification = IncrementalDagVerification::new([root], store).await?; - writer.write(cid, &block).await?; + let mut reader = CarReader::new(Cursor::new(request)).await?; + let mut block_bytes = 0; + + while let Some((cid, vec)) = reader.next_block().await? { + let block = Bytes::from(vec); + println!("Received {cid}"); + + block_bytes += block.len(); + if block_bytes > config.receive_maximum { + bail!( + "Received more than {} bytes ({block_bytes}), aborting request.", + config.receive_maximum + ); } - Ok(writer.finish().await?.into()) + dag_verification + .verify_and_store_block((cid, block), store) + .await?; } + + let subgraph_roots = dag_verification + .want_cids + .iter() + .take(config.max_roots_per_round) + .cloned() + .collect(); + + Ok(PushResponse { + subgraph_roots, + // We ignore blooms for now + bloom_k: 3, + bloom: Vec::new(), + }) } -pub struct PushReceiverSession<'a, B: BlockStore> { - accepted_roots: Vec, - receive_limit: usize, - store: &'a B, +pub struct DagWalk { + pub frontier: VecDeque, + pub visited: HashSet, + pub breadth_first: bool, } -impl<'a, B: BlockStore> PushReceiverSession<'a, B> { - pub fn new(root: Cid, store: &'a B) -> Self { - Self { - accepted_roots: vec![root], - receive_limit: 256 * 1024, // 256KiB - store, - } +impl DagWalk { + pub fn breadth_first(roots: impl IntoIterator) -> Self { + Self::new(roots, true) } - pub async fn handle_request(&mut self, request: Bytes) -> Result { - let mut reader = CarReader::new(Cursor::new(request)).await?; - let mut stream = read_in_order_dag_from_car(self.accepted_roots.clone(), &mut reader); + pub fn depth_first(roots: impl IntoIterator) -> Self { + Self::new(roots, false) + } - let mut missing_subgraphs: HashSet<_> = self.accepted_roots.iter().cloned().collect(); + pub fn new(roots: impl IntoIterator, breadth_first: bool) -> Self { + let frontier = roots.into_iter().collect(); + let visited = HashSet::new(); + Self { + frontier, + visited, + breadth_first, + } + } - let mut block_bytes = 0; - while let Some((cid, block)) = stream.try_next().await? { - block_bytes += block.len(); - if block_bytes > self.receive_limit { - bail!( - "Received more than {} bytes ({block_bytes}), aborting request.", - self.receive_limit - ); + pub async fn next(&mut self, store: &impl BlockStore) -> Result> { + let cid = loop { + let popped = if self.breadth_first { + self.frontier.pop_front() + } else { + self.frontier.pop_back() + }; + + let Some(cid) = popped else { + return Ok(None); + }; + + // We loop until we find an unvisited block + if self.visited.insert(cid) { + break cid; } + }; - let codec: IpldCodec = cid - .codec() - .try_into() - .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?; - - missing_subgraphs.remove(&cid); - missing_subgraphs.extend(references(codec, &block)?); - - self.store.put_block(block, cid.codec()).await?; + let block = store.get_block(&cid).await?; + let codec = IpldCodec::try_from(cid.codec())?; + for ref_cid in references(codec, &block)? { + if !self.visited.contains(&ref_cid) { + self.frontier.push_back(ref_cid); + } } - let subgraph_roots: Vec<_> = missing_subgraphs.into_iter().collect(); - - self.accepted_roots = subgraph_roots.clone(); + Ok(Some((cid, block))) + } - Ok(PushResponse { - subgraph_roots, - // We ignore blooms for now - bloom_k: 3, - bloom: Vec::new(), - }) + pub fn stream( + self, + store: &impl BlockStore, + ) -> impl Stream> + Unpin + '_ { + Box::pin(try_unfold(self, move |mut this| async move { + let maybe_block = this.next(store).await?; + Ok(maybe_block.map(|b| (b, this))) + })) } -} -/// walks a DAG from given root breadth-first along IPLD links -pub fn walk_dag_in_order_breadth_first( - roots: impl IntoIterator, - store: &impl BlockStore, -) -> LocalBoxStream<'_, Result<(Cid, Bytes)>> { - let mut frontier: VecDeque<_> = roots.into_iter().collect(); - - Box::pin(try_stream! { - let mut visited = HashSet::new(); - while let Some(cid) = frontier.pop_front() { - if visited.contains(&cid) { - continue; - } - visited.insert(cid); - let block = store.get_block(&cid).await?; - let codec = IpldCodec::try_from(cid.codec())?; - frontier.extend(references(codec, &block)?); - yield (cid, block); - } - }) + pub fn is_finished(&self) -> bool { + // We're finished if the frontier does not contain any CIDs that we have not visited yet. + // Put differently: + // We're not finished if there exist unvisited CIDs in the frontier. + !self + .frontier + .iter() + .any(|frontier_cid| !self.visited.contains(frontier_cid)) + } } /// Writes a stream of blocks into a car file @@ -186,47 +233,84 @@ pub async fn stream_into_car( Ok(()) } -/// Read a directed acyclic graph from a CAR file, making sure it's read in-order and -/// only blocks reachable from the root are included. -pub fn read_in_order_dag_from_car( - roots: impl IntoIterator, - reader: &mut CarReader, -) -> LocalBoxStream<'_, Result<(Cid, Bytes)>> { - let mut reachable_from_root: HashSet<_> = roots.into_iter().collect(); - Box::pin(try_stream! { - while let Some((cid, vec)) = reader.next_block().await.map_err(|e| anyhow!(e))? { - let block = Bytes::from(vec); - - let code: Code = cid - .hash() - .code() - .try_into() - .map_err(|_| anyhow!("Unsupported hash code in Cid: {cid}"))?; - - let codec: IpldCodec = cid - .codec() - .try_into() - .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?; - - let digest = code.digest(&block); - - if cid.hash() != &digest { - Err(anyhow!( - "Digest mismatch in CAR file: expected {:?}, got {:?}", - digest, - cid.hash() - ))?; +pub struct IncrementalDagVerification { + pub want_cids: HashSet, + pub have_cids: HashSet, +} + +impl IncrementalDagVerification { + pub async fn new( + roots: impl IntoIterator, + store: &impl BlockStore, + ) -> Result { + let mut want_cids = HashSet::new(); + let mut have_cids = HashSet::new(); + let mut dag_walk = DagWalk::breadth_first(roots); + + loop { + match dag_walk.next(store).await { + Err(e) => { + if let Some(BlockStoreError::CIDNotFound(not_found)) = + e.downcast_ref::() + { + want_cids.insert(*not_found); + } else { + bail!(e); + } + } + Ok(Some((cid, _))) => { + have_cids.insert(cid); + } + Ok(None) => { + break; + } } + } - if !reachable_from_root.contains(&cid) { - Err(anyhow!("Unexpected block or block out of order: {cid}"))?; + Ok(Self { + want_cids, + have_cids, + }) + } + + pub async fn verify_and_store_block( + &mut self, + block: (Cid, Bytes), + store: &impl BlockStore, + ) -> Result<()> { + let (cid, bytes) = block; + + let codec: IpldCodec = cid + .codec() + .try_into() + .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?; + + if !self.want_cids.contains(&cid) { + if self.have_cids.contains(&cid) { + eprintln!("Warn: Received {cid}, even though we already have it"); + } else { + bail!("Unexpected block or block out of order: {cid}"); } + } - reachable_from_root.extend(references(codec, &block)?); + let refs = references(codec, &bytes)?; + let result_cid = store.put_block(bytes, codec.into()).await?; - yield (cid, block); + if result_cid != cid { + bail!("Digest mismatch in CAR file: expected {cid}, got {result_cid}"); } - }) + + for ref_cid in refs { + if !self.have_cids.contains(&ref_cid) { + self.want_cids.insert(ref_cid); + } + } + + self.want_cids.remove(&cid); + self.have_cids.insert(cid); + + Ok(()) + } } fn references(codec: IpldCodec, block: impl AsRef<[u8]>) -> Result> { @@ -237,56 +321,13 @@ fn references(codec: IpldCodec, block: impl AsRef<[u8]>) -> Result> { #[cfg(test)] mod tests { - use std::collections::BTreeMap; - - use crate::test_utils::{encode, generate_dag, Rvg}; - use super::*; - use async_std::fs::File; - use futures::{future, TryStreamExt}; - use iroh_car::CarHeader; + use crate::test_utils::{encode, generate_dag, Rvg}; + use futures::TryStreamExt; use libipld_core::multihash::{Code, MultihashDigest}; - use tokio_util::compat::{FuturesAsyncReadCompatExt, FuturesAsyncWriteCompatExt}; + use std::collections::BTreeMap; use wnfs_common::MemoryBlockStore; - #[async_std::test] - async fn test_write_into_car() -> Result<()> { - let (blocks, root) = Rvg::new().sample(&generate_dag(256, |cids| { - let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect()); - let bytes = encode(&ipld); - let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes)); - (cid, bytes) - })); - - let store = &MemoryBlockStore::new(); - for (cid, bytes) in blocks.iter() { - let cid_store = store - .put_block(bytes.clone(), IpldCodec::DagCbor.into()) - .await?; - assert_eq!(*cid, cid_store); - } - - let filename = "./my-car.car"; - - let file = File::create(filename).await?; - let mut writer = CarWriter::new(CarHeader::new_v1(vec![root]), file.compat_write()); - writer.write_header().await?; - let block_stream = walk_dag_in_order_breadth_first([root], store); - stream_into_car(block_stream, &mut writer).await?; - writer.finish().await?; - - let mut reader = CarReader::new(File::open(filename).await?.compat()).await?; - - read_in_order_dag_from_car([root], &mut reader) - .try_for_each(|(cid, _)| { - println!("Got {cid}"); - future::ready(Ok(())) - }) - .await?; - - Ok(()) - } - #[async_std::test] async fn test_transfer() -> Result<()> { let (blocks, root) = Rvg::new().sample(&generate_dag(256, |cids| { @@ -311,26 +352,26 @@ mod tests { } let receiver_store = &MemoryBlockStore::new(); - - let mut sender = PushSenderSession::new(root, sender_store); - let mut receiver = PushReceiverSession::new(root, receiver_store); - + let config = &PushConfig::default(); + let mut request = client_initiate_push(root, config, sender_store).await?; loop { - let request = sender.next_request().await?; println!("Sending request {} bytes", request.len()); - let response = receiver.handle_request(request).await?; - if sender.handle_response(response) { - // Should be done + let response = server_push_response(root, request, config, receiver_store).await?; + println!("Response: {:?}", response.subgraph_roots); + if response.indicates_finished() { break; } + request = client_push(root, &response, config, sender_store).await?; } // receiver should have all data - let sender_cids = walk_dag_in_order_breadth_first([root], sender_store) + let sender_cids = DagWalk::breadth_first([root]) + .stream(sender_store) .map_ok(|(cid, _)| cid) .try_collect::>() .await?; - let receiver_cids = walk_dag_in_order_breadth_first([root], receiver_store) + let receiver_cids = DagWalk::breadth_first([root]) + .stream(receiver_store) .map_ok(|(cid, _)| cid) .try_collect::>() .await?; @@ -360,7 +401,8 @@ mod tests { ])) .await?; - let cids = walk_dag_in_order_breadth_first([cid_root], store) + let cids = DagWalk::breadth_first([cid_root]) + .stream(store) .try_collect::>() .await? .into_iter() @@ -377,7 +419,7 @@ mod tests { mod proptests { use crate::{ test_utils::{encode, generate_dag}, - walk_dag_in_order_breadth_first, + DagWalk, }; use futures::TryStreamExt; use libipld::{ @@ -413,7 +455,8 @@ mod proptests { assert_eq!(*cid, cid_store); } - let mut cids = walk_dag_in_order_breadth_first([root], store) + let mut cids = DagWalk::breadth_first([root]) + .stream(store) .map_ok(|(cid, _)| cid) .try_collect::>() .await diff --git a/car-mirror/src/messages.rs b/car-mirror/src/messages.rs index 815fe45..c5a778f 100644 --- a/car-mirror/src/messages.rs +++ b/car-mirror/src/messages.rs @@ -58,3 +58,9 @@ pub struct PushResponse { #[serde(rename = "bb")] pub bloom: Vec, } + +impl PushResponse { + pub fn indicates_finished(&self) -> bool { + self.subgraph_roots.is_empty() + } +} From 7535cf84a2b7b159dac0cf55e92810cab55364cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 18 Aug 2023 11:58:40 +0200 Subject: [PATCH 08/35] Implement bloom filters --- Cargo.lock | 90 +++++++++++++++++++++++++++++++++++++++++++ car-mirror/Cargo.toml | 1 + car-mirror/src/lib.rs | 87 +++++++++++++++++++++++++++++++---------- 3 files changed, 158 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 15f9f73..952b6fb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -272,6 +272,19 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitvec" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +dependencies = [ + "funty", + "radium", + "serde", + "tap", + "wyz", +] + [[package]] name = "blake2b_simd" version = "1.0.1" @@ -368,6 +381,7 @@ dependencies = [ "async-trait", "bytes", "car-mirror", + "deterministic-bloom", "fixedbitset", "futures", "iroh-car", @@ -647,6 +661,20 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "deterministic-bloom" +version = "0.1.0" +source = "git+https://github.com/wnfs-wg/deterministic-bloom#a8cd85b#a8cd85b1d71da9f79f5058c0a20e53a83a283230" +dependencies = [ + "bitvec", + "miette", + "rand_core", + "serde", + "thiserror", + "tracing", + "xxhash-rust", +] + [[package]] name = "digest" version = "0.10.7" @@ -718,6 +746,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "funty" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" + [[package]] name = "futures" version = "0.3.28" @@ -1127,6 +1161,29 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +[[package]] +name = "miette" +version = "5.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59bb584eaeeab6bd0226ccf3509a69d7936d148cf3d036ad350abe35e8c6856e" +dependencies = [ + "miette-derive", + "once_cell", + "thiserror", + "unicode-width", +] + +[[package]] +name = "miette-derive" +version = "5.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49e7bc1560b95a3c4a25d03de42fe76ca718ab92d1a22a55b9b4cf67b3ae635c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.28", +] + [[package]] name = "miniz_oxide" version = "0.7.1" @@ -1360,6 +1417,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "radium" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" + [[package]] name = "rand" version = "0.8.5" @@ -1690,6 +1753,12 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + [[package]] name = "tempfile" version = "3.6.0" @@ -1873,6 +1942,12 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" +[[package]] +name = "unicode-width" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" + [[package]] name = "unicode-xid" version = "0.2.4" @@ -2161,3 +2236,18 @@ dependencies = [ "serde", "thiserror", ] + +[[package]] +name = "wyz" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" +dependencies = [ + "tap", +] + +[[package]] +name = "xxhash-rust" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "735a71d46c4d68d71d4b24d03fdc2b98e38cea81730595801db779c04fe80d70" diff --git a/car-mirror/Cargo.toml b/car-mirror/Cargo.toml index a25201f..e5216c2 100644 --- a/car-mirror/Cargo.toml +++ b/car-mirror/Cargo.toml @@ -26,6 +26,7 @@ doc = true anyhow = "1.0" async-stream = "0.3.5" bytes = "1.4.0" +deterministic-bloom = { git = "https://github.com/wnfs-wg/deterministic-bloom#a8cd85b" } fixedbitset = "0.4.2" futures = "0.3.28" iroh-car = "0.3.0" diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs index ad547c3..7f1871a 100644 --- a/car-mirror/src/lib.rs +++ b/car-mirror/src/lib.rs @@ -6,6 +6,7 @@ use anyhow::{anyhow, bail, Result}; use bytes::Bytes; +use deterministic_bloom::runtime_size::BloomFilter; use futures::{stream::try_unfold, Stream, StreamExt, TryStreamExt}; use iroh_car::{CarHeader, CarReader, CarWriter}; use libipld::{Ipld, IpldCodec}; @@ -30,6 +31,7 @@ pub struct PushConfig { send_minimum: usize, receive_maximum: usize, max_roots_per_round: usize, + bloom_fpr: f64, } impl Default for PushConfig { @@ -38,6 +40,7 @@ impl Default for PushConfig { send_minimum: 128 * 1024, // 128KiB receive_maximum: 512 * 1024, // 512KiB max_roots_per_round: 1000, // max. ~41KB of CIDs + bloom_fpr: 1.0 / 1_000.0, // 0.1% } } } @@ -53,24 +56,34 @@ pub async fn client_initiate_push( bloom_k: 3, bloom: Vec::new(), }; - client_push(root, &fake_response, config, store).await + client_push(root, fake_response, config, store).await } pub async fn client_push( root: Cid, - last_response: &PushResponse, + last_response: PushResponse, config: &PushConfig, store: &impl BlockStore, ) -> Result { + let PushResponse { + ref subgraph_roots, + bloom_k, + bloom, + } = last_response; + // Verify that all subgraph roots are in the relevant DAG: let subgraph_roots: Vec = DagWalk::breadth_first([root]) .stream(store) - .try_filter_map(|(cid, _)| async move { - Ok(last_response.subgraph_roots.contains(&cid).then_some(cid)) - }) + .try_filter_map(|(cid, _)| async move { Ok(subgraph_roots.contains(&cid).then_some(cid)) }) .try_collect() .await?; + let bloom = if bloom.is_empty() { + BloomFilter::new_with(1, Box::new([0])) // An empty bloom that contains nothing + } else { + BloomFilter::new_with(bloom_k as usize, bloom.into_boxed_slice()) + }; + let mut writer = CarWriter::new( CarHeader::new_v1( // TODO(matheus23): This is stupid @@ -88,8 +101,21 @@ pub async fn client_push( writer.write_header().await?; let mut block_bytes = 0; - let mut dag_walk = DagWalk::breadth_first(subgraph_roots); + let mut dag_walk = DagWalk::breadth_first(subgraph_roots.clone()); while let Some((cid, block)) = dag_walk.next(store).await? { + if bloom.contains(&cid.to_bytes()) && !subgraph_roots.contains(&cid) { + // TODO(matheus23) I think the spec means to prune the whole subgraph. + // But + // 1. That requires the receiver to check the whole subgraph at that CID to find out whether there's a missing block at the subgraph. + // 2. It requires the sender to go through every block under this subgraph down to the leaves to mark all of these CIDs as visited. + // Both of these are *huge* traversals. I'd say likely not worth it. The only case I can image they're worth it, is if the DAG + // is *heavily* using structural sharing and not tree-like. + // Also: This fails completely if the sender is just missing a single leaf. It couldn't add the block to the bloom in that case. + dag_walk.skip_walking((cid, block))?; + println!("Skipped walking {cid} due to bloom"); + break; + } + writer.write(cid, &block).await?; println!("Sending {cid}"); @@ -138,11 +164,19 @@ pub async fn server_push_response( .cloned() .collect(); + let mut bloom = + BloomFilter::new_from_fpr_po2(dag_verification.have_cids.len() as u64, config.bloom_fpr); + + dag_verification + .have_cids + .iter() + .for_each(|cid| bloom.insert(&cid.to_bytes())); + Ok(PushResponse { subgraph_roots, // We ignore blooms for now - bloom_k: 3, - bloom: Vec::new(), + bloom_k: bloom.hash_count() as u32, + bloom: bloom.as_bytes().to_vec(), }) } @@ -190,8 +224,7 @@ impl DagWalk { }; let block = store.get_block(&cid).await?; - let codec = IpldCodec::try_from(cid.codec())?; - for ref_cid in references(codec, &block)? { + for ref_cid in references(cid, &block)? { if !self.visited.contains(&ref_cid) { self.frontier.push_back(ref_cid); } @@ -219,6 +252,16 @@ impl DagWalk { .iter() .any(|frontier_cid| !self.visited.contains(frontier_cid)) } + + pub fn skip_walking(&mut self, block: (Cid, Bytes)) -> Result<()> { + let (cid, bytes) = block; + let refs = references(cid, bytes)?; + self.visited.insert(cid); + self.frontier + .retain(|frontier_cid| !refs.contains(frontier_cid)); + + Ok(()) + } } /// Writes a stream of blocks into a car file @@ -280,11 +323,6 @@ impl IncrementalDagVerification { ) -> Result<()> { let (cid, bytes) = block; - let codec: IpldCodec = cid - .codec() - .try_into() - .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?; - if !self.want_cids.contains(&cid) { if self.have_cids.contains(&cid) { eprintln!("Warn: Received {cid}, even though we already have it"); @@ -293,8 +331,8 @@ impl IncrementalDagVerification { } } - let refs = references(codec, &bytes)?; - let result_cid = store.put_block(bytes, codec.into()).await?; + let refs = references(cid, &bytes)?; + let result_cid = store.put_block(bytes, cid.codec()).await?; if result_cid != cid { bail!("Digest mismatch in CAR file: expected {cid}, got {result_cid}"); @@ -313,7 +351,12 @@ impl IncrementalDagVerification { } } -fn references(codec: IpldCodec, block: impl AsRef<[u8]>) -> Result> { +fn references(cid: Cid, block: impl AsRef<[u8]>) -> Result> { + let codec: IpldCodec = cid + .codec() + .try_into() + .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?; + let mut refs = Vec::new(); >::references(codec, &mut Cursor::new(block), &mut refs)?; Ok(refs) @@ -357,11 +400,15 @@ mod tests { loop { println!("Sending request {} bytes", request.len()); let response = server_push_response(root, request, config, receiver_store).await?; - println!("Response: {:?}", response.subgraph_roots); + println!( + "Response (bloom bytes: {}): {:?}", + response.bloom.len(), + response.subgraph_roots, + ); if response.indicates_finished() { break; } - request = client_push(root, &response, config, sender_store).await?; + request = client_push(root, response, config, sender_store).await?; } // receiver should have all data From 5c4baa72dd950b62da9ee6162a246023edaa9089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 18 Aug 2023 14:26:37 +0200 Subject: [PATCH 09/35] Add some docs --- car-mirror/src/lib.rs | 91 +++++++++++++++++++++-- car-mirror/src/messages.rs | 1 + car-mirror/src/test_utils/dag_strategy.rs | 8 +- 3 files changed, 90 insertions(+), 10 deletions(-) diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs index 7f1871a..5982219 100644 --- a/car-mirror/src/lib.rs +++ b/car-mirror/src/lib.rs @@ -27,11 +27,20 @@ pub mod test_utils; /// Contains the data types that are sent over-the-wire and relevant serialization code. pub mod messages; +/// Configuration values (such as byte limits) for the CAR mirror push protocol +#[derive(Clone, Debug)] pub struct PushConfig { - send_minimum: usize, - receive_maximum: usize, - max_roots_per_round: usize, - bloom_fpr: f64, + /// A client will try to send at least `send_minimum` bytes of block data + /// in each request, except if close to the end of the protocol (when there's) + /// not that much data left. + pub send_minimum: usize, + /// The maximum number of bytes per request that the server accepts. + pub receive_maximum: usize, + /// The maximum number of roots per request that the server will send to the client, + /// and that the client will consume. + pub max_roots_per_round: usize, + /// The target false positive rate for the bloom filter that the server sends. + pub bloom_fpr: f64, } impl Default for PushConfig { @@ -40,11 +49,17 @@ impl Default for PushConfig { send_minimum: 128 * 1024, // 128KiB receive_maximum: 512 * 1024, // 512KiB max_roots_per_round: 1000, // max. ~41KB of CIDs - bloom_fpr: 1.0 / 1_000.0, // 0.1% + bloom_fpr: 1.0 / 10_000.0, // 0.1% } } } +/// Initiate a car mirror push request. +/// +/// The goal is to transfer the DAG below the root CID to +/// the server. +/// +/// The return value is a CAR file. pub async fn client_initiate_push( root: Cid, config: &PushConfig, @@ -59,6 +74,13 @@ pub async fn client_initiate_push( client_push(root, fake_response, config, store).await } +/// Send a subsequent car mirror push request, following up on +/// a response retrieved from an initial `client_initiate_push` request. +/// +/// Make sure to call `response.indicates_finished()` before initiating +/// a follow-up `client_push` request. +/// +/// The return value is another CAR file with more blocks from the DAG below the root. pub async fn client_push( root: Cid, last_response: PushResponse, @@ -129,6 +151,12 @@ pub async fn client_push( Ok(writer.finish().await?.into()) } +/// This handles a car mirror push request on the server side. +/// +/// The root is the root CID of the DAG that is pushed, the request is a CAR file +/// with some blocks from the cold call. +/// +/// Returns a response to answer the client's request with. pub async fn server_push_response( root: Cid, request: Bytes, @@ -180,21 +208,41 @@ pub async fn server_push_response( }) } +/// A struct that represents an ongoing walk through the Dag. +#[derive(Clone, Debug)] pub struct DagWalk { + /// A queue of CIDs to visit next pub frontier: VecDeque, + /// The set of already visited CIDs. This prevents re-visiting. pub visited: HashSet, + /// Whether to do a breadth-first or depth-first traversal. + /// This controls whether newly discovered links are appended or prepended to the frontier. pub breadth_first: bool, } impl DagWalk { + /// Start a breadth-first traversal of given roots. + /// + /// Breadth-first is explained the easiest in the simple case of a tree (which is a DAG): + /// It will visit each node in the tree layer-by-layer. + /// + /// So the first nodes it will visit are going to be all roots in order. pub fn breadth_first(roots: impl IntoIterator) -> Self { Self::new(roots, true) } + /// Start a depth-first traversal of given roots. + /// + /// Depth-first will follow links immediately after discovering them, taking the fastest + /// path towards leaves. + /// + /// The very first node is guaranteed to be the first root, but subsequent nodes may not be + /// from the initial roots. pub fn depth_first(roots: impl IntoIterator) -> Self { Self::new(roots, false) } + /// Start a DAG traversal of given roots. See also `breadth_first` and `depth_first`. pub fn new(roots: impl IntoIterator, breadth_first: bool) -> Self { let frontier = roots.into_iter().collect(); let visited = HashSet::new(); @@ -205,12 +253,15 @@ impl DagWalk { } } + /// Return the next node in the traversal. + /// + /// Returns `None` if no nodes are left to be visited. pub async fn next(&mut self, store: &impl BlockStore) -> Result> { let cid = loop { let popped = if self.breadth_first { - self.frontier.pop_front() - } else { self.frontier.pop_back() + } else { + self.frontier.pop_front() }; let Some(cid) = popped else { @@ -226,13 +277,14 @@ impl DagWalk { let block = store.get_block(&cid).await?; for ref_cid in references(cid, &block)? { if !self.visited.contains(&ref_cid) { - self.frontier.push_back(ref_cid); + self.frontier.push_front(ref_cid); } } Ok(Some((cid, block))) } + /// Turn this traversal into a stream pub fn stream( self, store: &impl BlockStore, @@ -243,6 +295,9 @@ impl DagWalk { })) } + /// Find out whether the traversal is finished. + /// + /// The next call to `next` would result in `None` if this returns true. pub fn is_finished(&self) -> bool { // We're finished if the frontier does not contain any CIDs that we have not visited yet. // Put differently: @@ -253,6 +308,7 @@ impl DagWalk { .any(|frontier_cid| !self.visited.contains(frontier_cid)) } + /// Skip a node from the traversal for now. pub fn skip_walking(&mut self, block: (Cid, Bytes)) -> Result<()> { let (cid, bytes) = block; let refs = references(cid, bytes)?; @@ -276,12 +332,20 @@ pub async fn stream_into_car( Ok(()) } +/// A data structure that keeps state about incremental DAG verification. +#[derive(Clone, Debug)] pub struct IncrementalDagVerification { + /// All the CIDs that have been discovered to be missing from the DAG. pub want_cids: HashSet, + /// All the CIDs that are available locally. pub have_cids: HashSet, } impl IncrementalDagVerification { + /// Initiate incremental DAG verification of given roots. + /// + /// This will already run a traversal to find missing subgraphs and + /// CIDs that are already present. pub async fn new( roots: impl IntoIterator, store: &impl BlockStore, @@ -316,6 +380,17 @@ impl IncrementalDagVerification { }) } + /// Verify that + /// - the block actually hashes to the hash from given CID and + /// - the block is part of the graph below the roots. + /// + /// And finally stores the block in the blockstore. + /// + /// This *may* fail, even if the block is part of the graph below the roots, + /// if intermediate blocks between the roots and this block are missing. + /// + /// This *may* add the block to the blockstore, but still fail to verify, specifically + /// if the block's bytes don't match the hash in the CID. pub async fn verify_and_store_block( &mut self, block: (Cid, Bytes), diff --git a/car-mirror/src/messages.rs b/car-mirror/src/messages.rs index c5a778f..283d55c 100644 --- a/car-mirror/src/messages.rs +++ b/car-mirror/src/messages.rs @@ -60,6 +60,7 @@ pub struct PushResponse { } impl PushResponse { + /// Whether this response indicates that the protocol is finished. pub fn indicates_finished(&self) -> bool { self.subgraph_roots.is_empty() } diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs index 003c25e..fc7ce00 100644 --- a/car-mirror/src/test_utils/dag_strategy.rs +++ b/car-mirror/src/test_utils/dag_strategy.rs @@ -6,12 +6,16 @@ use libipld_core::codec::Encode; use proptest::strategy::Strategy; use roaring_graphs::{arb_dag, DirectedAcyclicGraph, Vertex}; +/// Encode some IPLD as dag-cbor pub fn encode(ipld: &Ipld) -> Bytes { let mut vec = Vec::new(); ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap(); // TODO(matheus23) unwrap Bytes::from(vec) } +/// A strategy for use with proptest to generate random DAGs (directed acyclic graphs). +/// The strategy generates a list of blocks of type T and their CIDs, as well as +/// the root block's CID. pub fn generate_dag( max_nodes: u16, generate_block: fn(Vec) -> (Cid, T), @@ -19,7 +23,7 @@ pub fn generate_dag( arb_dag(1..max_nodes, 0.5).prop_map(move |dag| dag_to_nodes(&dag, generate_block)) } -pub fn dag_to_nodes( +fn dag_to_nodes( dag: &DirectedAcyclicGraph, generate_node: fn(Vec) -> (Cid, T), ) -> (Vec<(Cid, T)>, Cid) { @@ -30,7 +34,7 @@ pub fn dag_to_nodes( (blocks, cid) } -pub fn dag_to_nodes_helper( +fn dag_to_nodes_helper( dag: &DirectedAcyclicGraph, root: Vertex, generate_node: fn(Vec) -> (Cid, T), From b0da652a43cb677a36d885cb99987137f784c392 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 18 Aug 2023 14:39:41 +0200 Subject: [PATCH 10/35] Split into modules --- car-mirror/src/common.rs | 20 + car-mirror/src/dag_walk.rs | 219 ++++++++ car-mirror/src/incremental_verification.rs | 100 ++++ car-mirror/src/lib.rs | 581 +-------------------- car-mirror/src/push.rs | 259 +++++++++ 5 files changed, 607 insertions(+), 572 deletions(-) create mode 100644 car-mirror/src/common.rs create mode 100644 car-mirror/src/dag_walk.rs create mode 100644 car-mirror/src/incremental_verification.rs create mode 100644 car-mirror/src/push.rs diff --git a/car-mirror/src/common.rs b/car-mirror/src/common.rs new file mode 100644 index 0000000..bbf0cad --- /dev/null +++ b/car-mirror/src/common.rs @@ -0,0 +1,20 @@ +use anyhow::{anyhow, Result}; +use libipld::{Ipld, IpldCodec}; +use libipld_core::{cid::Cid, codec::References}; +use std::io::Cursor; + +/// Find all CIDs that a block references. +/// +/// This will error out if +/// - the codec is not supported +/// - the block can't be parsed. +pub fn references(cid: Cid, block: impl AsRef<[u8]>) -> Result> { + let codec: IpldCodec = cid + .codec() + .try_into() + .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?; + + let mut refs = Vec::new(); + >::references(codec, &mut Cursor::new(block), &mut refs)?; + Ok(refs) +} diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs new file mode 100644 index 0000000..0f57740 --- /dev/null +++ b/car-mirror/src/dag_walk.rs @@ -0,0 +1,219 @@ +use crate::common::references; +use anyhow::Result; +use bytes::Bytes; +use futures::{stream::try_unfold, Stream}; +use libipld_core::cid::Cid; +use std::collections::{HashSet, VecDeque}; +use wnfs_common::BlockStore; + +/// A struct that represents an ongoing walk through the Dag. +#[derive(Clone, Debug)] +pub struct DagWalk { + /// A queue of CIDs to visit next + pub frontier: VecDeque, + /// The set of already visited CIDs. This prevents re-visiting. + pub visited: HashSet, + /// Whether to do a breadth-first or depth-first traversal. + /// This controls whether newly discovered links are appended or prepended to the frontier. + pub breadth_first: bool, +} + +impl DagWalk { + /// Start a breadth-first traversal of given roots. + /// + /// Breadth-first is explained the easiest in the simple case of a tree (which is a DAG): + /// It will visit each node in the tree layer-by-layer. + /// + /// So the first nodes it will visit are going to be all roots in order. + pub fn breadth_first(roots: impl IntoIterator) -> Self { + Self::new(roots, true) + } + + /// Start a depth-first traversal of given roots. + /// + /// Depth-first will follow links immediately after discovering them, taking the fastest + /// path towards leaves. + /// + /// The very first node is guaranteed to be the first root, but subsequent nodes may not be + /// from the initial roots. + pub fn depth_first(roots: impl IntoIterator) -> Self { + Self::new(roots, false) + } + + /// Start a DAG traversal of given roots. See also `breadth_first` and `depth_first`. + pub fn new(roots: impl IntoIterator, breadth_first: bool) -> Self { + let frontier = roots.into_iter().collect(); + let visited = HashSet::new(); + Self { + frontier, + visited, + breadth_first, + } + } + + /// Return the next node in the traversal. + /// + /// Returns `None` if no nodes are left to be visited. + pub async fn next(&mut self, store: &impl BlockStore) -> Result> { + let cid = loop { + let popped = if self.breadth_first { + self.frontier.pop_back() + } else { + self.frontier.pop_front() + }; + + let Some(cid) = popped else { + return Ok(None); + }; + + // We loop until we find an unvisited block + if self.visited.insert(cid) { + break cid; + } + }; + + let block = store.get_block(&cid).await?; + for ref_cid in references(cid, &block)? { + if !self.visited.contains(&ref_cid) { + self.frontier.push_front(ref_cid); + } + } + + Ok(Some((cid, block))) + } + + /// Turn this traversal into a stream + pub fn stream( + self, + store: &impl BlockStore, + ) -> impl Stream> + Unpin + '_ { + Box::pin(try_unfold(self, move |mut this| async move { + let maybe_block = this.next(store).await?; + Ok(maybe_block.map(|b| (b, this))) + })) + } + + /// Find out whether the traversal is finished. + /// + /// The next call to `next` would result in `None` if this returns true. + pub fn is_finished(&self) -> bool { + // We're finished if the frontier does not contain any CIDs that we have not visited yet. + // Put differently: + // We're not finished if there exist unvisited CIDs in the frontier. + !self + .frontier + .iter() + .any(|frontier_cid| !self.visited.contains(frontier_cid)) + } + + /// Skip a node from the traversal for now. + pub fn skip_walking(&mut self, block: (Cid, Bytes)) -> Result<()> { + let (cid, bytes) = block; + let refs = references(cid, bytes)?; + self.visited.insert(cid); + self.frontier + .retain(|frontier_cid| !refs.contains(frontier_cid)); + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use futures::TryStreamExt; + use libipld::Ipld; + use wnfs_common::MemoryBlockStore; + + #[async_std::test] + async fn test_walk_dag_breadth_first() -> Result<()> { + let store = &MemoryBlockStore::new(); + + let cid_1 = store.put_serializable(&Ipld::String("1".into())).await?; + let cid_2 = store.put_serializable(&Ipld::String("2".into())).await?; + let cid_3 = store.put_serializable(&Ipld::String("3".into())).await?; + + let cid_1_wrap = store + .put_serializable(&Ipld::List(vec![Ipld::Link(cid_1)])) + .await?; + + let cid_root = store + .put_serializable(&Ipld::List(vec![ + Ipld::Link(cid_1_wrap), + Ipld::Link(cid_2), + Ipld::Link(cid_3), + ])) + .await?; + + let cids = DagWalk::breadth_first([cid_root]) + .stream(store) + .try_collect::>() + .await? + .into_iter() + .map(|(cid, _block)| cid) + .collect::>(); + + assert_eq!(cids, vec![cid_root, cid_1_wrap, cid_2, cid_3, cid_1]); + + Ok(()) + } +} + +#[cfg(test)] +mod proptests { + use super::*; + use crate::test_utils::{encode, generate_dag}; + use futures::TryStreamExt; + use libipld::{ + multihash::{Code, MultihashDigest}, + Cid, Ipld, IpldCodec, + }; + use proptest::strategy::Strategy; + use std::collections::BTreeSet; + use test_strategy::proptest; + use wnfs_common::{BlockStore, MemoryBlockStore}; + + fn ipld_dags() -> impl Strategy, Cid)> { + generate_dag(256, |cids| { + let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect()); + let cid = Cid::new_v1( + IpldCodec::DagCbor.into(), + Code::Blake3_256.digest(&encode(&ipld)), + ); + (cid, ipld) + }) + } + + #[proptest(max_shrink_iters = 100_000)] + fn walk_dag_never_iterates_block_twice(#[strategy(ipld_dags())] dag: (Vec<(Cid, Ipld)>, Cid)) { + async_std::task::block_on(async { + let (dag, root) = dag; + let store = &MemoryBlockStore::new(); + for (cid, ipld) in dag.iter() { + let cid_store = store + .put_block(encode(ipld), IpldCodec::DagCbor.into()) + .await + .unwrap(); + assert_eq!(*cid, cid_store); + } + + let mut cids = DagWalk::breadth_first([root]) + .stream(store) + .map_ok(|(cid, _)| cid) + .try_collect::>() + .await + .unwrap(); + + cids.sort(); + + let unique_cids = cids + .iter() + .cloned() + .collect::>() + .into_iter() + .collect::>(); + + assert_eq!(cids, unique_cids); + }); + } +} diff --git a/car-mirror/src/incremental_verification.rs b/car-mirror/src/incremental_verification.rs new file mode 100644 index 0000000..24edb3b --- /dev/null +++ b/car-mirror/src/incremental_verification.rs @@ -0,0 +1,100 @@ +use crate::{common::references, dag_walk::DagWalk}; +use anyhow::{bail, Result}; +use bytes::Bytes; +use libipld_core::cid::Cid; +use std::{collections::HashSet, eprintln}; +use wnfs_common::{BlockStore, BlockStoreError}; + +/// A data structure that keeps state about incremental DAG verification. +#[derive(Clone, Debug)] +pub struct IncrementalDagVerification { + /// All the CIDs that have been discovered to be missing from the DAG. + pub want_cids: HashSet, + /// All the CIDs that are available locally. + pub have_cids: HashSet, +} + +impl IncrementalDagVerification { + /// Initiate incremental DAG verification of given roots. + /// + /// This will already run a traversal to find missing subgraphs and + /// CIDs that are already present. + pub async fn new( + roots: impl IntoIterator, + store: &impl BlockStore, + ) -> Result { + let mut want_cids = HashSet::new(); + let mut have_cids = HashSet::new(); + let mut dag_walk = DagWalk::breadth_first(roots); + + loop { + match dag_walk.next(store).await { + Err(e) => { + if let Some(BlockStoreError::CIDNotFound(not_found)) = + e.downcast_ref::() + { + want_cids.insert(*not_found); + } else { + bail!(e); + } + } + Ok(Some((cid, _))) => { + have_cids.insert(cid); + } + Ok(None) => { + break; + } + } + } + + Ok(Self { + want_cids, + have_cids, + }) + } + + /// Verify that + /// - the block actually hashes to the hash from given CID and + /// - the block is part of the graph below the roots. + /// + /// And finally stores the block in the blockstore. + /// + /// This *may* fail, even if the block is part of the graph below the roots, + /// if intermediate blocks between the roots and this block are missing. + /// + /// This *may* add the block to the blockstore, but still fail to verify, specifically + /// if the block's bytes don't match the hash in the CID. + pub async fn verify_and_store_block( + &mut self, + block: (Cid, Bytes), + store: &impl BlockStore, + ) -> Result<()> { + let (cid, bytes) = block; + + if !self.want_cids.contains(&cid) { + if self.have_cids.contains(&cid) { + eprintln!("Warn: Received {cid}, even though we already have it"); + } else { + bail!("Unexpected block or block out of order: {cid}"); + } + } + + let refs = references(cid, &bytes)?; + let result_cid = store.put_block(bytes, cid.codec()).await?; + + if result_cid != cid { + bail!("Digest mismatch in CAR file: expected {cid}, got {result_cid}"); + } + + for ref_cid in refs { + if !self.have_cids.contains(&ref_cid) { + self.want_cids.insert(ref_cid); + } + } + + self.want_cids.remove(&cid); + self.have_cids.insert(cid); + + Ok(()) + } +} diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs index 5982219..2ed447b 100644 --- a/car-mirror/src/lib.rs +++ b/car-mirror/src/lib.rs @@ -24,576 +24,13 @@ use wnfs_common::{BlockStore, BlockStoreError}; #[cfg_attr(docsrs, doc(cfg(feature = "test_utils")))] pub mod test_utils; -/// Contains the data types that are sent over-the-wire and relevant serialization code. +/// Common utilities +pub mod common; +/// Algorithms for walking IPLD directed acyclic graphs +pub mod dag_walk; +/// Algorithms for doing incremental verification of IPLD DAGs on the receiving end. +pub mod incremental_verification; +/// Data types that are sent over-the-wire and relevant serialization code. pub mod messages; - -/// Configuration values (such as byte limits) for the CAR mirror push protocol -#[derive(Clone, Debug)] -pub struct PushConfig { - /// A client will try to send at least `send_minimum` bytes of block data - /// in each request, except if close to the end of the protocol (when there's) - /// not that much data left. - pub send_minimum: usize, - /// The maximum number of bytes per request that the server accepts. - pub receive_maximum: usize, - /// The maximum number of roots per request that the server will send to the client, - /// and that the client will consume. - pub max_roots_per_round: usize, - /// The target false positive rate for the bloom filter that the server sends. - pub bloom_fpr: f64, -} - -impl Default for PushConfig { - fn default() -> Self { - Self { - send_minimum: 128 * 1024, // 128KiB - receive_maximum: 512 * 1024, // 512KiB - max_roots_per_round: 1000, // max. ~41KB of CIDs - bloom_fpr: 1.0 / 10_000.0, // 0.1% - } - } -} - -/// Initiate a car mirror push request. -/// -/// The goal is to transfer the DAG below the root CID to -/// the server. -/// -/// The return value is a CAR file. -pub async fn client_initiate_push( - root: Cid, - config: &PushConfig, - store: &impl BlockStore, -) -> Result { - let fake_response = PushResponse { - subgraph_roots: vec![root], - // Just putting an empty bloom here - bloom_k: 3, - bloom: Vec::new(), - }; - client_push(root, fake_response, config, store).await -} - -/// Send a subsequent car mirror push request, following up on -/// a response retrieved from an initial `client_initiate_push` request. -/// -/// Make sure to call `response.indicates_finished()` before initiating -/// a follow-up `client_push` request. -/// -/// The return value is another CAR file with more blocks from the DAG below the root. -pub async fn client_push( - root: Cid, - last_response: PushResponse, - config: &PushConfig, - store: &impl BlockStore, -) -> Result { - let PushResponse { - ref subgraph_roots, - bloom_k, - bloom, - } = last_response; - - // Verify that all subgraph roots are in the relevant DAG: - let subgraph_roots: Vec = DagWalk::breadth_first([root]) - .stream(store) - .try_filter_map(|(cid, _)| async move { Ok(subgraph_roots.contains(&cid).then_some(cid)) }) - .try_collect() - .await?; - - let bloom = if bloom.is_empty() { - BloomFilter::new_with(1, Box::new([0])) // An empty bloom that contains nothing - } else { - BloomFilter::new_with(bloom_k as usize, bloom.into_boxed_slice()) - }; - - let mut writer = CarWriter::new( - CarHeader::new_v1( - // TODO(matheus23): This is stupid - // CAR files *must* have at least one CID in them, and all of them - // need to appear as a block in the payload. - // It would probably make most sense to just write all subgraph roots into this, - // but we don't know how many of the subgraph roots fit into this round yet, - // so we're simply writing the first one in here, since we know - // at least one block will be written (and it'll be that one). - subgraph_roots.iter().take(1).cloned().collect(), - ), - Vec::new(), - ); - - writer.write_header().await?; - - let mut block_bytes = 0; - let mut dag_walk = DagWalk::breadth_first(subgraph_roots.clone()); - while let Some((cid, block)) = dag_walk.next(store).await? { - if bloom.contains(&cid.to_bytes()) && !subgraph_roots.contains(&cid) { - // TODO(matheus23) I think the spec means to prune the whole subgraph. - // But - // 1. That requires the receiver to check the whole subgraph at that CID to find out whether there's a missing block at the subgraph. - // 2. It requires the sender to go through every block under this subgraph down to the leaves to mark all of these CIDs as visited. - // Both of these are *huge* traversals. I'd say likely not worth it. The only case I can image they're worth it, is if the DAG - // is *heavily* using structural sharing and not tree-like. - // Also: This fails completely if the sender is just missing a single leaf. It couldn't add the block to the bloom in that case. - dag_walk.skip_walking((cid, block))?; - println!("Skipped walking {cid} due to bloom"); - break; - } - - writer.write(cid, &block).await?; - println!("Sending {cid}"); - - // TODO(matheus23): Count the actual bytes sent? - block_bytes += block.len(); - if block_bytes > config.send_minimum { - break; - } - } - - Ok(writer.finish().await?.into()) -} - -/// This handles a car mirror push request on the server side. -/// -/// The root is the root CID of the DAG that is pushed, the request is a CAR file -/// with some blocks from the cold call. -/// -/// Returns a response to answer the client's request with. -pub async fn server_push_response( - root: Cid, - request: Bytes, - config: &PushConfig, - store: &impl BlockStore, -) -> Result { - let mut dag_verification = IncrementalDagVerification::new([root], store).await?; - - let mut reader = CarReader::new(Cursor::new(request)).await?; - let mut block_bytes = 0; - - while let Some((cid, vec)) = reader.next_block().await? { - let block = Bytes::from(vec); - println!("Received {cid}"); - - block_bytes += block.len(); - if block_bytes > config.receive_maximum { - bail!( - "Received more than {} bytes ({block_bytes}), aborting request.", - config.receive_maximum - ); - } - - dag_verification - .verify_and_store_block((cid, block), store) - .await?; - } - - let subgraph_roots = dag_verification - .want_cids - .iter() - .take(config.max_roots_per_round) - .cloned() - .collect(); - - let mut bloom = - BloomFilter::new_from_fpr_po2(dag_verification.have_cids.len() as u64, config.bloom_fpr); - - dag_verification - .have_cids - .iter() - .for_each(|cid| bloom.insert(&cid.to_bytes())); - - Ok(PushResponse { - subgraph_roots, - // We ignore blooms for now - bloom_k: bloom.hash_count() as u32, - bloom: bloom.as_bytes().to_vec(), - }) -} - -/// A struct that represents an ongoing walk through the Dag. -#[derive(Clone, Debug)] -pub struct DagWalk { - /// A queue of CIDs to visit next - pub frontier: VecDeque, - /// The set of already visited CIDs. This prevents re-visiting. - pub visited: HashSet, - /// Whether to do a breadth-first or depth-first traversal. - /// This controls whether newly discovered links are appended or prepended to the frontier. - pub breadth_first: bool, -} - -impl DagWalk { - /// Start a breadth-first traversal of given roots. - /// - /// Breadth-first is explained the easiest in the simple case of a tree (which is a DAG): - /// It will visit each node in the tree layer-by-layer. - /// - /// So the first nodes it will visit are going to be all roots in order. - pub fn breadth_first(roots: impl IntoIterator) -> Self { - Self::new(roots, true) - } - - /// Start a depth-first traversal of given roots. - /// - /// Depth-first will follow links immediately after discovering them, taking the fastest - /// path towards leaves. - /// - /// The very first node is guaranteed to be the first root, but subsequent nodes may not be - /// from the initial roots. - pub fn depth_first(roots: impl IntoIterator) -> Self { - Self::new(roots, false) - } - - /// Start a DAG traversal of given roots. See also `breadth_first` and `depth_first`. - pub fn new(roots: impl IntoIterator, breadth_first: bool) -> Self { - let frontier = roots.into_iter().collect(); - let visited = HashSet::new(); - Self { - frontier, - visited, - breadth_first, - } - } - - /// Return the next node in the traversal. - /// - /// Returns `None` if no nodes are left to be visited. - pub async fn next(&mut self, store: &impl BlockStore) -> Result> { - let cid = loop { - let popped = if self.breadth_first { - self.frontier.pop_back() - } else { - self.frontier.pop_front() - }; - - let Some(cid) = popped else { - return Ok(None); - }; - - // We loop until we find an unvisited block - if self.visited.insert(cid) { - break cid; - } - }; - - let block = store.get_block(&cid).await?; - for ref_cid in references(cid, &block)? { - if !self.visited.contains(&ref_cid) { - self.frontier.push_front(ref_cid); - } - } - - Ok(Some((cid, block))) - } - - /// Turn this traversal into a stream - pub fn stream( - self, - store: &impl BlockStore, - ) -> impl Stream> + Unpin + '_ { - Box::pin(try_unfold(self, move |mut this| async move { - let maybe_block = this.next(store).await?; - Ok(maybe_block.map(|b| (b, this))) - })) - } - - /// Find out whether the traversal is finished. - /// - /// The next call to `next` would result in `None` if this returns true. - pub fn is_finished(&self) -> bool { - // We're finished if the frontier does not contain any CIDs that we have not visited yet. - // Put differently: - // We're not finished if there exist unvisited CIDs in the frontier. - !self - .frontier - .iter() - .any(|frontier_cid| !self.visited.contains(frontier_cid)) - } - - /// Skip a node from the traversal for now. - pub fn skip_walking(&mut self, block: (Cid, Bytes)) -> Result<()> { - let (cid, bytes) = block; - let refs = references(cid, bytes)?; - self.visited.insert(cid); - self.frontier - .retain(|frontier_cid| !refs.contains(frontier_cid)); - - Ok(()) - } -} - -/// Writes a stream of blocks into a car file -pub async fn stream_into_car( - mut blocks: impl Stream> + Unpin, - writer: &mut CarWriter, -) -> Result<()> { - while let Some(result) = blocks.next().await { - let (cid, bytes) = result?; - writer.write(cid, bytes).await?; - } - Ok(()) -} - -/// A data structure that keeps state about incremental DAG verification. -#[derive(Clone, Debug)] -pub struct IncrementalDagVerification { - /// All the CIDs that have been discovered to be missing from the DAG. - pub want_cids: HashSet, - /// All the CIDs that are available locally. - pub have_cids: HashSet, -} - -impl IncrementalDagVerification { - /// Initiate incremental DAG verification of given roots. - /// - /// This will already run a traversal to find missing subgraphs and - /// CIDs that are already present. - pub async fn new( - roots: impl IntoIterator, - store: &impl BlockStore, - ) -> Result { - let mut want_cids = HashSet::new(); - let mut have_cids = HashSet::new(); - let mut dag_walk = DagWalk::breadth_first(roots); - - loop { - match dag_walk.next(store).await { - Err(e) => { - if let Some(BlockStoreError::CIDNotFound(not_found)) = - e.downcast_ref::() - { - want_cids.insert(*not_found); - } else { - bail!(e); - } - } - Ok(Some((cid, _))) => { - have_cids.insert(cid); - } - Ok(None) => { - break; - } - } - } - - Ok(Self { - want_cids, - have_cids, - }) - } - - /// Verify that - /// - the block actually hashes to the hash from given CID and - /// - the block is part of the graph below the roots. - /// - /// And finally stores the block in the blockstore. - /// - /// This *may* fail, even if the block is part of the graph below the roots, - /// if intermediate blocks between the roots and this block are missing. - /// - /// This *may* add the block to the blockstore, but still fail to verify, specifically - /// if the block's bytes don't match the hash in the CID. - pub async fn verify_and_store_block( - &mut self, - block: (Cid, Bytes), - store: &impl BlockStore, - ) -> Result<()> { - let (cid, bytes) = block; - - if !self.want_cids.contains(&cid) { - if self.have_cids.contains(&cid) { - eprintln!("Warn: Received {cid}, even though we already have it"); - } else { - bail!("Unexpected block or block out of order: {cid}"); - } - } - - let refs = references(cid, &bytes)?; - let result_cid = store.put_block(bytes, cid.codec()).await?; - - if result_cid != cid { - bail!("Digest mismatch in CAR file: expected {cid}, got {result_cid}"); - } - - for ref_cid in refs { - if !self.have_cids.contains(&ref_cid) { - self.want_cids.insert(ref_cid); - } - } - - self.want_cids.remove(&cid); - self.have_cids.insert(cid); - - Ok(()) - } -} - -fn references(cid: Cid, block: impl AsRef<[u8]>) -> Result> { - let codec: IpldCodec = cid - .codec() - .try_into() - .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?; - - let mut refs = Vec::new(); - >::references(codec, &mut Cursor::new(block), &mut refs)?; - Ok(refs) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::test_utils::{encode, generate_dag, Rvg}; - use futures::TryStreamExt; - use libipld_core::multihash::{Code, MultihashDigest}; - use std::collections::BTreeMap; - use wnfs_common::MemoryBlockStore; - - #[async_std::test] - async fn test_transfer() -> Result<()> { - let (blocks, root) = Rvg::new().sample(&generate_dag(256, |cids| { - let ipld = Ipld::Map(BTreeMap::from([ - ("data".into(), Ipld::Bytes(vec![0u8; 10 * 1024])), - ( - "links".into(), - Ipld::List(cids.into_iter().map(Ipld::Link).collect()), - ), - ])); - let bytes = encode(&ipld); - let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes)); - (cid, bytes) - })); - - let sender_store = &MemoryBlockStore::new(); - for (cid, bytes) in blocks.iter() { - let cid_store = sender_store - .put_block(bytes.clone(), IpldCodec::DagCbor.into()) - .await?; - assert_eq!(*cid, cid_store); - } - - let receiver_store = &MemoryBlockStore::new(); - let config = &PushConfig::default(); - let mut request = client_initiate_push(root, config, sender_store).await?; - loop { - println!("Sending request {} bytes", request.len()); - let response = server_push_response(root, request, config, receiver_store).await?; - println!( - "Response (bloom bytes: {}): {:?}", - response.bloom.len(), - response.subgraph_roots, - ); - if response.indicates_finished() { - break; - } - request = client_push(root, response, config, sender_store).await?; - } - - // receiver should have all data - let sender_cids = DagWalk::breadth_first([root]) - .stream(sender_store) - .map_ok(|(cid, _)| cid) - .try_collect::>() - .await?; - let receiver_cids = DagWalk::breadth_first([root]) - .stream(receiver_store) - .map_ok(|(cid, _)| cid) - .try_collect::>() - .await?; - - assert_eq!(sender_cids, receiver_cids); - - Ok(()) - } - - #[async_std::test] - async fn test_walk_dag_breadth_first() -> Result<()> { - let store = &MemoryBlockStore::new(); - - let cid_1 = store.put_serializable(&Ipld::String("1".into())).await?; - let cid_2 = store.put_serializable(&Ipld::String("2".into())).await?; - let cid_3 = store.put_serializable(&Ipld::String("3".into())).await?; - - let cid_1_wrap = store - .put_serializable(&Ipld::List(vec![Ipld::Link(cid_1)])) - .await?; - - let cid_root = store - .put_serializable(&Ipld::List(vec![ - Ipld::Link(cid_1_wrap), - Ipld::Link(cid_2), - Ipld::Link(cid_3), - ])) - .await?; - - let cids = DagWalk::breadth_first([cid_root]) - .stream(store) - .try_collect::>() - .await? - .into_iter() - .map(|(cid, _block)| cid) - .collect::>(); - - assert_eq!(cids, vec![cid_root, cid_1_wrap, cid_2, cid_3, cid_1]); - - Ok(()) - } -} - -#[cfg(test)] -mod proptests { - use crate::{ - test_utils::{encode, generate_dag}, - DagWalk, - }; - use futures::TryStreamExt; - use libipld::{ - multihash::{Code, MultihashDigest}, - Cid, Ipld, IpldCodec, - }; - use proptest::strategy::Strategy; - use std::collections::BTreeSet; - use test_strategy::proptest; - use wnfs_common::{BlockStore, MemoryBlockStore}; - - fn ipld_dags() -> impl Strategy, Cid)> { - generate_dag(256, |cids| { - let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect()); - let cid = Cid::new_v1( - IpldCodec::DagCbor.into(), - Code::Blake3_256.digest(&encode(&ipld)), - ); - (cid, ipld) - }) - } - - #[proptest(max_shrink_iters = 100_000)] - fn walk_dag_never_iterates_block_twice(#[strategy(ipld_dags())] dag: (Vec<(Cid, Ipld)>, Cid)) { - async_std::task::block_on(async { - let (dag, root) = dag; - let store = &MemoryBlockStore::new(); - for (cid, ipld) in dag.iter() { - let cid_store = store - .put_block(encode(ipld), IpldCodec::DagCbor.into()) - .await - .unwrap(); - assert_eq!(*cid, cid_store); - } - - let mut cids = DagWalk::breadth_first([root]) - .stream(store) - .map_ok(|(cid, _)| cid) - .try_collect::>() - .await - .unwrap(); - - cids.sort(); - - let unique_cids = cids - .iter() - .cloned() - .collect::>() - .into_iter() - .collect::>(); - - assert_eq!(cids, unique_cids); - }); - } -} +/// The CAR mirror push protocol +pub mod push; diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs new file mode 100644 index 0000000..f7819f8 --- /dev/null +++ b/car-mirror/src/push.rs @@ -0,0 +1,259 @@ +use crate::{ + dag_walk::DagWalk, incremental_verification::IncrementalDagVerification, messages::PushResponse, +}; +use anyhow::{bail, Result}; +use bytes::Bytes; +use deterministic_bloom::runtime_size::BloomFilter; +use futures::TryStreamExt; +use iroh_car::{CarHeader, CarReader, CarWriter}; +use libipld_core::cid::Cid; +use std::io::Cursor; +use wnfs_common::BlockStore; + +/// Configuration values (such as byte limits) for the CAR mirror push protocol +#[derive(Clone, Debug)] +pub struct PushConfig { + /// A client will try to send at least `send_minimum` bytes of block data + /// in each request, except if close to the end of the protocol (when there's) + /// not that much data left. + pub send_minimum: usize, + /// The maximum number of bytes per request that the server accepts. + pub receive_maximum: usize, + /// The maximum number of roots per request that the server will send to the client, + /// and that the client will consume. + pub max_roots_per_round: usize, + /// The target false positive rate for the bloom filter that the server sends. + pub bloom_fpr: f64, +} + +impl Default for PushConfig { + fn default() -> Self { + Self { + send_minimum: 128 * 1024, // 128KiB + receive_maximum: 512 * 1024, // 512KiB + max_roots_per_round: 1000, // max. ~41KB of CIDs + bloom_fpr: 1.0 / 10_000.0, // 0.1% + } + } +} + +/// Initiate a car mirror push request. +/// +/// The goal is to transfer the DAG below the root CID to +/// the server. +/// +/// The return value is a CAR file. +pub async fn client_initiate_push( + root: Cid, + config: &PushConfig, + store: &impl BlockStore, +) -> Result { + let fake_response = PushResponse { + subgraph_roots: vec![root], + // Just putting an empty bloom here + bloom_k: 3, + bloom: Vec::new(), + }; + client_push(root, fake_response, config, store).await +} + +/// Send a subsequent car mirror push request, following up on +/// a response retrieved from an initial `client_initiate_push` request. +/// +/// Make sure to call `response.indicates_finished()` before initiating +/// a follow-up `client_push` request. +/// +/// The return value is another CAR file with more blocks from the DAG below the root. +pub async fn client_push( + root: Cid, + last_response: PushResponse, + config: &PushConfig, + store: &impl BlockStore, +) -> Result { + let PushResponse { + ref subgraph_roots, + bloom_k, + bloom, + } = last_response; + + // Verify that all subgraph roots are in the relevant DAG: + let subgraph_roots: Vec = DagWalk::breadth_first([root]) + .stream(store) + .try_filter_map(|(cid, _)| async move { Ok(subgraph_roots.contains(&cid).then_some(cid)) }) + .try_collect() + .await?; + + let bloom = if bloom.is_empty() { + BloomFilter::new_with(1, Box::new([0])) // An empty bloom that contains nothing + } else { + BloomFilter::new_with(bloom_k as usize, bloom.into_boxed_slice()) + }; + + let mut writer = CarWriter::new( + CarHeader::new_v1( + // TODO(matheus23): This is stupid + // CAR files *must* have at least one CID in them, and all of them + // need to appear as a block in the payload. + // It would probably make most sense to just write all subgraph roots into this, + // but we don't know how many of the subgraph roots fit into this round yet, + // so we're simply writing the first one in here, since we know + // at least one block will be written (and it'll be that one). + subgraph_roots.iter().take(1).cloned().collect(), + ), + Vec::new(), + ); + + writer.write_header().await?; + + let mut block_bytes = 0; + let mut dag_walk = DagWalk::breadth_first(subgraph_roots.clone()); + while let Some((cid, block)) = dag_walk.next(store).await? { + if bloom.contains(&cid.to_bytes()) && !subgraph_roots.contains(&cid) { + // TODO(matheus23) I think the spec means to prune the whole subgraph. + // But + // 1. That requires the receiver to check the whole subgraph at that CID to find out whether there's a missing block at the subgraph. + // 2. It requires the sender to go through every block under this subgraph down to the leaves to mark all of these CIDs as visited. + // Both of these are *huge* traversals. I'd say likely not worth it. The only case I can image they're worth it, is if the DAG + // is *heavily* using structural sharing and not tree-like. + // Also: This fails completely if the sender is just missing a single leaf. It couldn't add the block to the bloom in that case. + dag_walk.skip_walking((cid, block))?; + println!("Skipped walking {cid} due to bloom"); + break; + } + + writer.write(cid, &block).await?; + println!("Sending {cid}"); + + // TODO(matheus23): Count the actual bytes sent? + block_bytes += block.len(); + if block_bytes > config.send_minimum { + break; + } + } + + Ok(writer.finish().await?.into()) +} + +/// This handles a car mirror push request on the server side. +/// +/// The root is the root CID of the DAG that is pushed, the request is a CAR file +/// with some blocks from the cold call. +/// +/// Returns a response to answer the client's request with. +pub async fn server_push_response( + root: Cid, + request: Bytes, + config: &PushConfig, + store: &impl BlockStore, +) -> Result { + let mut dag_verification = IncrementalDagVerification::new([root], store).await?; + + let mut reader = CarReader::new(Cursor::new(request)).await?; + let mut block_bytes = 0; + + while let Some((cid, vec)) = reader.next_block().await? { + let block = Bytes::from(vec); + println!("Received {cid}"); + + block_bytes += block.len(); + if block_bytes > config.receive_maximum { + bail!( + "Received more than {} bytes ({block_bytes}), aborting request.", + config.receive_maximum + ); + } + + dag_verification + .verify_and_store_block((cid, block), store) + .await?; + } + + let subgraph_roots = dag_verification + .want_cids + .iter() + .take(config.max_roots_per_round) + .cloned() + .collect(); + + let mut bloom = + BloomFilter::new_from_fpr_po2(dag_verification.have_cids.len() as u64, config.bloom_fpr); + + dag_verification + .have_cids + .iter() + .for_each(|cid| bloom.insert(&cid.to_bytes())); + + Ok(PushResponse { + subgraph_roots, + // We ignore blooms for now + bloom_k: bloom.hash_count() as u32, + bloom: bloom.as_bytes().to_vec(), + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_utils::{encode, generate_dag, Rvg}; + use libipld::{Ipld, IpldCodec}; + use libipld_core::multihash::{Code, MultihashDigest}; + use std::collections::BTreeMap; + use wnfs_common::MemoryBlockStore; + + #[async_std::test] + async fn test_transfer() -> Result<()> { + let (blocks, root) = Rvg::new().sample(&generate_dag(256, |cids| { + let ipld = Ipld::Map(BTreeMap::from([ + ("data".into(), Ipld::Bytes(vec![0u8; 10 * 1024])), + ( + "links".into(), + Ipld::List(cids.into_iter().map(Ipld::Link).collect()), + ), + ])); + let bytes = encode(&ipld); + let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes)); + (cid, bytes) + })); + + let sender_store = &MemoryBlockStore::new(); + for (cid, bytes) in blocks.iter() { + let cid_store = sender_store + .put_block(bytes.clone(), IpldCodec::DagCbor.into()) + .await?; + assert_eq!(*cid, cid_store); + } + + let receiver_store = &MemoryBlockStore::new(); + let config = &PushConfig::default(); + let mut request = client_initiate_push(root, config, sender_store).await?; + loop { + println!("Sending request {} bytes", request.len()); + let response = server_push_response(root, request, config, receiver_store).await?; + println!( + "Response (bloom bytes: {}): {:?}", + response.bloom.len(), + response.subgraph_roots, + ); + if response.indicates_finished() { + break; + } + request = client_push(root, response, config, sender_store).await?; + } + + // receiver should have all data + let sender_cids = DagWalk::breadth_first([root]) + .stream(sender_store) + .map_ok(|(cid, _)| cid) + .try_collect::>() + .await?; + let receiver_cids = DagWalk::breadth_first([root]) + .stream(receiver_store) + .map_ok(|(cid, _)| cid) + .try_collect::>() + .await?; + + assert_eq!(sender_cids, receiver_cids); + + Ok(()) + } +} From 51c7dc96b3b815a8bd7cdd9118b9e91c12f2cdc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 18 Aug 2023 15:12:22 +0200 Subject: [PATCH 11/35] Remove `println`s, add rough test for round trips --- car-mirror/src/lib.rs | 15 ----- car-mirror/src/push.rs | 67 ++++++++++++++++------- car-mirror/src/test_utils/dag_strategy.rs | 4 +- 3 files changed, 49 insertions(+), 37 deletions(-) diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs index 2ed447b..5b7779b 100644 --- a/car-mirror/src/lib.rs +++ b/car-mirror/src/lib.rs @@ -4,21 +4,6 @@ //! car-mirror -use anyhow::{anyhow, bail, Result}; -use bytes::Bytes; -use deterministic_bloom::runtime_size::BloomFilter; -use futures::{stream::try_unfold, Stream, StreamExt, TryStreamExt}; -use iroh_car::{CarHeader, CarReader, CarWriter}; -use libipld::{Ipld, IpldCodec}; -use libipld_core::{cid::Cid, codec::References}; -use messages::PushResponse; -use std::{ - collections::{HashSet, VecDeque}, - eprintln, - io::Cursor, -}; -use wnfs_common::{BlockStore, BlockStoreError}; - /// Test utilities. #[cfg(any(test, feature = "test_utils"))] #[cfg_attr(docsrs, doc(cfg(feature = "test_utils")))] diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs index f7819f8..13d95f6 100644 --- a/car-mirror/src/push.rs +++ b/car-mirror/src/push.rs @@ -116,13 +116,10 @@ pub async fn client_push( // Both of these are *huge* traversals. I'd say likely not worth it. The only case I can image they're worth it, is if the DAG // is *heavily* using structural sharing and not tree-like. // Also: This fails completely if the sender is just missing a single leaf. It couldn't add the block to the bloom in that case. - dag_walk.skip_walking((cid, block))?; - println!("Skipped walking {cid} due to bloom"); break; } writer.write(cid, &block).await?; - println!("Sending {cid}"); // TODO(matheus23): Count the actual bytes sent? block_bytes += block.len(); @@ -153,7 +150,6 @@ pub async fn server_push_response( while let Some((cid, vec)) = reader.next_block().await? { let block = Bytes::from(vec); - println!("Received {cid}"); block_bytes += block.len(); if block_bytes > config.receive_maximum { @@ -185,7 +181,6 @@ pub async fn server_push_response( Ok(PushResponse { subgraph_roots, - // We ignore blooms for now bloom_k: bloom.hash_count() as u32, bloom: bloom.as_bytes().to_vec(), }) @@ -200,11 +195,12 @@ mod tests { use std::collections::BTreeMap; use wnfs_common::MemoryBlockStore; - #[async_std::test] - async fn test_transfer() -> Result<()> { - let (blocks, root) = Rvg::new().sample(&generate_dag(256, |cids| { + async fn setup_random_dag( + dag_size: u16, + ) -> Result<(Cid, MemoryBlockStore)> { + let (blocks, root) = Rvg::new().sample(&generate_dag(dag_size, |cids| { let ipld = Ipld::Map(BTreeMap::from([ - ("data".into(), Ipld::Bytes(vec![0u8; 10 * 1024])), + ("data".into(), Ipld::Bytes(vec![0u8; BLOCK_PADDING])), ( "links".into(), Ipld::List(cids.into_iter().map(Ipld::Link).collect()), @@ -215,25 +211,24 @@ mod tests { (cid, bytes) })); - let sender_store = &MemoryBlockStore::new(); - for (cid, bytes) in blocks.iter() { - let cid_store = sender_store - .put_block(bytes.clone(), IpldCodec::DagCbor.into()) - .await?; - assert_eq!(*cid, cid_store); + let store = MemoryBlockStore::new(); + for (cid, bytes) in blocks.into_iter() { + let cid_store = store.put_block(bytes, IpldCodec::DagCbor.into()).await?; + assert_eq!(cid, cid_store); } + Ok((root, store)) + } + + #[async_std::test] + async fn test_transfer() -> Result<()> { + const BLOCK_PADDING: usize = 10 * 1024; + let (root, ref sender_store) = setup_random_dag::(256).await?; let receiver_store = &MemoryBlockStore::new(); let config = &PushConfig::default(); let mut request = client_initiate_push(root, config, sender_store).await?; loop { - println!("Sending request {} bytes", request.len()); let response = server_push_response(root, request, config, receiver_store).await?; - println!( - "Response (bloom bytes: {}): {:?}", - response.bloom.len(), - response.subgraph_roots, - ); if response.indicates_finished() { break; } @@ -256,4 +251,34 @@ mod tests { Ok(()) } + + #[async_std::test] + async fn print_average_number_of_rounds() -> Result<()> { + const TESTS: usize = 200; + const DAG_SIZE: u16 = 256; + const BLOCK_PADDING: usize = 10 * 1024; + + let mut total_rounds = 0; + for _ in 0..TESTS { + let (root, ref sender_store) = setup_random_dag::(DAG_SIZE).await?; + let receiver_store = &MemoryBlockStore::new(); + let config = &PushConfig::default(); + let mut request = client_initiate_push(root, config, sender_store).await?; + loop { + let response = server_push_response(root, request, config, receiver_store).await?; + total_rounds += 1; + if response.indicates_finished() { + break; + } + request = client_push(root, response, config, sender_store).await?; + } + } + + println!( + "Average # of rounds: {}", + total_rounds as f64 / TESTS as f64 + ); + + Ok(()) + } } diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs index fc7ce00..7e0ac52 100644 --- a/car-mirror/src/test_utils/dag_strategy.rs +++ b/car-mirror/src/test_utils/dag_strategy.rs @@ -23,7 +23,9 @@ pub fn generate_dag( arb_dag(1..max_nodes, 0.5).prop_map(move |dag| dag_to_nodes(&dag, generate_block)) } -fn dag_to_nodes( +/// Turn a directed acyclic graph into a list of nodes (with their CID) and a root CID. +/// This will select only the DAG that's reachable from the root. +pub fn dag_to_nodes( dag: &DirectedAcyclicGraph, generate_node: fn(Vec) -> (Cid, T), ) -> (Vec<(Cid, T)>, Cid) { From d4c7729faaaadd6d62e69f0f34f98cfd8085ba44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 18 Aug 2023 17:21:06 +0200 Subject: [PATCH 12/35] Generate bigger random DAGs --- Cargo.toml | 1 + car-mirror/src/dag_walk.rs | 2 +- car-mirror/src/push.rs | 117 ++++++++++++++++------ car-mirror/src/test_utils/dag_strategy.rs | 26 +++-- 4 files changed, 108 insertions(+), 38 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 408bb4f..96fa3b8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,3 +23,4 @@ opt-level = "s" # or 'z' to optimize "aggressively" for size # See https://blog.rust-lang.org/2021/03/25/Rust-1.51.0.html#splitting-debug-information [profile.dev] split-debuginfo = "unpacked" +opt-level = 3 \ No newline at end of file diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs index 0f57740..6210272 100644 --- a/car-mirror/src/dag_walk.rs +++ b/car-mirror/src/dag_walk.rs @@ -174,7 +174,7 @@ mod proptests { use wnfs_common::{BlockStore, MemoryBlockStore}; fn ipld_dags() -> impl Strategy, Cid)> { - generate_dag(256, |cids| { + generate_dag(256, |cids, _| { let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect()); let cid = Cid::new_v1( IpldCodec::DagCbor.into(), diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs index 13d95f6..e1e2311 100644 --- a/car-mirror/src/push.rs +++ b/car-mirror/src/push.rs @@ -192,15 +192,25 @@ mod tests { use crate::test_utils::{encode, generate_dag, Rvg}; use libipld::{Ipld, IpldCodec}; use libipld_core::multihash::{Code, MultihashDigest}; + use proptest::prelude::Rng; use std::collections::BTreeMap; use wnfs_common::MemoryBlockStore; + #[derive(Clone, Debug)] + struct Metrics { + request_bytes: usize, + response_bytes: usize, + } + async fn setup_random_dag( dag_size: u16, ) -> Result<(Cid, MemoryBlockStore)> { - let (blocks, root) = Rvg::new().sample(&generate_dag(dag_size, |cids| { + let (blocks, root) = Rvg::new().sample(&generate_dag(dag_size, |cids, rng| { let ipld = Ipld::Map(BTreeMap::from([ - ("data".into(), Ipld::Bytes(vec![0u8; BLOCK_PADDING])), + ( + "data".into(), + Ipld::Bytes((0..BLOCK_PADDING).map(|_| rng.gen::()).collect()), + ), ( "links".into(), Ipld::List(cids.into_iter().map(Ipld::Link).collect()), @@ -220,64 +230,113 @@ mod tests { Ok((root, store)) } - #[async_std::test] - async fn test_transfer() -> Result<()> { - const BLOCK_PADDING: usize = 10 * 1024; - let (root, ref sender_store) = setup_random_dag::(256).await?; - let receiver_store = &MemoryBlockStore::new(); - let config = &PushConfig::default(); - let mut request = client_initiate_push(root, config, sender_store).await?; + async fn simulate_protocol( + root: Cid, + config: &PushConfig, + client_store: &MemoryBlockStore, + server_store: &MemoryBlockStore, + ) -> Result> { + let mut metrics = Vec::new(); + let mut request = client_initiate_push(root, config, client_store).await?; loop { - let response = server_push_response(root, request, config, receiver_store).await?; + let request_bytes = request.len(); + let response = server_push_response(root, request, config, server_store).await?; + let response_bytes = serde_ipld_dagcbor::to_vec(&response)?.len(); + + metrics.push(Metrics { + request_bytes, + response_bytes, + }); + if response.indicates_finished() { break; } - request = client_push(root, response, config, sender_store).await?; + request = client_push(root, response, config, client_store).await?; } + Ok(metrics) + } + + async fn total_dag_bytes(root: Cid, store: &impl BlockStore) -> Result { + Ok(DagWalk::breadth_first([root]) + .stream(store) + .map_ok(|(_, block)| block.len()) + .try_collect::>() + .await? + .into_iter() + .sum::()) + } + + async fn total_dag_blocks(root: Cid, store: &impl BlockStore) -> Result { + Ok(DagWalk::breadth_first([root]) + .stream(store) + .map_ok(|(_, block)| block.len()) + .try_collect::>() + .await? + .len()) + } + + #[async_std::test] + async fn test_transfer() -> Result<()> { + const BLOCK_PADDING: usize = 10 * 1024; + let (root, ref client_store) = setup_random_dag::(256).await?; + let server_store = &MemoryBlockStore::new(); + simulate_protocol(root, &PushConfig::default(), client_store, server_store).await?; + // receiver should have all data - let sender_cids = DagWalk::breadth_first([root]) - .stream(sender_store) + let client_cids = DagWalk::breadth_first([root]) + .stream(client_store) .map_ok(|(cid, _)| cid) .try_collect::>() .await?; - let receiver_cids = DagWalk::breadth_first([root]) - .stream(receiver_store) + let server_cids = DagWalk::breadth_first([root]) + .stream(server_store) .map_ok(|(cid, _)| cid) .try_collect::>() .await?; - assert_eq!(sender_cids, receiver_cids); + assert_eq!(client_cids, server_cids); Ok(()) } #[async_std::test] - async fn print_average_number_of_rounds() -> Result<()> { + async fn print_metrics() -> Result<()> { const TESTS: usize = 200; const DAG_SIZE: u16 = 256; const BLOCK_PADDING: usize = 10 * 1024; let mut total_rounds = 0; + let mut total_blocks = 0; + let mut total_block_bytes = 0; + let mut total_network_bytes = 0; for _ in 0..TESTS { - let (root, ref sender_store) = setup_random_dag::(DAG_SIZE).await?; - let receiver_store = &MemoryBlockStore::new(); - let config = &PushConfig::default(); - let mut request = client_initiate_push(root, config, sender_store).await?; - loop { - let response = server_push_response(root, request, config, receiver_store).await?; - total_rounds += 1; - if response.indicates_finished() { - break; - } - request = client_push(root, response, config, sender_store).await?; - } + let (root, ref client_store) = setup_random_dag::(DAG_SIZE).await?; + let server_store = &MemoryBlockStore::new(); + let metrics = + simulate_protocol(root, &PushConfig::default(), client_store, server_store).await?; + + total_rounds += metrics.len(); + total_blocks += total_dag_blocks(root, client_store).await?; + total_block_bytes += total_dag_bytes(root, client_store).await?; + total_network_bytes += metrics + .iter() + .map(|metric| metric.request_bytes + metric.response_bytes) + .sum::(); } println!( "Average # of rounds: {}", total_rounds as f64 / TESTS as f64 ); + println!( + "Average # of blocks: {}", + total_blocks as f64 / TESTS as f64 + ); + println!( + "Average network overhead: {}%", + (total_network_bytes as f64 / total_block_bytes as f64 - 1.0) * 100.0 + ); Ok(()) } diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs index 7e0ac52..9e91b0b 100644 --- a/car-mirror/src/test_utils/dag_strategy.rs +++ b/car-mirror/src/test_utils/dag_strategy.rs @@ -3,7 +3,7 @@ use std::{collections::HashSet, fmt::Debug}; use bytes::Bytes; use libipld::{Cid, Ipld, IpldCodec}; use libipld_core::codec::Encode; -use proptest::strategy::Strategy; +use proptest::{strategy::Strategy, test_runner::TestRng}; use roaring_graphs::{arb_dag, DirectedAcyclicGraph, Vertex}; /// Encode some IPLD as dag-cbor @@ -18,20 +18,22 @@ pub fn encode(ipld: &Ipld) -> Bytes { /// the root block's CID. pub fn generate_dag( max_nodes: u16, - generate_block: fn(Vec) -> (Cid, T), + generate_block: fn(Vec, rng: &mut TestRng) -> (Cid, T), ) -> impl Strategy, Cid)> { - arb_dag(1..max_nodes, 0.5).prop_map(move |dag| dag_to_nodes(&dag, generate_block)) + arb_dag(1..max_nodes, 0.5) + .prop_perturb(move |dag, mut rng| dag_to_nodes(&dag, &mut rng, generate_block)) } /// Turn a directed acyclic graph into a list of nodes (with their CID) and a root CID. /// This will select only the DAG that's reachable from the root. pub fn dag_to_nodes( dag: &DirectedAcyclicGraph, - generate_node: fn(Vec) -> (Cid, T), + rng: &mut TestRng, + generate_node: fn(Vec, &mut TestRng) -> (Cid, T), ) -> (Vec<(Cid, T)>, Cid) { let mut blocks = Vec::new(); let mut visited = HashSet::new(); - let (cid, block) = dag_to_nodes_helper(dag, 0, generate_node, &mut blocks, &mut visited); + let (cid, block) = dag_to_nodes_helper(dag, 0, rng, generate_node, &mut blocks, &mut visited); blocks.push((cid, block)); (blocks, cid) } @@ -39,7 +41,8 @@ pub fn dag_to_nodes( fn dag_to_nodes_helper( dag: &DirectedAcyclicGraph, root: Vertex, - generate_node: fn(Vec) -> (Cid, T), + rng: &mut TestRng, + generate_node: fn(Vec, &mut TestRng) -> (Cid, T), arr: &mut Vec<(Cid, T)>, visited: &mut HashSet, ) -> (Cid, T) { @@ -49,9 +52,16 @@ fn dag_to_nodes_helper( continue; } visited.insert(child); - child_blocks.push(dag_to_nodes_helper(dag, child, generate_node, arr, visited)); + child_blocks.push(dag_to_nodes_helper( + dag, + child, + rng, + generate_node, + arr, + visited, + )); } - let result = generate_node(child_blocks.iter().map(|(cid, _)| *cid).collect()); + let result = generate_node(child_blocks.iter().map(|(cid, _)| *cid).collect(), rng); arr.extend(child_blocks); result } From 57a0d9f5294cd372ca0656b7f4f4841c3d31c6ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 18 Aug 2023 17:30:14 +0200 Subject: [PATCH 13/35] Use recommended FPR computation from the spec --- car-mirror/src/push.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs index e1e2311..d3da5a7 100644 --- a/car-mirror/src/push.rs +++ b/car-mirror/src/push.rs @@ -23,7 +23,7 @@ pub struct PushConfig { /// and that the client will consume. pub max_roots_per_round: usize, /// The target false positive rate for the bloom filter that the server sends. - pub bloom_fpr: f64, + pub bloom_fpr: fn(u64) -> f64, } impl Default for PushConfig { @@ -32,7 +32,7 @@ impl Default for PushConfig { send_minimum: 128 * 1024, // 128KiB receive_maximum: 512 * 1024, // 512KiB max_roots_per_round: 1000, // max. ~41KB of CIDs - bloom_fpr: 1.0 / 10_000.0, // 0.1% + bloom_fpr: |num_of_elems| 0.1 / num_of_elems as f64, } } } @@ -171,8 +171,10 @@ pub async fn server_push_response( .cloned() .collect(); + let bloom_capacity = dag_verification.have_cids.len() as u64; + let mut bloom = - BloomFilter::new_from_fpr_po2(dag_verification.have_cids.len() as u64, config.bloom_fpr); + BloomFilter::new_from_fpr_po2(bloom_capacity, (config.bloom_fpr)(bloom_capacity)); dag_verification .have_cids From 61cda9556b72ec002d36b95c983578399dc60dc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 18 Aug 2023 17:56:43 +0200 Subject: [PATCH 14/35] Test case for deduplicating transfer --- car-mirror/src/push.rs | 34 ++++++++++++++++++++-- car-mirror/src/test_utils/dag_strategy.rs | 11 +------ car-mirror/src/test_utils/mod.rs | 35 +++++++++++++++++++++++ 3 files changed, 68 insertions(+), 12 deletions(-) diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs index d3da5a7..8b0e2e9 100644 --- a/car-mirror/src/push.rs +++ b/car-mirror/src/push.rs @@ -191,10 +191,10 @@ pub async fn server_push_response( #[cfg(test)] mod tests { use super::*; - use crate::test_utils::{encode, generate_dag, Rvg}; + use crate::test_utils::{encode, generate_dag, get_cid_at_approx_path, Rvg}; use libipld::{Ipld, IpldCodec}; use libipld_core::multihash::{Code, MultihashDigest}; - use proptest::prelude::Rng; + use proptest::{collection::vec, prelude::Rng}; use std::collections::BTreeMap; use wnfs_common::MemoryBlockStore; @@ -302,6 +302,31 @@ mod tests { Ok(()) } + #[async_std::test] + async fn test_deduplicating_transfer() -> Result<()> { + const BLOCK_PADDING: usize = 10 * 1024; + let (root, ref client_store) = setup_random_dag::(256).await?; + let total_bytes = total_dag_bytes(root, client_store).await?; + let path = Rvg::new().sample(&vec(0usize..128, 0..64)); + let second_root = get_cid_at_approx_path(path, root, client_store).await?; + + let server_store = &MemoryBlockStore::new(); + let config = &PushConfig::default(); + let metrics1 = simulate_protocol(second_root, config, client_store, server_store).await?; + let metrics2 = simulate_protocol(root, config, client_store, server_store).await?; + + let total_network_bytes = metrics1 + .into_iter() + .chain(metrics2.into_iter()) + .map(|metric| metric.request_bytes + metric.response_bytes) + .sum::(); + + println!("Total DAG bytes: {total_bytes}"); + println!("Total network bytes: {total_network_bytes}"); + + Ok(()) + } + #[async_std::test] async fn print_metrics() -> Result<()> { const TESTS: usize = 200; @@ -343,3 +368,8 @@ mod tests { Ok(()) } } + +#[cfg(test)] +mod proptests { + use super::*; +} diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs index 9e91b0b..144a927 100644 --- a/car-mirror/src/test_utils/dag_strategy.rs +++ b/car-mirror/src/test_utils/dag_strategy.rs @@ -1,18 +1,9 @@ use std::{collections::HashSet, fmt::Debug}; -use bytes::Bytes; -use libipld::{Cid, Ipld, IpldCodec}; -use libipld_core::codec::Encode; +use libipld::Cid; use proptest::{strategy::Strategy, test_runner::TestRng}; use roaring_graphs::{arb_dag, DirectedAcyclicGraph, Vertex}; -/// Encode some IPLD as dag-cbor -pub fn encode(ipld: &Ipld) -> Bytes { - let mut vec = Vec::new(); - ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap(); // TODO(matheus23) unwrap - Bytes::from(vec) -} - /// A strategy for use with proptest to generate random DAGs (directed acyclic graphs). /// The strategy generates a list of blocks of type T and their CIDs, as well as /// the root block's CID. diff --git a/car-mirror/src/test_utils/mod.rs b/car-mirror/src/test_utils/mod.rs index 890a5ad..8442d38 100644 --- a/car-mirror/src/test_utils/mod.rs +++ b/car-mirror/src/test_utils/mod.rs @@ -1,3 +1,10 @@ +use crate::common::references; +use anyhow::Result; +use bytes::Bytes; +use libipld::{Cid, Ipld, IpldCodec}; +use libipld_core::codec::Encode; +use wnfs_common::BlockStore; + #[cfg(feature = "test_utils")] mod dag_strategy; /// Random value generator for sampling data. @@ -7,3 +14,31 @@ mod rvg; pub use dag_strategy::*; #[cfg(feature = "test_utils")] pub use rvg::*; + +/// Encode some IPLD as dag-cbor +pub fn encode(ipld: &Ipld) -> Bytes { + let mut vec = Vec::new(); + ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap(); // TODO(matheus23) unwrap + Bytes::from(vec) +} + +/// Walk a root DAG along some path. +/// At each node, take the `n % numlinks`th link, +/// and only walk the path as long as there are further links. +pub async fn get_cid_at_approx_path( + path: Vec, + root: Cid, + store: &impl BlockStore, +) -> Result { + let mut working_cid = root; + for nth in path { + let block = store.get_block(&working_cid).await?; + let refs = references(working_cid, block)?; + if refs.is_empty() { + break; + } + + working_cid = refs[nth % refs.len()]; + } + Ok(working_cid) +} From eb53a4460acdf001d2c3f6a67b80497eca124291 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Mon, 21 Aug 2023 10:44:22 +0200 Subject: [PATCH 15/35] Delete irrelevant TODO --- car-mirror/src/push.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs index 8b0e2e9..c3679f3 100644 --- a/car-mirror/src/push.rs +++ b/car-mirror/src/push.rs @@ -109,13 +109,6 @@ pub async fn client_push( let mut dag_walk = DagWalk::breadth_first(subgraph_roots.clone()); while let Some((cid, block)) = dag_walk.next(store).await? { if bloom.contains(&cid.to_bytes()) && !subgraph_roots.contains(&cid) { - // TODO(matheus23) I think the spec means to prune the whole subgraph. - // But - // 1. That requires the receiver to check the whole subgraph at that CID to find out whether there's a missing block at the subgraph. - // 2. It requires the sender to go through every block under this subgraph down to the leaves to mark all of these CIDs as visited. - // Both of these are *huge* traversals. I'd say likely not worth it. The only case I can image they're worth it, is if the DAG - // is *heavily* using structural sharing and not tree-like. - // Also: This fails completely if the sender is just missing a single leaf. It couldn't add the block to the bloom in that case. break; } From b926d54e2828dba211410dfb78c98cb762cd83c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Mon, 21 Aug 2023 11:54:51 +0200 Subject: [PATCH 16/35] Implement pull protocol. Also: - Put `CarFile` bytes into a newtype - Abstract out push&pull protocol parts into `common` - Abstract out test utilities --- car-mirror/src/common.rs | 261 ++++++++++++++++++++++++++++++- car-mirror/src/lib.rs | 4 +- car-mirror/src/messages.rs | 7 + car-mirror/src/pull.rs | 93 +++++++++++ car-mirror/src/push.rs | 260 +++++------------------------- car-mirror/src/test_utils/mod.rs | 70 ++++++++- 6 files changed, 466 insertions(+), 229 deletions(-) create mode 100644 car-mirror/src/pull.rs diff --git a/car-mirror/src/common.rs b/car-mirror/src/common.rs index bbf0cad..cd3229a 100644 --- a/car-mirror/src/common.rs +++ b/car-mirror/src/common.rs @@ -1,7 +1,175 @@ -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, bail, Result}; +use bytes::Bytes; +use deterministic_bloom::runtime_size::BloomFilter; +use futures::TryStreamExt; +use iroh_car::{CarHeader, CarReader, CarWriter}; use libipld::{Ipld, IpldCodec}; use libipld_core::{cid::Cid, codec::References}; use std::io::Cursor; +use wnfs_common::BlockStore; + +use crate::{ + dag_walk::DagWalk, + incremental_verification::IncrementalDagVerification, + messages::{PullRequest, PushResponse}, +}; + +//-------------------------------------------------------------------------------------------------- +// Types +//-------------------------------------------------------------------------------------------------- + +/// Configuration values (such as byte limits) for the CAR mirror protocol +#[derive(Clone, Debug)] +pub struct Config { + /// A client will try to send at least `send_minimum` bytes of block data + /// in each request, except if close to the end of the protocol (when there's) + /// not that much data left. + pub send_minimum: usize, + /// The maximum number of bytes per request that the server accepts. + pub receive_maximum: usize, + /// The maximum number of roots per request that the server will send to the client, + /// and that the client will consume. + pub max_roots_per_round: usize, + /// The target false positive rate for the bloom filter that the server sends. + pub bloom_fpr: fn(u64) -> f64, +} + +#[derive(Debug, Clone)] +pub struct ReceiverState { + pub missing_subgraph_roots: Vec, + pub have_cids_bloom: Option, +} + +/// Newtype around bytes that are supposed to represent a CAR file +#[derive(Debug, Clone)] +pub struct CarFile { + pub bytes: Bytes, +} + +//-------------------------------------------------------------------------------------------------- +// Functions +//-------------------------------------------------------------------------------------------------- + +pub async fn block_send( + root: Cid, + last_state: Option, + config: &Config, + store: &impl BlockStore, +) -> Result { + let ReceiverState { + ref missing_subgraph_roots, + have_cids_bloom, + } = last_state.unwrap_or(ReceiverState { + missing_subgraph_roots: vec![root], + have_cids_bloom: None, + }); + + // Verify that all missing subgraph roots are in the relevant DAG: + let subgraph_roots: Vec = DagWalk::breadth_first([root]) + .stream(store) + .try_filter_map(|(cid, _)| async move { + Ok(missing_subgraph_roots.contains(&cid).then_some(cid)) + }) + .try_collect() + .await?; + + let bloom = have_cids_bloom.unwrap_or(BloomFilter::new_with(1, Box::new([0]))); // An empty bloom that contains nothing + + let mut writer = CarWriter::new( + CarHeader::new_v1( + // TODO(matheus23): This is stupid + // CAR files *must* have at least one CID in them, and all of them + // need to appear as a block in the payload. + // It would probably make most sense to just write all subgraph roots into this, + // but we don't know how many of the subgraph roots fit into this round yet, + // so we're simply writing the first one in here, since we know + // at least one block will be written (and it'll be that one). + subgraph_roots.iter().take(1).cloned().collect(), + ), + Vec::new(), + ); + + writer.write_header().await?; + + let mut block_bytes = 0; + let mut dag_walk = DagWalk::breadth_first(subgraph_roots.clone()); + while let Some((cid, block)) = dag_walk.next(store).await? { + if bloom.contains(&cid.to_bytes()) && !subgraph_roots.contains(&cid) { + break; + } + + writer.write(cid, &block).await?; + + // TODO(matheus23): Count the actual bytes sent? + block_bytes += block.len(); + if block_bytes > config.send_minimum { + break; + } + } + + Ok(CarFile { + bytes: writer.finish().await?.into(), + }) +} + +pub async fn block_receive( + root: Cid, + last_car: Option, + config: &Config, + store: &impl BlockStore, +) -> Result { + let mut dag_verification = IncrementalDagVerification::new([root], store).await?; + + if let Some(car) = last_car { + let mut reader = CarReader::new(Cursor::new(car.bytes)).await?; + let mut block_bytes = 0; + + while let Some((cid, vec)) = reader.next_block().await? { + let block = Bytes::from(vec); + + block_bytes += block.len(); + if block_bytes > config.receive_maximum { + bail!( + "Received more than {} bytes ({block_bytes}), aborting request.", + config.receive_maximum + ); + } + + dag_verification + .verify_and_store_block((cid, block), store) + .await?; + } + } + + let missing_subgraph_roots = dag_verification + .want_cids + .iter() + .take(config.max_roots_per_round) + .cloned() + .collect(); + + let bloom_capacity = dag_verification.have_cids.len() as u64; + + if bloom_capacity == 0 { + return Ok(ReceiverState { + missing_subgraph_roots, + have_cids_bloom: None, + }); + } + + let mut bloom = + BloomFilter::new_from_fpr_po2(bloom_capacity, (config.bloom_fpr)(bloom_capacity)); + + dag_verification + .have_cids + .iter() + .for_each(|cid| bloom.insert(&cid.to_bytes())); + + Ok(ReceiverState { + missing_subgraph_roots, + have_cids_bloom: Some(bloom), + }) +} /// Find all CIDs that a block references. /// @@ -18,3 +186,94 @@ pub fn references(cid: Cid, block: impl AsRef<[u8]>) -> Result> { >::references(codec, &mut Cursor::new(block), &mut refs)?; Ok(refs) } + +//-------------------------------------------------------------------------------------------------- +// Implementations +//-------------------------------------------------------------------------------------------------- + +impl ReceiverState { + pub fn from_push_response(push: PushResponse) -> Self { + let PushResponse { + subgraph_roots, + bloom_k, + bloom, + } = push; + + Self { + missing_subgraph_roots: subgraph_roots, + have_cids_bloom: Self::bloom_deserialize(bloom_k, bloom), + } + } + + pub fn from_pull_request(pull: PullRequest) -> Self { + let PullRequest { + resources, + bloom_k, + bloom, + } = pull; + + Self { + missing_subgraph_roots: resources, + have_cids_bloom: Self::bloom_deserialize(bloom_k, bloom), + } + } + + pub fn into_push_response(self) -> PushResponse { + let ReceiverState { + missing_subgraph_roots, + have_cids_bloom, + } = self; + + let (bloom_k, bloom) = Self::bloom_serialize(have_cids_bloom); + + PushResponse { + subgraph_roots: missing_subgraph_roots, + bloom_k, + bloom, + } + } + + pub fn into_pull_request(self) -> PullRequest { + let ReceiverState { + missing_subgraph_roots, + have_cids_bloom, + } = self; + + let (bloom_k, bloom) = Self::bloom_serialize(have_cids_bloom); + + PullRequest { + resources: missing_subgraph_roots, + bloom_k, + bloom, + } + } + + fn bloom_serialize(bloom: Option) -> (u32, Vec) { + match bloom { + Some(bloom) => (bloom.hash_count() as u32, bloom.as_bytes().to_vec()), + None => (3, Vec::new()), + } + } + + fn bloom_deserialize(bloom_k: u32, bloom: Vec) -> Option { + if bloom.is_empty() { + None + } else { + Some(BloomFilter::new_with( + bloom_k as usize, + bloom.into_boxed_slice(), + )) + } + } +} + +impl Default for Config { + fn default() -> Self { + Self { + send_minimum: 128 * 1024, // 128KiB + receive_maximum: 512 * 1024, // 512KiB + max_roots_per_round: 1000, // max. ~41KB of CIDs + bloom_fpr: |num_of_elems| 0.1 / num_of_elems as f64, + } + } +} diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs index 5b7779b..04ef44a 100644 --- a/car-mirror/src/lib.rs +++ b/car-mirror/src/lib.rs @@ -17,5 +17,7 @@ pub mod dag_walk; pub mod incremental_verification; /// Data types that are sent over-the-wire and relevant serialization code. pub mod messages; -/// The CAR mirror push protocol +/// The CAR mirror pull protocol. Meant to be used qualified, i.e. `pull::request` and `pull::response` +pub mod pull; +/// The CAR mirror push protocol. Meant to be used qualified, i.e. `push::request` and `push::response` pub mod push; diff --git a/car-mirror/src/messages.rs b/car-mirror/src/messages.rs index 283d55c..e1471c1 100644 --- a/car-mirror/src/messages.rs +++ b/car-mirror/src/messages.rs @@ -65,3 +65,10 @@ impl PushResponse { self.subgraph_roots.is_empty() } } + +impl PullRequest { + /// Whether you need to actually send the request or not. If true, this indicates that the protocol is finished. + pub fn indicates_finished(&self) -> bool { + self.resources.is_empty() + } +} diff --git a/car-mirror/src/pull.rs b/car-mirror/src/pull.rs new file mode 100644 index 0000000..20d6d1b --- /dev/null +++ b/car-mirror/src/pull.rs @@ -0,0 +1,93 @@ +use crate::{ + common::{block_receive, block_send, CarFile, Config, ReceiverState}, + messages::PullRequest, +}; +use anyhow::Result; +use libipld::Cid; +use wnfs_common::BlockStore; + +pub async fn request( + root: Cid, + last_response: Option, + config: &Config, + store: &impl BlockStore, +) -> Result { + Ok(block_receive(root, last_response, config, store) + .await? + .into_pull_request()) +} + +pub async fn response( + root: Cid, + request: PullRequest, + config: &Config, + store: &impl BlockStore, +) -> Result { + let receiver_state = Some(ReceiverState::from_pull_request(request)); + block_send(root, receiver_state, config, store).await +} + +#[cfg(test)] +mod tests { + use crate::{ + common::Config, + dag_walk::DagWalk, + test_utils::{setup_random_dag, Metrics}, + }; + use anyhow::Result; + use futures::TryStreamExt; + use libipld::Cid; + use wnfs_common::MemoryBlockStore; + + async fn simulate_protocol( + root: Cid, + config: &Config, + client_store: &MemoryBlockStore, + server_store: &MemoryBlockStore, + ) -> Result> { + let mut metrics = Vec::new(); + let mut request = crate::pull::request(root, None, config, client_store).await?; + loop { + let request_bytes = serde_ipld_dagcbor::to_vec(&request)?.len(); + let response = crate::pull::response(root, request, config, server_store).await?; + let response_bytes = response.bytes.len(); + + metrics.push(Metrics { + request_bytes, + response_bytes, + }); + + request = crate::pull::request(root, Some(response), config, client_store).await?; + if request.indicates_finished() { + break; + } + } + + Ok(metrics) + } + + #[async_std::test] + async fn test_transfer() -> Result<()> { + const BLOCK_PADDING: usize = 10 * 1024; // 10KiB + let client_store = &MemoryBlockStore::new(); + let (root, ref server_store) = setup_random_dag::(256).await?; + + simulate_protocol(root, &Config::default(), client_store, server_store).await?; + + // client should have all data + let client_cids = DagWalk::breadth_first([root]) + .stream(client_store) + .map_ok(|(cid, _)| cid) + .try_collect::>() + .await?; + let server_cids = DagWalk::breadth_first([root]) + .stream(server_store) + .map_ok(|(cid, _)| cid) + .try_collect::>() + .await?; + + assert_eq!(client_cids, server_cids); + + Ok(()) + } +} diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs index c3679f3..3d37940 100644 --- a/car-mirror/src/push.rs +++ b/car-mirror/src/push.rs @@ -1,62 +1,13 @@ use crate::{ - dag_walk::DagWalk, incremental_verification::IncrementalDagVerification, messages::PushResponse, + common::{block_receive, block_send, CarFile, Config, ReceiverState}, + messages::PushResponse, }; -use anyhow::{bail, Result}; -use bytes::Bytes; -use deterministic_bloom::runtime_size::BloomFilter; -use futures::TryStreamExt; -use iroh_car::{CarHeader, CarReader, CarWriter}; +use anyhow::Result; use libipld_core::cid::Cid; -use std::io::Cursor; use wnfs_common::BlockStore; -/// Configuration values (such as byte limits) for the CAR mirror push protocol -#[derive(Clone, Debug)] -pub struct PushConfig { - /// A client will try to send at least `send_minimum` bytes of block data - /// in each request, except if close to the end of the protocol (when there's) - /// not that much data left. - pub send_minimum: usize, - /// The maximum number of bytes per request that the server accepts. - pub receive_maximum: usize, - /// The maximum number of roots per request that the server will send to the client, - /// and that the client will consume. - pub max_roots_per_round: usize, - /// The target false positive rate for the bloom filter that the server sends. - pub bloom_fpr: fn(u64) -> f64, -} - -impl Default for PushConfig { - fn default() -> Self { - Self { - send_minimum: 128 * 1024, // 128KiB - receive_maximum: 512 * 1024, // 512KiB - max_roots_per_round: 1000, // max. ~41KB of CIDs - bloom_fpr: |num_of_elems| 0.1 / num_of_elems as f64, - } - } -} - -/// Initiate a car mirror push request. -/// -/// The goal is to transfer the DAG below the root CID to -/// the server. +/// TODO(matheus23) update docs /// -/// The return value is a CAR file. -pub async fn client_initiate_push( - root: Cid, - config: &PushConfig, - store: &impl BlockStore, -) -> Result { - let fake_response = PushResponse { - subgraph_roots: vec![root], - // Just putting an empty bloom here - bloom_k: 3, - bloom: Vec::new(), - }; - client_push(root, fake_response, config, store).await -} - /// Send a subsequent car mirror push request, following up on /// a response retrieved from an initial `client_initiate_push` request. /// @@ -64,178 +15,62 @@ pub async fn client_initiate_push( /// a follow-up `client_push` request. /// /// The return value is another CAR file with more blocks from the DAG below the root. -pub async fn client_push( +pub async fn request( root: Cid, - last_response: PushResponse, - config: &PushConfig, + last_response: Option, + config: &Config, store: &impl BlockStore, -) -> Result { - let PushResponse { - ref subgraph_roots, - bloom_k, - bloom, - } = last_response; - - // Verify that all subgraph roots are in the relevant DAG: - let subgraph_roots: Vec = DagWalk::breadth_first([root]) - .stream(store) - .try_filter_map(|(cid, _)| async move { Ok(subgraph_roots.contains(&cid).then_some(cid)) }) - .try_collect() - .await?; - - let bloom = if bloom.is_empty() { - BloomFilter::new_with(1, Box::new([0])) // An empty bloom that contains nothing - } else { - BloomFilter::new_with(bloom_k as usize, bloom.into_boxed_slice()) - }; - - let mut writer = CarWriter::new( - CarHeader::new_v1( - // TODO(matheus23): This is stupid - // CAR files *must* have at least one CID in them, and all of them - // need to appear as a block in the payload. - // It would probably make most sense to just write all subgraph roots into this, - // but we don't know how many of the subgraph roots fit into this round yet, - // so we're simply writing the first one in here, since we know - // at least one block will be written (and it'll be that one). - subgraph_roots.iter().take(1).cloned().collect(), - ), - Vec::new(), - ); - - writer.write_header().await?; - - let mut block_bytes = 0; - let mut dag_walk = DagWalk::breadth_first(subgraph_roots.clone()); - while let Some((cid, block)) = dag_walk.next(store).await? { - if bloom.contains(&cid.to_bytes()) && !subgraph_roots.contains(&cid) { - break; - } - - writer.write(cid, &block).await?; - - // TODO(matheus23): Count the actual bytes sent? - block_bytes += block.len(); - if block_bytes > config.send_minimum { - break; - } - } - - Ok(writer.finish().await?.into()) +) -> Result { + let receiver_state = last_response.map(ReceiverState::from_push_response); + block_send(root, receiver_state, config, store).await } +/// TODO(matheus23) update docs +/// /// This handles a car mirror push request on the server side. /// /// The root is the root CID of the DAG that is pushed, the request is a CAR file /// with some blocks from the cold call. /// /// Returns a response to answer the client's request with. -pub async fn server_push_response( +pub async fn response( root: Cid, - request: Bytes, - config: &PushConfig, + request: CarFile, + config: &Config, store: &impl BlockStore, ) -> Result { - let mut dag_verification = IncrementalDagVerification::new([root], store).await?; - - let mut reader = CarReader::new(Cursor::new(request)).await?; - let mut block_bytes = 0; - - while let Some((cid, vec)) = reader.next_block().await? { - let block = Bytes::from(vec); - - block_bytes += block.len(); - if block_bytes > config.receive_maximum { - bail!( - "Received more than {} bytes ({block_bytes}), aborting request.", - config.receive_maximum - ); - } - - dag_verification - .verify_and_store_block((cid, block), store) - .await?; - } - - let subgraph_roots = dag_verification - .want_cids - .iter() - .take(config.max_roots_per_round) - .cloned() - .collect(); - - let bloom_capacity = dag_verification.have_cids.len() as u64; - - let mut bloom = - BloomFilter::new_from_fpr_po2(bloom_capacity, (config.bloom_fpr)(bloom_capacity)); - - dag_verification - .have_cids - .iter() - .for_each(|cid| bloom.insert(&cid.to_bytes())); - - Ok(PushResponse { - subgraph_roots, - bloom_k: bloom.hash_count() as u32, - bloom: bloom.as_bytes().to_vec(), - }) + Ok(block_receive(root, Some(request), config, store) + .await? + .into_push_response()) } #[cfg(test)] mod tests { - use super::*; - use crate::test_utils::{encode, generate_dag, get_cid_at_approx_path, Rvg}; - use libipld::{Ipld, IpldCodec}; - use libipld_core::multihash::{Code, MultihashDigest}; - use proptest::{collection::vec, prelude::Rng}; - use std::collections::BTreeMap; + use crate::{ + common::Config, + dag_walk::DagWalk, + test_utils::{ + get_cid_at_approx_path, setup_random_dag, total_dag_blocks, total_dag_bytes, Metrics, + Rvg, + }, + }; + use anyhow::Result; + use futures::TryStreamExt; + use libipld::Cid; + use proptest::collection::vec; use wnfs_common::MemoryBlockStore; - #[derive(Clone, Debug)] - struct Metrics { - request_bytes: usize, - response_bytes: usize, - } - - async fn setup_random_dag( - dag_size: u16, - ) -> Result<(Cid, MemoryBlockStore)> { - let (blocks, root) = Rvg::new().sample(&generate_dag(dag_size, |cids, rng| { - let ipld = Ipld::Map(BTreeMap::from([ - ( - "data".into(), - Ipld::Bytes((0..BLOCK_PADDING).map(|_| rng.gen::()).collect()), - ), - ( - "links".into(), - Ipld::List(cids.into_iter().map(Ipld::Link).collect()), - ), - ])); - let bytes = encode(&ipld); - let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes)); - (cid, bytes) - })); - - let store = MemoryBlockStore::new(); - for (cid, bytes) in blocks.into_iter() { - let cid_store = store.put_block(bytes, IpldCodec::DagCbor.into()).await?; - assert_eq!(cid, cid_store); - } - - Ok((root, store)) - } - async fn simulate_protocol( root: Cid, - config: &PushConfig, + config: &Config, client_store: &MemoryBlockStore, server_store: &MemoryBlockStore, ) -> Result> { let mut metrics = Vec::new(); - let mut request = client_initiate_push(root, config, client_store).await?; + let mut request = crate::push::request(root, None, config, client_store).await?; loop { - let request_bytes = request.len(); - let response = server_push_response(root, request, config, server_store).await?; + let request_bytes = request.bytes.len(); + let response = crate::push::response(root, request, config, server_store).await?; let response_bytes = serde_ipld_dagcbor::to_vec(&response)?.len(); metrics.push(Metrics { @@ -246,37 +81,18 @@ mod tests { if response.indicates_finished() { break; } - request = client_push(root, response, config, client_store).await?; + request = crate::push::request(root, Some(response), config, client_store).await?; } Ok(metrics) } - async fn total_dag_bytes(root: Cid, store: &impl BlockStore) -> Result { - Ok(DagWalk::breadth_first([root]) - .stream(store) - .map_ok(|(_, block)| block.len()) - .try_collect::>() - .await? - .into_iter() - .sum::()) - } - - async fn total_dag_blocks(root: Cid, store: &impl BlockStore) -> Result { - Ok(DagWalk::breadth_first([root]) - .stream(store) - .map_ok(|(_, block)| block.len()) - .try_collect::>() - .await? - .len()) - } - #[async_std::test] async fn test_transfer() -> Result<()> { const BLOCK_PADDING: usize = 10 * 1024; let (root, ref client_store) = setup_random_dag::(256).await?; let server_store = &MemoryBlockStore::new(); - simulate_protocol(root, &PushConfig::default(), client_store, server_store).await?; + simulate_protocol(root, &Config::default(), client_store, server_store).await?; // receiver should have all data let client_cids = DagWalk::breadth_first([root]) @@ -304,7 +120,7 @@ mod tests { let second_root = get_cid_at_approx_path(path, root, client_store).await?; let server_store = &MemoryBlockStore::new(); - let config = &PushConfig::default(); + let config = &Config::default(); let metrics1 = simulate_protocol(second_root, config, client_store, server_store).await?; let metrics2 = simulate_protocol(root, config, client_store, server_store).await?; @@ -334,7 +150,7 @@ mod tests { let (root, ref client_store) = setup_random_dag::(DAG_SIZE).await?; let server_store = &MemoryBlockStore::new(); let metrics = - simulate_protocol(root, &PushConfig::default(), client_store, server_store).await?; + simulate_protocol(root, &Config::default(), client_store, server_store).await?; total_rounds += metrics.len(); total_blocks += total_dag_blocks(root, client_store).await?; diff --git a/car-mirror/src/test_utils/mod.rs b/car-mirror/src/test_utils/mod.rs index 8442d38..92bb0a0 100644 --- a/car-mirror/src/test_utils/mod.rs +++ b/car-mirror/src/test_utils/mod.rs @@ -1,9 +1,16 @@ -use crate::common::references; +use std::collections::BTreeMap; + +use crate::{common::references, dag_walk::DagWalk}; use anyhow::Result; use bytes::Bytes; +use futures::TryStreamExt; use libipld::{Cid, Ipld, IpldCodec}; -use libipld_core::codec::Encode; -use wnfs_common::BlockStore; +use libipld_core::{ + codec::Encode, + multihash::{Code, MultihashDigest}, +}; +use proptest::prelude::Rng; +use wnfs_common::{BlockStore, MemoryBlockStore}; #[cfg(feature = "test_utils")] mod dag_strategy; @@ -15,8 +22,14 @@ pub use dag_strategy::*; #[cfg(feature = "test_utils")] pub use rvg::*; +#[derive(Clone, Debug)] +pub(crate) struct Metrics { + pub(crate) request_bytes: usize, + pub(crate) response_bytes: usize, +} + /// Encode some IPLD as dag-cbor -pub fn encode(ipld: &Ipld) -> Bytes { +pub(crate) fn encode(ipld: &Ipld) -> Bytes { let mut vec = Vec::new(); ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap(); // TODO(matheus23) unwrap Bytes::from(vec) @@ -25,7 +38,7 @@ pub fn encode(ipld: &Ipld) -> Bytes { /// Walk a root DAG along some path. /// At each node, take the `n % numlinks`th link, /// and only walk the path as long as there are further links. -pub async fn get_cid_at_approx_path( +pub(crate) async fn get_cid_at_approx_path( path: Vec, root: Cid, store: &impl BlockStore, @@ -42,3 +55,50 @@ pub async fn get_cid_at_approx_path( } Ok(working_cid) } + +pub(crate) async fn setup_random_dag( + dag_size: u16, +) -> Result<(Cid, MemoryBlockStore)> { + let (blocks, root) = Rvg::new().sample(&generate_dag(dag_size, |cids, rng| { + let ipld = Ipld::Map(BTreeMap::from([ + ( + "data".into(), + Ipld::Bytes((0..BLOCK_PADDING).map(|_| rng.gen::()).collect()), + ), + ( + "links".into(), + Ipld::List(cids.into_iter().map(Ipld::Link).collect()), + ), + ])); + let bytes = encode(&ipld); + let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes)); + (cid, bytes) + })); + + let store = MemoryBlockStore::new(); + for (cid, bytes) in blocks.into_iter() { + let cid_store = store.put_block(bytes, IpldCodec::DagCbor.into()).await?; + assert_eq!(cid, cid_store); + } + + Ok((root, store)) +} + +pub(crate) async fn total_dag_bytes(root: Cid, store: &impl BlockStore) -> Result { + Ok(DagWalk::breadth_first([root]) + .stream(store) + .map_ok(|(_, block)| block.len()) + .try_collect::>() + .await? + .into_iter() + .sum::()) +} + +pub(crate) async fn total_dag_blocks(root: Cid, store: &impl BlockStore) -> Result { + Ok(DagWalk::breadth_first([root]) + .stream(store) + .map_ok(|(_, block)| block.len()) + .try_collect::>() + .await? + .len()) +} From 2c1b7035e4a835c45101cfb30319adfe5352eeec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Mon, 21 Aug 2023 12:03:45 +0200 Subject: [PATCH 17/35] Make `generate_dag`'s function be able to capture --- car-mirror/src/dag_walk.rs | 2 +- car-mirror/src/pull.rs | 3 +-- car-mirror/src/push.rs | 8 +++----- car-mirror/src/test_utils/dag_strategy.rs | 8 ++++---- car-mirror/src/test_utils/mod.rs | 15 +++++++++------ 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs index 6210272..1ec03ba 100644 --- a/car-mirror/src/dag_walk.rs +++ b/car-mirror/src/dag_walk.rs @@ -174,7 +174,7 @@ mod proptests { use wnfs_common::{BlockStore, MemoryBlockStore}; fn ipld_dags() -> impl Strategy, Cid)> { - generate_dag(256, |cids, _| { + generate_dag(256, &|cids, _| { let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect()); let cid = Cid::new_v1( IpldCodec::DagCbor.into(), diff --git a/car-mirror/src/pull.rs b/car-mirror/src/pull.rs index 20d6d1b..4db2365 100644 --- a/car-mirror/src/pull.rs +++ b/car-mirror/src/pull.rs @@ -68,9 +68,8 @@ mod tests { #[async_std::test] async fn test_transfer() -> Result<()> { - const BLOCK_PADDING: usize = 10 * 1024; // 10KiB let client_store = &MemoryBlockStore::new(); - let (root, ref server_store) = setup_random_dag::(256).await?; + let (root, ref server_store) = setup_random_dag(256, 10 * 1024 /* 10 KiB */).await?; simulate_protocol(root, &Config::default(), client_store, server_store).await?; diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs index 3d37940..8e652a2 100644 --- a/car-mirror/src/push.rs +++ b/car-mirror/src/push.rs @@ -89,8 +89,7 @@ mod tests { #[async_std::test] async fn test_transfer() -> Result<()> { - const BLOCK_PADDING: usize = 10 * 1024; - let (root, ref client_store) = setup_random_dag::(256).await?; + let (root, ref client_store) = setup_random_dag(256, 10 * 1024 /* 10 KiB */).await?; let server_store = &MemoryBlockStore::new(); simulate_protocol(root, &Config::default(), client_store, server_store).await?; @@ -113,8 +112,7 @@ mod tests { #[async_std::test] async fn test_deduplicating_transfer() -> Result<()> { - const BLOCK_PADDING: usize = 10 * 1024; - let (root, ref client_store) = setup_random_dag::(256).await?; + let (root, ref client_store) = setup_random_dag(256, 10 * 1024 /* 10 KiB */).await?; let total_bytes = total_dag_bytes(root, client_store).await?; let path = Rvg::new().sample(&vec(0usize..128, 0..64)); let second_root = get_cid_at_approx_path(path, root, client_store).await?; @@ -147,7 +145,7 @@ mod tests { let mut total_block_bytes = 0; let mut total_network_bytes = 0; for _ in 0..TESTS { - let (root, ref client_store) = setup_random_dag::(DAG_SIZE).await?; + let (root, ref client_store) = setup_random_dag(DAG_SIZE, BLOCK_PADDING).await?; let server_store = &MemoryBlockStore::new(); let metrics = simulate_protocol(root, &Config::default(), client_store, server_store).await?; diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs index 144a927..60e1ab6 100644 --- a/car-mirror/src/test_utils/dag_strategy.rs +++ b/car-mirror/src/test_utils/dag_strategy.rs @@ -9,8 +9,8 @@ use roaring_graphs::{arb_dag, DirectedAcyclicGraph, Vertex}; /// the root block's CID. pub fn generate_dag( max_nodes: u16, - generate_block: fn(Vec, rng: &mut TestRng) -> (Cid, T), -) -> impl Strategy, Cid)> { + generate_block: &impl Fn(Vec, &mut TestRng) -> (Cid, T), +) -> impl Strategy, Cid)> + '_ { arb_dag(1..max_nodes, 0.5) .prop_perturb(move |dag, mut rng| dag_to_nodes(&dag, &mut rng, generate_block)) } @@ -20,7 +20,7 @@ pub fn generate_dag( pub fn dag_to_nodes( dag: &DirectedAcyclicGraph, rng: &mut TestRng, - generate_node: fn(Vec, &mut TestRng) -> (Cid, T), + generate_node: &impl Fn(Vec, &mut TestRng) -> (Cid, T), ) -> (Vec<(Cid, T)>, Cid) { let mut blocks = Vec::new(); let mut visited = HashSet::new(); @@ -33,7 +33,7 @@ fn dag_to_nodes_helper( dag: &DirectedAcyclicGraph, root: Vertex, rng: &mut TestRng, - generate_node: fn(Vec, &mut TestRng) -> (Cid, T), + generate_node: &impl Fn(Vec, &mut TestRng) -> (Cid, T), arr: &mut Vec<(Cid, T)>, visited: &mut HashSet, ) -> (Cid, T) { diff --git a/car-mirror/src/test_utils/mod.rs b/car-mirror/src/test_utils/mod.rs index 92bb0a0..ed74678 100644 --- a/car-mirror/src/test_utils/mod.rs +++ b/car-mirror/src/test_utils/mod.rs @@ -56,15 +56,18 @@ pub(crate) async fn get_cid_at_approx_path( Ok(working_cid) } -pub(crate) async fn setup_random_dag( +pub(crate) async fn setup_random_dag( dag_size: u16, + block_padding: usize, ) -> Result<(Cid, MemoryBlockStore)> { - let (blocks, root) = Rvg::new().sample(&generate_dag(dag_size, |cids, rng| { + let (blocks, root) = Rvg::new().sample(&generate_dag(dag_size, &|cids, rng| { + let mut padding = Vec::with_capacity(block_padding); + for _ in 0..block_padding { + padding.push(rng.gen::()); + } + let ipld = Ipld::Map(BTreeMap::from([ - ( - "data".into(), - Ipld::Bytes((0..BLOCK_PADDING).map(|_| rng.gen::()).collect()), - ), + ("data".into(), Ipld::Bytes(padding)), ( "links".into(), Ipld::List(cids.into_iter().map(Ipld::Link).collect()), From 5a7ba80cd8ac6ac44b590d98e44b917f014cd821 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Mon, 21 Aug 2023 18:15:56 +0200 Subject: [PATCH 18/35] Handle TODOs, implement proptests --- car-mirror/src/common.rs | 42 ++++++- car-mirror/src/dag_walk.rs | 2 +- car-mirror/src/pull.rs | 65 ++++++++++- car-mirror/src/push.rs | 75 ++++++++++--- car-mirror/src/test_utils/dag_strategy.rs | 15 ++- car-mirror/src/test_utils/local_utils.rs | 128 ++++++++++++++++++++++ car-mirror/src/test_utils/mod.rs | 101 +---------------- 7 files changed, 299 insertions(+), 129 deletions(-) create mode 100644 car-mirror/src/test_utils/local_utils.rs diff --git a/car-mirror/src/common.rs b/car-mirror/src/common.rs index cd3229a..ef48acc 100644 --- a/car-mirror/src/common.rs +++ b/car-mirror/src/common.rs @@ -34,15 +34,21 @@ pub struct Config { pub bloom_fpr: fn(u64) -> f64, } +/// Some information that the block receiving end provides the block sending end +/// in order to deduplicate block transfers. #[derive(Debug, Clone)] pub struct ReceiverState { + /// At least *some* of the subgraph roots that are missing for sure on the receiving end. pub missing_subgraph_roots: Vec, + /// An optional bloom filter of all CIDs below the root that the receiving end has. pub have_cids_bloom: Option, } /// Newtype around bytes that are supposed to represent a CAR file #[derive(Debug, Clone)] pub struct CarFile { + /// The car file contents as bytes. + /// (`CarFile` is cheap to clone, since `Bytes` is an `Arc` wrapper around a byte buffer.) pub bytes: Bytes, } @@ -50,6 +56,13 @@ pub struct CarFile { // Functions //-------------------------------------------------------------------------------------------------- +/// This function is run on the block sending side of the protocol. +/// +/// It's used on the client during the push protocol, or on the server +/// during the pull protocol. +/// +/// It returns a `CarFile` of (a subset) of all blocks below `root`, that +/// are thought to be missing on the receiving end. pub async fn block_send( root: Cid, last_state: Option, @@ -77,7 +90,7 @@ pub async fn block_send( let mut writer = CarWriter::new( CarHeader::new_v1( - // TODO(matheus23): This is stupid + // https://github.com/wnfs-wg/car-mirror-spec/issues/6 // CAR files *must* have at least one CID in them, and all of them // need to appear as a block in the payload. // It would probably make most sense to just write all subgraph roots into this, @@ -101,6 +114,7 @@ pub async fn block_send( writer.write(cid, &block).await?; // TODO(matheus23): Count the actual bytes sent? + // At the moment, this is a rough estimate. iroh-car could be improved to return the written bytes. block_bytes += block.len(); if block_bytes > config.send_minimum { break; @@ -112,6 +126,14 @@ pub async fn block_send( }) } +/// This function is run on the block receiving end of the protocol. +/// +/// It's used on the client during the pull protocol and on the server +/// during the push protocol. +/// +/// It takes a `CarFile`, verifies that its contents are related to the +/// `root` and returns some information to help the block sending side +/// figure out what blocks to send next. pub async fn block_receive( root: Cid, last_car: Option, @@ -191,8 +213,8 @@ pub fn references(cid: Cid, block: impl AsRef<[u8]>) -> Result> { // Implementations //-------------------------------------------------------------------------------------------------- -impl ReceiverState { - pub fn from_push_response(push: PushResponse) -> Self { +impl From for ReceiverState { + fn from(push: PushResponse) -> Self { let PushResponse { subgraph_roots, bloom_k, @@ -204,8 +226,10 @@ impl ReceiverState { have_cids_bloom: Self::bloom_deserialize(bloom_k, bloom), } } +} - pub fn from_pull_request(pull: PullRequest) -> Self { +impl From for ReceiverState { + fn from(pull: PullRequest) -> Self { let PullRequest { resources, bloom_k, @@ -217,8 +241,10 @@ impl ReceiverState { have_cids_bloom: Self::bloom_deserialize(bloom_k, bloom), } } +} - pub fn into_push_response(self) -> PushResponse { +impl Into for ReceiverState { + fn into(self) -> PushResponse { let ReceiverState { missing_subgraph_roots, have_cids_bloom, @@ -232,8 +258,10 @@ impl ReceiverState { bloom, } } +} - pub fn into_pull_request(self) -> PullRequest { +impl Into for ReceiverState { + fn into(self) -> PullRequest { let ReceiverState { missing_subgraph_roots, have_cids_bloom, @@ -247,7 +275,9 @@ impl ReceiverState { bloom, } } +} +impl ReceiverState { fn bloom_serialize(bloom: Option) -> (u32, Vec) { match bloom { Some(bloom) => (bloom.hash_count() as u32, bloom.as_bytes().to_vec()), diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs index 1ec03ba..6210272 100644 --- a/car-mirror/src/dag_walk.rs +++ b/car-mirror/src/dag_walk.rs @@ -174,7 +174,7 @@ mod proptests { use wnfs_common::{BlockStore, MemoryBlockStore}; fn ipld_dags() -> impl Strategy, Cid)> { - generate_dag(256, &|cids, _| { + generate_dag(256, |cids, _| { let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect()); let cid = Cid::new_v1( IpldCodec::DagCbor.into(), diff --git a/car-mirror/src/pull.rs b/car-mirror/src/pull.rs index 4db2365..8dc1292 100644 --- a/car-mirror/src/pull.rs +++ b/car-mirror/src/pull.rs @@ -6,6 +6,17 @@ use anyhow::Result; use libipld::Cid; use wnfs_common::BlockStore; +/// Create a CAR mirror pull request. +/// +/// If this is the first request that's sent for this +/// particular root CID, then set `last_response` to `None`. +/// +/// On subsequent requests, set `last_response` to the +/// last successfully received response. +/// +/// Before actually sending the request over the network, +/// make sure to check the `request.indicates_finished()`. +/// If true, the client already has all data. pub async fn request( root: Cid, last_response: Option, @@ -14,16 +25,17 @@ pub async fn request( ) -> Result { Ok(block_receive(root, last_response, config, store) .await? - .into_pull_request()) + .into()) } +/// Respond to a CAR mirror pull request. pub async fn response( root: Cid, request: PullRequest, config: &Config, store: &impl BlockStore, ) -> Result { - let receiver_state = Some(ReceiverState::from_pull_request(request)); + let receiver_state = Some(ReceiverState::from(request)); block_send(root, receiver_state, config, store).await } @@ -39,7 +51,7 @@ mod tests { use libipld::Cid; use wnfs_common::MemoryBlockStore; - async fn simulate_protocol( + pub(crate) async fn simulate_protocol( root: Cid, config: &Config, client_store: &MemoryBlockStore, @@ -90,3 +102,50 @@ mod tests { Ok(()) } } + +#[cfg(test)] +mod proptests { + use crate::{ + common::Config, + dag_walk::DagWalk, + test_utils::{setup_blockstore, variable_blocksize_dag}, + }; + use futures::TryStreamExt; + use libipld::{Cid, Ipld}; + use test_strategy::proptest; + use wnfs_common::MemoryBlockStore; + + #[proptest] + fn cold_transfer_completes(#[strategy(variable_blocksize_dag())] dag: (Vec<(Cid, Ipld)>, Cid)) { + let (blocks, root) = dag; + async_std::task::block_on(async { + let server_store = &setup_blockstore(blocks).await.unwrap(); + let client_store = &MemoryBlockStore::new(); + + crate::pull::tests::simulate_protocol( + root, + &Config::default(), + client_store, + server_store, + ) + .await + .unwrap(); + + // client should have all data + let client_cids = DagWalk::breadth_first([root]) + .stream(client_store) + .map_ok(|(cid, _)| cid) + .try_collect::>() + .await + .unwrap(); + let server_cids = DagWalk::breadth_first([root]) + .stream(server_store) + .map_ok(|(cid, _)| cid) + .try_collect::>() + .await + .unwrap(); + + assert_eq!(client_cids, server_cids); + }) + } +} diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs index 8e652a2..bf37d7a 100644 --- a/car-mirror/src/push.rs +++ b/car-mirror/src/push.rs @@ -6,33 +6,34 @@ use anyhow::Result; use libipld_core::cid::Cid; use wnfs_common::BlockStore; -/// TODO(matheus23) update docs +/// Create a CAR mirror push request. /// -/// Send a subsequent car mirror push request, following up on -/// a response retrieved from an initial `client_initiate_push` request. +/// On the first request for a particular `root`, set +/// `last_response` to `None`. /// -/// Make sure to call `response.indicates_finished()` before initiating -/// a follow-up `client_push` request. +/// For subsequent requests, set it to the last successful +/// response from a request with the same `root`. /// -/// The return value is another CAR file with more blocks from the DAG below the root. +/// The returned request body is a CAR file from some of the first +/// blocks below the root. pub async fn request( root: Cid, last_response: Option, config: &Config, store: &impl BlockStore, ) -> Result { - let receiver_state = last_response.map(ReceiverState::from_push_response); + let receiver_state = last_response.map(ReceiverState::from); block_send(root, receiver_state, config, store).await } -/// TODO(matheus23) update docs +/// Create a response for a CAR mirror push request. /// -/// This handles a car mirror push request on the server side. +/// This takes in the CAR file from the request body and stores its blocks +/// in the given `store`, if the blocks can be shown to relate +/// to the `root` CID. /// -/// The root is the root CID of the DAG that is pushed, the request is a CAR file -/// with some blocks from the cold call. -/// -/// Returns a response to answer the client's request with. +/// Returnes a response that gives the client information about what +/// other data remains to be fetched. pub async fn response( root: Cid, request: CarFile, @@ -41,7 +42,7 @@ pub async fn response( ) -> Result { Ok(block_receive(root, Some(request), config, store) .await? - .into_push_response()) + .into()) } #[cfg(test)] @@ -60,7 +61,7 @@ mod tests { use proptest::collection::vec; use wnfs_common::MemoryBlockStore; - async fn simulate_protocol( + pub(crate) async fn simulate_protocol( root: Cid, config: &Config, client_store: &MemoryBlockStore, @@ -178,5 +179,47 @@ mod tests { #[cfg(test)] mod proptests { - use super::*; + use crate::{ + common::Config, + dag_walk::DagWalk, + test_utils::{setup_blockstore, variable_blocksize_dag}, + }; + use futures::TryStreamExt; + use libipld::{Cid, Ipld}; + use test_strategy::proptest; + use wnfs_common::MemoryBlockStore; + + #[proptest] + fn cold_transfer_completes(#[strategy(variable_blocksize_dag())] dag: (Vec<(Cid, Ipld)>, Cid)) { + let (blocks, root) = dag; + async_std::task::block_on(async { + let client_store = &setup_blockstore(blocks).await.unwrap(); + let server_store = &MemoryBlockStore::new(); + + crate::push::tests::simulate_protocol( + root, + &Config::default(), + client_store, + server_store, + ) + .await + .unwrap(); + + // client should have all data + let client_cids = DagWalk::breadth_first([root]) + .stream(client_store) + .map_ok(|(cid, _)| cid) + .try_collect::>() + .await + .unwrap(); + let server_cids = DagWalk::breadth_first([root]) + .stream(server_store) + .map_ok(|(cid, _)| cid) + .try_collect::>() + .await + .unwrap(); + + assert_eq!(client_cids, server_cids); + }) + } } diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs index 60e1ab6..e8fa931 100644 --- a/car-mirror/src/test_utils/dag_strategy.rs +++ b/car-mirror/src/test_utils/dag_strategy.rs @@ -9,10 +9,10 @@ use roaring_graphs::{arb_dag, DirectedAcyclicGraph, Vertex}; /// the root block's CID. pub fn generate_dag( max_nodes: u16, - generate_block: &impl Fn(Vec, &mut TestRng) -> (Cid, T), -) -> impl Strategy, Cid)> + '_ { + generate_block: impl Fn(Vec, &mut TestRng) -> (Cid, T) + Clone, +) -> impl Strategy, Cid)> { arb_dag(1..max_nodes, 0.5) - .prop_perturb(move |dag, mut rng| dag_to_nodes(&dag, &mut rng, generate_block)) + .prop_perturb(move |dag, mut rng| dag_to_nodes(&dag, &mut rng, generate_block.clone())) } /// Turn a directed acyclic graph into a list of nodes (with their CID) and a root CID. @@ -20,7 +20,7 @@ pub fn generate_dag( pub fn dag_to_nodes( dag: &DirectedAcyclicGraph, rng: &mut TestRng, - generate_node: &impl Fn(Vec, &mut TestRng) -> (Cid, T), + generate_node: impl Fn(Vec, &mut TestRng) -> (Cid, T) + Clone, ) -> (Vec<(Cid, T)>, Cid) { let mut blocks = Vec::new(); let mut visited = HashSet::new(); @@ -33,11 +33,14 @@ fn dag_to_nodes_helper( dag: &DirectedAcyclicGraph, root: Vertex, rng: &mut TestRng, - generate_node: &impl Fn(Vec, &mut TestRng) -> (Cid, T), + generate_node: impl Fn(Vec, &mut TestRng) -> (Cid, T) + Clone, arr: &mut Vec<(Cid, T)>, visited: &mut HashSet, ) -> (Cid, T) { let mut child_blocks = Vec::new(); + if root >= dag.get_vertex_count() { + println!("{root}, {}", dag.get_vertex_count()); + } for child in dag.iter_children(root) { if visited.contains(&child) { continue; @@ -47,7 +50,7 @@ fn dag_to_nodes_helper( dag, child, rng, - generate_node, + generate_node.clone(), arr, visited, )); diff --git a/car-mirror/src/test_utils/local_utils.rs b/car-mirror/src/test_utils/local_utils.rs new file mode 100644 index 0000000..becf29f --- /dev/null +++ b/car-mirror/src/test_utils/local_utils.rs @@ -0,0 +1,128 @@ +///! Crate-local test utilities +use super::{generate_dag, Rvg}; +use crate::{common::references, dag_walk::DagWalk}; +use anyhow::Result; +use bytes::Bytes; +use futures::TryStreamExt; +use libipld::{Cid, Ipld, IpldCodec}; +use libipld_core::{ + codec::Encode, + multihash::{Code, MultihashDigest}, +}; +use proptest::{prelude::Rng, strategy::Strategy}; +use std::collections::BTreeMap; +use wnfs_common::{BlockStore, MemoryBlockStore}; + +#[derive(Clone, Debug)] +pub(crate) struct Metrics { + pub(crate) request_bytes: usize, + pub(crate) response_bytes: usize, +} + +/// Walk a root DAG along some path. +/// At each node, take the `n % numlinks`th link, +/// and only walk the path as long as there are further links. +pub(crate) async fn get_cid_at_approx_path( + path: Vec, + root: Cid, + store: &impl BlockStore, +) -> Result { + let mut working_cid = root; + for nth in path { + let block = store.get_block(&working_cid).await?; + let refs = references(working_cid, block)?; + if refs.is_empty() { + break; + } + + working_cid = refs[nth % refs.len()]; + } + Ok(working_cid) +} + +pub(crate) fn padded_dag_strategy( + dag_size: u16, + block_padding: usize, +) -> impl Strategy, Cid)> { + generate_dag(dag_size, move |cids, rng| { + let mut padding = Vec::with_capacity(block_padding); + for _ in 0..block_padding { + padding.push(rng.gen::()); + } + + let ipld = Ipld::Map(BTreeMap::from([ + ("data".into(), Ipld::Bytes(padding)), + ( + "links".into(), + Ipld::List(cids.into_iter().map(Ipld::Link).collect()), + ), + ])); + let bytes = encode(&ipld); + let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes)); + (cid, ipld) + }) +} + +pub(crate) fn variable_blocksize_dag() -> impl Strategy, Cid)> { + const MAX_DAG_NODES: u16 = 128; // with this proptests run ~15 sec for me + const MAX_LINK_BYTES: usize = MAX_DAG_NODES as usize * 42; // 1 byte cbor CID tag, 1 byte multibase indicator, 40 bytes CID + + // 1 byte cbor tag for whole object, + // 1 byte cbor tag for block padding bytes + // up to ~3 bytes for block padding size + // 1 bytes cbor tag for list (of cids) + // up to ~2 bytes for list size + const EST_OVERHEAD: usize = 1 + 1 + 3 + 1 + 2; + const MAX_BLOCK_SIZE: usize = 256 * 1024; + const MAX_BLOCK_PADDING: usize = MAX_BLOCK_SIZE - EST_OVERHEAD - MAX_LINK_BYTES; + + (32..MAX_BLOCK_PADDING) + .prop_ind_flat_map(move |block_padding| padded_dag_strategy(MAX_DAG_NODES, block_padding)) +} + +pub(crate) async fn setup_blockstore(blocks: Vec<(Cid, Ipld)>) -> Result { + let store = MemoryBlockStore::new(); + for (cid, ipld) in blocks.into_iter() { + let cid_store = store + .put_block(encode(&ipld), IpldCodec::DagCbor.into()) + .await?; + debug_assert_eq!(cid, cid_store); + } + + Ok(store) +} + +pub(crate) async fn setup_random_dag( + dag_size: u16, + block_padding: usize, +) -> Result<(Cid, MemoryBlockStore)> { + let (blocks, root) = Rvg::new().sample(&padded_dag_strategy(dag_size, block_padding)); + let store = setup_blockstore(blocks).await?; + Ok((root, store)) +} + +pub(crate) async fn total_dag_bytes(root: Cid, store: &impl BlockStore) -> Result { + Ok(DagWalk::breadth_first([root]) + .stream(store) + .map_ok(|(_, block)| block.len()) + .try_collect::>() + .await? + .into_iter() + .sum::()) +} + +pub(crate) async fn total_dag_blocks(root: Cid, store: &impl BlockStore) -> Result { + Ok(DagWalk::breadth_first([root]) + .stream(store) + .map_ok(|(_, block)| block.len()) + .try_collect::>() + .await? + .len()) +} + +/// Encode some IPLD as dag-cbor +pub(crate) fn encode(ipld: &Ipld) -> Bytes { + let mut vec = Vec::new(); + ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap(); + Bytes::from(vec) +} diff --git a/car-mirror/src/test_utils/mod.rs b/car-mirror/src/test_utils/mod.rs index ed74678..aa4d5d3 100644 --- a/car-mirror/src/test_utils/mod.rs +++ b/car-mirror/src/test_utils/mod.rs @@ -1,17 +1,3 @@ -use std::collections::BTreeMap; - -use crate::{common::references, dag_walk::DagWalk}; -use anyhow::Result; -use bytes::Bytes; -use futures::TryStreamExt; -use libipld::{Cid, Ipld, IpldCodec}; -use libipld_core::{ - codec::Encode, - multihash::{Code, MultihashDigest}, -}; -use proptest::prelude::Rng; -use wnfs_common::{BlockStore, MemoryBlockStore}; - #[cfg(feature = "test_utils")] mod dag_strategy; /// Random value generator for sampling data. @@ -22,86 +8,7 @@ pub use dag_strategy::*; #[cfg(feature = "test_utils")] pub use rvg::*; -#[derive(Clone, Debug)] -pub(crate) struct Metrics { - pub(crate) request_bytes: usize, - pub(crate) response_bytes: usize, -} - -/// Encode some IPLD as dag-cbor -pub(crate) fn encode(ipld: &Ipld) -> Bytes { - let mut vec = Vec::new(); - ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap(); // TODO(matheus23) unwrap - Bytes::from(vec) -} - -/// Walk a root DAG along some path. -/// At each node, take the `n % numlinks`th link, -/// and only walk the path as long as there are further links. -pub(crate) async fn get_cid_at_approx_path( - path: Vec, - root: Cid, - store: &impl BlockStore, -) -> Result { - let mut working_cid = root; - for nth in path { - let block = store.get_block(&working_cid).await?; - let refs = references(working_cid, block)?; - if refs.is_empty() { - break; - } - - working_cid = refs[nth % refs.len()]; - } - Ok(working_cid) -} - -pub(crate) async fn setup_random_dag( - dag_size: u16, - block_padding: usize, -) -> Result<(Cid, MemoryBlockStore)> { - let (blocks, root) = Rvg::new().sample(&generate_dag(dag_size, &|cids, rng| { - let mut padding = Vec::with_capacity(block_padding); - for _ in 0..block_padding { - padding.push(rng.gen::()); - } - - let ipld = Ipld::Map(BTreeMap::from([ - ("data".into(), Ipld::Bytes(padding)), - ( - "links".into(), - Ipld::List(cids.into_iter().map(Ipld::Link).collect()), - ), - ])); - let bytes = encode(&ipld); - let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes)); - (cid, bytes) - })); - - let store = MemoryBlockStore::new(); - for (cid, bytes) in blocks.into_iter() { - let cid_store = store.put_block(bytes, IpldCodec::DagCbor.into()).await?; - assert_eq!(cid, cid_store); - } - - Ok((root, store)) -} - -pub(crate) async fn total_dag_bytes(root: Cid, store: &impl BlockStore) -> Result { - Ok(DagWalk::breadth_first([root]) - .stream(store) - .map_ok(|(_, block)| block.len()) - .try_collect::>() - .await? - .into_iter() - .sum::()) -} - -pub(crate) async fn total_dag_blocks(root: Cid, store: &impl BlockStore) -> Result { - Ok(DagWalk::breadth_first([root]) - .stream(store) - .map_ok(|(_, block)| block.len()) - .try_collect::>() - .await? - .len()) -} +#[cfg(test)] +mod local_utils; +#[cfg(test)] +pub(crate) use local_utils::*; From 0c6388b06ffadf8ab0f27357d571830bf51d36af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Mon, 21 Aug 2023 18:17:45 +0200 Subject: [PATCH 19/35] Use `HashSet` for comparing CID sets --- car-mirror/src/pull.rs | 10 ++++++---- car-mirror/src/push.rs | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/car-mirror/src/pull.rs b/car-mirror/src/pull.rs index 8dc1292..8bfdbb8 100644 --- a/car-mirror/src/pull.rs +++ b/car-mirror/src/pull.rs @@ -49,6 +49,7 @@ mod tests { use anyhow::Result; use futures::TryStreamExt; use libipld::Cid; + use std::collections::HashSet; use wnfs_common::MemoryBlockStore; pub(crate) async fn simulate_protocol( @@ -89,12 +90,12 @@ mod tests { let client_cids = DagWalk::breadth_first([root]) .stream(client_store) .map_ok(|(cid, _)| cid) - .try_collect::>() + .try_collect::>() .await?; let server_cids = DagWalk::breadth_first([root]) .stream(server_store) .map_ok(|(cid, _)| cid) - .try_collect::>() + .try_collect::>() .await?; assert_eq!(client_cids, server_cids); @@ -112,6 +113,7 @@ mod proptests { }; use futures::TryStreamExt; use libipld::{Cid, Ipld}; + use std::collections::HashSet; use test_strategy::proptest; use wnfs_common::MemoryBlockStore; @@ -135,13 +137,13 @@ mod proptests { let client_cids = DagWalk::breadth_first([root]) .stream(client_store) .map_ok(|(cid, _)| cid) - .try_collect::>() + .try_collect::>() .await .unwrap(); let server_cids = DagWalk::breadth_first([root]) .stream(server_store) .map_ok(|(cid, _)| cid) - .try_collect::>() + .try_collect::>() .await .unwrap(); diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs index bf37d7a..03f54fc 100644 --- a/car-mirror/src/push.rs +++ b/car-mirror/src/push.rs @@ -59,6 +59,7 @@ mod tests { use futures::TryStreamExt; use libipld::Cid; use proptest::collection::vec; + use std::collections::HashSet; use wnfs_common::MemoryBlockStore; pub(crate) async fn simulate_protocol( @@ -98,12 +99,12 @@ mod tests { let client_cids = DagWalk::breadth_first([root]) .stream(client_store) .map_ok(|(cid, _)| cid) - .try_collect::>() + .try_collect::>() .await?; let server_cids = DagWalk::breadth_first([root]) .stream(server_store) .map_ok(|(cid, _)| cid) - .try_collect::>() + .try_collect::>() .await?; assert_eq!(client_cids, server_cids); @@ -186,6 +187,7 @@ mod proptests { }; use futures::TryStreamExt; use libipld::{Cid, Ipld}; + use std::collections::HashSet; use test_strategy::proptest; use wnfs_common::MemoryBlockStore; @@ -209,13 +211,13 @@ mod proptests { let client_cids = DagWalk::breadth_first([root]) .stream(client_store) .map_ok(|(cid, _)| cid) - .try_collect::>() + .try_collect::>() .await .unwrap(); let server_cids = DagWalk::breadth_first([root]) .stream(server_store) .map_ok(|(cid, _)| cid) - .try_collect::>() + .try_collect::>() .await .unwrap(); From 6bf868aee04b4a790af83c3ce0d1b6bd725a6dd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Mon, 21 Aug 2023 18:57:38 +0200 Subject: [PATCH 20/35] Implement benchmarks --- Cargo.lock | 3 + car-mirror-benches/Cargo.toml | 5 +- car-mirror-benches/benches/a_benchmark.rs | 15 ---- car-mirror-benches/benches/in_memory.rs | 88 +++++++++++++++++++ car-mirror/src/dag_walk.rs | 4 +- car-mirror/src/test_utils/blockstore_utils.rs | 26 ++++++ car-mirror/src/test_utils/dag_strategy.rs | 65 ++++++++++++-- car-mirror/src/test_utils/local_utils.rs | 54 ++---------- car-mirror/src/test_utils/mod.rs | 4 + car-mirror/tests/integration_test.rs | 4 - 10 files changed, 192 insertions(+), 76 deletions(-) delete mode 100644 car-mirror-benches/benches/a_benchmark.rs create mode 100644 car-mirror-benches/benches/in_memory.rs create mode 100644 car-mirror/src/test_utils/blockstore_utils.rs delete mode 100644 car-mirror/tests/integration_test.rs diff --git a/Cargo.lock b/Cargo.lock index 952b6fb..54cf3d0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -403,8 +403,11 @@ dependencies = [ name = "car-mirror-benches" version = "0.1.0" dependencies = [ + "anyhow", + "async-std", "car-mirror", "criterion", + "wnfs-common", ] [[package]] diff --git a/car-mirror-benches/Cargo.toml b/car-mirror-benches/Cargo.toml index 4d5161e..7b2df9e 100644 --- a/car-mirror-benches/Cargo.toml +++ b/car-mirror-benches/Cargo.toml @@ -7,10 +7,13 @@ authors = ["Stephen Akinyemi "] [dependencies] car-mirror = { path = "../car-mirror", version = "0.1", features = ["test_utils"] } +wnfs-common = "0.1.23" +async-std = { version = "1.11", features = ["attributes"] } +anyhow = "1.0" [dev-dependencies] criterion = { version = "0.4", default-features = false } [[bench]] -name = "a_benchmark" +name = "in_memory" harness = false diff --git a/car-mirror-benches/benches/a_benchmark.rs b/car-mirror-benches/benches/a_benchmark.rs deleted file mode 100644 index 6650d1a..0000000 --- a/car-mirror-benches/benches/a_benchmark.rs +++ /dev/null @@ -1,15 +0,0 @@ -use criterion::{criterion_group, criterion_main, Criterion}; - -pub fn add_benchmark(c: &mut Criterion) { - let mut rvg = car_mirror::test_utils::Rvg::deterministic(); - let int_val_1 = rvg.sample(&(0..100i32)); - let int_val_2 = rvg.sample(&(0..100i32)); - - c.bench_function("add", |b| { - b.iter(|| { - car_mirror::add(int_val_1, int_val_2); - }) - }); -} -criterion_group!(benches, add_benchmark); -criterion_main!(benches); diff --git a/car-mirror-benches/benches/in_memory.rs b/car-mirror-benches/benches/in_memory.rs new file mode 100644 index 0000000..84b4291 --- /dev/null +++ b/car-mirror-benches/benches/in_memory.rs @@ -0,0 +1,88 @@ +use car_mirror::{ + common::Config, + pull, push, + test_utils::{arb_ipld_dag, links_to_padded_ipld, setup_blockstore}, +}; +use criterion::{criterion_group, criterion_main, BatchSize, Criterion}; +use wnfs_common::MemoryBlockStore; + +pub fn push(c: &mut Criterion) { + let mut rvg = car_mirror::test_utils::Rvg::deterministic(); + + c.bench_function("push cold", |b| { + b.iter_batched( + || { + let (blocks, root) = rvg.sample(&arb_ipld_dag( + 250..256, + 0.9, // Very highly connected + links_to_padded_ipld(10 * 1024), + )); + let store = async_std::task::block_on(setup_blockstore(blocks)).unwrap(); + (store, root) + }, + |(ref client_store, root)| { + let server_store = &MemoryBlockStore::new(); + let config = &Config::default(); + + // Simulate a multi-round protocol run in-memory + async_std::task::block_on(async move { + let mut request = push::request(root, None, config, client_store).await?; + loop { + let response = push::response(root, request, config, server_store).await?; + + if response.indicates_finished() { + break; + } + request = push::request(root, Some(response), config, client_store).await?; + } + + Ok::<(), anyhow::Error>(()) + }) + .unwrap(); + }, + BatchSize::LargeInput, + ) + }); +} + +pub fn pull(c: &mut Criterion) { + let mut rvg = car_mirror::test_utils::Rvg::deterministic(); + + c.bench_function("pull cold", |b| { + b.iter_batched( + || { + let (blocks, root) = rvg.sample(&arb_ipld_dag( + 250..256, + 0.9, // Very highly connected + links_to_padded_ipld(10 * 1024), // 10KiB random data per block + )); + let store = async_std::task::block_on(setup_blockstore(blocks)).unwrap(); + (store, root) + }, + |(ref server_store, root)| { + let client_store = &MemoryBlockStore::new(); + let config = &Config::default(); + + // Simulate a multi-round protocol run in-memory + async_std::task::block_on(async move { + let mut request = pull::request(root, None, config, client_store).await?; + loop { + let response = pull::response(root, request, config, server_store).await?; + request = pull::request(root, Some(response), config, client_store).await?; + + if request.indicates_finished() { + break; + } + } + + Ok::<(), anyhow::Error>(()) + }) + .unwrap(); + }, + BatchSize::LargeInput, + ) + }); +} + +criterion_group!(benches, push, pull); +criterion_main!(benches); diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs index 6210272..707c617 100644 --- a/car-mirror/src/dag_walk.rs +++ b/car-mirror/src/dag_walk.rs @@ -162,7 +162,7 @@ mod tests { #[cfg(test)] mod proptests { use super::*; - use crate::test_utils::{encode, generate_dag}; + use crate::test_utils::{arb_ipld_dag, encode}; use futures::TryStreamExt; use libipld::{ multihash::{Code, MultihashDigest}, @@ -174,7 +174,7 @@ mod proptests { use wnfs_common::{BlockStore, MemoryBlockStore}; fn ipld_dags() -> impl Strategy, Cid)> { - generate_dag(256, |cids, _| { + arb_ipld_dag(1..256, 0.5, |cids, _| { let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect()); let cid = Cid::new_v1( IpldCodec::DagCbor.into(), diff --git a/car-mirror/src/test_utils/blockstore_utils.rs b/car-mirror/src/test_utils/blockstore_utils.rs new file mode 100644 index 0000000..4bf9b4c --- /dev/null +++ b/car-mirror/src/test_utils/blockstore_utils.rs @@ -0,0 +1,26 @@ +use anyhow::Result; +use bytes::Bytes; +use libipld::{Cid, Ipld, IpldCodec}; +use libipld_core::codec::Encode; +use wnfs_common::{BlockStore, MemoryBlockStore}; + +/// Take a list of dag-cbor IPLD blocks and store all of them as dag-cbor in a +/// MemoryBlockStore & return it. +pub async fn setup_blockstore(blocks: Vec<(Cid, Ipld)>) -> Result { + let store = MemoryBlockStore::new(); + for (cid, ipld) in blocks.into_iter() { + let cid_store = store + .put_block(encode(&ipld), IpldCodec::DagCbor.into()) + .await?; + debug_assert_eq!(cid, cid_store); + } + + Ok(store) +} + +/// Encode some IPLD as dag-cbor. +pub fn encode(ipld: &Ipld) -> Bytes { + let mut vec = Vec::new(); + ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap(); + Bytes::from(vec) +} diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs index e8fa931..e751f55 100644 --- a/car-mirror/src/test_utils/dag_strategy.rs +++ b/car-mirror/src/test_utils/dag_strategy.rs @@ -1,20 +1,71 @@ -use std::{collections::HashSet, fmt::Debug}; - -use libipld::Cid; -use proptest::{strategy::Strategy, test_runner::TestRng}; +use super::encode; +use bytes::Bytes; +use libipld::{Cid, Ipld, IpldCodec}; +use libipld_core::multihash::{Code, MultihashDigest}; +use proptest::{prelude::Rng, strategy::Strategy, test_runner::TestRng}; use roaring_graphs::{arb_dag, DirectedAcyclicGraph, Vertex}; +use std::{ + collections::{BTreeMap, HashSet}, + fmt::Debug, + ops::Range, +}; /// A strategy for use with proptest to generate random DAGs (directed acyclic graphs). /// The strategy generates a list of blocks of type T and their CIDs, as well as /// the root block's CID. -pub fn generate_dag( - max_nodes: u16, +pub fn arb_ipld_dag( + vertex_count: impl Into>, + edge_probability: f64, generate_block: impl Fn(Vec, &mut TestRng) -> (Cid, T) + Clone, ) -> impl Strategy, Cid)> { - arb_dag(1..max_nodes, 0.5) + arb_dag(vertex_count, edge_probability) .prop_perturb(move |dag, mut rng| dag_to_nodes(&dag, &mut rng, generate_block.clone())) } +/// A block-generating function for use with `arb_ipld_dag`. +pub fn links_to_ipld(cids: Vec, _: &mut TestRng) -> (Cid, Ipld) { + let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect()); + let cid = Cid::new_v1( + IpldCodec::DagCbor.into(), + Code::Blake3_256.digest(&encode(&ipld)), + ); + (cid, ipld) +} + +/// A block-generating function for use with `arb_ipld_dag`. +pub fn links_to_dag_cbor(cids: Vec, _: &mut TestRng) -> (Cid, Bytes) { + let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect()); + let bytes = encode(&ipld); + let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes)); + (cid, bytes) +} + +/// A block-generating function for use with `arb_ipld_dag`. +/// +/// Creates (a function that creates) an IPLD block with given links & some +/// random `padding_bytes` bytes attached. +pub fn links_to_padded_ipld( + padding_bytes: usize, +) -> impl Fn(Vec, &mut TestRng) -> (Cid, Ipld) + Clone { + move |cids, rng| { + let mut padding = Vec::with_capacity(padding_bytes); + for _ in 0..padding_bytes { + padding.push(rng.gen::()); + } + + let ipld = Ipld::Map(BTreeMap::from([ + ("data".into(), Ipld::Bytes(padding)), + ( + "links".into(), + Ipld::List(cids.into_iter().map(Ipld::Link).collect()), + ), + ])); + let bytes = encode(&ipld); + let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes)); + (cid, ipld) + } +} + /// Turn a directed acyclic graph into a list of nodes (with their CID) and a root CID. /// This will select only the DAG that's reachable from the root. pub fn dag_to_nodes( diff --git a/car-mirror/src/test_utils/local_utils.rs b/car-mirror/src/test_utils/local_utils.rs index becf29f..5c7323f 100644 --- a/car-mirror/src/test_utils/local_utils.rs +++ b/car-mirror/src/test_utils/local_utils.rs @@ -1,16 +1,10 @@ ///! Crate-local test utilities -use super::{generate_dag, Rvg}; +use super::{arb_ipld_dag, links_to_padded_ipld, setup_blockstore, Rvg}; use crate::{common::references, dag_walk::DagWalk}; use anyhow::Result; -use bytes::Bytes; use futures::TryStreamExt; -use libipld::{Cid, Ipld, IpldCodec}; -use libipld_core::{ - codec::Encode, - multihash::{Code, MultihashDigest}, -}; -use proptest::{prelude::Rng, strategy::Strategy}; -use std::collections::BTreeMap; +use libipld::{Cid, Ipld}; +use proptest::strategy::Strategy; use wnfs_common::{BlockStore, MemoryBlockStore}; #[derive(Clone, Debug)] @@ -44,23 +38,7 @@ pub(crate) fn padded_dag_strategy( dag_size: u16, block_padding: usize, ) -> impl Strategy, Cid)> { - generate_dag(dag_size, move |cids, rng| { - let mut padding = Vec::with_capacity(block_padding); - for _ in 0..block_padding { - padding.push(rng.gen::()); - } - - let ipld = Ipld::Map(BTreeMap::from([ - ("data".into(), Ipld::Bytes(padding)), - ( - "links".into(), - Ipld::List(cids.into_iter().map(Ipld::Link).collect()), - ), - ])); - let bytes = encode(&ipld); - let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes)); - (cid, ipld) - }) + arb_ipld_dag(1..dag_size, 0.5, links_to_padded_ipld(block_padding)) } pub(crate) fn variable_blocksize_dag() -> impl Strategy, Cid)> { @@ -76,20 +54,9 @@ pub(crate) fn variable_blocksize_dag() -> impl Strategy) -> Result { - let store = MemoryBlockStore::new(); - for (cid, ipld) in blocks.into_iter() { - let cid_store = store - .put_block(encode(&ipld), IpldCodec::DagCbor.into()) - .await?; - debug_assert_eq!(cid, cid_store); - } - - Ok(store) + (32..MAX_BLOCK_PADDING).prop_ind_flat_map(move |block_padding| { + arb_ipld_dag(1..MAX_DAG_NODES, 0.5, links_to_padded_ipld(block_padding)) + }) } pub(crate) async fn setup_random_dag( @@ -119,10 +86,3 @@ pub(crate) async fn total_dag_blocks(root: Cid, store: &impl BlockStore) -> Resu .await? .len()) } - -/// Encode some IPLD as dag-cbor -pub(crate) fn encode(ipld: &Ipld) -> Bytes { - let mut vec = Vec::new(); - ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap(); - Bytes::from(vec) -} diff --git a/car-mirror/src/test_utils/mod.rs b/car-mirror/src/test_utils/mod.rs index aa4d5d3..11cc566 100644 --- a/car-mirror/src/test_utils/mod.rs +++ b/car-mirror/src/test_utils/mod.rs @@ -7,6 +7,10 @@ mod rvg; pub use dag_strategy::*; #[cfg(feature = "test_utils")] pub use rvg::*; +#[cfg(feature = "test_utils")] +mod blockstore_utils; +#[cfg(feature = "test_utils")] +pub use blockstore_utils::*; #[cfg(test)] mod local_utils; diff --git a/car-mirror/tests/integration_test.rs b/car-mirror/tests/integration_test.rs deleted file mode 100644 index d60e3f3..0000000 --- a/car-mirror/tests/integration_test.rs +++ /dev/null @@ -1,4 +0,0 @@ -#[test] -fn test_add() { - assert_eq!(car_mirror::add(3, 2), 5); -} From e3358fd3e97fa23c4c73583d1e4f6bb7cfda890e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Mon, 21 Aug 2023 19:28:41 +0200 Subject: [PATCH 21/35] Create benchmarks with throttled `get_block`s --- Cargo.lock | 3 + car-mirror-benches/Cargo.toml | 11 +- .../benches/artificially_slow_blockstore.rs | 117 ++++++++++++++++++ car-mirror/src/test_utils/blockstore_utils.rs | 12 +- 4 files changed, 140 insertions(+), 3 deletions(-) create mode 100644 car-mirror-benches/benches/artificially_slow_blockstore.rs diff --git a/Cargo.lock b/Cargo.lock index 54cf3d0..d80f0db 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -405,8 +405,11 @@ version = "0.1.0" dependencies = [ "anyhow", "async-std", + "async-trait", + "bytes", "car-mirror", "criterion", + "libipld", "wnfs-common", ] diff --git a/car-mirror-benches/Cargo.toml b/car-mirror-benches/Cargo.toml index 7b2df9e..ba1c467 100644 --- a/car-mirror-benches/Cargo.toml +++ b/car-mirror-benches/Cargo.toml @@ -6,10 +6,13 @@ edition = "2021" authors = ["Stephen Akinyemi "] [dependencies] +anyhow = "1.0" +async-std = { version = "1.11", features = ["attributes"] } +async-trait = "0.1" +bytes = "1.4.0" car-mirror = { path = "../car-mirror", version = "0.1", features = ["test_utils"] } +libipld = "0.16.0" wnfs-common = "0.1.23" -async-std = { version = "1.11", features = ["attributes"] } -anyhow = "1.0" [dev-dependencies] criterion = { version = "0.4", default-features = false } @@ -17,3 +20,7 @@ criterion = { version = "0.4", default-features = false } [[bench]] name = "in_memory" harness = false + +[[bench]] +name = "artificially_slow_blockstore" +harness = false diff --git a/car-mirror-benches/benches/artificially_slow_blockstore.rs b/car-mirror-benches/benches/artificially_slow_blockstore.rs new file mode 100644 index 0000000..f74ec72 --- /dev/null +++ b/car-mirror-benches/benches/artificially_slow_blockstore.rs @@ -0,0 +1,117 @@ +use anyhow::Result; +use async_trait::async_trait; +use bytes::Bytes; +use car_mirror::{ + common::Config, + pull, push, + test_utils::{arb_ipld_dag, links_to_padded_ipld, setup_blockstore}, +}; +use criterion::{criterion_group, criterion_main, BatchSize, Criterion}; +use libipld::Cid; +use std::time::Duration; +use wnfs_common::{BlockStore, MemoryBlockStore}; + +pub fn push_throttled(c: &mut Criterion) { + let mut rvg = car_mirror::test_utils::Rvg::deterministic(); + + c.bench_function("push cold, get_block throttled", |b| { + b.iter_batched( + || { + let (blocks, root) = rvg.sample(&arb_ipld_dag( + 60..64, + 0.9, // Very highly connected + links_to_padded_ipld(10 * 1024), + )); + let store = async_std::task::block_on(setup_blockstore(blocks)).unwrap(); + (store, root) + }, + |(client_store, root)| { + let client_store = &ThrottledBlockStore(client_store); + let server_store = &ThrottledBlockStore::new(); + let config = &Config::default(); + + // Simulate a multi-round protocol run in-memory + async_std::task::block_on(async move { + let mut request = push::request(root, None, config, client_store).await?; + loop { + let response = push::response(root, request, config, server_store).await?; + + if response.indicates_finished() { + break; + } + request = push::request(root, Some(response), config, client_store).await?; + } + + Ok::<(), anyhow::Error>(()) + }) + .unwrap(); + }, + BatchSize::LargeInput, + ) + }); +} + +pub fn pull_throttled(c: &mut Criterion) { + let mut rvg = car_mirror::test_utils::Rvg::deterministic(); + + c.bench_function("pull cold, get_block throttled", |b| { + b.iter_batched( + || { + let (blocks, root) = rvg.sample(&arb_ipld_dag( + 60..64, + 0.9, // Very highly connected + links_to_padded_ipld(10 * 1024), // 10KiB random data added + )); + let store = async_std::task::block_on(setup_blockstore(blocks)).unwrap(); + (store, root) + }, + |(server_store, root)| { + let server_store = &ThrottledBlockStore(server_store); + let client_store = &ThrottledBlockStore::new(); + let config = &Config::default(); + + // Simulate a multi-round protocol run in-memory + async_std::task::block_on(async move { + let mut request = pull::request(root, None, config, client_store).await?; + loop { + let response = pull::response(root, request, config, server_store).await?; + request = pull::request(root, Some(response), config, client_store).await?; + + if request.indicates_finished() { + break; + } + } + + Ok::<(), anyhow::Error>(()) + }) + .unwrap(); + }, + BatchSize::LargeInput, + ) + }); +} + +#[derive(Debug, Clone)] +struct ThrottledBlockStore(MemoryBlockStore); + +#[async_trait(?Send)] +impl BlockStore for ThrottledBlockStore { + async fn get_block(&self, cid: &Cid) -> Result { + let bytes = self.0.get_block(cid).await?; + async_std::task::sleep(Duration::from_micros(50)).await; // Block fetching is artifically slowed by 50 microseconds + Ok(bytes) + } + + async fn put_block(&self, bytes: impl Into, codec: u64) -> Result { + self.0.put_block(bytes, codec).await + } +} + +impl ThrottledBlockStore { + pub fn new() -> Self { + Self(MemoryBlockStore::new()) + } +} + +criterion_group!(benches, push_throttled, pull_throttled); +criterion_main!(benches); diff --git a/car-mirror/src/test_utils/blockstore_utils.rs b/car-mirror/src/test_utils/blockstore_utils.rs index 4bf9b4c..c3ab2c2 100644 --- a/car-mirror/src/test_utils/blockstore_utils.rs +++ b/car-mirror/src/test_utils/blockstore_utils.rs @@ -8,6 +8,16 @@ use wnfs_common::{BlockStore, MemoryBlockStore}; /// MemoryBlockStore & return it. pub async fn setup_blockstore(blocks: Vec<(Cid, Ipld)>) -> Result { let store = MemoryBlockStore::new(); + setup_existing_blockstore(blocks, &store).await?; + Ok(store) +} + +/// Take a list of dag-cbor IPLD blocks and store all of them as dag-cbor in +/// the given `BlockStore`. +pub async fn setup_existing_blockstore( + blocks: Vec<(Cid, Ipld)>, + store: &impl BlockStore, +) -> Result<()> { for (cid, ipld) in blocks.into_iter() { let cid_store = store .put_block(encode(&ipld), IpldCodec::DagCbor.into()) @@ -15,7 +25,7 @@ pub async fn setup_blockstore(blocks: Vec<(Cid, Ipld)>) -> Result Date: Mon, 21 Aug 2023 19:37:28 +0200 Subject: [PATCH 22/35] Add some perf improvment idea comments --- car-mirror/src/dag_walk.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs index 707c617..2f9b3f0 100644 --- a/car-mirror/src/dag_walk.rs +++ b/car-mirror/src/dag_walk.rs @@ -72,6 +72,9 @@ impl DagWalk { } }; + // TODO: Two opportunities for performance improvement: + // - skip Raw CIDs. They can't have further links (but needs adjustment to this function's return type) + // - run multiple `get_block` calls concurrently let block = store.get_block(&cid).await?; for ref_cid in references(cid, &block)? { if !self.visited.contains(&ref_cid) { From 8b3d6494e536210148c76666dfed4dc76d147449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Mon, 21 Aug 2023 19:40:35 +0200 Subject: [PATCH 23/35] Fix lints --- car-mirror/src/common.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/car-mirror/src/common.rs b/car-mirror/src/common.rs index ef48acc..6ede3e5 100644 --- a/car-mirror/src/common.rs +++ b/car-mirror/src/common.rs @@ -243,14 +243,14 @@ impl From for ReceiverState { } } -impl Into for ReceiverState { - fn into(self) -> PushResponse { +impl From for PushResponse { + fn from(receiver_state: ReceiverState) -> PushResponse { let ReceiverState { missing_subgraph_roots, have_cids_bloom, - } = self; + } = receiver_state; - let (bloom_k, bloom) = Self::bloom_serialize(have_cids_bloom); + let (bloom_k, bloom) = ReceiverState::bloom_serialize(have_cids_bloom); PushResponse { subgraph_roots: missing_subgraph_roots, @@ -260,14 +260,14 @@ impl Into for ReceiverState { } } -impl Into for ReceiverState { - fn into(self) -> PullRequest { +impl From for PullRequest { + fn from(receiver_state: ReceiverState) -> PullRequest { let ReceiverState { missing_subgraph_roots, have_cids_bloom, - } = self; + } = receiver_state; - let (bloom_k, bloom) = Self::bloom_serialize(have_cids_bloom); + let (bloom_k, bloom) = ReceiverState::bloom_serialize(have_cids_bloom); PullRequest { resources: missing_subgraph_roots, From 10e638aee948381c81c2e35e1005e7ea8d793f96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Mon, 21 Aug 2023 19:47:28 +0200 Subject: [PATCH 24/35] set MSRV to 1.66 --- .github/workflows/tests_and_checks.yml | 2 +- Cargo.toml | 6 +++--- car-mirror-wasm/Cargo.toml | 2 +- car-mirror/Cargo.toml | 10 +++++----- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/tests_and_checks.yml b/.github/workflows/tests_and_checks.yml index b728ad7..8acc4f8 100644 --- a/.github/workflows/tests_and_checks.yml +++ b/.github/workflows/tests_and_checks.yml @@ -21,7 +21,7 @@ jobs: - stable - nightly # minimum version - - 1.64 + - 1.66 steps: - name: Checkout Repository uses: actions/checkout@v3 diff --git a/Cargo.toml b/Cargo.toml index 96fa3b8..aa734b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,10 +1,10 @@ [workspace] members = [ - "examples", "car-mirror", "car-mirror-benches", "car-mirror-wasm" -] +, + "examples"] # See https://doc.rust-lang.org/cargo/reference/profiles.html for more info. [profile.release.package.car-mirror-wasm] @@ -23,4 +23,4 @@ opt-level = "s" # or 'z' to optimize "aggressively" for size # See https://blog.rust-lang.org/2021/03/25/Rust-1.51.0.html#splitting-debug-information [profile.dev] split-debuginfo = "unpacked" -opt-level = 3 \ No newline at end of file +opt-level = 3 diff --git a/car-mirror-wasm/Cargo.toml b/car-mirror-wasm/Cargo.toml index 97e75a3..0dcdd3a 100644 --- a/car-mirror-wasm/Cargo.toml +++ b/car-mirror-wasm/Cargo.toml @@ -8,7 +8,7 @@ include = ["/src", "README.md", "LICENSE-APACHE", "LICENSE-MIT"] license = "Apache-2.0 or MIT" readme = "README.md" edition = "2021" -rust-version = "1.64" +rust-version = "1.66" documentation = "https://docs.rs/car-mirror-wasm" repository = "https://github.com/fission-codes/rs-car-mirror/tree/main/car-mirror-wasm" authors = ["Stephen Akinyemi "] diff --git a/car-mirror/Cargo.toml b/car-mirror/Cargo.toml index e5216c2..8771faf 100644 --- a/car-mirror/Cargo.toml +++ b/car-mirror/Cargo.toml @@ -8,7 +8,7 @@ include = ["/src", "README.md", "LICENSE-APACHE", "LICENSE-MIT"] license = "Apache-2.0 or MIT" readme = "README.md" edition = "2021" -rust-version = "1.64" +rust-version = "1.66" documentation = "https://docs.rs/car-mirror" repository = "https://github.com/fission-codes/rs-car-mirror/tree/main/car-mirror" authors = ["Stephen Akinyemi "] @@ -25,6 +25,7 @@ doc = true [dependencies] anyhow = "1.0" async-stream = "0.3.5" +async-trait = "0.1.73" bytes = "1.4.0" deterministic-bloom = { git = "https://github.com/wnfs-wg/deterministic-bloom#a8cd85b" } fixedbitset = "0.4.2" @@ -34,14 +35,13 @@ libipld = "0.16.0" libipld-core = "0.16.0" proptest = { version = "1.1", optional = true } roaring-graphs = "0.12" -tokio-util = { version = "0.7.8", features = ["compat"] } +serde = "1.0.183" +serde_ipld_dagcbor = "0.4.0" tokio = { version = "^1", features = ["io-util"] } +tokio-util = { version = "0.7.8", features = ["compat"] } tracing = "0.1" tracing-subscriber = "0.3" wnfs-common = "0.1.23" -async-trait = "0.1.73" -serde_ipld_dagcbor = "0.4.0" -serde = "1.0.183" [dev-dependencies] async-std = { version = "1.11", features = ["attributes"] } From 2bce61a12035f3dc8226a8cefc9d9545b7c7f16b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Mon, 21 Aug 2023 19:55:03 +0200 Subject: [PATCH 25/35] Specifically allow the BSL license --- deny.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deny.toml b/deny.toml index f532111..a6c5ca1 100644 --- a/deny.toml +++ b/deny.toml @@ -76,7 +76,8 @@ allow = [ "BSD-2-Clause", "BSD-3-Clause", "ISC", - "Zlib" + "Zlib", + "BSL-1.0" ] # List of explicitly disallowed licenses # See https://spdx.org/licenses/ for list of possible licenses From 208519a0ad0cb3f713c256b7046c530987ec603e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Mon, 21 Aug 2023 20:14:52 +0200 Subject: [PATCH 26/35] Depend on published `deterministic-bloom` --- Cargo.lock | 3 ++- car-mirror/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d80f0db..6278cd4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -670,7 +670,8 @@ dependencies = [ [[package]] name = "deterministic-bloom" version = "0.1.0" -source = "git+https://github.com/wnfs-wg/deterministic-bloom#a8cd85b#a8cd85b1d71da9f79f5058c0a20e53a83a283230" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12a3873e91e360aee2403cbafd2beb42f02ace06da9b053574518f003aa2490d" dependencies = [ "bitvec", "miette", diff --git a/car-mirror/Cargo.toml b/car-mirror/Cargo.toml index 8771faf..77855f8 100644 --- a/car-mirror/Cargo.toml +++ b/car-mirror/Cargo.toml @@ -27,7 +27,7 @@ anyhow = "1.0" async-stream = "0.3.5" async-trait = "0.1.73" bytes = "1.4.0" -deterministic-bloom = { git = "https://github.com/wnfs-wg/deterministic-bloom#a8cd85b" } +deterministic-bloom = "0.1" fixedbitset = "0.4.2" futures = "0.3.28" iroh-car = "0.3.0" From cbe9246cba3b6bef30b98964c8fdf3c155dbd532 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Mon, 21 Aug 2023 20:15:21 +0200 Subject: [PATCH 27/35] Lint --- car-mirror/src/common.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/car-mirror/src/common.rs b/car-mirror/src/common.rs index 6ede3e5..bdc91b7 100644 --- a/car-mirror/src/common.rs +++ b/car-mirror/src/common.rs @@ -86,7 +86,7 @@ pub async fn block_send( .try_collect() .await?; - let bloom = have_cids_bloom.unwrap_or(BloomFilter::new_with(1, Box::new([0]))); // An empty bloom that contains nothing + let bloom = have_cids_bloom.unwrap_or_else(|| BloomFilter::new_with(1, Box::new([0]))); // An empty bloom that contains nothing let mut writer = CarWriter::new( CarHeader::new_v1( From d48e3828707384c0bdb160323993f94e87f7aa97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Mon, 21 Aug 2023 20:17:57 +0200 Subject: [PATCH 28/35] Some helpful doc --- car-mirror/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/car-mirror/src/lib.rs b/car-mirror/src/lib.rs index 04ef44a..b40d1a0 100644 --- a/car-mirror/src/lib.rs +++ b/car-mirror/src/lib.rs @@ -4,7 +4,7 @@ //! car-mirror -/// Test utilities. +/// Test utilities. Enabled with the `test_utils` feature flag. #[cfg(any(test, feature = "test_utils"))] #[cfg_attr(docsrs, doc(cfg(feature = "test_utils")))] pub mod test_utils; From a257f6e9210cf82e0849a4008c711d18804d34e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 25 Aug 2023 11:40:44 +0200 Subject: [PATCH 29/35] Updates from feedback --- Cargo.lock | 23 ------ car-mirror/Cargo.toml | 8 +- car-mirror/src/common.rs | 19 +++-- car-mirror/src/dag_walk.rs | 9 ++- car-mirror/src/incremental_verification.rs | 79 ++++++++++++------- car-mirror/src/test_utils/blockstore_utils.rs | 48 ++++++++--- car-mirror/src/test_utils/dag_strategy.rs | 12 ++- 7 files changed, 117 insertions(+), 81 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6278cd4..32f2802 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -382,7 +382,6 @@ dependencies = [ "bytes", "car-mirror", "deterministic-bloom", - "fixedbitset", "futures", "iroh-car", "libipld", @@ -392,8 +391,6 @@ dependencies = [ "serde", "serde_ipld_dagcbor", "test-strategy", - "tokio", - "tokio-util", "tracing", "tracing-subscriber", "wnfs-common", @@ -741,12 +738,6 @@ dependencies = [ "instant", ] -[[package]] -name = "fixedbitset" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" - [[package]] name = "fnv" version = "1.0.7" @@ -1850,20 +1841,6 @@ dependencies = [ "pin-project-lite", ] -[[package]] -name = "tokio-util" -version = "0.7.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" -dependencies = [ - "bytes", - "futures-core", - "futures-io", - "futures-sink", - "pin-project-lite", - "tokio", -] - [[package]] name = "toml" version = "0.5.11" diff --git a/car-mirror/Cargo.toml b/car-mirror/Cargo.toml index 77855f8..3a59ad5 100644 --- a/car-mirror/Cargo.toml +++ b/car-mirror/Cargo.toml @@ -28,17 +28,14 @@ async-stream = "0.3.5" async-trait = "0.1.73" bytes = "1.4.0" deterministic-bloom = "0.1" -fixedbitset = "0.4.2" futures = "0.3.28" iroh-car = "0.3.0" libipld = "0.16.0" libipld-core = "0.16.0" proptest = { version = "1.1", optional = true } -roaring-graphs = "0.12" +roaring-graphs = { version = "0.12", optional = true } serde = "1.0.183" serde_ipld_dagcbor = "0.4.0" -tokio = { version = "^1", features = ["io-util"] } -tokio-util = { version = "0.7.8", features = ["compat"] } tracing = "0.1" tracing-subscriber = "0.3" wnfs-common = "0.1.23" @@ -47,11 +44,12 @@ wnfs-common = "0.1.23" async-std = { version = "1.11", features = ["attributes"] } car-mirror = { path = ".", features = ["test_utils"] } proptest = "1.1" +roaring-graphs = "0.12" test-strategy = "0.3" [features] default = [] -test_utils = ["proptest"] +test_utils = ["proptest", "roaring-graphs"] [package.metadata.docs.rs] all-features = true diff --git a/car-mirror/src/common.rs b/car-mirror/src/common.rs index bdc91b7..cc26d8c 100644 --- a/car-mirror/src/common.rs +++ b/car-mirror/src/common.rs @@ -10,7 +10,7 @@ use wnfs_common::BlockStore; use crate::{ dag_walk::DagWalk, - incremental_verification::IncrementalDagVerification, + incremental_verification::{BlockState, IncrementalDagVerification}, messages::{PullRequest, PushResponse}, }; @@ -108,7 +108,7 @@ pub async fn block_send( let mut dag_walk = DagWalk::breadth_first(subgraph_roots.clone()); while let Some((cid, block)) = dag_walk.next(store).await? { if bloom.contains(&cid.to_bytes()) && !subgraph_roots.contains(&cid) { - break; + continue; } writer.write(cid, &block).await?; @@ -157,9 +157,18 @@ pub async fn block_receive( ); } - dag_verification - .verify_and_store_block((cid, block), store) - .await?; + match dag_verification.block_state(cid) { + BlockState::Have => continue, + BlockState::Unexpected => { + eprintln!("Warn: Received block {cid} out of order, may be due to bloom false positive."); + break; + } + BlockState::Want => { + dag_verification + .verify_and_store_block((cid, block), store) + .await?; + } + } } } diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs index 2f9b3f0..9050648 100644 --- a/car-mirror/src/dag_walk.rs +++ b/car-mirror/src/dag_walk.rs @@ -165,7 +165,7 @@ mod tests { #[cfg(test)] mod proptests { use super::*; - use crate::test_utils::{arb_ipld_dag, encode}; + use crate::test_utils::arb_ipld_dag; use futures::TryStreamExt; use libipld::{ multihash::{Code, MultihashDigest}, @@ -174,14 +174,14 @@ mod proptests { use proptest::strategy::Strategy; use std::collections::BTreeSet; use test_strategy::proptest; - use wnfs_common::{BlockStore, MemoryBlockStore}; + use wnfs_common::{dagcbor::encode, BlockStore, MemoryBlockStore}; fn ipld_dags() -> impl Strategy, Cid)> { arb_ipld_dag(1..256, 0.5, |cids, _| { let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect()); let cid = Cid::new_v1( IpldCodec::DagCbor.into(), - Code::Blake3_256.digest(&encode(&ipld)), + Code::Blake3_256.digest(&encode(&ipld).unwrap()), ); (cid, ipld) }) @@ -193,8 +193,9 @@ mod proptests { let (dag, root) = dag; let store = &MemoryBlockStore::new(); for (cid, ipld) in dag.iter() { + let block: Bytes = encode(ipld).unwrap().into(); let cid_store = store - .put_block(encode(ipld), IpldCodec::DagCbor.into()) + .put_block(block, IpldCodec::DagCbor.into()) .await .unwrap(); assert_eq!(*cid, cid_store); diff --git a/car-mirror/src/incremental_verification.rs b/car-mirror/src/incremental_verification.rs index 24edb3b..506c0c7 100644 --- a/car-mirror/src/incremental_verification.rs +++ b/car-mirror/src/incremental_verification.rs @@ -1,8 +1,8 @@ -use crate::{common::references, dag_walk::DagWalk}; +use crate::dag_walk::DagWalk; use anyhow::{bail, Result}; use bytes::Bytes; use libipld_core::cid::Cid; -use std::{collections::HashSet, eprintln}; +use std::{collections::HashSet, matches}; use wnfs_common::{BlockStore, BlockStoreError}; /// A data structure that keeps state about incremental DAG verification. @@ -14,6 +14,17 @@ pub struct IncrementalDagVerification { pub have_cids: HashSet, } +/// The state of a block retrieval +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum BlockState { + /// The block was already received/is already stored + Have, + /// We know we will need this block + Want, + /// We don't know whether we'll need this block + Unexpected, +} + impl IncrementalDagVerification { /// Initiate incremental DAG verification of given roots. /// @@ -23,9 +34,18 @@ impl IncrementalDagVerification { roots: impl IntoIterator, store: &impl BlockStore, ) -> Result { - let mut want_cids = HashSet::new(); - let mut have_cids = HashSet::new(); - let mut dag_walk = DagWalk::breadth_first(roots); + let mut this = Self { + want_cids: roots.into_iter().collect(), + have_cids: HashSet::new(), + }; + + this.update_have_cids(store).await?; + + Ok(this) + } + + async fn update_have_cids(&mut self, store: &impl BlockStore) -> Result<()> { + let mut dag_walk = DagWalk::breadth_first(self.want_cids.iter().cloned()); loop { match dag_walk.next(store).await { @@ -33,13 +53,14 @@ impl IncrementalDagVerification { if let Some(BlockStoreError::CIDNotFound(not_found)) = e.downcast_ref::() { - want_cids.insert(*not_found); + self.want_cids.insert(*not_found); } else { bail!(e); } } Ok(Some((cid, _))) => { - have_cids.insert(cid); + self.want_cids.remove(&cid); + self.have_cids.insert(cid); } Ok(None) => { break; @@ -47,15 +68,27 @@ impl IncrementalDagVerification { } } - Ok(Self { - want_cids, - have_cids, - }) + Ok(()) + } + + /// Check the state of a CID to find out whether + /// - we expect it as one of the next possible blocks to receive (Want) + /// - we have already stored it (Have) + /// - we don't know whether we need it (Unexpected) + pub fn block_state(&self, cid: Cid) -> BlockState { + if self.want_cids.contains(&cid) { + BlockState::Want + } else if self.have_cids.contains(&cid) { + BlockState::Have + } else { + BlockState::Unexpected + } } /// Verify that - /// - the block actually hashes to the hash from given CID and /// - the block is part of the graph below the roots. + /// - the block hasn't been received before + /// - the block actually hashes to the hash from given CID and /// /// And finally stores the block in the blockstore. /// @@ -71,29 +104,21 @@ impl IncrementalDagVerification { ) -> Result<()> { let (cid, bytes) = block; - if !self.want_cids.contains(&cid) { - if self.have_cids.contains(&cid) { - eprintln!("Warn: Received {cid}, even though we already have it"); - } else { - bail!("Unexpected block or block out of order: {cid}"); - } + let block_state = self.block_state(cid); + if !matches!(block_state, BlockState::Want) { + bail!("Incremental verification failed. Block state is: {block_state:?}, expected BlockState::Want"); } - let refs = references(cid, &bytes)?; + // TODO(matheus23): Verify hash before putting it into the blockstore. let result_cid = store.put_block(bytes, cid.codec()).await?; + // TODO(matheus23): The BlockStore chooses the hashing function, + // so it may choose a different hashing function, causing a mismatch if result_cid != cid { bail!("Digest mismatch in CAR file: expected {cid}, got {result_cid}"); } - for ref_cid in refs { - if !self.have_cids.contains(&ref_cid) { - self.want_cids.insert(ref_cid); - } - } - - self.want_cids.remove(&cid); - self.have_cids.insert(cid); + self.update_have_cids(store).await?; Ok(()) } diff --git a/car-mirror/src/test_utils/blockstore_utils.rs b/car-mirror/src/test_utils/blockstore_utils.rs index c3ab2c2..4bce5aa 100644 --- a/car-mirror/src/test_utils/blockstore_utils.rs +++ b/car-mirror/src/test_utils/blockstore_utils.rs @@ -1,8 +1,9 @@ +use crate::common::references; use anyhow::Result; use bytes::Bytes; use libipld::{Cid, Ipld, IpldCodec}; -use libipld_core::codec::Encode; -use wnfs_common::{BlockStore, MemoryBlockStore}; +use std::io::Write; +use wnfs_common::{dagcbor::encode, BlockStore, MemoryBlockStore}; /// Take a list of dag-cbor IPLD blocks and store all of them as dag-cbor in a /// MemoryBlockStore & return it. @@ -19,18 +20,45 @@ pub async fn setup_existing_blockstore( store: &impl BlockStore, ) -> Result<()> { for (cid, ipld) in blocks.into_iter() { - let cid_store = store - .put_block(encode(&ipld), IpldCodec::DagCbor.into()) - .await?; + let block: Bytes = encode(&ipld)?.into(); + let cid_store = store.put_block(block, IpldCodec::DagCbor.into()).await?; debug_assert_eq!(cid, cid_store); } Ok(()) } -/// Encode some IPLD as dag-cbor. -pub fn encode(ipld: &Ipld) -> Bytes { - let mut vec = Vec::new(); - ipld.encode(IpldCodec::DagCbor, &mut vec).unwrap(); - Bytes::from(vec) +/// Print a DAG as a dot file with truncated CIDs +pub fn dag_to_dot( + writer: &mut impl Write, + blocks: impl IntoIterator, +) -> Result<()> { + writeln!(writer, "digraph {{")?; + + for (cid, ipld) in blocks { + let bytes = encode(&ipld)?; + let refs = references(cid, &bytes)?; + for to_cid in refs { + print_truncated_string(writer, cid.to_string())?; + write!(writer, " -> ")?; + print_truncated_string(writer, to_cid.to_string())?; + writeln!(writer)?; + } + } + + writeln!(writer, "}}")?; + + Ok(()) +} + +fn print_truncated_string(writer: &mut impl Write, mut string: String) -> Result<()> { + if string.len() > 20 { + let mut string_rest = string.split_off(10); + let string_end = string_rest.split_off(std::cmp::max(string_rest.len(), 10) - 10); + write!(writer, "\"{string}...{string_end}\"")?; + } else { + write!(writer, "\"{string}\"")?; + } + + Ok(()) } diff --git a/car-mirror/src/test_utils/dag_strategy.rs b/car-mirror/src/test_utils/dag_strategy.rs index e751f55..571d376 100644 --- a/car-mirror/src/test_utils/dag_strategy.rs +++ b/car-mirror/src/test_utils/dag_strategy.rs @@ -1,4 +1,3 @@ -use super::encode; use bytes::Bytes; use libipld::{Cid, Ipld, IpldCodec}; use libipld_core::multihash::{Code, MultihashDigest}; @@ -9,6 +8,7 @@ use std::{ fmt::Debug, ops::Range, }; +use wnfs_common::dagcbor::encode; /// A strategy for use with proptest to generate random DAGs (directed acyclic graphs). /// The strategy generates a list of blocks of type T and their CIDs, as well as @@ -25,17 +25,15 @@ pub fn arb_ipld_dag( /// A block-generating function for use with `arb_ipld_dag`. pub fn links_to_ipld(cids: Vec, _: &mut TestRng) -> (Cid, Ipld) { let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect()); - let cid = Cid::new_v1( - IpldCodec::DagCbor.into(), - Code::Blake3_256.digest(&encode(&ipld)), - ); + let bytes = encode(&ipld).unwrap(); + let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes)); (cid, ipld) } /// A block-generating function for use with `arb_ipld_dag`. pub fn links_to_dag_cbor(cids: Vec, _: &mut TestRng) -> (Cid, Bytes) { let ipld = Ipld::List(cids.into_iter().map(Ipld::Link).collect()); - let bytes = encode(&ipld); + let bytes: Bytes = encode(&ipld).unwrap().into(); let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes)); (cid, bytes) } @@ -60,7 +58,7 @@ pub fn links_to_padded_ipld( Ipld::List(cids.into_iter().map(Ipld::Link).collect()), ), ])); - let bytes = encode(&ipld); + let bytes = encode(&ipld).unwrap(); let cid = Cid::new_v1(IpldCodec::DagCbor.into(), Code::Blake3_256.digest(&bytes)); (cid, ipld) } From 9d55be9f17c31620b3774e5441a0f60b969d5005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 25 Aug 2023 11:43:23 +0200 Subject: [PATCH 30/35] Fix typo --- car-mirror/src/push.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/car-mirror/src/push.rs b/car-mirror/src/push.rs index 03f54fc..0fb229d 100644 --- a/car-mirror/src/push.rs +++ b/car-mirror/src/push.rs @@ -32,7 +32,7 @@ pub async fn request( /// in the given `store`, if the blocks can be shown to relate /// to the `root` CID. /// -/// Returnes a response that gives the client information about what +/// Returns a response that gives the client information about what /// other data remains to be fetched. pub async fn response( root: Cid, From 1cd43c7ed4e5ce815ba26ffd0315316d7ce12d18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 25 Aug 2023 11:48:51 +0200 Subject: [PATCH 31/35] Check the block hash prior to storing the block --- car-mirror/src/incremental_verification.rs | 23 ++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/car-mirror/src/incremental_verification.rs b/car-mirror/src/incremental_verification.rs index 506c0c7..0960699 100644 --- a/car-mirror/src/incremental_verification.rs +++ b/car-mirror/src/incremental_verification.rs @@ -1,7 +1,10 @@ use crate::dag_walk::DagWalk; -use anyhow::{bail, Result}; +use anyhow::{anyhow, bail, Result}; use bytes::Bytes; -use libipld_core::cid::Cid; +use libipld_core::{ + cid::Cid, + multihash::{Code, MultihashDigest}, +}; use std::{collections::HashSet, matches}; use wnfs_common::{BlockStore, BlockStoreError}; @@ -109,13 +112,25 @@ impl IncrementalDagVerification { bail!("Incremental verification failed. Block state is: {block_state:?}, expected BlockState::Want"); } - // TODO(matheus23): Verify hash before putting it into the blockstore. + let hash_func: Code = cid + .hash() + .code() + .try_into() + .map_err(|_| anyhow!("Unsupported hash code in CID {cid}"))?; + + let hash = hash_func.digest(bytes.as_ref()); + + if &hash != cid.hash() { + let result_cid = Cid::new_v1(cid.codec(), hash); + bail!("Digest mismatch in CAR file: expected {cid}, got {result_cid}"); + } + let result_cid = store.put_block(bytes, cid.codec()).await?; // TODO(matheus23): The BlockStore chooses the hashing function, // so it may choose a different hashing function, causing a mismatch if result_cid != cid { - bail!("Digest mismatch in CAR file: expected {cid}, got {result_cid}"); + bail!("BlockStore uses an incompatible hashing function: CID mismatched, expected {cid}, got {result_cid}"); } self.update_have_cids(store).await?; From a6d33ebaee81e4baa85801a495abf24e91c7ad3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 25 Aug 2023 11:59:39 +0200 Subject: [PATCH 32/35] Use `#[serde(flatten)]` to clean up serialization --- car-mirror/src/common.rs | 39 ++++++++++++++++++------------------- car-mirror/src/messages.rs | 40 +++++++++++++------------------------- 2 files changed, 32 insertions(+), 47 deletions(-) diff --git a/car-mirror/src/common.rs b/car-mirror/src/common.rs index cc26d8c..acbd0bf 100644 --- a/car-mirror/src/common.rs +++ b/car-mirror/src/common.rs @@ -11,7 +11,7 @@ use wnfs_common::BlockStore; use crate::{ dag_walk::DagWalk, incremental_verification::{BlockState, IncrementalDagVerification}, - messages::{PullRequest, PushResponse}, + messages::{Bloom, PullRequest, PushResponse}, }; //-------------------------------------------------------------------------------------------------- @@ -226,28 +226,23 @@ impl From for ReceiverState { fn from(push: PushResponse) -> Self { let PushResponse { subgraph_roots, - bloom_k, bloom, } = push; Self { missing_subgraph_roots: subgraph_roots, - have_cids_bloom: Self::bloom_deserialize(bloom_k, bloom), + have_cids_bloom: Self::bloom_deserialize(bloom), } } } impl From for ReceiverState { fn from(pull: PullRequest) -> Self { - let PullRequest { - resources, - bloom_k, - bloom, - } = pull; + let PullRequest { resources, bloom } = pull; Self { missing_subgraph_roots: resources, - have_cids_bloom: Self::bloom_deserialize(bloom_k, bloom), + have_cids_bloom: Self::bloom_deserialize(bloom), } } } @@ -259,11 +254,10 @@ impl From for PushResponse { have_cids_bloom, } = receiver_state; - let (bloom_k, bloom) = ReceiverState::bloom_serialize(have_cids_bloom); + let bloom = ReceiverState::bloom_serialize(have_cids_bloom); PushResponse { subgraph_roots: missing_subgraph_roots, - bloom_k, bloom, } } @@ -276,31 +270,36 @@ impl From for PullRequest { have_cids_bloom, } = receiver_state; - let (bloom_k, bloom) = ReceiverState::bloom_serialize(have_cids_bloom); + let bloom = ReceiverState::bloom_serialize(have_cids_bloom); PullRequest { resources: missing_subgraph_roots, - bloom_k, bloom, } } } impl ReceiverState { - fn bloom_serialize(bloom: Option) -> (u32, Vec) { + fn bloom_serialize(bloom: Option) -> Bloom { match bloom { - Some(bloom) => (bloom.hash_count() as u32, bloom.as_bytes().to_vec()), - None => (3, Vec::new()), + Some(bloom) => Bloom { + hash_count: bloom.hash_count() as u32, + bytes: bloom.as_bytes().to_vec(), + }, + None => Bloom { + hash_count: 3, + bytes: Vec::new(), + }, } } - fn bloom_deserialize(bloom_k: u32, bloom: Vec) -> Option { - if bloom.is_empty() { + fn bloom_deserialize(bloom: Bloom) -> Option { + if bloom.bytes.is_empty() { None } else { Some(BloomFilter::new_with( - bloom_k as usize, - bloom.into_boxed_slice(), + bloom.hash_count as usize, + bloom.bytes.into_boxed_slice(), )) } } diff --git a/car-mirror/src/messages.rs b/car-mirror/src/messages.rs index e1471c1..85b1640 100644 --- a/car-mirror/src/messages.rs +++ b/car-mirror/src/messages.rs @@ -12,31 +12,9 @@ pub struct PullRequest { #[serde(rename = "rs")] pub resources: Vec, - /// Bloom filter hash count - #[serde(rename = "bk")] - pub bloom_k: u32, - - /// Bloom filter Binary - #[serde(rename = "bb")] - pub bloom: Vec, -} - -/// Part of the initial message for push requests. -/// The other part is simply tupled together with the actual initial -/// CAR file. -/// -/// Wire data type from the [specification]. -/// -/// [specification]: https://github.com/fission-codes/spec/blob/86fcfb07d507f1df4fdaaf49088abecbb1dda76a/car-pool/car-mirror/http.md#22-requestor-payload -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct PushRequestHeader { - /// Bloom filter hash count - #[serde(rename = "bk")] - pub bloom_k: u32, - - /// Bloom filter Binary - #[serde(rename = "bb")] - pub bloom: Vec, + /// A bloom containing already stored blocks + #[serde(flatten)] + pub bloom: Bloom, } /// The response sent after the initial and subsequent push requests. @@ -50,13 +28,21 @@ pub struct PushResponse { #[serde(rename = "sr")] pub subgraph_roots: Vec, + /// A bloom containing already stored blocks + #[serde(flatten)] + pub bloom: Bloom, +} + +/// The serialization format for bloom filters in CAR mirror +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Bloom { /// Bloom filter hash count #[serde(rename = "bk")] - pub bloom_k: u32, + pub hash_count: u32, /// Bloom filter Binary #[serde(rename = "bb")] - pub bloom: Vec, + pub bytes: Vec, } impl PushResponse { From cad84c80b5dda08e617db20363b81da5df0e0914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 25 Aug 2023 12:00:42 +0200 Subject: [PATCH 33/35] Lint --- car-mirror/src/test_utils/blockstore_utils.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/car-mirror/src/test_utils/blockstore_utils.rs b/car-mirror/src/test_utils/blockstore_utils.rs index 4bce5aa..1bcd9e5 100644 --- a/car-mirror/src/test_utils/blockstore_utils.rs +++ b/car-mirror/src/test_utils/blockstore_utils.rs @@ -37,7 +37,7 @@ pub fn dag_to_dot( for (cid, ipld) in blocks { let bytes = encode(&ipld)?; - let refs = references(cid, &bytes)?; + let refs = references(cid, bytes)?; for to_cid in refs { print_truncated_string(writer, cid.to_string())?; write!(writer, " -> ")?; From b3c51ae923c098e4fad991f10d2612e6d1ac0148 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 25 Aug 2023 12:04:09 +0200 Subject: [PATCH 34/35] Choose appropriate datastructures for `references` --- car-mirror/src/common.rs | 3 +-- car-mirror/src/dag_walk.rs | 4 ++-- car-mirror/src/test_utils/blockstore_utils.rs | 2 +- car-mirror/src/test_utils/local_utils.rs | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/car-mirror/src/common.rs b/car-mirror/src/common.rs index acbd0bf..900aa32 100644 --- a/car-mirror/src/common.rs +++ b/car-mirror/src/common.rs @@ -207,13 +207,12 @@ pub async fn block_receive( /// This will error out if /// - the codec is not supported /// - the block can't be parsed. -pub fn references(cid: Cid, block: impl AsRef<[u8]>) -> Result> { +pub fn references>(cid: Cid, block: impl AsRef<[u8]>, mut refs: E) -> Result { let codec: IpldCodec = cid .codec() .try_into() .map_err(|_| anyhow!("Unsupported codec in Cid: {cid}"))?; - let mut refs = Vec::new(); >::references(codec, &mut Cursor::new(block), &mut refs)?; Ok(refs) } diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs index 9050648..3b45c69 100644 --- a/car-mirror/src/dag_walk.rs +++ b/car-mirror/src/dag_walk.rs @@ -76,7 +76,7 @@ impl DagWalk { // - skip Raw CIDs. They can't have further links (but needs adjustment to this function's return type) // - run multiple `get_block` calls concurrently let block = store.get_block(&cid).await?; - for ref_cid in references(cid, &block)? { + for ref_cid in references(cid, &block, HashSet::new())? { if !self.visited.contains(&ref_cid) { self.frontier.push_front(ref_cid); } @@ -112,7 +112,7 @@ impl DagWalk { /// Skip a node from the traversal for now. pub fn skip_walking(&mut self, block: (Cid, Bytes)) -> Result<()> { let (cid, bytes) = block; - let refs = references(cid, bytes)?; + let refs = references(cid, bytes, HashSet::new())?; self.visited.insert(cid); self.frontier .retain(|frontier_cid| !refs.contains(frontier_cid)); diff --git a/car-mirror/src/test_utils/blockstore_utils.rs b/car-mirror/src/test_utils/blockstore_utils.rs index 1bcd9e5..3394271 100644 --- a/car-mirror/src/test_utils/blockstore_utils.rs +++ b/car-mirror/src/test_utils/blockstore_utils.rs @@ -37,7 +37,7 @@ pub fn dag_to_dot( for (cid, ipld) in blocks { let bytes = encode(&ipld)?; - let refs = references(cid, bytes)?; + let refs = references(cid, bytes, Vec::new())?; for to_cid in refs { print_truncated_string(writer, cid.to_string())?; write!(writer, " -> ")?; diff --git a/car-mirror/src/test_utils/local_utils.rs b/car-mirror/src/test_utils/local_utils.rs index 5c7323f..51f021d 100644 --- a/car-mirror/src/test_utils/local_utils.rs +++ b/car-mirror/src/test_utils/local_utils.rs @@ -24,7 +24,7 @@ pub(crate) async fn get_cid_at_approx_path( let mut working_cid = root; for nth in path { let block = store.get_block(&working_cid).await?; - let refs = references(working_cid, block)?; + let refs = references(working_cid, block, Vec::new())?; if refs.is_empty() { break; } From 28b3c2672b8fcedc1ddf4a49f78cfdf591d058ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= Date: Fri, 25 Aug 2023 12:12:56 +0200 Subject: [PATCH 35/35] Make `DagWalk` deterministic again --- car-mirror/src/dag_walk.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/car-mirror/src/dag_walk.rs b/car-mirror/src/dag_walk.rs index 3b45c69..3f27e7e 100644 --- a/car-mirror/src/dag_walk.rs +++ b/car-mirror/src/dag_walk.rs @@ -76,7 +76,7 @@ impl DagWalk { // - skip Raw CIDs. They can't have further links (but needs adjustment to this function's return type) // - run multiple `get_block` calls concurrently let block = store.get_block(&cid).await?; - for ref_cid in references(cid, &block, HashSet::new())? { + for ref_cid in references(cid, &block, Vec::new())? { if !self.visited.contains(&ref_cid) { self.frontier.push_front(ref_cid); } @@ -132,6 +132,10 @@ mod tests { async fn test_walk_dag_breadth_first() -> Result<()> { let store = &MemoryBlockStore::new(); + // cid_root ---> cid_1_wrap ---> cid_1 + // -> cid_2 + // -> cid_3 + let cid_1 = store.put_serializable(&Ipld::String("1".into())).await?; let cid_2 = store.put_serializable(&Ipld::String("2".into())).await?; let cid_3 = store.put_serializable(&Ipld::String("3".into())).await?;