From 78521a0ecae9bd34baecb5163f56ee2c7675acc5 Mon Sep 17 00:00:00 2001 From: aumetra Date: Fri, 27 Sep 2024 21:05:40 +0200 Subject: [PATCH 1/3] Add small custom scraper library --- Cargo.lock | 333 +++++++------------------- Cargo.toml | 2 + crates/kitsune-derive/impl/Cargo.toml | 2 +- crates/kitsune-embed/Cargo.toml | 2 +- crates/kitsune-embed/src/lib.rs | 23 +- lib/schaber/Cargo.toml | 13 + lib/schaber/LICENSE-APACHE-2.0 | 1 + lib/schaber/LICENSE-MIT | 1 + lib/schaber/src/lib.rs | 71 ++++++ lib/schaber/tests/basic.rs | 23 ++ 10 files changed, 213 insertions(+), 258 deletions(-) create mode 100644 lib/schaber/Cargo.toml create mode 120000 lib/schaber/LICENSE-APACHE-2.0 create mode 120000 lib/schaber/LICENSE-MIT create mode 100644 lib/schaber/src/lib.rs create mode 100644 lib/schaber/tests/basic.rs diff --git a/Cargo.lock b/Cargo.lock index 85095bf7f..1700f9910 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -81,7 +81,7 @@ checksum = "d1eb7c4fcde1858a6796c18a729b661346d38e05a207e2d9028bce822fc20283" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -212,7 +212,7 @@ dependencies = [ "argh_shared", "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -288,7 +288,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -398,7 +398,7 @@ dependencies = [ "proc-macro2", "quote", "strum", - "syn 2.0.78", + "syn 2.0.79", "thiserror", ] @@ -434,7 +434,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -456,7 +456,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -467,7 +467,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -628,7 +628,7 @@ checksum = "57d123550fa8d071b7255cb0cc04dc302baa6c8c4a79f55701552684d8399bce" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -1065,7 +1065,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -1528,19 +1528,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "cssparser" -version = "0.31.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" -dependencies = [ - "cssparser-macros", - "dtoa-short", - "itoa 1.0.11", - "phf 0.11.2", - "smallvec", -] - [[package]] name = "cssparser-macros" version = "0.6.1" @@ -1548,7 +1535,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" dependencies = [ "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -1594,7 +1581,7 @@ checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -1618,7 +1605,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -1629,7 +1616,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -1659,7 +1646,7 @@ checksum = "8034092389675178f570469e6c3b0465d3d30b4505c294a6550db47f3c17ad18" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -1690,7 +1677,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -1700,7 +1687,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4abae7035bf79b9877b779505d8cf3749285b80c43941eda66604841889451dc" dependencies = [ "derive_builder_core", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -1713,7 +1700,7 @@ dependencies = [ "proc-macro2", "quote", "rustc_version", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -1733,7 +1720,7 @@ checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -1775,7 +1762,7 @@ dependencies = [ "dsl_auto_type", "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -1804,7 +1791,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25" dependencies = [ - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -1833,7 +1820,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -1858,7 +1845,7 @@ checksum = "27540baf49be0d484d8f0130d7d8da3011c32a44d4fc873368154f1510e574a2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -1878,7 +1865,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -1940,12 +1927,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "ego-tree" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12a0bb14ac04a9fcf170d0bbbef949b44cc492f4452bd20c095636956f653642" - [[package]] name = "either" version = "1.13.0" @@ -2028,7 +2009,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -2039,7 +2020,7 @@ checksum = "ba7795da175654fe16979af73f81f26a8ea27638d8d9823d317016888a63dc4c" dependencies = [ "num-traits", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -2051,7 +2032,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -2271,7 +2252,7 @@ checksum = "1458c6e22d36d61507034d5afecc64f105c1d39712b7ac6ec3b352c423f715cc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -2291,16 +2272,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" -[[package]] -name = "futf" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" -dependencies = [ - "mac", - "new_debug_unreachable", -] - [[package]] name = "futures" version = "0.3.30" @@ -2357,7 +2328,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -2446,7 +2417,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -2547,7 +2518,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -2804,20 +2775,6 @@ dependencies = [ "windows", ] -[[package]] -name = "html5ever" -version = "0.27.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" -dependencies = [ - "log", - "mac", - "markup5ever", - "proc-macro2", - "quote", - "syn 2.0.78", -] - [[package]] name = "http" version = "1.1.0" @@ -3132,7 +3089,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -3661,7 +3618,7 @@ version = "0.0.1-pre.6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -3696,7 +3653,7 @@ dependencies = [ "kitsune-derive", "kitsune-error", "kitsune-http-client", - "scraper", + "schaber", "smol_str", ] @@ -4285,7 +4242,7 @@ dependencies = [ "proc-macro2", "quote", "regex-syntax", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -4305,14 +4262,14 @@ checksum = "964b47c14635e111f7efddcd8f1f8794195f66225fef19822fa942b217a859cf" dependencies = [ "bitflags 2.6.0", "cfg-if", - "cssparser 0.27.2", + "cssparser", "encoding_rs", "hashbrown 0.13.2", "lazy_static", "lazycell", "memchr", "mime", - "selectors 0.22.0", + "selectors", "thiserror", ] @@ -4325,12 +4282,6 @@ dependencies = [ "linked-hash-map", ] -[[package]] -name = "mac" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" - [[package]] name = "mach2" version = "0.4.2" @@ -4340,20 +4291,6 @@ dependencies = [ "libc", ] -[[package]] -name = "markup5ever" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" -dependencies = [ - "log", - "phf 0.11.2", - "phf_codegen 0.11.2", - "string_cache", - "string_cache_codegen", - "tendril", -] - [[package]] name = "masto-id-convert" version = "0.0.1-pre.6" @@ -4409,7 +4346,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -4479,7 +4416,7 @@ checksum = "dcf09caffaac8068c346b6df2a7fc27a177fd20b39421a39ce0a211bde679a6c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -4659,12 +4596,6 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3" -[[package]] -name = "new_debug_unreachable" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" - [[package]] name = "nodrop" version = "0.1.14" @@ -4722,7 +4653,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -5129,7 +5060,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -5185,26 +5116,6 @@ dependencies = [ "phf_shared 0.8.0", ] -[[package]] -name = "phf_codegen" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" -dependencies = [ - "phf_generator 0.10.0", - "phf_shared 0.10.0", -] - -[[package]] -name = "phf_codegen" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" -dependencies = [ - "phf_generator 0.11.2", - "phf_shared 0.11.2", -] - [[package]] name = "phf_generator" version = "0.8.0" @@ -5273,7 +5184,7 @@ dependencies = [ "phf_shared 0.11.2", "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -5320,7 +5231,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -5446,7 +5357,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479cf940fbbb3426c32c5d5176f62ad57549a0bb84773423ba8be9d089f5faba" dependencies = [ "proc-macro2", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -5506,7 +5417,7 @@ checksum = "6ff7ff745a347b87471d859a377a9a404361e7efc2a971d73424a6d183c0fc77" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -5529,7 +5440,7 @@ dependencies = [ "itertools 0.13.0", "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -5930,7 +5841,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.78", + "syn 2.0.79", "unicode-ident", ] @@ -5965,7 +5876,7 @@ dependencies = [ "proc-macro2", "quote", "rust-embed-utils", - "syn 2.0.78", + "syn 2.0.79", "walkdir", ] @@ -6124,6 +6035,14 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "schaber" +version = "0.0.1-pre.6" +dependencies = [ + "lol_html", + "thiserror", +] + [[package]] name = "schannel" version = "0.1.24" @@ -6155,7 +6074,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -6174,21 +6093,6 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" -[[package]] -name = "scraper" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b90460b31bfe1fc07be8262e42c665ad97118d4585869de9345a84d501a9eaf0" -dependencies = [ - "ahash", - "cssparser 0.31.2", - "ego-tree", - "html5ever", - "once_cell", - "selectors 0.25.0", - "tendril", -] - [[package]] name = "sec1" version = "0.7.3" @@ -6233,38 +6137,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe" dependencies = [ "bitflags 1.3.2", - "cssparser 0.27.2", + "cssparser", "derive_more 0.99.18", "fxhash", "log", "matches", "phf 0.8.0", - "phf_codegen 0.8.0", + "phf_codegen", "precomputed-hash", - "servo_arc 0.1.1", + "servo_arc", "smallvec", "thin-slice", ] -[[package]] -name = "selectors" -version = "0.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" -dependencies = [ - "bitflags 2.6.0", - "cssparser 0.31.2", - "derive_more 0.99.18", - "fxhash", - "log", - "new_debug_unreachable", - "phf 0.10.1", - "phf_codegen 0.10.0", - "precomputed-hash", - "servo_arc 0.3.0", - "smallvec", -] - [[package]] name = "semver" version = "1.0.23" @@ -6301,7 +6186,7 @@ checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -6312,7 +6197,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -6367,7 +6252,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -6427,7 +6312,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -6440,15 +6325,6 @@ dependencies = [ "stable_deref_trait", ] -[[package]] -name = "servo_arc" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" -dependencies = [ - "stable_deref_trait", -] - [[package]] name = "sha-1" version = "0.10.1" @@ -6699,32 +6575,6 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7beae5182595e9a8b683fa98c4317f956c9a2dec3b9716990d20023cc60c766" -[[package]] -name = "string_cache" -version = "0.8.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" -dependencies = [ - "new_debug_unreachable", - "once_cell", - "parking_lot", - "phf_shared 0.10.0", - "precomputed-hash", - "serde", -] - -[[package]] -name = "string_cache_codegen" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" -dependencies = [ - "phf_generator 0.10.0", - "phf_shared 0.10.0", - "proc-macro2", - "quote", -] - [[package]] name = "stringprep" version = "0.1.5" @@ -6751,7 +6601,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -6762,7 +6612,7 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -6785,7 +6635,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -6838,9 +6688,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.78" +version = "2.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81b9b4733a9c8b8aaa20634df36eeb68cc0c0669f2e18fb287006b496a14195d" +checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" dependencies = [ "proc-macro2", "quote", @@ -6867,7 +6717,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -6911,17 +6761,6 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "tendril" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" -dependencies = [ - "futf", - "mac", - "utf-8", -] - [[package]] name = "termcolor" version = "1.4.1" @@ -6984,7 +6823,7 @@ checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -7087,7 +6926,7 @@ checksum = "8d9ef545650e79f30233c0003bcc2504d7efac6dad25fca40744de773fe2049c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -7117,7 +6956,7 @@ checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -7423,7 +7262,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -7558,7 +7397,7 @@ checksum = "560b82d656506509d43abe30e0ba64c56b1953ab3d4fe7ba5902747a7a3cedd5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -7594,7 +7433,7 @@ checksum = "70b20a22c42c8f1cd23ce5e34f165d4d37038f5b663ad20fb6adbdf029172483" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -7842,7 +7681,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", "wasm-bindgen-shared", ] @@ -7876,7 +7715,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -8005,7 +7844,7 @@ dependencies = [ "anyhow", "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", "wasmtime-component-util", "wasmtime-wit-bindgen", "wit-parser", @@ -8122,7 +7961,7 @@ checksum = "e9bb1f01efb8b542eadfda511e8ea1cc54309451aba97b69969e5b1a59cb7ded" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -8552,7 +8391,7 @@ dependencies = [ "heck 0.5.0", "indexmap 2.5.0", "prettyplease", - "syn 2.0.78", + "syn 2.0.79", "wasm-metadata", "wit-bindgen-core", "wit-component", @@ -8568,7 +8407,7 @@ dependencies = [ "prettyplease", "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", "wit-bindgen-core", "wit-bindgen-rust", ] @@ -8689,7 +8528,7 @@ checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", "synstructure", ] @@ -8711,7 +8550,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -8731,7 +8570,7 @@ checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", "synstructure", ] @@ -8752,7 +8591,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] @@ -8774,7 +8613,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.78", + "syn 2.0.79", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 53c438769..128bd62b1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,6 +66,7 @@ members = [ "lib/mrf-manifest", "lib/mrf-tool", "lib/post-process", + "lib/schaber", "lib/speedy-uuid", "lib/tick-tock-mock", "lib/tower-http-digest", @@ -162,6 +163,7 @@ just-retry = { path = "lib/just-retry" } masto-id-convert = { path = "lib/masto-id-convert" } mrf-manifest = { path = "lib/mrf-manifest" } post-process = { path = "lib/post-process" } +schaber = { path = "lib/schaber" } speedy-uuid = { path = "lib/speedy-uuid", features = ["serde"] } tick-tock-mock = { path = "lib/tick-tock-mock" } tower-http-digest = { path = "lib/tower-http-digest" } diff --git a/crates/kitsune-derive/impl/Cargo.toml b/crates/kitsune-derive/impl/Cargo.toml index 397d5c045..1cf2afebe 100644 --- a/crates/kitsune-derive/impl/Cargo.toml +++ b/crates/kitsune-derive/impl/Cargo.toml @@ -11,7 +11,7 @@ proc-macro = true [dependencies] proc-macro2 = "1.0.86" quote = "1.0.37" -syn = { version = "2.0.78", features = ["full"] } +syn = { version = "2.0.79", features = ["full"] } [lints] workspace = true diff --git a/crates/kitsune-embed/Cargo.toml b/crates/kitsune-embed/Cargo.toml index 98d754977..b2261ae53 100644 --- a/crates/kitsune-embed/Cargo.toml +++ b/crates/kitsune-embed/Cargo.toml @@ -15,7 +15,7 @@ kitsune-derive = { workspace = true } kitsune-error = { workspace = true } kitsune-http-client = { workspace = true } lantern-client-sdk = { package = "client-sdk", git = "https://github.com/Lantern-chat/client-sdk-rs.git", rev = "efb4288d9b107b48609802193d57b29f7ae395a1", default-features = false } -scraper = { version = "0.20.0", default-features = false } +schaber = { workspace = true } smol_str = "0.3.1" [lints] diff --git a/crates/kitsune-embed/src/lib.rs b/crates/kitsune-embed/src/lib.rs index b3282fdb1..1544ef5a7 100644 --- a/crates/kitsune-embed/src/lib.rs +++ b/crates/kitsune-embed/src/lib.rs @@ -12,24 +12,29 @@ use kitsune_derive::kitsune_service; use kitsune_error::Result; use kitsune_http_client::Client as HttpClient; use lantern_client_sdk::models::EmbedWithExpire; -use scraper::{Html, Selector}; +use schaber::Scraper; use smol_str::SmolStr; use std::sync::LazyLock; pub use lantern_client_sdk::models::{Embed, EmbedType}; -static LINK_SELECTOR: LazyLock = LazyLock::new(|| { - Selector::parse("a:not(.mention, .hashtag)").expect("[Bug] Failed to parse link HTML selector") +static LINK_SCRAPER: LazyLock = LazyLock::new(|| { + Scraper::new("a:not(.mention, .hashtag)").expect("[Bug] Failed to parse link HTML selector") }); fn first_link_from_fragment(fragment: &str) -> Option { - let parsed_fragment = Html::parse_fragment(fragment); + let mut link = None; + LINK_SCRAPER + .process(fragment, |element| { + if link.is_some() { + return; + } + + link = element.get_attribute("href"); + }) + .unwrap(); - parsed_fragment - .select(&LINK_SELECTOR) - .next() - .and_then(|element| element.value().attr("href")) - .map(ToString::to_string) + link } #[kitsune_service] diff --git a/lib/schaber/Cargo.toml b/lib/schaber/Cargo.toml new file mode 100644 index 000000000..17b26ba52 --- /dev/null +++ b/lib/schaber/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "schaber" +authors.workspace = true +edition.workspace = true +version.workspace = true +license = "MIT OR Apache-2.0" + +[dependencies] +lol_html = "2.0.0" +thiserror = "1.0.64" + +[lints] +workspace = true diff --git a/lib/schaber/LICENSE-APACHE-2.0 b/lib/schaber/LICENSE-APACHE-2.0 new file mode 120000 index 000000000..bffaaba2f --- /dev/null +++ b/lib/schaber/LICENSE-APACHE-2.0 @@ -0,0 +1 @@ +../../LICENSE-APACHE-2.0 \ No newline at end of file diff --git a/lib/schaber/LICENSE-MIT b/lib/schaber/LICENSE-MIT new file mode 120000 index 000000000..b2cfbdc7b --- /dev/null +++ b/lib/schaber/LICENSE-MIT @@ -0,0 +1 @@ +../../LICENSE-MIT \ No newline at end of file diff --git a/lib/schaber/src/lib.rs b/lib/schaber/src/lib.rs new file mode 100644 index 000000000..b72d8d0fd --- /dev/null +++ b/lib/schaber/src/lib.rs @@ -0,0 +1,71 @@ +use lol_html::{ + errors::{RewritingError, SelectorError}, + html_content::Element, + ElementContentHandlers, HandlerResult, HtmlRewriter, Selector, Settings, +}; +use std::{borrow::Cow, str::FromStr}; +use thiserror::Error; + +type Result = std::result::Result; + +#[derive(Debug, Error)] +pub enum Error { + #[error(transparent)] + InvalidSelector(#[from] SelectorError), + + #[error(transparent)] + RewriteError(#[from] RewritingError), +} + +pub struct Scraper { + element_selector: Selector, +} + +impl Scraper { + pub fn new(selector: &str) -> Result { + Ok(Self { + element_selector: Selector::from_str(selector)?, + }) + } + + pub fn process(&self, input: I, mut handler: H) -> Result<()> + where + I: AsRef<[u8]>, + H: FnMut(&Element<'_, '_>), + { + #[inline(always)] + fn handler_assert(uwu: F) -> F + where + F: FnMut(&mut Element<'_, '_>) -> HandlerResult, + { + uwu + } + + #[inline(always)] + fn sink_assert(uwu: F) -> F + where + F: FnMut(&[u8]), + { + uwu + } + + let mut rewriter = HtmlRewriter::new( + Settings { + element_content_handlers: vec![( + Cow::Borrowed(&self.element_selector), + ElementContentHandlers::default().element(handler_assert(|el| { + handler(el); + Ok(()) + })), + )], + ..Settings::new() + }, + sink_assert(|_| {}), + ); + + rewriter.write(input.as_ref())?; + rewriter.end()?; + + Ok(()) + } +} diff --git a/lib/schaber/tests/basic.rs b/lib/schaber/tests/basic.rs new file mode 100644 index 000000000..66f3bea52 --- /dev/null +++ b/lib/schaber/tests/basic.rs @@ -0,0 +1,23 @@ +use schaber::Scraper; + +#[test] +fn select_link() { + let html = r#" + + "#; + + let mut link_url = None; + let scraper = Scraper::new("a").unwrap(); + + scraper + .process(html, |element| { + link_url = element.get_attribute("href"); + }) + .unwrap(); + + assert_eq!(link_url.as_deref(), Some("http://druckbrudi.lab")); +} From 32ba801353dc65679d55111d1302d113493ab7aa Mon Sep 17 00:00:00 2001 From: aumetra Date: Fri, 27 Sep 2024 21:19:46 +0200 Subject: [PATCH 2/3] add control flow capabilities --- crates/kitsune-embed/src/lib.rs | 7 ++-- lib/schaber/src/lib.rs | 30 +++++++++++++---- lib/schaber/tests/basic.rs | 2 ++ lib/schaber/tests/control_flow.rs | 56 +++++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 11 deletions(-) create mode 100644 lib/schaber/tests/control_flow.rs diff --git a/crates/kitsune-embed/src/lib.rs b/crates/kitsune-embed/src/lib.rs index 1544ef5a7..e484f0d24 100644 --- a/crates/kitsune-embed/src/lib.rs +++ b/crates/kitsune-embed/src/lib.rs @@ -14,7 +14,7 @@ use kitsune_http_client::Client as HttpClient; use lantern_client_sdk::models::EmbedWithExpire; use schaber::Scraper; use smol_str::SmolStr; -use std::sync::LazyLock; +use std::{ops::ControlFlow, sync::LazyLock}; pub use lantern_client_sdk::models::{Embed, EmbedType}; @@ -26,11 +26,8 @@ fn first_link_from_fragment(fragment: &str) -> Option { let mut link = None; LINK_SCRAPER .process(fragment, |element| { - if link.is_some() { - return; - } - link = element.get_attribute("href"); + ControlFlow::Break(()) }) .unwrap(); diff --git a/lib/schaber/src/lib.rs b/lib/schaber/src/lib.rs index b72d8d0fd..0163fd3dc 100644 --- a/lib/schaber/src/lib.rs +++ b/lib/schaber/src/lib.rs @@ -3,11 +3,26 @@ use lol_html::{ html_content::Element, ElementContentHandlers, HandlerResult, HtmlRewriter, Selector, Settings, }; -use std::{borrow::Cow, str::FromStr}; +use std::{borrow::Cow, ops::ControlFlow, str::FromStr}; use thiserror::Error; type Result = std::result::Result; +/// Ignore any content handler "errors", since we use these errors +/// as our means of communicating control flow +macro_rules! handle_error { + ($error_expr:expr) => {{ + match { $error_expr } { + Err(::lol_html::errors::RewritingError::ContentHandlerError(..)) => return Ok(()), + other => other, + } + }}; +} + +#[derive(Debug, Error)] +#[error("small sacrifice for the lol_html gods")] +struct Sacrifice; + #[derive(Debug, Error)] pub enum Error { #[error(transparent)] @@ -31,7 +46,7 @@ impl Scraper { pub fn process(&self, input: I, mut handler: H) -> Result<()> where I: AsRef<[u8]>, - H: FnMut(&Element<'_, '_>), + H: FnMut(&Element<'_, '_>) -> ControlFlow<()>, { #[inline(always)] fn handler_assert(uwu: F) -> F @@ -54,8 +69,11 @@ impl Scraper { element_content_handlers: vec![( Cow::Borrowed(&self.element_selector), ElementContentHandlers::default().element(handler_assert(|el| { - handler(el); - Ok(()) + if handler(el).is_continue() { + Ok(()) + } else { + Err(Box::new(Sacrifice)) + } })), )], ..Settings::new() @@ -63,8 +81,8 @@ impl Scraper { sink_assert(|_| {}), ); - rewriter.write(input.as_ref())?; - rewriter.end()?; + handle_error!(rewriter.write(input.as_ref()))?; + handle_error!(rewriter.end())?; Ok(()) } diff --git a/lib/schaber/tests/basic.rs b/lib/schaber/tests/basic.rs index 66f3bea52..0afbe44c7 100644 --- a/lib/schaber/tests/basic.rs +++ b/lib/schaber/tests/basic.rs @@ -1,4 +1,5 @@ use schaber::Scraper; +use std::ops::ControlFlow; #[test] fn select_link() { @@ -16,6 +17,7 @@ fn select_link() { scraper .process(html, |element| { link_url = element.get_attribute("href"); + ControlFlow::Break(()) }) .unwrap(); diff --git a/lib/schaber/tests/control_flow.rs b/lib/schaber/tests/control_flow.rs new file mode 100644 index 000000000..4199b68d5 --- /dev/null +++ b/lib/schaber/tests/control_flow.rs @@ -0,0 +1,56 @@ +use schaber::Scraper; +use std::ops::ControlFlow; + +#[test] +fn ends_after_break() { + let html = r#" + + "#; + + let mut link_url = None; + let scraper = Scraper::new("a").unwrap(); + + scraper + .process(html, |element| { + link_url = element.get_attribute("href"); + ControlFlow::Break(()) + }) + .unwrap(); + + assert_eq!(link_url.as_deref(), Some("http://druckbrudi.lab")); +} + +#[test] +fn continues_after_continue() { + let html = r#" + + "#; + + let mut link_url = None; + let scraper = Scraper::new("a").unwrap(); + + scraper + .process(html, |element| { + link_url = element.get_attribute("href"); + ControlFlow::Continue(()) + }) + .unwrap(); + + assert_eq!(link_url.as_deref(), Some("https://good.org")); +} From 026e096f47c2217d3e3e4ce5eb17b104fb591a16 Mon Sep 17 00:00:00 2001 From: aumetra Date: Fri, 27 Sep 2024 21:22:04 +0200 Subject: [PATCH 3/3] fix lint --- lib/schaber/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/schaber/src/lib.rs b/lib/schaber/src/lib.rs index 0163fd3dc..c49f92127 100644 --- a/lib/schaber/src/lib.rs +++ b/lib/schaber/src/lib.rs @@ -48,7 +48,7 @@ impl Scraper { I: AsRef<[u8]>, H: FnMut(&Element<'_, '_>) -> ControlFlow<()>, { - #[inline(always)] + #[inline] fn handler_assert(uwu: F) -> F where F: FnMut(&mut Element<'_, '_>) -> HandlerResult, @@ -56,7 +56,7 @@ impl Scraper { uwu } - #[inline(always)] + #[inline] fn sink_assert(uwu: F) -> F where F: FnMut(&[u8]),