Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace scraper with a small custom library #593

Merged
merged 3 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
333 changes: 86 additions & 247 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ members = [
"lib/mrf-manifest",
"lib/mrf-tool",
"lib/post-process",
"lib/schaber",
"lib/speedy-uuid",
"lib/tick-tock-mock",
"lib/tower-http-digest",
Expand Down Expand Up @@ -162,6 +163,7 @@ just-retry = { path = "lib/just-retry" }
masto-id-convert = { path = "lib/masto-id-convert" }
mrf-manifest = { path = "lib/mrf-manifest" }
post-process = { path = "lib/post-process" }
schaber = { path = "lib/schaber" }
speedy-uuid = { path = "lib/speedy-uuid", features = ["serde"] }
tick-tock-mock = { path = "lib/tick-tock-mock" }
tower-http-digest = { path = "lib/tower-http-digest" }
Expand Down
2 changes: 1 addition & 1 deletion crates/kitsune-derive/impl/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ proc-macro = true
[dependencies]
proc-macro2 = "1.0.86"
quote = "1.0.37"
syn = { version = "2.0.78", features = ["full"] }
syn = { version = "2.0.79", features = ["full"] }

[lints]
workspace = true
2 changes: 1 addition & 1 deletion crates/kitsune-embed/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ kitsune-derive = { workspace = true }
kitsune-error = { workspace = true }
kitsune-http-client = { workspace = true }
lantern-client-sdk = { package = "client-sdk", git = "https://github.com/Lantern-chat/client-sdk-rs.git", rev = "efb4288d9b107b48609802193d57b29f7ae395a1", default-features = false }
scraper = { version = "0.20.0", default-features = false }
schaber = { workspace = true }
smol_str = "0.3.1"

[lints]
Expand Down
22 changes: 12 additions & 10 deletions crates/kitsune-embed/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,26 @@
use kitsune_error::Result;
use kitsune_http_client::Client as HttpClient;
use lantern_client_sdk::models::EmbedWithExpire;
use scraper::{Html, Selector};
use schaber::Scraper;
use smol_str::SmolStr;
use std::sync::LazyLock;
use std::{ops::ControlFlow, sync::LazyLock};

pub use lantern_client_sdk::models::{Embed, EmbedType};

static LINK_SELECTOR: LazyLock<Selector> = LazyLock::new(|| {
Selector::parse("a:not(.mention, .hashtag)").expect("[Bug] Failed to parse link HTML selector")
static LINK_SCRAPER: LazyLock<Scraper> = LazyLock::new(|| {
Scraper::new("a:not(.mention, .hashtag)").expect("[Bug] Failed to parse link HTML selector")

Check warning on line 22 in crates/kitsune-embed/src/lib.rs

View check run for this annotation

Codecov / codecov/patch

crates/kitsune-embed/src/lib.rs#L21-L22

Added lines #L21 - L22 were not covered by tests
});

fn first_link_from_fragment(fragment: &str) -> Option<String> {
let parsed_fragment = Html::parse_fragment(fragment);
let mut link = None;
LINK_SCRAPER
.process(fragment, |element| {
link = element.get_attribute("href");
ControlFlow::Break(())
})
.unwrap();

Check warning on line 32 in crates/kitsune-embed/src/lib.rs

View check run for this annotation

Codecov / codecov/patch

crates/kitsune-embed/src/lib.rs#L26-L32

Added lines #L26 - L32 were not covered by tests

parsed_fragment
.select(&LINK_SELECTOR)
.next()
.and_then(|element| element.value().attr("href"))
.map(ToString::to_string)
link

Check warning on line 34 in crates/kitsune-embed/src/lib.rs

View check run for this annotation

Codecov / codecov/patch

crates/kitsune-embed/src/lib.rs#L34

Added line #L34 was not covered by tests
}

#[kitsune_service]
Expand Down
13 changes: 13 additions & 0 deletions lib/schaber/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[package]
name = "schaber"
authors.workspace = true
edition.workspace = true
version.workspace = true
license = "MIT OR Apache-2.0"

[dependencies]
lol_html = "2.0.0"
thiserror = "1.0.64"

[lints]
workspace = true
1 change: 1 addition & 0 deletions lib/schaber/LICENSE-APACHE-2.0
1 change: 1 addition & 0 deletions lib/schaber/LICENSE-MIT
89 changes: 89 additions & 0 deletions lib/schaber/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
use lol_html::{
errors::{RewritingError, SelectorError},
html_content::Element,
ElementContentHandlers, HandlerResult, HtmlRewriter, Selector, Settings,
};
use std::{borrow::Cow, ops::ControlFlow, str::FromStr};
use thiserror::Error;

type Result<T, E = Error> = std::result::Result<T, E>;

/// Ignore any content handler "errors", since we use these errors
/// as our means of communicating control flow
macro_rules! handle_error {
($error_expr:expr) => {{
match { $error_expr } {
Err(::lol_html::errors::RewritingError::ContentHandlerError(..)) => return Ok(()),
other => other,
}
}};
}

#[derive(Debug, Error)]
#[error("small sacrifice for the lol_html gods")]
struct Sacrifice;

#[derive(Debug, Error)]
pub enum Error {
#[error(transparent)]
InvalidSelector(#[from] SelectorError),

#[error(transparent)]
RewriteError(#[from] RewritingError),
}

pub struct Scraper {
element_selector: Selector,
}

impl Scraper {
pub fn new(selector: &str) -> Result<Self> {
Ok(Self {
element_selector: Selector::from_str(selector)?,
})
}

pub fn process<I, H>(&self, input: I, mut handler: H) -> Result<()>
where
I: AsRef<[u8]>,
H: FnMut(&Element<'_, '_>) -> ControlFlow<()>,
{
#[inline]
fn handler_assert<F>(uwu: F) -> F
where
F: FnMut(&mut Element<'_, '_>) -> HandlerResult,
{
uwu
}

#[inline]
fn sink_assert<F>(uwu: F) -> F
where
F: FnMut(&[u8]),
{
uwu
}

let mut rewriter = HtmlRewriter::new(
Settings {
element_content_handlers: vec![(
Cow::Borrowed(&self.element_selector),
ElementContentHandlers::default().element(handler_assert(|el| {
if handler(el).is_continue() {
Ok(())
} else {
Err(Box::new(Sacrifice))
}
})),
)],
..Settings::new()
},
sink_assert(|_| {}),
);

handle_error!(rewriter.write(input.as_ref()))?;
handle_error!(rewriter.end())?;

Ok(())
}
}
25 changes: 25 additions & 0 deletions lib/schaber/tests/basic.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
use schaber::Scraper;
use std::ops::ControlFlow;

#[test]
fn select_link() {
let html = r#"
<div id="hello">
<a href="http://druckbrudi.lab">
PRINT MORE BLÅHAJ CATEARS!
</a>
</div>
"#;

let mut link_url = None;
let scraper = Scraper::new("a").unwrap();

scraper
.process(html, |element| {
link_url = element.get_attribute("href");
ControlFlow::Break(())
})
.unwrap();

assert_eq!(link_url.as_deref(), Some("http://druckbrudi.lab"));
}
56 changes: 56 additions & 0 deletions lib/schaber/tests/control_flow.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
use schaber::Scraper;
use std::ops::ControlFlow;

#[test]
fn ends_after_break() {
let html = r#"
<div id="hello">
<a href="http://druckbrudi.lab">
PRINT MORE BLÅHAJ CATEARS!
</a>

<a href="http://evil.com">
This link shall not be seen!
</a>
</div>
"#;

let mut link_url = None;
let scraper = Scraper::new("a").unwrap();

scraper
.process(html, |element| {
link_url = element.get_attribute("href");
ControlFlow::Break(())
})
.unwrap();

assert_eq!(link_url.as_deref(), Some("http://druckbrudi.lab"));
}

#[test]
fn continues_after_continue() {
let html = r#"
<div id="hello">
<a href="http://druckbrudi.lab">
PRINT MORE BLÅHAJ CATEARS!
</a>

<a href="https://good.org">
This link shall be seen!
</a>
</div>
"#;

let mut link_url = None;
let scraper = Scraper::new("a").unwrap();

scraper
.process(html, |element| {
link_url = element.get_attribute("href");
ControlFlow::Continue(())
})
.unwrap();

assert_eq!(link_url.as_deref(), Some("https://good.org"));
}
Loading