Skip to content

Commit

Permalink
support relative urls
Browse files Browse the repository at this point in the history
  • Loading branch information
suaviloquence committed Jul 27, 2024
1 parent a4cb7ae commit 9d690b3
Show file tree
Hide file tree
Showing 7 changed files with 71 additions and 20 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ filter-proc-macro = { path = "./filter-proc-macro" }
serde_json = "1.0"
serde = { version = "1.0", features = ["derive", "rc"] }
futures = "0.3"
url = "2.5"

[workspace]
members = [".", "filter-proc-macro"]
7 changes: 7 additions & 0 deletions examples/inputs/relative.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<!doctype html>

<html>
<body>
<a href="./relative2.html">My linky!</a>
</body>
</html>
7 changes: 7 additions & 0 deletions examples/inputs/relative2.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<!doctype html>

<html>
<body>
<p id="success">You found me!</p>
</body>
</html>
4 changes: 4 additions & 0 deletions examples/outputs/relative.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"href": "./relative2.html",
"relative": "You found me!"
}
7 changes: 7 additions & 0 deletions examples/scrps/relative.scrp
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
href: a {
href: $element | attrs() | take(key: "href");
} | take(key: "href");

relative: <$href> #success {
x: $text;
} | take(key: "x");
64 changes: 44 additions & 20 deletions src/interpreter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ pub struct ElementContext<'ast, 'ctx> {
element: scraper::ElementRef<'ctx>,
text: OnceCell<Arc<str>>,
parent: Option<&'ctx ElementContext<'ast, 'ctx>>,
url: Url,
}

#[derive(Debug)]
Expand Down Expand Up @@ -122,15 +123,16 @@ impl<'ast> Interpreter<'ast> {
root_url: Url,
head: Option<AstRef<'ast, StatementList<'ast>>>,
) -> Result<DataVariables<'ast>> {
let html = self.get_html(root_url).await?;
self.interpret_block(html.root_element(), head, None).await
let html = self.get_html(&root_url).await?;
self.interpret_block(html.root_element(), head, None, root_url)
.await
}

async fn get_html(&self, url: Url) -> Result<scraper::Html> {
async fn get_html(&self, url: &Url) -> Result<scraper::Html> {
let text = match url.scheme() {
"http" | "https" => self
.client
.get(url)
.get(url.clone())
.send()
.await
.context("Error sending HTTP request")?
Expand All @@ -151,12 +153,14 @@ impl<'ast> Interpreter<'ast> {
element: scraper::ElementRef<'_>,
statements: Option<AstRef<'ast, StatementList<'ast>>>,
parent: Option<&ElementContext<'ast, '_>>,
url: Url,
) -> Result<DataVariables<'ast>> {
let mut ctx = ElementContext {
element,
parent,
variables: Variables::default(),
text: OnceCell::new(),
url,
};

for statement in self.ast.flatten(statements) {
Expand Down Expand Up @@ -190,17 +194,20 @@ impl<'ast> Interpreter<'ast> {
) -> anyhow::Result<Value> {
let html;

let root_element = if let Some(url) = &element.url {
let (root_element, url) = if let Some(url) = &element.url {
let url: Arc<str> = self.eval_inline(url, ctx)?.try_unwrap()?;
html = self
.get_html(
url.parse()
.with_context(|| format!("`{url}` is not a valid URL"))?,
)
.await?;
html.root_element()
let url: Url = match url.parse() {
Ok(url) => url,
Err(url::ParseError::RelativeUrlWithoutBase) => ctx
.url
.join(&url)
.with_context(|| format!("`{url} is not a valid relative URL"))?,
Err(e) => anyhow::bail!("`{url}` is not a valid URL: {e}"),
};
html = self.get_html(&url).await?;
(html.root_element(), url)
} else {
ctx.element
(ctx.element, ctx.url.clone())
};

let selector_str = element.to_selector_str(self.ast);
Expand All @@ -215,11 +222,10 @@ impl<'ast> Interpreter<'ast> {

let element_refs = ExecutionMode::hinted_from_iter(element.qualifier, selection)?;

let values =
futures::future::try_join_all(element_refs.into_iter().map(|element_ref| {
self.interpret_block(element_ref, element.statements, Some(ctx))
}))
.await?;
let values = futures::future::try_join_all(element_refs.into_iter().map(|element_ref| {
self.interpret_block(element_ref, element.statements, Some(ctx), url.clone())
}))
.await?;

Ok(
ExecutionMode::hinted_from_iter(
Expand Down Expand Up @@ -323,7 +329,13 @@ pub async fn interpret_string_harness(
let html = scraper::Html::parse_document(html);
let interpreter = Interpreter::new(Box::leak(Box::new(ast)));
interpreter
.interpret_block(html.root_element(), head, None)
// TODO: url hack
.interpret_block(
html.root_element(),
head,
None,
"file:///tmp/inmemory.html".parse().expect("URL parse"),
)
.await
}

Expand All @@ -345,7 +357,18 @@ mod tests {
let html = scraper::Html::parse_document(&input);

let result = super::Interpreter::new(&ast)
.interpret_block(html.root_element(), head, None)
.interpret_block(
html.root_element(),
head,
None,
format!(
"file://{}/examples/inputs/{}",
std::env::current_dir().expect("get current dir").display(),
filename,
)
.parse()
.expect("parse URL failed"),
)
.await?;
let result = serde_json::to_value(result.0)?;
assert_eq!(output, result);
Expand Down Expand Up @@ -430,5 +453,6 @@ mod tests {
abc,
attr,
qualifiers,
relative,
}
}

0 comments on commit 9d690b3

Please sign in to comment.