diff --git a/Cargo.lock b/Cargo.lock index db24e07..6a957a4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -270,6 +270,21 @@ dependencies = [ "new_debug_unreachable", ] +[[package]] +name = "futures" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + [[package]] name = "futures-channel" version = "0.3.30" @@ -277,6 +292,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" dependencies = [ "futures-core", + "futures-sink", ] [[package]] @@ -285,6 +301,34 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +[[package]] +name = "futures-executor" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" + +[[package]] +name = "futures-macro" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.64", +] + [[package]] name = "futures-sink" version = "0.3.30" @@ -303,10 +347,16 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" dependencies = [ + "futures-channel", "futures-core", + "futures-io", + "futures-macro", + "futures-sink", "futures-task", + "memchr", "pin-project-lite", "pin-utils", + "slab", ] [[package]] @@ -1104,6 +1154,7 @@ version = "0.2.0" dependencies = [ "anyhow", "filter-proc-macro", + "futures", "regex", "reqwest", "scraper", diff --git a/Cargo.toml b/Cargo.toml index 844a195..359e80f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ tokio = { version = "1.38.0", features = ["full"] } filter-proc-macro = { path = "./filter-proc-macro" } serde_json = "1.0" serde = { version = "1.0", features = ["derive", "rc"] } +futures = "0.3" [workspace] members = [".", "filter-proc-macro"] diff --git a/grammar.txt b/grammar.txt index 53f1992..37b8472 100644 --- a/grammar.txt +++ b/grammar.txt @@ -10,8 +10,9 @@ rvalue -> element element -> maybe_url selector_list selector_ops `{` statement_list `}` -maybe_url -> variable - | STRING +inline -> `<` leaf filter_list `>` + +maybe_url -> inline | "" selector_list -> selector selector2 diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..ce4bff1 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,2 @@ +[toolchain] +channel = "beta" diff --git a/src/frontend/ast.rs b/src/frontend/ast.rs index 3f53296..090efaa 100644 --- a/src/frontend/ast.rs +++ b/src/frontend/ast.rs @@ -202,10 +202,9 @@ pub enum RValue<'a> { } #[derive(Debug, Clone)] -pub enum Url<'a> { - Parent, - String(Cow<'a, str>), - Var(&'a str), +pub struct Inline<'a> { + pub value: Leaf<'a>, + pub filters: Option>>, } #[derive(Debug, Clone)] @@ -217,7 +216,7 @@ pub struct Statement<'a> { #[derive(Debug, Clone)] pub struct Element<'a> { - pub url: Url<'a>, + pub url: Option>, pub selector_head: Selector<'a>, pub selectors: Option>>, pub ops: SelectorOpts, diff --git a/src/frontend/parser.rs b/src/frontend/parser.rs index f678d19..4b20d24 100644 --- a/src/frontend/parser.rs +++ b/src/frontend/parser.rs @@ -4,8 +4,8 @@ use std::{borrow::Cow, ops::Not}; use super::{ arena::Arena, ast::{ - ArgList, Ast, AstRef, Element, FilterList, Leaf, RValue, Selector, SelectorCombinator, - SelectorList, SelectorOpts, Statement, StatementList, Url, + ArgList, Ast, AstRef, Element, FilterList, Inline, Leaf, RValue, Selector, + SelectorCombinator, SelectorList, SelectorOpts, Statement, StatementList, }, scanner::{Lexeme, Scanner, Token}, }; @@ -88,7 +88,7 @@ impl<'a> Parser<'a> { let lx = self.scanner.peek_non_whitespace(); match lx.token { - Token::Id => self.parse_element().map(RValue::Element), + Token::Id | Token::Less => self.parse_element().map(RValue::Element), _ => self.parse_leaf().map(RValue::Leaf), } } @@ -129,7 +129,7 @@ impl<'a> Parser<'a> { } fn parse_element(&mut self) -> Result<'a, Element<'a>> { - let url = self.parse_url()?; + let url = self.parse_maybe_url()?; let selector_head = self.parse_selector()?; let selectors = self.parse_selector_list()?; let lx = self.scanner.peek_non_whitespace(); @@ -161,19 +161,28 @@ impl<'a> Parser<'a> { }) } - fn parse_url(&mut self) -> Result<'a, Url<'a>> { + fn parse_maybe_url(&mut self) -> Result<'a, Option>> { let lx = self.scanner.peek_non_whitespace(); - match lx.token { - Token::Dollar => { - self.scanner.eat_token(); - let id = self.try_eat(Token::Id)?.value; - Ok(Url::Var(id)) - } - Token::String => { - self.scanner.eat_token(); - Ok(Url::String(parse_string_literal(lx.value))) - } - _ => Ok(Url::Parent), + if lx.token == Token::Less { + self.parse_inline().map(Some) + } else { + Ok(None) + } + } + + fn parse_inline(&mut self) -> Result<'a, Inline<'a>> { + let lx = self.scanner.peek_non_whitespace(); + if lx.token == Token::Less { + self.scanner.eat_token(); + let value = self.parse_leaf()?; + let filters = self.parse_filter_list()?; + self.try_eat(Token::Greater)?; + Ok(Inline { value, filters }) + } else { + Err(ParseError::UnexpectedToken { + expected: vec![Token::Less], + got: lx, + }) } } diff --git a/src/frontend/scanner.rs b/src/frontend/scanner.rs index 2b1a429..9e37b37 100644 --- a/src/frontend/scanner.rs +++ b/src/frontend/scanner.rs @@ -16,7 +16,7 @@ pub enum Token { Star, /// the selector combinator `+` to indicate the next sibling Plus, - /// the selector combinator `>` to indicate a direct child + /// the selector combinator `>` to indicate a direct child or the end of an inline expansion Greater, /// the selector combinator `~` to indicate a subsequent sibling Tilde, @@ -44,6 +44,8 @@ pub enum Token { Colon, /// a semicolon `;` to indicate the end of a statement Semi, + /// a less than sign `<` to indicate the start of an inline expansion + Less, /// special token to indicate the end of the file Eof, /// special token to indicate unknown token @@ -116,6 +118,7 @@ mod statics { Comma <- "," Colon <- ":" Semi <- ";" + Less <- "<" }; } } diff --git a/src/interpreter/execution_mode.rs b/src/interpreter/execution_mode.rs new file mode 100644 index 0000000..5511feb --- /dev/null +++ b/src/interpreter/execution_mode.rs @@ -0,0 +1,114 @@ +use std::{future::Future, iter, option, vec}; + +use crate::frontend::ast::SelectorOpts; + +use super::DataValue; +use anyhow::Context; +use ExecutionMode::{Collection, One, Optional}; + +/// Whether we are matching a list, singular item, or optional item +/// as specified by the user +#[derive(Debug, Clone)] +pub enum ExecutionMode { + One(T), + Optional(Option), + Collection(Vec), +} + +#[derive(Debug)] +pub enum IntoIter { + One(iter::Once), + Optional(option::IntoIter), + Collection(vec::IntoIter), +} + +impl Iterator for IntoIter { + type Item = T; + + #[inline] + fn next(&mut self) -> Option { + match self { + Self::One(x) => x.next(), + Self::Optional(x) => x.next(), + Self::Collection(x) => x.next(), + } + } +} + +impl IntoIterator for ExecutionMode { + type Item = T; + + type IntoIter = IntoIter; + + #[inline] + fn into_iter(self) -> Self::IntoIter { + match self { + One(x) => IntoIter::One(iter::once(x)), + Optional(x) => IntoIter::Optional(x.into_iter()), + Collection(x) => IntoIter::Collection(x.into_iter()), + } + } +} + +impl ExecutionMode { + pub fn map U>(self, mut f: F) -> ExecutionMode { + match self { + One(x) => One(f(x)), + Optional(Some(x)) => Optional(Some(f(x))), + Optional(None) => Optional(None), + Collection(l) => Collection(l.into_iter().map(f).collect()), + } + } + + #[inline] + pub async fn async_map, Fn: FnMut(T) -> Fut>( + self, + f: Fn, + ) -> ExecutionMode { + self.map(f).transpose_fut().await + } + + pub fn hinted_from_iter>( + ops: SelectorOpts, + mut iter: I, + ) -> anyhow::Result { + Ok(match ops { + // TODO: take the first, or fail if there are > 1? + SelectorOpts::One => One(iter.next().context("Expected exactly one value")?), + SelectorOpts::Optional => Optional(iter.next()), + SelectorOpts::Collection => Collection(iter.collect()), + }) + } +} + +impl ExecutionMode { + pub fn into_data_value(self) -> DataValue { + match self { + One(x) | Optional(Some(x)) => x, + Optional(None) => DataValue::Null, + Collection(l) => DataValue::List(l), + } + } +} + +impl ExecutionMode> { + pub fn transpose_res(self) -> Result, E> { + Ok(match self { + One(x) => One(x?), + Optional(Some(x)) => Optional(Some(x?)), + Optional(None) => Optional(None), + Collection(l) => Collection(l.into_iter().collect::>()?), + }) + } +} + +impl> ExecutionMode { + pub async fn transpose_fut(self) -> ExecutionMode { + match self { + One(f) => One(f.await), + Optional(Some(f)) => Optional(Some(f.await)), + Optional(None) => Optional(None), + Collection(l) => Collection(futures::future::join_all(l).await), + } + } +} diff --git a/src/interpreter/mod.rs b/src/interpreter/mod.rs index 5fbd9fd..d9ed8b5 100644 --- a/src/interpreter/mod.rs +++ b/src/interpreter/mod.rs @@ -1,42 +1,23 @@ use std::{borrow::Cow, cell::OnceCell, collections::BTreeMap, sync::Arc}; use anyhow::Context; +use execution_mode::ExecutionMode; +use reqwest::IntoUrl; use crate::frontend::{ - ast::{AstRef, Element, FilterList, Leaf, RValue, SelectorOpts, Statement, StatementList}, + ast::{AstRef, Element, FilterList, Inline, Leaf, RValue, Statement, StatementList}, AstArena, }; +mod execution_mode; mod filter; mod value; pub use filter::Filter; pub use value::{DataValue, TryFromValue, Value}; -/// Whether we are matching a list, singular item, or optional item -/// as specified by the user -#[derive(Debug)] -enum ExecutionMode { - One(T), - Optional(Option), - Collection(Vec), -} - -impl ExecutionMode { - fn try_map(self, mut f: F) -> Result, E> - where - F: FnMut(T) -> Result, - { - use ExecutionMode::{Collection, One, Optional}; - - Ok(match self { - One(t) => One(f(t)?), - Optional(Some(t)) => Optional(Some(f(t)?)), - Optional(None) => Optional(None), - Collection(vec) => Collection(vec.into_iter().map(f).collect::>()?), - }) - } -} +type Error = anyhow::Error; +type Result = core::result::Result; impl<'ast> Element<'ast> { #[must_use] @@ -78,6 +59,18 @@ impl<'a, 'b> From> for Variables<'a, 'b> { } } +impl<'ast> From> for DataValue { + fn from(value: DataVariables<'ast>) -> Self { + Self::Structure( + value + .0 + .into_iter() + .map(|(k, v)| (Arc::from(&*k), v)) + .collect(), + ) + } +} + #[derive(Debug)] pub struct ElementContext<'ast, 'ctx> { variables: Variables<'ast, 'ctx>, @@ -86,102 +79,163 @@ pub struct ElementContext<'ast, 'ctx> { parent: Option<&'ctx ElementContext<'ast, 'ctx>>, } -pub fn interpret<'ast>( - html: scraper::Html, +#[derive(Debug)] +pub struct Interpreter<'ast> { + client: reqwest::Client, ast: &'ast AstArena<'ast>, - head: Option>>, -) -> anyhow::Result> { - let mut ctx = ElementContext { - element: html.root_element(), - variables: Variables::default(), - text: OnceCell::new(), - parent: None, - }; - - for statement in ast.flatten(head) { - interpret_statement(ast, &statement.value, &mut ctx)?; - } - - Ok(ctx.variables.into()) } -fn interpret_statement<'ast, 'stmt, 'ctx: 'stmt>( - ast: &'ast AstArena<'ast>, - statement: &Statement<'ast>, - ctx: &'stmt mut ElementContext<'ast, 'ctx>, -) -> anyhow::Result<()> { - match statement.id { - immutable @ ("document" | "text") => { - anyhow::bail!("can't assign to immutable variable `{immutable}`") - } - id => { - let value = match &statement.value { - RValue::Leaf(l) => ctx.leaf_to_value(l)?, - RValue::Element(e) => interpret_element(ctx, ctx.element, ast, e)?.into(), - }; +impl<'ast> Interpreter<'ast> { + #[must_use] + #[inline] + pub fn new(ast: &'ast AstArena<'ast>) -> Self { + Self::with_client( + ast, + reqwest::Client::builder() + .user_agent(concat!( + env!("CARGO_PKG_NAME"), + " v", + env!("CARGO_PKG_VERSION") + )) + .build() + .expect("Default client is invalid"), + ) + } - let value = apply_filters(ast.flatten(statement.filters).into_iter(), value, ctx, ast)?; - ctx.variables.0.insert(Cow::Borrowed(id), value); - } + #[must_use] + #[inline] + pub const fn with_client(ast: &'ast AstArena<'ast>, client: reqwest::Client) -> Self { + Self { ast, client } } - Ok(()) -} + #[inline] + pub async fn interpret( + &self, + root_url: U, + head: Option>>, + ) -> Result> { + let html = self.get_html(root_url).await?; + self.interpret_block(html.root_element(), head, None).await + } -fn interpret_element<'ast, 'ctx>( - parent: &'ctx ElementContext<'ast, 'ctx>, - root: scraper::ElementRef<'ctx>, - ast: &'ast AstArena<'ast>, - element: &Element<'ast>, -) -> anyhow::Result { - let selector_str = element.to_selector_str(ast); + async fn get_html(&self, url: U) -> Result { + let text = self + .client + .get(url) + .send() + .await + .context("Error sending HTTP request")? + .text() + .await + .context("Error getting HTTP body text")?; + Ok(scraper::Html::parse_document(&text)) + } - let selector = scraper::Selector::parse(&selector_str).map_err(|e| { - anyhow::anyhow!( - "Selector parse failed: {e}. This is a program error. Selector is `{selector_str}`", - ) - })?; - - let mut selection = root.select(&selector); - - let element_refs = match element.ops { - // TODO: take the first, or fail if there are > 1? - SelectorOpts::One => ExecutionMode::One( - selection - .next() - .with_context(|| format!("Expected exactly one `{selector_str}`"))?, - ), - SelectorOpts::Optional => ExecutionMode::Optional(selection.next()), - SelectorOpts::Collection => ExecutionMode::Collection(selection.collect()), - }; - - let values = element_refs.try_map(|element_ref| { + async fn interpret_block( + &self, + element: scraper::ElementRef<'_>, + statements: Option>>, + parent: Option<&ElementContext<'ast, '_>>, + ) -> Result> { let mut ctx = ElementContext { - element: element_ref, + element, + parent, variables: Variables::default(), text: OnceCell::new(), - parent: Some(parent), }; - for statement in ast.flatten(element.statements) { - let statement = &statement.value; - interpret_statement(ast, statement, &mut ctx)?; + for statement in self.ast.flatten(statements) { + self.interpret_statement(&statement.value, &mut ctx).await?; } - Ok::<_, anyhow::Error>(DataValue::Structure( - ctx.variables - .0 + Ok(ctx.variables.into()) + } + + async fn interpret_statement( + &self, + statement: &Statement<'ast>, + ctx: &mut ElementContext<'ast, '_>, + ) -> Result<()> { + let value = match &statement.value { + RValue::Leaf(l) => ctx.leaf_to_value(l)?, + RValue::Element(e) => self.interpret_element(e, ctx).await?.into(), + }; + + let value = + self.apply_filters(value, self.ast.flatten(statement.filters).into_iter(), ctx)?; + ctx.set_var(Cow::Borrowed(statement.id), value)?; + + Ok(()) + } + + async fn interpret_element( + &self, + element: &Element<'ast>, + ctx: &mut ElementContext<'ast, '_>, + ) -> anyhow::Result { + let html; + + let root_element = if let Some(url) = &element.url { + let url: Arc = self.eval_inline(url, ctx)?.try_into()?; + html = self.get_html(&*url).await?; + html.root_element() + } else { + ctx.element + }; + + let selector_str = element.to_selector_str(self.ast); + + let selector = scraper::Selector::parse(&selector_str).map_err(|e| { + anyhow::anyhow!( + "Selector parse failed: {e}. This is a program error. Selector is `{selector_str}`", + ) + })?; + + let selection = root_element.select(&selector); + + let element_refs = ExecutionMode::hinted_from_iter(element.ops, selection)?; + + let values = + futures::future::try_join_all(element_refs.into_iter().map(|element_ref| { + self.interpret_block(element_ref, element.statements, Some(ctx)) + })) + .await?; + + Ok( + ExecutionMode::hinted_from_iter(element.ops, values.into_iter().map(DataValue::from))? + .into_data_value(), + ) + } + + fn apply_filters<'ctx>( + &self, + value: Value<'ctx>, + mut filters: impl Iterator>, + ctx: &mut ElementContext<'ast, 'ctx>, + ) -> Result> { + filters.try_fold(value, |value, filter| { + let args = self + .ast + .flatten(filter.args) .into_iter() - .filter_map(|(k, v)| DataValue::try_from(v).ok().map(|v| (Arc::from(&*k), v))) - .collect(), - )) - })?; - - Ok(match values { - ExecutionMode::One(x) | ExecutionMode::Optional(Some(x)) => x, - ExecutionMode::Optional(None) => DataValue::Null, - ExecutionMode::Collection(l) => DataValue::List(l), - }) + .map(|arg| Ok((arg.id, ctx.leaf_to_value(&arg.value)?))) + .collect::>>()?; + + filter::dispatch_filter(filter.id, value, args, ctx) + }) + } + + fn eval_inline<'ctx>( + &self, + inline: &Inline<'ast>, + ctx: &mut ElementContext<'ast, 'ctx>, + ) -> Result> { + self.apply_filters( + ctx.leaf_to_value(&inline.value)?, + self.ast.flatten(inline.filters).into_iter(), + ctx, + ) + } } impl<'ast, 'ctx> ElementContext<'ast, 'ctx> { @@ -226,49 +280,24 @@ impl<'ast, 'ctx> ElementContext<'ast, 'ctx> { } } -fn apply_filters<'ast, 'ctx>( - filters: impl Iterator>, - value: Value<'ctx>, - ctx: &mut ElementContext<'ast, 'ctx>, - ast: &AstArena<'ast>, -) -> anyhow::Result> { - let mut value = value; - for f in filters { - let args: BTreeMap<_, _> = ast - .flatten(f.args) - .into_iter() - .map(|a| { - let value = match &a.value { - Leaf::Var(id) => ctx.get_var(id)?, - Leaf::Int(n) => Value::Int(*n), - Leaf::String(c) => Value::String(Arc::from(&**c)), - Leaf::Float(f) => Value::Float(*f), - }; - - Ok((a.id, value)) - }) - .collect::>()?; - - value = filter::dispatch_filter(f.id, value, args, ctx)?; - } - Ok(value) -} - #[cfg(test)] -pub fn interpret_string_harness( +pub async fn interpret_string_harness( program: &'static str, html: &'static str, -) -> anyhow::Result> { +) -> Result> { let (ast, head) = crate::frontend::Parser::new(program).parse()?; let html = scraper::Html::parse_document(html); - interpret(html, Box::leak(Box::new(ast)), head) + let interpreter = Interpreter::new(Box::leak(Box::new(ast))); + interpreter + .interpret_block(html.root_element(), head, None) + .await } #[cfg(test)] mod tests { - use super::{interpret, DataValue::*}; + use super::DataValue::*; - fn integration_test(filename: &str) -> anyhow::Result<()> { + async fn integration_test(filename: &str) -> anyhow::Result<()> { let input = std::fs::read_to_string(format!("examples/inputs/{filename}.html"))?; let script = std::fs::read_to_string(format!("examples/scrps/{filename}.scrp"))?; let output: serde_json::Value = serde_json::from_reader(std::fs::File::open(format!( @@ -278,9 +307,12 @@ mod tests { let (ast, head) = crate::frontend::Parser::new(&script) .parse() .expect("parse error"); + let html = scraper::Html::parse_document(&input); - let result = interpret(html, &ast, head)?; + let result = super::Interpreter::new(&ast) + .interpret_block(html.root_element(), head, None) + .await?; let result = serde_json::to_value(result.0)?; assert_eq!(output, result); @@ -292,16 +324,16 @@ mod tests { $($name: ident,)* } => { $( - #[test] - fn $name() -> anyhow::Result<()> { - integration_test(stringify!($name)) + #[tokio::test] + async fn $name() -> anyhow::Result<()> { + integration_test(stringify!($name)).await } )* }; } - #[test] - fn test_basic() { + #[tokio::test] + async fn test_basic() { let output = super::interpret_string_harness( r#" h3: h3 { @@ -320,6 +352,7 @@ mod tests { "#, ) + .await .expect("parsing and interpreting should succeed"); let Some(Structure(d)) = output.0.get("h3") else { diff --git a/src/main.rs b/src/main.rs index fbef0db..227eb25 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,7 +2,7 @@ use std::{env, sync::Arc}; use anyhow::Context; use frontend::Parser; -use interpreter::DataValue; +use interpreter::{DataValue, Interpreter}; pub mod frontend; pub mod interpreter; @@ -26,40 +26,14 @@ async fn main() -> anyhow::Result<()> { let parser = Parser::new(&pgm); - let client = reqwest::Client::builder() - .user_agent(env::var("USER_AGENT").unwrap_or_else(|_| { - format!("{}v{}", env!("CARGO_PKG_NAME"), env!("CARGO_PKG_VERSION")) - })) - .build() - .context("error building HTTP client")?; - let cloned = client.clone(); - - let parse_fut = async move { - match parser.parse() { - Ok(x) => Ok(x), - Err(e) => anyhow::bail!("Parse error: {e}"), - } - }; - - let fetch_fut = async move { - let client = cloned; - let url = url; - let req = client - .get(&url) - .send() - .await - .with_context(|| format!("Error fetching `{url}`"))?; - - let text = req.text().await.context("Error getting body text")?; - - let html = scraper::Html::parse_document(&text); - - Ok::<_, anyhow::Error>(html) + let (ast, head) = match parser.parse() { + Ok(x) => x, + Err(e) => anyhow::bail!("Parse Error: {e}"), }; - let ((ast, head), html) = tokio::try_join!(parse_fut, fetch_fut)?; + let interpreter = Interpreter::new(&ast); - let results = interpreter::interpret(html, &ast, head)?; + let results = interpreter.interpret(url, head).await?; let results = DataValue::Structure( results