From 8583e9c49a85743ecff9a20d56f47a915914130d Mon Sep 17 00:00:00 2001 From: m Date: Wed, 11 Sep 2024 11:06:48 -0700 Subject: [PATCH 1/3] add repl --- Cargo.lock | 80 ++++++- Cargo.toml | 1 + filter-types/src/context.rs | 68 ++++-- filter-types/src/value.rs | 10 +- src/frontend/ast.rs | 16 ++ src/frontend/parser.rs | 6 +- src/interpreter/mod.rs | 26 +- src/interpreter/repl.rs | 463 ++++++++++++++++++++++++++++++++++++ src/main.rs | 23 +- 9 files changed, 648 insertions(+), 45 deletions(-) create mode 100644 src/interpreter/repl.rs diff --git a/Cargo.lock b/Cargo.lock index 4d13e1c..cc05144 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -39,6 +39,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "aliasable" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "250f629c0161ad8107cf89319e990051fae62832fd343083bea452d93e2205fd" + [[package]] name = "anstream" version = "0.6.15" @@ -197,7 +203,7 @@ version = "4.5.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "501d359d5f3dcaf6ecdeee48833ae73ec6e42723a1e52419c79abf9507eec0a0" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn", @@ -298,6 +304,12 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591" +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + [[package]] name = "encode_unicode" version = "0.3.6" @@ -524,6 +536,12 @@ version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + [[package]] name = "heck" version = "0.5.0" @@ -708,6 +726,15 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.11" @@ -898,6 +925,31 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "ouroboros" +version = "0.18.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "944fa20996a25aded6b4795c6d63f10014a7a83f8be9828a11860b08c5fc4a67" +dependencies = [ + "aliasable", + "ouroboros_macro", + "static_assertions", +] + +[[package]] +name = "ouroboros_macro" +version = "0.18.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39b0deead1528fd0e5947a8546a9642a9777c25f6e1e26f34c97b204bbb465bd" +dependencies = [ + "heck 0.4.1", + "itertools", + "proc-macro2", + "proc-macro2-diagnostics", + "quote", + "syn", +] + [[package]] name = "parking_lot" version = "0.12.3" @@ -1079,6 +1131,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "proc-macro2-diagnostics" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "version_check", + "yansi", +] + [[package]] name = "quote" version = "1.0.36" @@ -1302,6 +1367,7 @@ dependencies = [ "clap", "futures", "insta", + "ouroboros", "regex", "reqwest", "scrapelect-filter-proc-macro", @@ -1500,6 +1566,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "string_cache" version = "0.8.7" @@ -2047,6 +2119,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "yansi" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" + [[package]] name = "zerocopy" version = "0.7.35" diff --git a/Cargo.toml b/Cargo.toml index c31ba64..5ce8a7e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ serde = { version = "1.0", features = ["derive", "rc"] } futures = "0.3" url = "2.5" clap = { version = "4.5.16", features = ["derive"] } +ouroboros = "0.18.4" [workspace] members = [".", "filter-proc-macro", "filter-types"] diff --git a/filter-types/src/context.rs b/filter-types/src/context.rs index fd6df90..59e9792 100644 --- a/filter-types/src/context.rs +++ b/filter-types/src/context.rs @@ -18,24 +18,56 @@ pub trait ElementContextView<'ast, 'ctx> { /// /// Overwrites a previous value bound to that name, if one is present. /// + /// Implementors should not implement this, instead [`set_inner`](Self::set_inner). + /// /// # Errors /// /// Returns an `Err` if `name` cannot be rebound (e.g., it is `"element"`). - fn set(&mut self, name: Cow<'ast, str>, value: EValue<'ctx>) -> Result<()>; + fn set(&mut self, name: Cow<'ast, str>, value: EValue<'ctx>) -> Result<()> { + match &*name { + immutable @ "element" => { + bail!("assignment to immutable binding `{immutable}`") + } + _ => self.set_inner(name, value), + } + } + + /// Sets the binding with name `name` to `value` in this context. + /// + /// Overwrites a previous value bound to that name, if one is present. + /// + /// Implementors should implement this and not `set`. + fn set_inner(&mut self, name: Cow<'ast, str>, value: EValue<'ctx>) -> Result<()>; /// Gets the binding with name `id`, if it is present. Handles /// retrieving special bindings like `element`. Looks in this /// context and all parent contexts, starting innermost first. /// + /// Implementors should not implement this and instead use [`get_inner`](Self::get_inner). + /// /// # Errors /// /// Returns an `Err` if a binding with name `id` is not found in this /// scope or any parent scopes. - fn get(&self, id: &str) -> Result>; + fn get(&self, id: &str) -> Result> { + match id { + "element" => Ok(self.element().into()), + _ => self.get_inner(id), + } + } + + /// For implementors of [`ElementContext`]. Only needs to get/set safe + /// (non-special) bindings, looking up in parent scopes as necessary. + /// + /// # Errors + /// + /// Returns an `Err` if a binding with name `id` is not found in this + /// scope or any parent scopes. + fn get_inner(&self, id: &str) -> Result>; /// Returns a [reference](ElementRef) to the root element of this block. #[must_use] - fn element(&self) -> ElementRef<'_>; + fn element(&self) -> ElementRef<'ctx>; /// Returns a reference to the URL of the document that this element is in. #[must_use] @@ -167,27 +199,19 @@ impl<'ast, 'ctx> Linked<'ast, 'ctx> { } impl<'ast, 'ctx> ElementContextView<'ast, 'ctx> for Linked<'ast, 'ctx> { - fn get(&self, id: &str) -> Result> { - match id { - "element" => Ok(self.element.into()), - _ => match self.bindings.0.get(id) { - Some(id) => Ok(id.clone()), - None => self - .parent - .with_msg(|| format!("unknown binding `{id}`"))? - .get(id), - }, + fn get_inner(&self, id: &str) -> Result> { + match self.bindings.0.get(id) { + Some(id) => Ok(id.clone()), + None => self + .parent + .with_msg(|| format!("unknown binding `{id}`"))? + .get(id), } } - fn set(&mut self, name: Cow<'ast, str>, value: EValue<'ctx>) -> Result<()> { - match &*name { - immutable @ "element" => { - bail!("assignment to immutable binding `{immutable}`") - } - _ => self.bindings.0.insert(name, value), - }; - + #[inline] + fn set_inner(&mut self, name: Cow<'ast, str>, value: EValue<'ctx>) -> Result<()> { + self.bindings.0.insert(name, value); Ok(()) } @@ -197,7 +221,7 @@ impl<'ast, 'ctx> ElementContextView<'ast, 'ctx> for Linked<'ast, 'ctx> { } #[inline] - fn element(&self) -> ElementRef<'_> { + fn element(&self) -> ElementRef<'ctx> { self.element } } diff --git a/filter-types/src/value.rs b/filter-types/src/value.rs index 6301020..56c1258 100644 --- a/filter-types/src/value.rs +++ b/filter-types/src/value.rs @@ -237,7 +237,15 @@ pub enum Element<'a> { impl fmt::Display for Element<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Self::Element(elem) => write!(f, "`{}`", elem.html()), + Self::Element(elem) => { + write!(f, "<{}", elem.value().name())?; + + for (name, value) in elem.value().attrs() { + write!(f, r#"{name}="{value}""#)?; + } + + f.write_str(">") + } } } } diff --git a/src/frontend/ast.rs b/src/frontend/ast.rs index 28ebf97..9b9d6b1 100644 --- a/src/frontend/ast.rs +++ b/src/frontend/ast.rs @@ -221,4 +221,20 @@ mod selector_display { Ok(()) } } + + impl Selector<'_> { + pub fn to_scraper(&self) -> scraper::Selector { + let selector_str = self.to_string(); + // the borrow checker does not like having this inline. + let result = scraper::Selector::parse(&selector_str); + match result { + Ok(s) => s, + Err(e) => unreachable!( + "failed to parse selector `{selector_str}`. + This is a bug in `scrapelect`, please report it. + `selectors` error: {e}" + ), + } + } + } } diff --git a/src/frontend/parser.rs b/src/frontend/parser.rs index 340f0ed..f1fd01f 100644 --- a/src/frontend/parser.rs +++ b/src/frontend/parser.rs @@ -179,7 +179,7 @@ impl<'a> Parser<'a> { Ok(Inline { value, filters }) } - fn parse_value(&mut self) -> Result> { + pub(crate) fn parse_value(&mut self) -> Result> { let (span, lx) = self.scanner.peek_non_whitespace(); match lx.token { Token::Less => self.parse_inline(), @@ -230,7 +230,9 @@ impl<'a> Parser<'a> { let (span, lx) = item; match lx.token { - Token::BraceOpen | Token::ParenOpen => Ok(None), + // Eof is only allowed in a repl context. + // TODO: investigate if this makes it possible to parse an invalid program anyway. + Token::BraceOpen | Token::ParenOpen | Token::Eof => Ok(None), // invariant: peek_next_whitespace is one of Id | Hash | Dot | Star // whitespace is eaten in the above block. Token::Whitespace => Ok(Some(SelectorCombinator::Descendent)), diff --git a/src/interpreter/mod.rs b/src/interpreter/mod.rs index c487682..198d8e0 100644 --- a/src/interpreter/mod.rs +++ b/src/interpreter/mod.rs @@ -3,17 +3,20 @@ use std::{borrow::Cow, collections::BTreeMap, sync::Arc}; use execution_mode::ExecutionMode; use reqwest::Url; use scrapelect_filter_types::{ - bail, other, Bindings, EValue, ElementContext, ElementContextView as _, Linked, ListIter, - PValue, Value, + bail, Bindings, EValue, ElementContext, ElementContextView as _, Linked, ListIter, PValue, + Value, }; use crate::frontend::ast::{self, Element, Inline, Leaf, Qualifier, RValue, Statement}; mod execution_mode; pub mod filter; +mod repl; pub use scrapelect_filter_types::{Error, MessageExt, Result, WrapExt}; +pub use repl::Repl; + #[derive(Debug)] pub struct Interpreter { client: reqwest::Client, @@ -115,7 +118,6 @@ impl Interpreter { element: &Element<'ast>, ctx: &mut E, ) -> Result { - let selector_str = &element.selector.to_string(); let inner = || async move { let html; @@ -135,14 +137,7 @@ impl Interpreter { (ctx.element(), None) }; - let selector = scraper::Selector::parse(selector_str).map_err(|e| { - other!( - "failed to parse selector `{selector_str}`. - This is a bug in `scrapelect`, please report it. - `selectors` error: {e}" - ) - })?; - + let selector = element.selector.to_scraper(); let selection = root_element.select(&selector); let element_refs = ExecutionMode::hinted_from_iter(element.qualifier, selection)?; @@ -160,9 +155,12 @@ impl Interpreter { .into_value()) }; - inner() - .await - .wrap_with(|| format!("note: occurred while evaluating element block `{selector_str}`")) + inner().await.wrap_with(|| { + format!( + "note: occurred while evaluating element block `{}`", + element.selector + ) + }) } fn apply_filters<'a, 'ast: 'a, 'ctx, E: ElementContext<'ast, 'ctx>>( diff --git a/src/interpreter/repl.rs b/src/interpreter/repl.rs new file mode 100644 index 0000000..d6a61d1 --- /dev/null +++ b/src/interpreter/repl.rs @@ -0,0 +1,463 @@ +//! The `scrapelect` read-evaluate-print-loop interpreter. +//! +//! To run the loop, see [`Repl::rep`] and [`Repl::repl`]. This module is primarily +//! for use in the `scrapelect` binary, but + +use std::{ + borrow::Cow, + io::{self, BufRead, StdinLock, StdoutLock, Write}, + sync::Arc, +}; + +use anyhow::Context as _; +use scrapelect_filter_types::{ + other, Bindings, EValue, Element, ElementContext, ElementContextView, Value, +}; +use scraper::{ElementRef, Html}; +use url::Url; + +use crate::frontend::{ast, Parser, Token}; + +use super::Interpreter; + +#[derive(Debug)] +pub enum ElementParent { + Document { + document: Html, + url: Url, + }, + Element { + parent: Arc, + selector: String, + name: Option, + }, +} + +#[ouroboros::self_referencing] +#[derive(Debug)] +pub struct ElementArc { + parent: Arc, + #[covariant] + #[borrows(parent)] + element: ElementRef<'this>, +} + +#[ouroboros::self_referencing] +#[derive(Debug)] +pub struct ContextNode { + pub element: Arc, + #[covariant] + #[borrows(element)] + pub bindings: Bindings<'static, Element<'this>>, +} + +#[derive(Debug)] +struct Context<'a> { + element: ElementRef<'a>, + url: Url, + view: Vec<&'a Bindings<'static, Element<'a>>>, + ledger: Bindings<'static, Element<'a>>, +} + +impl<'a> Context<'a> { + #[must_use] + pub fn new(stack: &'a [ContextNode]) -> Option { + let url = stack + .iter() + .rev() + .find_map(|x| match &**x.borrow_element().borrow_parent() { + ElementParent::Document { url, .. } => Some(url), + _ => None, + })? + .clone(); + + let &element = stack.last()?.borrow_element().borrow_element(); + + let view = stack.iter().map(|x| x.borrow_bindings()).collect(); + + Some(Self { + element, + url, + view, + ledger: Bindings::new(), + }) + } + + #[inline] + fn new_as_error(stack: &'a [ContextNode]) -> anyhow::Result { + Self::new(stack).context( + "You do not have a document open.\n\ + Call `/open ` to load a document from a URL.", + ) + } +} + +impl<'a, 'b> ElementContextView<'b, 'a> for Context<'a> { + #[inline] + fn element(&self) -> ElementRef<'a> { + self.element + } + + #[inline] + fn set_inner( + &mut self, + name: Cow<'b, str>, + value: EValue<'a>, + ) -> scrapelect_filter_types::Result<()> { + self.ledger.0.insert(Cow::Owned(name.into_owned()), value); + Ok(()) + } + + fn get_inner(&self, id: &str) -> scrapelect_filter_types::Result> { + if let Some(item) = self.ledger.0.get(id) { + Ok(item.clone()) + } else if let Some(item) = self.view.iter().rev().find_map(|x| x.0.get(id)) { + Ok(item.clone()) + } else { + Err(other!("Binding `{id}` not found.")) + } + } + + #[inline] + fn url(&self) -> &Url { + &self.url + } +} + +impl<'a, 'b> ElementContext<'b, 'a> for Context<'a> { + fn new(element: ElementRef<'a>, url: Url) -> Self { + Self { + element, + url, + ledger: Bindings::new(), + view: Vec::new(), + } + } + + #[inline] + fn into_bindings(self) -> Bindings<'static> { + self.ledger.into_data() + } + + type Nested<'inner> = Context<'inner> where Self: 'inner; + + fn nest<'inner, 'outer: 'inner>( + &'outer self, + url: Option, + element: ElementRef<'inner>, + ) -> Self::Nested<'inner> { + Context { + element, + url: url.unwrap_or_else(|| self.url.clone()), + ledger: Bindings::new(), + view: self + .view + .iter() + .copied() + .chain(std::iter::once(&self.ledger)) + .collect(), + } + } +} + +#[non_exhaustive] +#[derive(Debug)] +pub struct Repl, W = StdoutLock<'static>> { + client: reqwest::Client, + stack: Vec, + input: R, + output: W, +} + +impl Repl { + /// Creates a new empty [`Repl`] instance using stdin and stdout. + #[inline] + #[must_use] + pub fn new() -> Self { + Self { + client: reqwest::Client::builder() + .user_agent(concat!( + env!("CARGO_PKG_NAME"), + " v", + env!("CARGO_PKG_VERSION") + )) + .build() + .expect("Default client is invalid"), + stack: Vec::new(), + input: io::stdin().lock(), + output: io::stdout().lock(), + } + } + + /// Creates a new [`Repl`] with the given url loaded and opened. + pub async fn open(url: Url) -> anyhow::Result { + let mut this = Self::new(); + let interpreter = Interpreter::with_client(this.client.clone()); + let document = interpreter.get_html(&url).await?; + this.stack.push(ContextNode::new( + Arc::new(ElementArc::new( + Arc::new(ElementParent::Document { document, url }), + |e| match &**e { + ElementParent::Document { document, .. } => document.root_element(), + _ => unreachable!("Expected Document variant"), + }, + )), + |_| Bindings::new(), + )); + + Ok(this) + } +} + +macro_rules! output { + ($self: expr, $($tt:tt)*) => { + write!(&mut $self.output, $($tt)*) + }; +} + +macro_rules! outputln { + ($self: expr$(, $($tt:tt)*)?) => { + writeln!(&mut $self.output, $($($tt)*)?) + }; +} + +impl Repl { + /// Read a line from the input into `buf`. Returns `Ok(true)` if + /// a string was added and `Ok(false)` if EOF was reached. + /// + /// # Errors + /// + /// Returns an `Err` when an error occured while reading the line. + fn get_line(&mut self, buf: &mut String) -> anyhow::Result { + if let Some(top) = self.stack.last() { + match &**top.borrow_element().borrow_parent() { + ElementParent::Document { url, .. } => output!(self, "{url}")?, + ElementParent::Element { selector, .. } => output!(self, "{selector}")?, + } + } + output!(self, "> ")?; + self.output.flush()?; + + buf.clear(); + while !buf.ends_with('\n') { + let len = self.input.read_line(buf)?; + if len == 0 { + return Ok(false); + } + } + + // remove newline + buf.pop(); + + Ok(true) + } + + async fn handle_command( + &mut self, + interpreter: &Interpreter, + command: &str, + ) -> anyhow::Result { + let mut parser = Parser::new(command); + + match parser.try_eat(Token::Id)?.value { + "exit" | "quit" => return Ok(false), + "open" => { + let url = Parser::parse_string_literal(parser.try_eat(Token::String)?.value); + let url = url + .parse() + .map_err(|e| anyhow::anyhow!("Invalid URL `{url}`: {e}"))?; + + let document = interpreter.get_html(&url).await?; + + self.stack.push(ContextNode::new( + Arc::new(ElementArc::new( + Arc::new(ElementParent::Document { document, url }), + |doc| { + let ElementParent::Document { document, .. } = &**doc else { + unreachable!( + "expected a document parent at {}:{}:{}", + file!(), + line!(), + column!() + ) + }; + + document.root_element() + }, + )), + |_| Bindings::new(), + )); + } + "leave" => match self.stack.pop() { + Some(mut node) => { + let bindings = + node.with_bindings_mut(|b| std::mem::take(b).into_data().into_value()); + + if let ElementParent::Element { + name: Some(name), .. + } = &**node.borrow_element().borrow_parent() + { + if let Some(last) = self.stack.last_mut() { + outputln!(self, "{name}: {bindings}")?; + last.with_bindings_mut(|b| { + b.0.insert(Cow::Owned(name.clone()), Value::from_data(bindings)) + }); + } + } + } + None => return Ok(false), + }, + "enter" => { + let name = parser.try_eat(Token::Id)?.value; + + let selector = parser.parse_selector()?; + self.select(&selector, Some(name.to_owned()))?; + } + "help" => { + outputln!(self, + "Available commands:\n\n\ + - /help: display this help message\n\ + - /open : open a new web page at `url` and select the root element.\n\ + - /enter : enter a multiline element context block `name: selector {{...}}`\n\ + - /current: print the current URL and selector\n\ + - /leave: leave the current element context\n\ + - /quit | /exit: exit the REPL\n\ + " + )?; + } + "current" => { + for item in &self.stack { + match &**item.borrow_element().borrow_parent() { + ElementParent::Document { url, .. } => output!(self, "\non {url}:")?, + ElementParent::Element { selector, .. } => output!(self, " {selector}")?, + } + } + outputln!(self)?; + } + "eval" => { + let inline = parser.parse_value()?; + let mut ctx = Context::new_as_error(&self.stack)?; + + let value = interpreter.eval_inline(&inline, &mut ctx)?; + outputln!(self, "{value}")?; + let ledger = into_data(ctx); + self.pop_off(ledger)?; + } + unknown => anyhow::bail!( + "Unknown command `/{unknown}`.\n\ + Run `/help` for a list of commands." + ), + } + + Ok(true) + } + + pub async fn repl(mut self) -> anyhow::Result<()> { + let interpreter = Interpreter::with_client(self.client.clone()); + + loop { + let result = self.rep(&interpreter).await; + + match result { + Ok(true) => (), + Ok(false) => break, + Err(e) => eprintln!("Error: {e}"), + } + } + + outputln!(self, "Exiting...")?; + Ok(()) + } + + pub async fn rep(&mut self, interpreter: &Interpreter) -> anyhow::Result { + let mut buf = String::new(); + if !self.get_line(&mut buf)? { + return Ok(false); + } + + let input = buf.trim(); + + if let Some(command) = input.strip_prefix("/") { + self.handle_command(interpreter, command).await + } else { + let mut ctx = Context::new_as_error(&self.stack)?; + let mut parser = Parser::new(input); + let statement = parser.parse_statement()?; + + interpreter + .interpret_statement(&statement, &mut ctx) + .await?; + + let ledger = into_data(ctx); + self.pop_off(ledger)?; + + Ok(true) + } + } + + pub fn select( + &mut self, + selector: &ast::Selector<'_>, + name: Option, + ) -> anyhow::Result<()> { + let ctx = self.stack.last().context( + "You do not have a document open.\n\ + Call `/open ` to load a document from a URL.", + )?; + + let new_element = ElementArc::try_new( + Arc::new(ElementParent::Element { + parent: ctx.borrow_element().clone(), + selector: selector.to_string(), + name, + }), + |e| { + let ElementParent::Element { parent, .. } = &**e else { + unreachable!( + "expected an `Element` variant at {}:{}:{}", + file!(), + line!(), + column!() + ); + }; + parent + .borrow_element() + .select(&selector.to_scraper()) + .next() + .with_context(|| format!("no element found with selector `{selector}`")) + }, + )?; + + self.stack + .push(ContextNode::new(Arc::new(new_element), |_| Bindings::new())); + Ok(()) + } + + fn pop_off(&mut self, ledger: Bindings<'static, Element<'static>>) -> anyhow::Result<()> { + let last_mut = self + .stack + .last_mut() + .context("Expected to be in an element block")?; + + for (name, value) in ledger.0 { + outputln!(self, "{name}: {value}")?; + last_mut.with_bindings_mut(|b| b.0.insert(name, value)); + } + + Ok(()) + } +} + +// TODO: make this not drop elements with lifetimes. +fn into_data(ctx: Context<'_>) -> Bindings<'static, Element<'static>> { + for (k, v) in &ctx.ledger.0 { + if let Value::Extra(e) = v { + eprintln!( + "Binding {k}: {e} contains a temporary element reference, which is currently not supported in the REPL.\n\ + It will be 'forgotten' and not available in the subsequent lines." + ); + } + } + + Bindings::from_data(ctx.ledger.into_data()) +} diff --git a/src/main.rs b/src/main.rs index 4d0e218..9c3d4bc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,7 +3,10 @@ use std::path::PathBuf; use anyhow::Context; use clap::Parser as _; -use scrapelect::{frontend::Parser, interpreter::Interpreter}; +use scrapelect::{ + frontend::Parser, + interpreter::{Interpreter, Repl}, +}; use url::Url; #[derive(Debug, clap::Parser)] @@ -25,10 +28,16 @@ struct RunArgs { url: Url, } +#[derive(Debug, clap::Args)] +struct ReplArgs { + /// An optional URL to open at the beginning of the REPL. + url: Option, +} + #[derive(Debug, clap::Subcommand)] enum Mode { Run(RunArgs), - Repl, + Repl(ReplArgs), } #[tokio::main] @@ -52,15 +61,19 @@ async fn main() -> anyhow::Result<()> { println!("{}", serde_json::to_string_pretty(&results)?); } + (Some(Mode::Repl(ReplArgs { url: Some(url) })), None) => { + Repl::open(url).await?.repl().await?; + } // TODO: investigate if the (None, None) branch is reachable (I think it isn't) - (Some(Mode::Repl), None) | (None, None) => { - todo!() + (Some(Mode::Repl(ReplArgs { url: None })), None) | (None, None) => { + Repl::new().repl().await?; } (Some(_), Some(_)) => { unreachable!( "This should be impossible to reach with clap's `args_conflicts_with_subcommands`. If you see this error message, please file a GitHub issue with the arguments - you provided to `scrapelect`." + you provided to `scrapelect`:\n{:?}", + std::env::args().collect::>(), ) } } From dd0865a38d0e746996664d2c2bfcb1167620a741 Mon Sep 17 00:00:00 2001 From: m Date: Thu, 12 Sep 2024 20:53:10 -0700 Subject: [PATCH 2/3] doc fixes --- filter-types/src/filter.rs | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/filter-types/src/filter.rs b/filter-types/src/filter.rs index edfeab2..1b40cf9 100644 --- a/filter-types/src/filter.rs +++ b/filter-types/src/filter.rs @@ -7,9 +7,13 @@ use super::{ pub use scrapelect_filter_proc_macro::{filter_fn, Args}; -/// Typed arguments for a [`Filter`]. It can be implemented manually, but you -/// can also use the derive macro [`Args`](scrapelect_filter_proc_macro::Args) -/// if all the fields in the struct implement [`TryFromValue`](TryFromValue). +/// Typed arguments for a [`Filter`]. +/// +/// If all the fields in the struct implement [`TryFromValue: Sized { /// Try to deserialize the typed arguments from the given `args`. /// @@ -59,18 +63,17 @@ pub trait Filter { ) -> Result>; } -#[allow(clippy::doc_lazy_continuation)] /// An object-safe version of [`Filter`]. All `F: Filter` implement this trait, /// so prefer implementing `Filter` unless you must: /// /// 1. Deserialize the input [`PValue`] in a custom way (not using [`TryFromValue`]) /// 2. Use custom arg-deserializing logic (but often you will be able to implement -/// [`Args`] manually instead, and still get the typed guarantees of [`Filter`]) +/// [`Args`] manually instead, and still get the typed guarantees of [`Filter`]) /// 3. Use the `&self` reference. This *can* be used to store state with interior -/// mutability (though note that in `scrapelect`, filters must be `Send + Sync`) to -/// register, but it is often not the best idea to have filter state because filters -/// can be called from anywhere in the program, and you will have to reason out the -/// soundness of having the state. +/// mutability (though note that in `scrapelect`, filters must be `Send + Sync`) to +/// register, but it is often not the best idea to have filter state because filters +/// can be called from anywhere in the program, and you will have to reason out the +/// soundness of having the state. pub trait FilterDyn { /// Call this filter with the given `value`, `args`, and `ctx`. /// From 4e265ceb623b1dfd401655ab15d263a074f4676c Mon Sep 17 00:00:00 2001 From: m Date: Mon, 7 Oct 2024 23:33:29 -0700 Subject: [PATCH 3/3] fix --- src/interpreter/repl.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/interpreter/repl.rs b/src/interpreter/repl.rs index d6a61d1..d2bedd9 100644 --- a/src/interpreter/repl.rs +++ b/src/interpreter/repl.rs @@ -169,6 +169,13 @@ pub struct Repl, W = StdoutLock<'static>> { output: W, } +impl Default for Repl { + #[inline] + fn default() -> Self { + Self::new() + } +} + impl Repl { /// Creates a new empty [`Repl`] instance using stdin and stdout. #[inline]