Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add repl #36

Merged
merged 3 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 79 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ serde = { version = "1.0", features = ["derive", "rc"] }
futures = "0.3"
url = "2.5"
clap = { version = "4.5.16", features = ["derive"] }
ouroboros = "0.18.4"

[workspace]
members = [".", "filter-proc-macro", "filter-types"]
Expand Down
68 changes: 46 additions & 22 deletions filter-types/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,56 @@
///
/// Overwrites a previous value bound to that name, if one is present.
///
/// Implementors should not implement this, instead [`set_inner`](Self::set_inner).
///
/// # Errors
///
/// Returns an `Err` if `name` cannot be rebound (e.g., it is `"element"`).
fn set(&mut self, name: Cow<'ast, str>, value: EValue<'ctx>) -> Result<()>;
fn set(&mut self, name: Cow<'ast, str>, value: EValue<'ctx>) -> Result<()> {
match &*name {
immutable @ "element" => {
bail!("assignment to immutable binding `{immutable}`")
}
_ => self.set_inner(name, value),
}
}

/// Sets the binding with name `name` to `value` in this context.
///
/// Overwrites a previous value bound to that name, if one is present.
///
/// Implementors should implement this and not `set`.
fn set_inner(&mut self, name: Cow<'ast, str>, value: EValue<'ctx>) -> Result<()>;

/// Gets the binding with name `id`, if it is present. Handles
/// retrieving special bindings like `element`. Looks in this
/// context and all parent contexts, starting innermost first.
///
/// Implementors should not implement this and instead use [`get_inner`](Self::get_inner).
///
/// # Errors
///
/// Returns an `Err` if a binding with name `id` is not found in this
/// scope or any parent scopes.
fn get(&self, id: &str) -> Result<EValue<'ctx>>;
fn get(&self, id: &str) -> Result<EValue<'ctx>> {
match id {
"element" => Ok(self.element().into()),
_ => self.get_inner(id),
}
}

/// For implementors of [`ElementContext`]. Only needs to get/set safe
/// (non-special) bindings, looking up in parent scopes as necessary.
///
/// # Errors
///
/// Returns an `Err` if a binding with name `id` is not found in this
/// scope or any parent scopes.
fn get_inner(&self, id: &str) -> Result<EValue<'ctx>>;

/// Returns a [reference](ElementRef) to the root element of this block.
#[must_use]
fn element(&self) -> ElementRef<'_>;
fn element(&self) -> ElementRef<'ctx>;

/// Returns a reference to the URL of the document that this element is in.
#[must_use]
Expand Down Expand Up @@ -167,27 +199,19 @@
}

impl<'ast, 'ctx> ElementContextView<'ast, 'ctx> for Linked<'ast, 'ctx> {
fn get(&self, id: &str) -> Result<EValue<'ctx>> {
match id {
"element" => Ok(self.element.into()),
_ => match self.bindings.0.get(id) {
Some(id) => Ok(id.clone()),
None => self
.parent
.with_msg(|| format!("unknown binding `{id}`"))?
.get(id),
},
fn get_inner(&self, id: &str) -> Result<EValue<'ctx>> {
match self.bindings.0.get(id) {
Some(id) => Ok(id.clone()),
None => self
.parent
.with_msg(|| format!("unknown binding `{id}`"))?
.get(id),
}
}

fn set(&mut self, name: Cow<'ast, str>, value: EValue<'ctx>) -> Result<()> {
match &*name {
immutable @ "element" => {
bail!("assignment to immutable binding `{immutable}`")
}
_ => self.bindings.0.insert(name, value),
};

#[inline]
fn set_inner(&mut self, name: Cow<'ast, str>, value: EValue<'ctx>) -> Result<()> {
self.bindings.0.insert(name, value);
Ok(())
}

Expand All @@ -197,10 +221,10 @@
}

#[inline]
fn element(&self) -> ElementRef<'_> {
fn element(&self) -> ElementRef<'ctx> {
self.element
}
}

Check warning on line 227 in filter-types/src/context.rs

View workflow job for this annotation

GitHub Actions / verify formatting and lints

Diff in /home/runner/work/scrapelect/scrapelect/filter-types/src/context.rs

impl<'ast, 'ctx> ElementContext<'ast, 'ctx> for Linked<'ast, 'ctx> {
type Nested<'inner> = Linked<'ast, 'inner> where Self: 'inner;
Expand Down
21 changes: 12 additions & 9 deletions filter-types/src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@ use super::{

pub use scrapelect_filter_proc_macro::{filter_fn, Args};

/// Typed arguments for a [`Filter`]. It can be implemented manually, but you
/// can also use the derive macro [`Args`](scrapelect_filter_proc_macro::Args)
/// if all the fields in the struct implement [`TryFromValue<Element>`](TryFromValue).
/// Typed arguments for a [`Filter`].
///
/// If all the fields in the struct implement [`TryFromValue<Element`], it is easier
/// to use the derive macro [`Args`], but this can also be implemented manually if
/// you need to express more custom deserializing logic.
///
/// [`Args`](scrapelect_filter_proc_macro::Args)
pub trait Args<'doc>: Sized {
/// Try to deserialize the typed arguments from the given `args`.
///
Expand Down Expand Up @@ -59,18 +63,17 @@ pub trait Filter {
) -> Result<PValue<'doc>>;
}

#[allow(clippy::doc_lazy_continuation)]
/// An object-safe version of [`Filter`]. All `F: Filter` implement this trait,
/// so prefer implementing `Filter` unless you must:
///
/// 1. Deserialize the input [`PValue`] in a custom way (not using [`TryFromValue`])
/// 2. Use custom arg-deserializing logic (but often you will be able to implement
/// [`Args`] manually instead, and still get the typed guarantees of [`Filter`])
/// [`Args`] manually instead, and still get the typed guarantees of [`Filter`])
/// 3. Use the `&self` reference. This *can* be used to store state with interior
/// mutability (though note that in `scrapelect`, filters must be `Send + Sync`) to
/// register, but it is often not the best idea to have filter state because filters
/// can be called from anywhere in the program, and you will have to reason out the
/// soundness of having the state.
/// mutability (though note that in `scrapelect`, filters must be `Send + Sync`) to
/// register, but it is often not the best idea to have filter state because filters
/// can be called from anywhere in the program, and you will have to reason out the
/// soundness of having the state.
pub trait FilterDyn {
/// Call this filter with the given `value`, `args`, and `ctx`.
///
Expand Down
10 changes: 9 additions & 1 deletion filter-types/src/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,15 @@ pub enum Element<'a> {
impl fmt::Display for Element<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Element(elem) => write!(f, "`{}`", elem.html()),
Self::Element(elem) => {
write!(f, "<{}", elem.value().name())?;

for (name, value) in elem.value().attrs() {
write!(f, r#"{name}="{value}""#)?;
}

f.write_str(">")
}
}
}
}
Expand Down
16 changes: 16 additions & 0 deletions src/frontend/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -221,4 +221,20 @@ mod selector_display {
Ok(())
}
}

impl Selector<'_> {
pub fn to_scraper(&self) -> scraper::Selector {
let selector_str = self.to_string();
// the borrow checker does not like having this inline.
let result = scraper::Selector::parse(&selector_str);
match result {
Ok(s) => s,
Err(e) => unreachable!(
"failed to parse selector `{selector_str}`.
This is a bug in `scrapelect`, please report it.
`selectors` error: {e}"
),
}
}
}
}
6 changes: 4 additions & 2 deletions src/frontend/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ impl<'a> Parser<'a> {
Ok(Inline { value, filters })
}

fn parse_value(&mut self) -> Result<Inline<'a>> {
pub(crate) fn parse_value(&mut self) -> Result<Inline<'a>> {
let (span, lx) = self.scanner.peek_non_whitespace();
match lx.token {
Token::Less => self.parse_inline(),
Expand Down Expand Up @@ -230,7 +230,9 @@ impl<'a> Parser<'a> {
let (span, lx) = item;

match lx.token {
Token::BraceOpen | Token::ParenOpen => Ok(None),
// Eof is only allowed in a repl context.
// TODO: investigate if this makes it possible to parse an invalid program anyway.
Token::BraceOpen | Token::ParenOpen | Token::Eof => Ok(None),
// invariant: peek_next_whitespace is one of Id | Hash | Dot | Star
// whitespace is eaten in the above block.
Token::Whitespace => Ok(Some(SelectorCombinator::Descendent)),
Expand Down
Loading
Loading