diff --git a/.circleci/config.yml b/.circleci/config.yml index b775004..163850f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -12,6 +12,14 @@ jobs: - run: cargo --version - run: cargo build - run: cargo test + build-css: + docker: + - image: cimg/rust:1.73 + steps: + - checkout + - run: cargo --version + - run: cargo build --features=css + - run: cargo test --features=css build-1-60: docker: - image: cimg/rust:1.60 @@ -54,5 +62,6 @@ workflows: build: jobs: - "build-stable" + - "build-css" - "build-1-60" - "build-windows" diff --git a/CHANGELOG.md b/CHANGELOG.md index 386ecc1..004d71d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,12 @@ Possible log types: - `[fixed]` for any bug fixes. - `[security]` to invite users to upgrade in case of vulnerabilities. +### 0.7.1 + +- [added] Now recognised CSS `display:none` +- [added] Can now add extra CSS rules via `Config::add_css`. +- [changed] StyleData::coloured is no longer public. + ### 0.7.0 - [changed] Remove some noisy stderr output when encoutering control chars diff --git a/Cargo.toml b/Cargo.toml index e07d891..4fed751 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "html2text" -version = "0.7.0" +version = "0.7.1" authors = ["Chris Emerson "] description = "Render HTML as plain text." repository = "https://github.com/jugglerchris/rust-html2text/" diff --git a/src/css.rs b/src/css.rs index 7f7049d..7f9eb4f 100644 --- a/src/css.rs +++ b/src/css.rs @@ -12,7 +12,8 @@ use crate::{Result, TreeMapResult, markup5ever_rcdom::{Handle, NodeData::{Commen #[derive(Clone, Default, Debug)] pub struct StyleData { /// Map from classes to colours - pub colours: HashMap, + pub(crate) colours: HashMap, + pub(crate) display: HashMap, } impl StyleData { @@ -39,6 +40,19 @@ impl StyleData { } } } + Property::Display(disp) => { + for selector in &style.selectors.0 { + for item in selector.iter() { + use lightningcss::selector::Component; + match item { + Component::Class(c) => { + self.display.insert(c.0.to_string(), disp.clone()); + } + _ => { } + } + } + } + } _ => (), } } @@ -47,6 +61,13 @@ impl StyleData { } } } + + /// Merge style data from other into this one. + /// Data on other takes precedence. + pub fn merge(&mut self, other: Self) { + self.colours.extend(other.colours); + self.display.extend(other.display); + } } fn pending<'a, F>(handle: Handle, f: F) -> TreeMapResult<'a, (), Handle, Vec> diff --git a/src/lib.rs b/src/lib.rs index 3eeb901..d643ee4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -935,13 +935,17 @@ struct HtmlContext { style_data: css::StyleData, } -/// Convert a DOM tree or subtree into a render tree. -pub fn dom_to_render_tree(handle: Handle, err_out: &mut T) -> Result> { +fn dom_to_render_tree_with_context( + handle: Handle, + err_out: &mut T, + mut context: HtmlContext) +-> Result> { html_trace!("### dom_to_render_tree: HTML: {:?}", handle); - let mut context = HtmlContext::default(); #[cfg(feature = "css")] { - context.style_data = css::dom_to_stylesheet(handle.clone(), err_out)?; + let mut doc_style_data = css::dom_to_stylesheet(handle.clone(), err_out)?; + doc_style_data.merge(context.style_data); + context.style_data = doc_style_data; } let result = tree_map_reduce(&mut context, handle, |context, handle| { @@ -952,6 +956,11 @@ pub fn dom_to_render_tree(handle: Handle, err_out: &mut T) -> Result(handle: Handle, err_out: &mut T) -> Result> { + dom_to_render_tree_with_context(handle, err_out, Default::default()) +} + fn pending<'a, F>(handle: Handle, f: F) -> TreeMapResult<'a, HtmlContext, Handle, RenderNode> where for<'r> F: Fn(&'r mut HtmlContext, std::vec::Vec) -> Result> + 'static, @@ -1043,6 +1052,33 @@ fn process_dom_node<'a, 'b, 'c, T: Write>( .. } => { let mut frag_from_name_attr = false; + + #[cfg(feature = "css")] + let classes = { + let mut classes = Vec::new(); + let borrowed = attrs.borrow(); + for attr in borrowed.iter() { + if &attr.name.local == "class" { + for class in attr.value.split_whitespace() { + classes.push(class.to_string()); + } + } + } + classes + }; + #[cfg(feature = "css")] + for class in &classes { + if let Some(disp) = context.style_data.display.get(class) { + use lightningcss::properties::display; + match disp { + display::Display::Keyword(display::DisplayKeyword::None) => { + // Hide display: none + return Ok(Nothing); + } + _ => {} + } + } + } let result = match name.expanded() { expanded_name!(html "html") | expanded_name!(html "body") => { @@ -1066,15 +1102,6 @@ fn process_dom_node<'a, 'b, 'c, T: Write>( } #[cfg(feature = "css")] { - let mut classes = Vec::new(); - let borrowed = attrs.borrow(); - for attr in borrowed.iter() { - if &attr.name.local == "class" { - for class in attr.value.split_whitespace() { - classes.push(class.to_string()); - } - } - } let mut colour = None; for class in classes { if let Some(c) = context.style_data.colours.get(&class) { @@ -1693,51 +1720,110 @@ pub mod config { //! constructed using one of the functions in this module. use crate::{render::text_renderer::{ - PlainDecorator, RichDecorator, TaggedLine, TextDecorator - }, Result}; - use super::parse; + PlainDecorator, RichDecorator, TaggedLine, TextDecorator, RichAnnotation + }, Result, RenderTree, HtmlContext}; + #[cfg(feature = "css")] + use crate::css::StyleData; /// Configure the HTML processing. pub struct Config { decorator: D, + + #[cfg(feature = "css")] + style: StyleData, } impl Config { + /// Parse with context. + fn do_parse(&mut self, input: R) -> Result { + super::parse_with_context( + input, + HtmlContext { + #[cfg(feature = "css")] + style_data: std::mem::take(&mut self.style), + }) + } + /// Reads HTML from `input`, and returns a `String` with text wrapped to /// `width` columns. - pub fn string_from_read(self, input: R, width: usize) -> Result { - Ok(parse(input)?.render(width, self.decorator)?.into_string()?) + pub fn string_from_read(mut self, input: R, width: usize) -> Result { + Ok(self.do_parse(input)?.render(width, self.decorator)?.into_string()?) } - /// Reads HTML from `input`, and returns text wrapped to `width` columns. /// The text is returned as a `Vec>`; the annotations are vectors /// of the provided text decorator's `Annotation`. The "outer" annotation comes first in /// the `Vec`. - pub fn lines_from_read(self, input: R, width: usize) -> Result>>> { - Ok(parse(input)? + pub fn lines_from_read(mut self, input: R, width: usize) -> Result>>> { + Ok(self.do_parse(input)? .render(width, self.decorator)? .into_lines()?) } + + #[cfg(feature = "css")] + /// Add some CSS rules which will be used (if supported) with any + /// HTML processed. + pub fn add_css(mut self, css: &str) -> Self { + self.style.add_css(css); + self + } + } + + impl Config { + /// Return coloured text. `colour_map` is a function which takes + /// a list of `RichAnnotation` and some text, and returns the text + /// with any terminal escapes desired to indicate those annotations + /// (such as colour). + pub fn coloured( + mut self, + input: R, + width: usize, + colour_map: FMap, + ) -> Result + where + R: std::io::Read, + FMap: Fn(&[RichAnnotation], &str) -> String, + { + use std::fmt::Write; + + let lines = self.do_parse(input)? + .render(width, self.decorator)? + .into_lines()?; + + let mut result = String::new(); + for line in lines { + for ts in line.tagged_strings() { + write!(result, "{}", colour_map(&ts.tag, &ts.s))?; + } + result.push('\n'); + } + Ok(result) + } } /// Return a Config initialized with a `RichDecorator`. pub fn rich() -> Config { Config { - decorator: RichDecorator::new() + decorator: RichDecorator::new(), + #[cfg(feature = "css")] + style: Default::default() } } /// Return a Config initialized with a `PlainDecorator`. pub fn plain() -> Config { Config { - decorator: PlainDecorator::new() + decorator: PlainDecorator::new(), + #[cfg(feature = "css")] + style: Default::default() } } /// Return a Config initialized with a custom decorator. pub fn with_decorator(decorator: D) -> Config { Config { - decorator + decorator, + #[cfg(feature = "css")] + style: Default::default() } } } @@ -1797,8 +1883,9 @@ impl RenderedText { } } -/// Reads and parses HTML from `input` and prepares a render tree. -pub fn parse(mut input: impl io::Read) -> Result { +fn parse_with_context(mut input: impl io::Read, + context: HtmlContext, + ) -> Result { let opts = ParseOpts { tree_builder: TreeBuilderOpts { drop_doctype: true, @@ -1810,11 +1897,16 @@ pub fn parse(mut input: impl io::Read) -> Result { .from_utf8() .read_from(&mut input) .unwrap(); - let render_tree = dom_to_render_tree(dom.document.clone(), &mut Discard {})? + let render_tree = dom_to_render_tree_with_context(dom.document.clone(), &mut Discard {}, context)? .ok_or(Error::Fail)?; Ok(RenderTree(render_tree)) } +/// Reads and parses HTML from `input` and prepares a render tree. +pub fn parse(input: impl io::Read) -> Result { + parse_with_context(input, Default::default()) +} + /// Reads HTML from `input`, decorates it using `decorator`, and /// returns a `String` with text wrapped to `width` columns. pub fn from_read_with_decorator(input: R, width: usize, decorator: D) -> String diff --git a/src/tests.rs b/src/tests.rs index fcca2e7..df1b491 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -28,6 +28,14 @@ fn test_html_err(input: &[u8], expected: Error, width: usize) { } } +#[cfg(feature = "css")] +fn test_html_style(input: &[u8], style: &str, expected: &str, width: usize) { + let result = config::plain() + .add_css(style) + .string_from_read(input, width).unwrap(); + assert_eq_str!(result, expected); +} + fn test_html_decorator(input: &[u8], expected: &str, width: usize, decorator: D) where D: TextDecorator, @@ -1605,3 +1613,30 @@ fn test_issue_93_x() { let d1 = TrivialDecorator::new(); let _local1 = crate::RenderTree::render(_local0, 1, d1); } + +#[cfg(feature = "css")] +#[test] +fn test_disp_none() { + test_html(br#" + +

Hello

+

Ignore

+

There

"#, + r#"Hello + +There +"#, 20); + + // Same as above, but style supplied separately. + test_html_style(br#" +

Hello

+

Ignore

+

There

"#, + " .hide { display: none; }", + r#"Hello + +There +"#, 20); +}