diff --git a/examples/html2text.rs b/examples/html2text.rs index 88bd8fd..8f26668 100644 --- a/examples/html2text.rs +++ b/examples/html2text.rs @@ -126,7 +126,14 @@ where .unwrap(); } } - if literal { + if flags.show_dom { + let conf = config::plain(); + let conf = update_config(conf, &flags); + let dom = conf.parse_html(input).unwrap(); + dom.as_dom_string() + } else if flags.show_render { + todo!() + } else if literal { let conf = config::with_decorator(TrivialDecorator::new()); let conf = update_config(conf, &flags); conf.string_from_read(input, flags.width).unwrap() @@ -148,6 +155,8 @@ struct Flags { ignore_css_colours: bool, #[cfg(feature = "css")] use_only_css: bool, + show_dom: bool, + show_render: bool, } fn main() { @@ -166,6 +175,8 @@ fn main() { ignore_css_colours: false, #[cfg(feature = "css")] use_only_css: false, + show_dom: false, + show_render: false, }; let mut literal: bool = false; @@ -214,6 +225,16 @@ fn main() { StoreTrue, "Don't use default non-CSS colours", ); + ap.refer(&mut flags.show_dom).add_option( + &["--show-dom"], + StoreTrue, + "Show the parsed HTML DOM instead of rendered output", + ); + ap.refer(&mut flags.show_render).add_option( + &["--show-render"], + StoreTrue, + "Show the computed render tree instead of the rendered output", + ); ap.parse_args_or_exit(); } diff --git a/src/css/parser.rs b/src/css/parser.rs index 607c341..0419d6b 100644 --- a/src/css/parser.rs +++ b/src/css/parser.rs @@ -385,7 +385,7 @@ pub(crate) fn parse_color_attribute( text: &str, ) -> Result>> { let (_rest, value) = parse_value(text).map_err(|_| empty_fail())?; - parse_color(&value) + parse_color(&value.tokens) } #[derive(Copy, Clone, PartialEq, Eq, Debug)] @@ -404,11 +404,15 @@ pub fn parse_declaration(text: &str) -> IResult<&str, Option> { ))(text)?; let decl = match prop.0.as_str() { "background-color" => { - let value = parse_color(&value)?; + let value = parse_color(&value.tokens)?; Decl::BackgroundColor { value } } + "background" => match parse_background_color(&value)? { + Some(value) => Decl::BackgroundColor { value }, + _ => Decl::Unknown { name: prop }, + }, "color" => { - let value = parse_color(&value)?; + let value = parse_color(&value.tokens)?; Decl::Color { value } } "height" => { @@ -457,12 +461,12 @@ fn empty_fail() -> nom::Err> { nom::Err::Error(nom::error::Error::new("", ErrorKind::Fail)) } -fn parse_color(value: &RawValue) -> Result>> { +fn parse_color(tokens: &[Token]) -> Result>> { let fail_error = empty_fail(); - if value.tokens.is_empty() { + if tokens.is_empty() { return Err(fail_error); } - match &value.tokens[..] { + match tokens { [Token::Ident(c)] => { let colour = match c.deref() { "aqua" => Colour::Rgb(0, 0xff, 0xff), @@ -492,7 +496,7 @@ fn parse_color(value: &RawValue) -> Result { - let rgb_args = &value.tokens[1..value.tokens.len() - 1]; + let rgb_args = &tokens[1..tokens.len() - 1]; match rgb_args { [Number(r), Comma, Number(g), Comma, Number(b)] => { let r = r.parse().map_err(|_e| empty_fail())?; @@ -527,6 +531,22 @@ fn parse_color(value: &RawValue) -> Result Result, nom::Err>> { + let tokens = if let Some(last) = value.tokens.rsplit(|tok| *tok == Token::Comma).next() { + last + } else { + return Err(empty_fail()); + }; + + match parse_color(tokens) { + Ok(col) => Ok(Some(col)), + Err(_) => Ok(None), + } +} + fn parse_integer(text: &str) -> IResult<&str, f32> { let (rest, digits) = digit1(text)?; Ok((rest, ::from_str(digits).unwrap())) @@ -1000,4 +1020,44 @@ mod test { )) ); } + + #[test] + fn test_background() { + assert_eq!( + super::parse_declaration("background: white"), + Ok(( + "", + Some(Declaration { + data: Decl::BackgroundColor { + value: Colour::Rgb(0xff, 0xff, 0xff) + }, + important: Importance::Default, + }) + )) + ); + assert_eq!( + super::parse_declaration("background: url('blah'), white"), + Ok(( + "", + Some(Declaration { + data: Decl::BackgroundColor { + value: Colour::Rgb(0xff, 0xff, 0xff) + }, + important: Importance::Default, + }) + )) + ); + assert_eq!( + super::parse_declaration("background: url('blah'), foo"), + Ok(( + "", + Some(Declaration { + data: Decl::Unknown { + name: PropertyName("background".into()), + }, + important: Importance::Default, + }) + )) + ); + } } diff --git a/src/markup5ever_rcdom.rs b/src/markup5ever_rcdom.rs index 0714087..56782d4 100644 --- a/src/markup5ever_rcdom.rs +++ b/src/markup5ever_rcdom.rs @@ -216,6 +216,45 @@ pub struct RcDom { pub quirks_mode: Cell, } +impl RcDom { + fn add_node_to_string(s: &mut String, node: &Handle, indent: usize) { + use std::fmt::Write as _; + match &node.data { + NodeData::Document => { + for child in &*node.children.borrow() { + Self::add_node_to_string(s, child, indent); + } + } + NodeData::Doctype { .. } => { + writeln!(s, "{0:indent$}", "", indent = indent).unwrap(); + } + NodeData::Text { contents } => { + let borrowed = contents.borrow(); + let text = borrowed.to_string(); + if !text.trim().is_empty() { + writeln!(s, "{0:indent$}Text:{1}", "", text, indent = indent).unwrap(); + } + } + NodeData::Comment { .. } => (), + NodeData::Element { name, .. } => { + writeln!(s, "{0:indent$}<{1}>", "", name.local, indent = indent).unwrap(); + for child in &*node.children.borrow() { + Self::add_node_to_string(s, child, indent + 1); + } + writeln!(s, "{0:indent$}", "", name.local, indent = indent).unwrap(); + } + NodeData::ProcessingInstruction { .. } => {} + } + } + + /// A low-quality debug DOM rendering. + pub fn as_dom_string(&self) -> String { + let mut s = String::new(); + Self::add_node_to_string(&mut s, &self.document, 0); + s + } +} + impl TreeSink for RcDom { type Output = Self; diff --git a/src/render/text_renderer.rs b/src/render/text_renderer.rs index 0644912..b59c4a7 100644 --- a/src/render/text_renderer.rs +++ b/src/render/text_renderer.rs @@ -352,7 +352,7 @@ impl WrappedBlock { } } - fn flush_word(&mut self, ws_mode: WhiteSpace, main_tag: &T, wrap_tag: &T) -> Result<(), Error> { + fn flush_word(&mut self, ws_mode: WhiteSpace) -> Result<(), Error> { use self::TaggedLineElement::Str; /* Finish the word. */ @@ -362,18 +362,19 @@ impl WrappedBlock { self.line.len ); - let mut tag = if self.pre_wrapped { wrap_tag } else { main_tag }; if !self.word.is_empty() { self.pre_wrapped = false; let space_in_line = self.width - self.line.len; let space_needed = self.wslen + self.wordlen; if space_needed <= space_in_line { html_trace!("Got enough space"); - self.line.push(Str(TaggedString { - s: " ".repeat(self.wslen), - tag: tag.clone(), - })); - self.wslen = 0; + if self.wslen > 0 { + self.line.push(Str(TaggedString { + s: " ".repeat(self.wslen), + tag: self.spacetag.take().unwrap(), + })); + self.wslen = 0; + } self.line.consume(&mut self.word); html_trace!("linelen increased by wordlen to {}", self.line.len); @@ -386,12 +387,14 @@ impl WrappedBlock { if self.wslen >= space_in_line { // Skip the whitespace self.wslen -= space_in_line; - } else { - self.line.push_ws(self.wslen, tag); + } else if self.wslen > 0 { + self.line + .push_ws(self.wslen, &self.spacetag.take().unwrap()); self.wslen = 0; } } else { // We're word-wrapping, so discard any whitespace. + self.spacetag = None; self.wslen = 0; } /* Start a new line */ @@ -399,18 +402,18 @@ impl WrappedBlock { if ws_mode == WhiteSpace::Pre { self.pre_wrapped = true; - tag = wrap_tag; } // Write any remaining whitespace while self.wslen > 0 { let to_copy = self.wslen.min(self.width); - self.line.push_ws(to_copy, tag); + self.line.push_ws(to_copy, self.spacetag.as_ref().unwrap()); if to_copy == self.width { self.flush_line(); } self.wslen -= to_copy; } + self.spacetag = None; // At this point, either: // We're word-wrapping, and at the start of the line or @@ -506,9 +509,7 @@ impl WrappedBlock { } fn flush(&mut self) -> Result<(), Error> { - let tag = self.spacetag.clone().unwrap_or_default(); - - self.flush_word(WhiteSpace::Normal, &tag, &tag)?; + self.flush_word(WhiteSpace::Normal)?; self.flush_line(); Ok(()) } @@ -548,6 +549,7 @@ impl WrappedBlock { // We walk character by character. // 1. First, build up whitespace columns in self.wslen // - In normal mode self.wslen will always be 0 or 1 + // - If wslen > 0, then self.spacetag will always be set. // 2. Next build up a word (non-whitespace). // 2a. If the word gets too long for the line // 2b. If we get to more whitespace, output the first whitespace and the word @@ -563,7 +565,7 @@ impl WrappedBlock { self.line ); if c.is_whitespace() && self.wordlen > 0 { - self.flush_word(ws_mode, main_tag, wrap_tag)?; + self.flush_word(ws_mode)?; } if c.is_whitespace() { @@ -575,6 +577,7 @@ impl WrappedBlock { // the line. self.force_flush_line(); self.wslen = 0; + self.spacetag = None; self.pre_wrapped = false; // Hard new line, so back to main tag. tag = main_tag; @@ -607,6 +610,7 @@ impl WrappedBlock { } else { // Manual wrapping, keep the space. self.wslen += cwidth; + self.spacetag = Some(tag.clone()); self.pre_wrapped = true; } } else { @@ -619,7 +623,7 @@ impl WrappedBlock { } else { // If not preserving whitespace, everything is collapsed, // and the line won't start with whitespace. - if self.line.len > 0 { + if self.line.len > 0 && self.wslen == 0 { self.spacetag = Some(tag.clone()); self.wslen = 1; } diff --git a/src/tests.rs b/src/tests.rs index e3c4744..6513a09 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -2229,6 +2229,23 @@ There ); } + #[test] + fn test_bgcoloured3() { + test_html_coloured( + br##" + +

Test Two words bg

+ "##, + r#"Test Two words bg +"#, + 20, + ); + } + #[test] fn test_coloured_element() { test_html_coloured(