Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add display:none support and add_css(). #97

Merged
merged 4 commits into from
Dec 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ jobs:
- run: cargo --version
- run: cargo build
- run: cargo test
build-css:
docker:
- image: cimg/rust:1.73
steps:
- checkout
- run: cargo --version
- run: cargo build --features=css
- run: cargo test --features=css
build-1-60:
docker:
- image: cimg/rust:1.60
Expand Down Expand Up @@ -54,5 +62,6 @@ workflows:
build:
jobs:
- "build-stable"
- "build-css"
- "build-1-60"
- "build-windows"
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ Possible log types:
- `[fixed]` for any bug fixes.
- `[security]` to invite users to upgrade in case of vulnerabilities.

### 0.7.1

- [added] Now recognised CSS `display:none`
- [added] Can now add extra CSS rules via `Config::add_css`.
- [changed] StyleData::coloured is no longer public.

### 0.7.0

- [changed] Remove some noisy stderr output when encoutering control chars
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "html2text"
version = "0.7.0"
version = "0.7.1"
authors = ["Chris Emerson <[email protected]>"]
description = "Render HTML as plain text."
repository = "https://github.com/jugglerchris/rust-html2text/"
Expand Down
23 changes: 22 additions & 1 deletion src/css.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ use crate::{Result, TreeMapResult, markup5ever_rcdom::{Handle, NodeData::{Commen
#[derive(Clone, Default, Debug)]
pub struct StyleData {
/// Map from classes to colours
pub colours: HashMap<String, CssColor>,
pub(crate) colours: HashMap<String, CssColor>,
pub(crate) display: HashMap<String, lightningcss::properties::display::Display>,
}

impl StyleData {
Expand All @@ -39,6 +40,19 @@ impl StyleData {
}
}
}
Property::Display(disp) => {
for selector in &style.selectors.0 {
for item in selector.iter() {
use lightningcss::selector::Component;
match item {
Component::Class(c) => {
self.display.insert(c.0.to_string(), disp.clone());
}
_ => { }
}
}
}
}
_ => (),
}
}
Expand All @@ -47,6 +61,13 @@ impl StyleData {
}
}
}

/// Merge style data from other into this one.
/// Data on other takes precedence.
pub fn merge(&mut self, other: Self) {
self.colours.extend(other.colours);
self.display.extend(other.display);
}
}

fn pending<'a, F>(handle: Handle, f: F) -> TreeMapResult<'a, (), Handle, Vec<String>>
Expand Down
146 changes: 119 additions & 27 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -935,13 +935,17 @@ struct HtmlContext {
style_data: css::StyleData,
}

/// Convert a DOM tree or subtree into a render tree.
pub fn dom_to_render_tree<T: Write>(handle: Handle, err_out: &mut T) -> Result<Option<RenderNode>> {
fn dom_to_render_tree_with_context<T: Write>(
handle: Handle,
err_out: &mut T,
mut context: HtmlContext)
-> Result<Option<RenderNode>> {
html_trace!("### dom_to_render_tree: HTML: {:?}", handle);
let mut context = HtmlContext::default();
#[cfg(feature = "css")]
{
context.style_data = css::dom_to_stylesheet(handle.clone(), err_out)?;
let mut doc_style_data = css::dom_to_stylesheet(handle.clone(), err_out)?;
doc_style_data.merge(context.style_data);
context.style_data = doc_style_data;
}

let result = tree_map_reduce(&mut context, handle, |context, handle| {
Expand All @@ -952,6 +956,11 @@ pub fn dom_to_render_tree<T: Write>(handle: Handle, err_out: &mut T) -> Result<O
result
}

/// Convert a DOM tree or subtree into a render tree.
pub fn dom_to_render_tree<T: Write>(handle: Handle, err_out: &mut T) -> Result<Option<RenderNode>> {
dom_to_render_tree_with_context(handle, err_out, Default::default())
}

fn pending<'a, F>(handle: Handle, f: F) -> TreeMapResult<'a, HtmlContext, Handle, RenderNode>
where
for<'r> F: Fn(&'r mut HtmlContext, std::vec::Vec<RenderNode>) -> Result<Option<RenderNode>> + 'static,
Expand Down Expand Up @@ -1043,6 +1052,33 @@ fn process_dom_node<'a, 'b, 'c, T: Write>(
..
} => {
let mut frag_from_name_attr = false;

#[cfg(feature = "css")]
let classes = {
let mut classes = Vec::new();
let borrowed = attrs.borrow();
for attr in borrowed.iter() {
if &attr.name.local == "class" {
for class in attr.value.split_whitespace() {
classes.push(class.to_string());
}
}
}
classes
};
#[cfg(feature = "css")]
for class in &classes {
if let Some(disp) = context.style_data.display.get(class) {
use lightningcss::properties::display;
match disp {
display::Display::Keyword(display::DisplayKeyword::None) => {
// Hide display: none
return Ok(Nothing);
}
_ => {}
}
}
}
let result = match name.expanded() {
expanded_name!(html "html")
| expanded_name!(html "body") => {
Expand All @@ -1066,15 +1102,6 @@ fn process_dom_node<'a, 'b, 'c, T: Write>(
}
#[cfg(feature = "css")]
{
let mut classes = Vec::new();
let borrowed = attrs.borrow();
for attr in borrowed.iter() {
if &attr.name.local == "class" {
for class in attr.value.split_whitespace() {
classes.push(class.to_string());
}
}
}
let mut colour = None;
for class in classes {
if let Some(c) = context.style_data.colours.get(&class) {
Expand Down Expand Up @@ -1693,51 +1720,110 @@ pub mod config {
//! constructed using one of the functions in this module.

use crate::{render::text_renderer::{
PlainDecorator, RichDecorator, TaggedLine, TextDecorator
}, Result};
use super::parse;
PlainDecorator, RichDecorator, TaggedLine, TextDecorator, RichAnnotation
}, Result, RenderTree, HtmlContext};
#[cfg(feature = "css")]
use crate::css::StyleData;

/// Configure the HTML processing.
pub struct Config<D: TextDecorator> {
decorator: D,

#[cfg(feature = "css")]
style: StyleData,
}

impl<D: TextDecorator> Config<D> {
/// Parse with context.
fn do_parse<R: std::io::Read>(&mut self, input: R) -> Result<RenderTree> {
super::parse_with_context(
input,
HtmlContext {
#[cfg(feature = "css")]
style_data: std::mem::take(&mut self.style),
})
}

/// Reads HTML from `input`, and returns a `String` with text wrapped to
/// `width` columns.
pub fn string_from_read<R: std::io::Read>(self, input: R, width: usize) -> Result<String> {
Ok(parse(input)?.render(width, self.decorator)?.into_string()?)
pub fn string_from_read<R: std::io::Read>(mut self, input: R, width: usize) -> Result<String> {
Ok(self.do_parse(input)?.render(width, self.decorator)?.into_string()?)
}

/// Reads HTML from `input`, and returns text wrapped to `width` columns.
/// The text is returned as a `Vec<TaggedLine<_>>`; the annotations are vectors
/// of the provided text decorator's `Annotation`. The "outer" annotation comes first in
/// the `Vec`.
pub fn lines_from_read<R: std::io::Read>(self, input: R, width: usize) -> Result<Vec<TaggedLine<Vec<D::Annotation>>>> {
Ok(parse(input)?
pub fn lines_from_read<R: std::io::Read>(mut self, input: R, width: usize) -> Result<Vec<TaggedLine<Vec<D::Annotation>>>> {
Ok(self.do_parse(input)?
.render(width, self.decorator)?
.into_lines()?)
}

#[cfg(feature = "css")]
/// Add some CSS rules which will be used (if supported) with any
/// HTML processed.
pub fn add_css(mut self, css: &str) -> Self {
self.style.add_css(css);
self
}
}

impl Config<RichDecorator> {
/// Return coloured text. `colour_map` is a function which takes
/// a list of `RichAnnotation` and some text, and returns the text
/// with any terminal escapes desired to indicate those annotations
/// (such as colour).
pub fn coloured<R, FMap>(
mut self,
input: R,
width: usize,
colour_map: FMap,
) -> Result<String>
where
R: std::io::Read,
FMap: Fn(&[RichAnnotation], &str) -> String,
{
use std::fmt::Write;

let lines = self.do_parse(input)?
.render(width, self.decorator)?
.into_lines()?;

let mut result = String::new();
for line in lines {
for ts in line.tagged_strings() {
write!(result, "{}", colour_map(&ts.tag, &ts.s))?;
}
result.push('\n');
}
Ok(result)
}
}

/// Return a Config initialized with a `RichDecorator`.
pub fn rich() -> Config<RichDecorator> {
Config {
decorator: RichDecorator::new()
decorator: RichDecorator::new(),
#[cfg(feature = "css")]
style: Default::default()
}
}

/// Return a Config initialized with a `PlainDecorator`.
pub fn plain() -> Config<PlainDecorator> {
Config {
decorator: PlainDecorator::new()
decorator: PlainDecorator::new(),
#[cfg(feature = "css")]
style: Default::default()
}
}

/// Return a Config initialized with a custom decorator.
pub fn with_decorator<D: TextDecorator>(decorator: D) -> Config<D> {
Config {
decorator
decorator,
#[cfg(feature = "css")]
style: Default::default()
}
}
}
Expand Down Expand Up @@ -1797,8 +1883,9 @@ impl<D: TextDecorator> RenderedText<D> {
}
}

/// Reads and parses HTML from `input` and prepares a render tree.
pub fn parse(mut input: impl io::Read) -> Result<RenderTree> {
fn parse_with_context(mut input: impl io::Read,
context: HtmlContext,
) -> Result<RenderTree> {
let opts = ParseOpts {
tree_builder: TreeBuilderOpts {
drop_doctype: true,
Expand All @@ -1810,11 +1897,16 @@ pub fn parse(mut input: impl io::Read) -> Result<RenderTree> {
.from_utf8()
.read_from(&mut input)
.unwrap();
let render_tree = dom_to_render_tree(dom.document.clone(), &mut Discard {})?
let render_tree = dom_to_render_tree_with_context(dom.document.clone(), &mut Discard {}, context)?
.ok_or(Error::Fail)?;
Ok(RenderTree(render_tree))
}

/// Reads and parses HTML from `input` and prepares a render tree.
pub fn parse(input: impl io::Read) -> Result<RenderTree> {
parse_with_context(input, Default::default())
}

/// Reads HTML from `input`, decorates it using `decorator`, and
/// returns a `String` with text wrapped to `width` columns.
pub fn from_read_with_decorator<R, D>(input: R, width: usize, decorator: D) -> String
Expand Down
35 changes: 35 additions & 0 deletions src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ fn test_html_err(input: &[u8], expected: Error, width: usize) {
}
}

#[cfg(feature = "css")]
fn test_html_style(input: &[u8], style: &str, expected: &str, width: usize) {
let result = config::plain()
.add_css(style)
.string_from_read(input, width).unwrap();
assert_eq_str!(result, expected);
}

fn test_html_decorator<D>(input: &[u8], expected: &str, width: usize, decorator: D)
where
D: TextDecorator,
Expand Down Expand Up @@ -1605,3 +1613,30 @@ fn test_issue_93_x() {
let d1 = TrivialDecorator::new();
let _local1 = crate::RenderTree::render(_local0, 1, d1);
}

#[cfg(feature = "css")]
#[test]
fn test_disp_none() {
test_html(br#"
<style>
.hide { display: none; }
</style>
<p>Hello</p>
<p class="hide">Ignore</p>
<p>There</p>"#,
r#"Hello

There
"#, 20);

// Same as above, but style supplied separately.
test_html_style(br#"
<p>Hello</p>
<p class="hide">Ignore</p>
<p>There</p>"#,
" .hide { display: none; }",
r#"Hello

There
"#, 20);
}
Loading