From 1f3e1c2a0744cff2d3d1602e09695b60ea3ab903 Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Sun, 22 Sep 2024 23:05:34 +0900 Subject: [PATCH] diff: implement some ignore-space rules The added comparison functions correspond to --ignore-all-space and --ignore-space-change. --ignore-space-at-eol can be combined with the other flags, so it will have to be implemented as a preprocessing function. --ignore-blank-lines will also require some change in the tokenizer function. --- lib/src/diff.rs | 109 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/lib/src/diff.rs b/lib/src/diff.rs index 2b05bdbd13..94d38b05fb 100644 --- a/lib/src/diff.rs +++ b/lib/src/diff.rs @@ -75,6 +75,38 @@ pub fn find_nonword_ranges(text: &[u8]) -> Vec> { .collect() } +fn bytes_ignore_all_whitespace(text: &[u8]) -> impl Iterator + '_ { + text.iter().copied().filter(|b| !b.is_ascii_whitespace()) +} + +fn bytes_ignore_whitespace_amount(text: &[u8]) -> impl Iterator + '_ { + let mut prev_was_space = false; + text.iter().filter_map(move |&b| { + let was_space = prev_was_space; + let is_space = b.is_ascii_whitespace(); + prev_was_space = is_space; + match (was_space, is_space) { + (_, false) => Some(b), + (false, true) => Some(b' '), + (true, true) => None, + } + }) +} + +fn hash_with_length_suffix(data: I, state: &mut H) +where + I: IntoIterator, + I::Item: Hash, + H: Hasher, +{ + let mut len: usize = 0; + for d in data { + d.hash(state); + len += 1; + } + state.write_usize(len); +} + /// Compares byte sequences based on a certain equivalence property. /// /// This isn't a newtype `Wrapper<'a>(&'a [u8])` but an external comparison @@ -121,6 +153,34 @@ impl CompareBytes for CompareBytesExactly { } } +/// Compares byte sequences ignoring any whitespace occurrences. +#[derive(Clone, Debug, Default)] +pub struct CompareBytesIgnoreAllWhitespace; + +impl CompareBytes for CompareBytesIgnoreAllWhitespace { + fn eq(&self, left: &[u8], right: &[u8]) -> bool { + bytes_ignore_all_whitespace(left).eq(bytes_ignore_all_whitespace(right)) + } + + fn hash(&self, text: &[u8], state: &mut H) { + hash_with_length_suffix(bytes_ignore_all_whitespace(text), state); + } +} + +/// Compares byte sequences ignoring changes in whitespace amount. +#[derive(Clone, Debug, Default)] +pub struct CompareBytesIgnoreWhitespaceAmount; + +impl CompareBytes for CompareBytesIgnoreWhitespaceAmount { + fn eq(&self, left: &[u8], right: &[u8]) -> bool { + bytes_ignore_whitespace_amount(left).eq(bytes_ignore_whitespace_amount(right)) + } + + fn hash(&self, text: &[u8], state: &mut H) { + hash_with_length_suffix(bytes_ignore_whitespace_amount(text), state); + } +} + /// Compares words (or tokens) under a certain hasher configuration. #[derive(Clone, Debug, Default)] struct WordComparator { @@ -888,6 +948,55 @@ mod tests { ); } + #[test] + fn test_compare_bytes_ignore_all_whitespace() { + let comp = WordComparator::new(CompareBytesIgnoreAllWhitespace); + let hash = |data: &[u8]| comp.hash_one(data); + + assert!(comp.eq(b"", b"")); + assert!(comp.eq(b"", b" ")); + assert!(comp.eq(b"\t", b"\r")); + assert_eq!(hash(b""), hash(b"")); + assert_eq!(hash(b""), hash(b" ")); + assert_eq!(hash(b""), hash(b"\t")); + assert_eq!(hash(b""), hash(b"\r")); + + assert!(comp.eq(b"ab", b" a b\t")); + assert_eq!(hash(b"ab"), hash(b" a b\t")); + + assert!(!comp.eq(b"a", b"")); + assert!(!comp.eq(b"a", b" ")); + assert!(!comp.eq(b"a", b"ab")); + assert!(!comp.eq(b"ab", b"ba")); + } + + #[test] + fn test_compare_bytes_ignore_whitespace_amount() { + let comp = WordComparator::new(CompareBytesIgnoreWhitespaceAmount); + let hash = |data: &[u8]| comp.hash_one(data); + + assert!(comp.eq(b"", b"")); + assert!(comp.eq(b"\n", b" \n")); + assert!(comp.eq(b"\t", b"\r")); + assert_eq!(hash(b""), hash(b"")); + assert_eq!(hash(b" "), hash(b"\n")); + assert_eq!(hash(b" "), hash(b" \n")); + assert_eq!(hash(b" "), hash(b"\t")); + assert_eq!(hash(b" "), hash(b"\r")); + + assert!(comp.eq(b"a b c\n", b"a b\tc\r\n")); + assert_eq!(hash(b"a b c\n"), hash(b"a b\tc\r\n")); + + assert!(!comp.eq(b"", b" ")); + assert!(!comp.eq(b"a", b"")); + assert!(!comp.eq(b"a", b" ")); + assert!(!comp.eq(b"a", b"a ")); + assert!(!comp.eq(b"a", b" a")); + assert!(!comp.eq(b"a", b"ab")); + assert!(!comp.eq(b"ab", b"ba")); + assert!(!comp.eq(b"ab", b"a b")); + } + fn unchanged_ranges( left: &DiffSource, right: &DiffSource,