From 72aec474191bfdc5260ea14bf8f0d1c8cf3cd8df Mon Sep 17 00:00:00 2001 From: Liam Bigelow <40188355+bglw@users.noreply.github.com> Date: Thu, 14 Sep 2023 22:26:30 +1200 Subject: [PATCH] Fixes a bug when indexing some non-breaking spaces in extended mode --- CHANGELOG.md | 2 ++ pagefind/src/fossick/mod.rs | 55 +++++++++++++++++++++++++++++++------ 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 81907cde..1e603502 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ ## Unreleased +* Fixes a bug when indexing some non-breaking spaces on ja/zh language pages in extended mode + ## v1.0.1 (September 14, 2023) Hotfix for Pagefind v1.0.0, restoring default-on support for multilingual word segmentation, and helping resolve packaging issues with new dependencies. diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs index b50f1ba5..90115f32 100644 --- a/pagefind/src/fossick/mod.rs +++ b/pagefind/src/fossick/mod.rs @@ -359,14 +359,18 @@ impl Fossicker { // Only proceed if the word was broken into multiple parts if word_parts.contains(|c: char| c.is_whitespace()) { let part_words: Vec<_> = word_parts.split_whitespace().collect(); - // Index constituents of a compound word as a proportion of the - // weight of the full word. - let per_weight = - (word_weight / part_words.len().try_into().unwrap_or(std::u8::MAX)).max(1); - - // Only index two+ character words - for part_word in part_words.into_iter().filter(|w| w.len() > 1) { - store_word(part_word, word_index, per_weight); + + if !part_words.is_empty() { + // Index constituents of a compound word as a proportion of the + // weight of the full word. + let per_weight = (word_weight + / part_words.len().try_into().unwrap_or(std::u8::MAX)) + .max(1); + + // Only index two+ character words + for part_word in part_words.into_iter().filter(|w| w.len() > 1) { + store_word(part_word, word_index, per_weight); + } } } // Additionally store any special extra characters we are given @@ -774,6 +778,41 @@ mod tests { ); } + #[tokio::test] + async fn parse_significant_whitespace() { + let mut f = test_fossick( + [ + "
", + "Hello \u{a0} \u{a0}World ! .
", + "", + ] + .concat(), + ) + .await; + + let (digest, words, anchors, word_count) = f.parse_digest(); + + assert_eq!( + words, + HashMap::from_iter([ + ( + "hello".to_string(), + vec![FossickedWord { + position: 0, + weight: 1 * 24 + }] + ), + ( + "world".to_string(), + vec![FossickedWord { + position: 1, + weight: 1 * 24 + }] + ) + ]) + ); + } + #[cfg(not(target_os = "windows"))] #[test] fn building_url() {