diff --git a/README.md b/README.md index 3324a79..099912d 100644 --- a/README.md +++ b/README.md @@ -18,22 +18,22 @@ Python version supports more encodings, but a lot of them are old almost unused ## ⚡ Performance -This package offer better performance than Python version (3 times faster, than MYPYC version of charset-normalizer, 6 times faster than usual Python version). -However, in comparison with `chardet` and `chardetng` packages it is slower but more accurate (I guess because it process whole file chunk by chunk). +This package offer better performance than Python version (4 times faster, than MYPYC version of charset-normalizer, 8 times faster than usual Python version). +In comparison with `chardet` and `chardetng` packages it has approximately the same speed but more accurate. Here are some numbers. | Package | Accuracy | Mean per file (ms) | File per sec (est) | |---------------------------------------------------------------------------------------------|:----------:|:------------------:|:------------------:| | [chardet](https://crates.io/crates/chardet) | 82.6 % | 2.2 ms | 450 file/sec | | [chardetng](https://crates.io/crates/chardetng) | 90.7 % | 1.6 ms | 625 file/sec | -| charset-normalizer-rs | **97.1 %** | **2.7 ms** | 370 file/sec | +| charset-normalizer-rs | **97.1 %** | **1.8 ms** | 555 file/sec | | [charset-normalizer](https://github.com/Ousret/charset_normalizer) (Python + MYPYC version) | **98 %** | **8 ms** | 125 file/sec | | Package | 99th percentile | 95th percentile | 50th percentile | |---------------------------------------------------------------------------------------------|:---------------:|:---------------:|:---------------:| | [chardet](https://crates.io/crates/chardet) | 8 ms | 2 ms | 0.2 ms | | [chardetng](https://crates.io/crates/chardetng) | 14 ms | 5 ms | 0.5 ms | -| charset-normalizer-rs | 19 ms | 7 ms | 1.2 ms | +| charset-normalizer-rs | 12 ms | 5 ms | 0.7 ms | | [charset-normalizer](https://github.com/Ousret/charset_normalizer) (Python + MYPYC version) | 94 ms | 37 ms | 3 ms | Stats are generated using 400+ files using default parameters. These results might change at any time. diff --git a/src/md.rs b/src/md.rs index 5ffeb1c..b10abe6 100644 --- a/src/md.rs +++ b/src/md.rs @@ -1,14 +1,18 @@ #![allow(unused_variables)] #![allow(unused_imports)] + use crate::consts::{COMMON_SAFE_ASCII_CHARACTERS, UTF8_MAXIMAL_ALLOCATION}; -use crate::utils::{is_suspiciously_successive_range, remove_accent, unicode_range}; +use crate::utils::{ + is_accentuated, is_cjk, is_hangul, is_hiragana, is_katakana, is_latin, is_punctuation, + is_separator, is_suspiciously_successive_range, is_thai, remove_accent, unicode_range, +}; use bitflags::{bitflags, Flags}; use cached::proc_macro::cached; use cached::UnboundCache; use log::trace; use ordered_float::OrderedFloat; use unic::char::property::EnumeratedCharProperty; -use unic::ucd::{GeneralCategory, Name}; +use unic::ucd::{is_white_space, GeneralCategory, Name}; // // Mess detection module @@ -64,7 +68,6 @@ impl PartialEq for MessDetectorChar { } impl MessDetectorChar { - pub fn new(character: char) -> Self { new_mess_detector_character(character) } @@ -137,6 +140,7 @@ pub fn new_mess_detector_character(character: char) -> MessDetectorChar { // whitespace if character.is_whitespace() { flags.insert(MessDetectorCharFlags::WHITESPACE); + flags.insert(MessDetectorCharFlags::SEPARATOR); } else { // safe symbols (non-whitespace) if COMMON_SAFE_ASCII_CHARACTERS.contains(character) { @@ -171,18 +175,21 @@ pub fn new_mess_detector_character(character: char) -> MessDetectorChar { // emoticon if MessDetectorChar::in_category(category, range, &[], &[], &["Emoticons"]) { flags.insert(MessDetectorCharFlags::EMOTICON); - } else { - // punctuation - if MessDetectorChar::in_category(category, range, &[], &["P"], &["Punctuation"]) { - flags.insert(MessDetectorCharFlags::PUNCTUATION); - } + } - // separator - if MessDetectorChar::in_category(category, range, &["Po", "Pd", "Pc"], &["Z"], &[]) { - flags.insert(MessDetectorCharFlags::SEPARATOR); - } + // separator + if ['|', '+', '<', '>'].contains(&character) + || MessDetectorChar::in_category(category, range, &["Po", "Pd", "Pc"], &["Z"], &[]) + { + flags.insert(MessDetectorCharFlags::SEPARATOR); } } + + // punctuation + if MessDetectorChar::in_category(category, range, &[], &["P"], &["Punctuation"]) { + flags.insert(MessDetectorCharFlags::PUNCTUATION); + } + // symbol if MessDetectorChar::in_category(category, range, &[], &["N", "S"], &["Forms"]) { flags.insert(MessDetectorCharFlags::SYMBOL); diff --git a/src/tests/md.rs b/src/tests/md.rs index f5eadf2..c25ac2b 100644 --- a/src/tests/md.rs +++ b/src/tests/md.rs @@ -33,7 +33,6 @@ fn test_mess_ratio() { #[test] fn test_datasets_mess_ratio() { - env_logger::init(); // TODO remove for (path, encoding) in &get_large_test_datasets().unwrap() { let file = File::open(path); if file.is_err() { @@ -51,7 +50,7 @@ fn test_datasets_mess_ratio() { false, ) { let mr = mess_ratio(decoded_sequence, Some(OrderedFloat(1.0))); - assert!(mr < 0.2, "Mess ration is very high = {} for {}", mr, path); + assert!(mr < 0.2, "Mess ratio is very high = {} for {}", mr, path); } } }