Skip to content

Commit

Permalink
fix bug and change README.md
Browse files Browse the repository at this point in the history
  • Loading branch information
nickspring committed Oct 5, 2023
1 parent 807eb40 commit 082b052
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 18 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,22 @@ Python version supports more encodings, but a lot of them are old almost unused

## ⚡ Performance

This package offer better performance than Python version (3 times faster, than MYPYC version of charset-normalizer, 6 times faster than usual Python version).
However, in comparison with `chardet` and `chardetng` packages it is slower but more accurate (I guess because it process whole file chunk by chunk).
This package offer better performance than Python version (4 times faster, than MYPYC version of charset-normalizer, 8 times faster than usual Python version).
In comparison with `chardet` and `chardetng` packages it has approximately the same speed but more accurate.
Here are some numbers.

| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|---------------------------------------------------------------------------------------------|:----------:|:------------------:|:------------------:|
| [chardet](https://crates.io/crates/chardet) | 82.6 % | 2.2 ms | 450 file/sec |
| [chardetng](https://crates.io/crates/chardetng) | 90.7 % | 1.6 ms | 625 file/sec |
| charset-normalizer-rs | **97.1 %** | **2.7 ms** | 370 file/sec |
| charset-normalizer-rs | **97.1 %** | **1.8 ms** | 555 file/sec |
| [charset-normalizer](https://github.com/Ousret/charset_normalizer) (Python + MYPYC version) | **98 %** | **8 ms** | 125 file/sec |

| Package | 99th percentile | 95th percentile | 50th percentile |
|---------------------------------------------------------------------------------------------|:---------------:|:---------------:|:---------------:|
| [chardet](https://crates.io/crates/chardet) | 8 ms | 2 ms | 0.2 ms |
| [chardetng](https://crates.io/crates/chardetng) | 14 ms | 5 ms | 0.5 ms |
| charset-normalizer-rs | 19 ms | 7 ms | 1.2 ms |
| charset-normalizer-rs | 12 ms | 5 ms | 0.7 ms |
| [charset-normalizer](https://github.com/Ousret/charset_normalizer) (Python + MYPYC version) | 94 ms | 37 ms | 3 ms |

Stats are generated using 400+ files using default parameters. These results might change at any time.
Expand Down
31 changes: 19 additions & 12 deletions src/md.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
#![allow(unused_variables)]
#![allow(unused_imports)]

use crate::consts::{COMMON_SAFE_ASCII_CHARACTERS, UTF8_MAXIMAL_ALLOCATION};
use crate::utils::{is_suspiciously_successive_range, remove_accent, unicode_range};
use crate::utils::{
is_accentuated, is_cjk, is_hangul, is_hiragana, is_katakana, is_latin, is_punctuation,
is_separator, is_suspiciously_successive_range, is_thai, remove_accent, unicode_range,
};
use bitflags::{bitflags, Flags};
use cached::proc_macro::cached;
use cached::UnboundCache;
use log::trace;
use ordered_float::OrderedFloat;
use unic::char::property::EnumeratedCharProperty;
use unic::ucd::{GeneralCategory, Name};
use unic::ucd::{is_white_space, GeneralCategory, Name};

//
// Mess detection module
Expand Down Expand Up @@ -64,7 +68,6 @@ impl PartialEq for MessDetectorChar {
}

impl MessDetectorChar {

pub fn new(character: char) -> Self {
new_mess_detector_character(character)
}
Expand Down Expand Up @@ -137,6 +140,7 @@ pub fn new_mess_detector_character(character: char) -> MessDetectorChar {
// whitespace
if character.is_whitespace() {
flags.insert(MessDetectorCharFlags::WHITESPACE);
flags.insert(MessDetectorCharFlags::SEPARATOR);
} else {
// safe symbols (non-whitespace)
if COMMON_SAFE_ASCII_CHARACTERS.contains(character) {
Expand Down Expand Up @@ -171,18 +175,21 @@ pub fn new_mess_detector_character(character: char) -> MessDetectorChar {
// emoticon
if MessDetectorChar::in_category(category, range, &[], &[], &["Emoticons"]) {
flags.insert(MessDetectorCharFlags::EMOTICON);
} else {
// punctuation
if MessDetectorChar::in_category(category, range, &[], &["P"], &["Punctuation"]) {
flags.insert(MessDetectorCharFlags::PUNCTUATION);
}
}

// separator
if MessDetectorChar::in_category(category, range, &["Po", "Pd", "Pc"], &["Z"], &[]) {
flags.insert(MessDetectorCharFlags::SEPARATOR);
}
// separator
if ['|', '+', '<', '>'].contains(&character)
|| MessDetectorChar::in_category(category, range, &["Po", "Pd", "Pc"], &["Z"], &[])
{
flags.insert(MessDetectorCharFlags::SEPARATOR);
}
}

// punctuation
if MessDetectorChar::in_category(category, range, &[], &["P"], &["Punctuation"]) {
flags.insert(MessDetectorCharFlags::PUNCTUATION);
}

// symbol
if MessDetectorChar::in_category(category, range, &[], &["N", "S"], &["Forms"]) {
flags.insert(MessDetectorCharFlags::SYMBOL);
Expand Down
3 changes: 1 addition & 2 deletions src/tests/md.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ fn test_mess_ratio() {

#[test]
fn test_datasets_mess_ratio() {
env_logger::init(); // TODO remove
for (path, encoding) in &get_large_test_datasets().unwrap() {
let file = File::open(path);
if file.is_err() {
Expand All @@ -51,7 +50,7 @@ fn test_datasets_mess_ratio() {
false,
) {
let mr = mess_ratio(decoded_sequence, Some(OrderedFloat(1.0)));
assert!(mr < 0.2, "Mess ration is very high = {} for {}", mr, path);
assert!(mr < 0.2, "Mess ratio is very high = {} for {}", mr, path);
}
}
}
Expand Down

0 comments on commit 082b052

Please sign in to comment.