Skip to content

Commit

Permalink
Merge pull request #36 from chris-ha458/oncecell
Browse files Browse the repository at this point in the history
Change lazy_static into once_cell
  • Loading branch information
nickspring authored Dec 2, 2023
2 parents eb0f58a + 8950d77 commit c426e16
Show file tree
Hide file tree
Showing 11 changed files with 844 additions and 551 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ encoding = "0.2.33"
env_logger = "0.10.0"
icu_normalizer = "1.3.2"
icu_properties = "1.3.2"
lazy_static = "1.4.0"
log = "0.4.20"
once_cell = "1.18.0"
ordered-float = "3.9.1"
regex = "1.9.3"
serde = { version = "1.0.188", features = ["derive"] }
Expand Down
2 changes: 1 addition & 1 deletion benches/large_payload.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion};

pub fn large_payload(c: &mut Criterion) {
let mut payload = b"hello simple ascii "
.repeat(*TOO_BIG_SEQUENCE)
.repeat(TOO_BIG_SEQUENCE)
.as_slice()
.to_vec();
payload.extend("我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。".as_bytes());
Expand Down
121 changes: 61 additions & 60 deletions src/assets.rs
Original file line number Diff line number Diff line change
@@ -1,65 +1,66 @@
use crate::entity::Language;
use ahash::HashMap;
use lazy_static::lazy_static;

lazy_static! {
pub static ref LANGUAGE_SUPPORTED_COUNT: usize = 41;
pub static ref LANGUAGES: [(Language, &'static str, bool, bool);41] = [
// language, alphabet, have_accents, pure_latin
(Language::English, "eationsrhldcmufpgwbyvkjxzq", false, true, ),
(Language::English, "eationsrhldcumfpgwybvkxjzq", false, true, ),
(Language::German, "enirstadhulgocmbfkwzpvüäöj", true, true, ),
(Language::French, "easnitrluodcpmévgfbhqàxèyj", true, true, ),
(Language::Dutch, "enairtodslghvmukcpbwjzfyxë", true, true, ),
(Language::Italian, "eiaonltrscdupmgvfbzhqèàkyò", true, true, ),
(Language::Polish, "aioenrzwsctkydpmuljłgbhąęó", true, true, ),
(Language::Spanish, "eaonsrildtcumpbgvfyóhqíjzá", true, true, ),
(Language::Russian, "оаеинстрвлкмдпугяызбйьчхжц", false, false, ),
(Language::Japanese, "人一大亅丁丨竹笑口日今二彳行十土丶寸寺時乙丿乂气気冂巾亠市目儿見八小凵県月彐門間木東山出本中刀分耳又取最言田心思刂前京尹事生厶云会未来白冫楽灬馬尸尺駅明耂者了阝都高卜占厂广店子申奄亻俺上方冖学衣艮食自", false, false, ),
(Language::Japanese, "ーンス・ルトリイアラックドシレジタフロカテマィグバムプオコデニウメサビナブャエュチキズダパミェョハセベガモツネボソノァヴワポペピケゴギザホゲォヤヒユヨヘゼヌゥゾヶヂヲヅヵヱヰヮヽ゠ヾヷヿヸヹヺ", false, false, ),
(Language::Japanese, "のにるたとはしいをでてがなれからさっりすあもこまうくよきんめおけそつだやえどわちみせじばへびずろほげむべひょゆぶごゃねふぐぎぼゅづざぞぬぜぱぽぷぴぃぁぇぺゞぢぉぅゐゝゑ゛゜ゎゔ゚ゟ゙ゕゖ", false, false, ),
(Language::Portuguese, "aeosirdntmuclpgvbfhãqéçází", true, true, ),
(Language::Swedish, "eanrtsildomkgvhfupäcböåyjx", true, true, ),
(Language::Chinese, "的一是不了在人有我他这个们中来上大为和国地到以说时要就出会可也你对生能而子那得于着下自之年过发后作里用道行所然家种事成方多经么去法学如都同现当没动面起看定天分还进好小部其些主样理心她本前开但因只从想实", false, false, ),
(Language::Ukrainian, "оаніирвтесклудмпзяьбгйчхцї", false, false, ),
(Language::Norwegian, "erntasioldgkmvfpubhåyjøcæw", false, true, ),
(Language::Finnish, "aintesloukämrvjhpydögcbfwz", true, true, ),
(Language::Vietnamese, "nhticgaoumlràđsevpbyưdákộế", true, true, ),
(Language::Czech, "oeantsilvrkdumpíchzáyjběéř", true, true, ),
(Language::Hungarian, "eatlsnkriozáégmbyvdhupjöfc", true, true, ),
(Language::Korean, "이다에의는로하을가고지서한은기으년대사시를리도인스일", false, false, ),
(Language::Indonesian, "aneirtusdkmlgpbohyjcwfvzxq", false, true, ),
(Language::Turkish, "aeinrlıkdtsmyuobüşvgzhcpçğ", true, true, ),
(Language::Romanian, "eiarntulocsdpmăfvîgbșțzhâj", true, true, ),
(Language::Farsi, "ایردنهومتبسلکشزفگعخقجآپحطص", false, false, ),
(Language::Arabic, "اليمونرتبةعدسفهكقأحجشطصىخإ", false, false, ),
(Language::Danish, "erntaisdlogmkfvubhpåyøæcjw", false, true, ),
(Language::Serbian, "аиоенрсуткјвдмплгзбaieonцш", false, false, ),
(Language::Lithuanian, "iasoretnukmlpvdjgėbyųšžcąį", false, true, ),
(Language::Slovene, "eaionrsltjvkdpmuzbghčcšžfy", false, true, ),
(Language::Slovak, "oaenirvtslkdmpuchjbzáyýíčé", true, true, ),
(Language::Hebrew, "יוהלרבתמאשנעםדקחפסכגטצןזך", false, false, ),
(Language::Bulgarian, "аиоентрсвлкдпмзгяъубчцйжщх", false, false, ),
(Language::Croatian, "aioenrjstuklvdmpgzbcčhšžćf", true, true, ),
(Language::Hindi, "करसनतमहपयलवजदगबशटअएथभडचधषइ", false, false, ),
(Language::Estonian, "aiestlunokrdmvgpjhäbõüfcöy", true, true, ),
(Language::Thai, "านรอกเงมยลวดทสตะปบคหแจพชขใ", false, false, ),
(Language::Greek, "ατοιενρσκηπςυμλίόάγέδήωχθύ", false, false, ),
(Language::Tamil, "கதபடரமலனவறயளசநஇணஅஆழஙஎஉஒஸ", false, false, ),
(Language::Kazakh, "аыентрлідсмқкобиуғжңзшйпгө", false, false, ),
];
use once_cell::sync::Lazy;
use std::iter::FromIterator;

pub(crate) static LANGUAGES: Lazy<[(Language, &'static str, bool, bool); 41]> = Lazy::new(|| {
[
// language, alphabet, have_accents, pure_latin
(Language::English, "eationsrhldcmufpgwbyvkjxzq", false, true, ),
(Language::English, "eationsrhldcumfpgwybvkxjzq", false, true, ),
(Language::German, "enirstadhulgocmbfkwzpvüäöj", true, true, ),
(Language::French, "easnitrluodcpmévgfbhqàxèyj", true, true, ),
(Language::Dutch, "enairtodslghvmukcpbwjzfyxë", true, true, ),
(Language::Italian, "eiaonltrscdupmgvfbzhqèàkyò", true, true, ),
(Language::Polish, "aioenrzwsctkydpmuljłgbhąęó", true, true, ),
(Language::Spanish, "eaonsrildtcumpbgvfyóhqíjzá", true, true, ),
(Language::Russian, "оаеинстрвлкмдпугяызбйьчхжц", false, false, ),
(Language::Japanese, "人一大亅丁丨竹笑口日今二彳行十土丶寸寺時乙丿乂气気冂巾亠市目儿見八小凵県月彐門間木東山出本中刀分耳又取最言田心思刂前京尹事生厶云会未来白冫楽灬馬尸尺駅明耂者了阝都高卜占厂广店子申奄亻俺上方冖学衣艮食自", false, false, ),
(Language::Japanese, "ーンス・ルトリイアラックドシレジタフロカテマィグバムプオコデニウメサビナブャエュチキズダパミェョハセベガモツネボソノァヴワポペピケゴギザホゲォヤヒユヨヘゼヌゥゾヶヂヲヅヵヱヰヮヽ゠ヾヷヿヸヹヺ", false, false, ),
(Language::Japanese, "のにるたとはしいをでてがなれからさっりすあもこまうくよきんめおけそつだやえどわちみせじばへびずろほげむべひょゆぶごゃねふぐぎぼゅづざぞぬぜぱぽぷぴぃぁぇぺゞぢぉぅゐゝゑ゛゜ゎゔ゚ゟ゙ゕゖ", false, false, ),
(Language::Portuguese, "aeosirdntmuclpgvbfhãqéçází", true, true, ),
(Language::Swedish, "eanrtsildomkgvhfupäcböåyjx", true, true, ),
(Language::Chinese, "的一是不了在人有我他这个们中来上大为和国地到以说时要就出会可也你对生能而子那得于着下自之年过发后作里用道行所然家种事成方多经么去法学如都同现当没动面起看定天分还进好小部其些主样理心她本前开但因只从想实", false, false, ),
(Language::Ukrainian, "оаніирвтесклудмпзяьбгйчхцї", false, false, ),
(Language::Norwegian, "erntasioldgkmvfpubhåyjøcæw", false, true, ),
(Language::Finnish, "aintesloukämrvjhpydögcbfwz", true, true, ),
(Language::Vietnamese, "nhticgaoumlràđsevpbyưdákộế", true, true, ),
(Language::Czech, "oeantsilvrkdumpíchzáyjběéř", true, true, ),
(Language::Hungarian, "eatlsnkriozáégmbyvdhupjöfc", true, true, ),
(Language::Korean, "이다에의는로하을가고지서한은기으년대사시를리도인스일", false, false, ),
(Language::Indonesian, "aneirtusdkmlgpbohyjcwfvzxq", false, true, ),
(Language::Turkish, "aeinrlıkdtsmyuobüşvgzhcpçğ", true, true, ),
(Language::Romanian, "eiarntulocsdpmăfvîgbșțzhâj", true, true, ),
(Language::Farsi, "ایردنهومتبسلکشزفگعخقجآپحطص", false, false, ),
(Language::Arabic, "اليمونرتبةعدسفهكقأحجشطصىخإ", false, false, ),
(Language::Danish, "erntaisdlogmkfvubhpåyøæcjw", false, true, ),
(Language::Serbian, "аиоенрсуткјвдмплгзбaieonцш", false, false, ),
(Language::Lithuanian, "iasoretnukmlpvdjgėbyųšžcąį", false, true, ),
(Language::Slovene, "eaionrsltjvkdpmuzbghčcšžfy", false, true, ),
(Language::Slovak, "oaenirvtslkdmpuchjbzáyýíčé", true, true, ),
(Language::Hebrew, "יוהלרבתמאשנעםדקחפסכגטצןזך", false, false, ),
(Language::Bulgarian, "аиоентрсвлкдпмзгяъубчцйжщх", false, false, ),
(Language::Croatian, "aioenrjstuklvdmpgzbcčhšžćf", true, true, ),
(Language::Hindi, "करसनतमहपयलवजदगबशटअएथभडचधषइ", false, false, ),
(Language::Estonian, "aiestlunokrdmvgpjhäbõüfcöy", true, true, ),
(Language::Thai, "านรอกเงมยลวดทสตะปบคหแจพชขใ", false, false, ),
(Language::Greek, "ατοιενρσκηπςυμλίόάγέδήωχθύ", false, false, ),
(Language::Tamil, "கதபடரமலனவறயளசநஇணஅஆழஙஎஉஒஸ", false, false, ),
(Language::Kazakh, "аыентрлідсмқкобиуғжңзшйпгө", false, false, ),
]
});
pub(crate) static LANGUAGE_SUPPORTED_COUNT: Lazy<usize> = Lazy::new(|| LANGUAGES.len()); // 41


// direct binding encoding to language
pub(crate) static ref ENCODING_TO_LANGUAGE: HashMap<&'static str, Language> = HashMap::from_iter([
("euc-kr", Language::Korean),
("big5", Language::Chinese),
("hz", Language::Chinese),
("gbk", Language::Chinese),
("gb18030", Language::Chinese),
("euc-jp", Language::Japanese),
("iso-2022-jp", Language::Japanese),
("shift_jis", Language::Japanese),
]);
}
pub(crate) static ENCODING_TO_LANGUAGE: Lazy<HashMap<&'static str, Language>> = Lazy::new(|| {
HashMap::from_iter([
("euc-kr", Language::Korean),
("big5", Language::Chinese),
("hz", Language::Chinese),
("gbk", Language::Chinese),
("gb18030", Language::Chinese),
("euc-jp", Language::Japanese),
("iso-2022-jp", Language::Japanese),
("shift_jis", Language::Japanese),
])
});
2 changes: 1 addition & 1 deletion src/cd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ pub(crate) fn coherence_ratio(
let mut sufficient_match_count: u64 = 0;

for layer in alpha_unicode_split(&decoded_sequence) {
if layer.chars().count() <= *TOO_SMALL_SEQUENCE {
if layer.chars().count() <= TOO_SMALL_SEQUENCE {
continue;
}
let most_common = layer.chars().collect::<Counter<_>>().most_common_ordered();
Expand Down
Loading

0 comments on commit c426e16

Please sign in to comment.