From a0dcb19baf9446cf0e1f56a035d224432e7fbef2 Mon Sep 17 00:00:00 2001 From: chris-ha458 Date: Wed, 22 Nov 2023 23:02:11 +0900 Subject: [PATCH 1/2] change lazy_static into once_cell --- Cargo.lock | 2 +- Cargo.toml | 2 +- benches/large_payload.rs | 2 +- src/assets.rs | 121 +-- src/cd.rs | 2 +- src/consts.rs | 1136 ++++++++++++++++---------- src/entity.rs | 2 +- src/lib.rs | 8 +- src/md/structs.rs | 2 +- src/tests/detection_large_payload.rs | 6 +- src/tests/mod.rs | 111 +-- 11 files changed, 844 insertions(+), 550 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6f90c14..8842707 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -227,8 +227,8 @@ dependencies = [ "env_logger", "icu_normalizer", "icu_properties", - "lazy_static", "log", + "once_cell", "ordered-float", "predicates", "regex", diff --git a/Cargo.toml b/Cargo.toml index df39d7c..39bbf89 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,8 +31,8 @@ encoding = "0.2.33" env_logger = "0.10.0" icu_normalizer = "1.3.2" icu_properties = "1.3.2" -lazy_static = "1.4.0" log = "0.4.20" +once_cell = "1.18.0" ordered-float = "3.9.1" regex = "1.9.3" serde = { version = "1.0.188", features = ["derive"] } diff --git a/benches/large_payload.rs b/benches/large_payload.rs index 34febbe..e080b13 100644 --- a/benches/large_payload.rs +++ b/benches/large_payload.rs @@ -5,7 +5,7 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; pub fn large_payload(c: &mut Criterion) { let mut payload = b"hello simple ascii " - .repeat(*TOO_BIG_SEQUENCE) + .repeat(TOO_BIG_SEQUENCE) .as_slice() .to_vec(); payload.extend("我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。".as_bytes()); diff --git a/src/assets.rs b/src/assets.rs index fc014b9..0f20934 100644 --- a/src/assets.rs +++ b/src/assets.rs @@ -1,65 +1,66 @@ use crate::entity::Language; use ahash::HashMap; -use lazy_static::lazy_static; -lazy_static! { - pub static ref LANGUAGE_SUPPORTED_COUNT: usize = 41; - pub static ref LANGUAGES: [(Language, &'static str, bool, bool);41] = [ - // language, alphabet, have_accents, pure_latin - (Language::English, "eationsrhldcmufpgwbyvkjxzq", false, true, ), - (Language::English, "eationsrhldcumfpgwybvkxjzq", false, true, ), - (Language::German, "enirstadhulgocmbfkwzpvüäöj", true, true, ), - (Language::French, "easnitrluodcpmévgfbhqàxèyj", true, true, ), - (Language::Dutch, "enairtodslghvmukcpbwjzfyxë", true, true, ), - (Language::Italian, "eiaonltrscdupmgvfbzhqèàkyò", true, true, ), - (Language::Polish, "aioenrzwsctkydpmuljłgbhąęó", true, true, ), - (Language::Spanish, "eaonsrildtcumpbgvfyóhqíjzá", true, true, ), - (Language::Russian, "оаеинстрвлкмдпугяызбйьчхжц", false, false, ), - (Language::Japanese, "人一大亅丁丨竹笑口日今二彳行十土丶寸寺時乙丿乂气気冂巾亠市目儿見八小凵県月彐門間木東山出本中刀分耳又取最言田心思刂前京尹事生厶云会未来白冫楽灬馬尸尺駅明耂者了阝都高卜占厂广店子申奄亻俺上方冖学衣艮食自", false, false, ), - (Language::Japanese, "ーンス・ルトリイアラックドシレジタフロカテマィグバムプオコデニウメサビナブャエュチキズダパミェョハセベガモツネボソノァヴワポペピケゴギザホゲォヤヒユヨヘゼヌゥゾヶヂヲヅヵヱヰヮヽ゠ヾヷヿヸヹヺ", false, false, ), - (Language::Japanese, "のにるたとはしいをでてがなれからさっりすあもこまうくよきんめおけそつだやえどわちみせじばへびずろほげむべひょゆぶごゃねふぐぎぼゅづざぞぬぜぱぽぷぴぃぁぇぺゞぢぉぅゐゝゑ゛゜ゎゔ゚ゟ゙ゕゖ", false, false, ), - (Language::Portuguese, "aeosirdntmuclpgvbfhãqéçází", true, true, ), - (Language::Swedish, "eanrtsildomkgvhfupäcböåyjx", true, true, ), - (Language::Chinese, "的一是不了在人有我他这个们中来上大为和国地到以说时要就出会可也你对生能而子那得于着下自之年过发后作里用道行所然家种事成方多经么去法学如都同现当没动面起看定天分还进好小部其些主样理心她本前开但因只从想实", false, false, ), - (Language::Ukrainian, "оаніирвтесклудмпзяьбгйчхцї", false, false, ), - (Language::Norwegian, "erntasioldgkmvfpubhåyjøcæw", false, true, ), - (Language::Finnish, "aintesloukämrvjhpydögcbfwz", true, true, ), - (Language::Vietnamese, "nhticgaoumlràđsevpbyưdákộế", true, true, ), - (Language::Czech, "oeantsilvrkdumpíchzáyjběéř", true, true, ), - (Language::Hungarian, "eatlsnkriozáégmbyvdhupjöfc", true, true, ), - (Language::Korean, "이다에의는로하을가고지서한은기으년대사시를리도인스일", false, false, ), - (Language::Indonesian, "aneirtusdkmlgpbohyjcwfvzxq", false, true, ), - (Language::Turkish, "aeinrlıkdtsmyuobüşvgzhcpçğ", true, true, ), - (Language::Romanian, "eiarntulocsdpmăfvîgbșțzhâj", true, true, ), - (Language::Farsi, "ایردنهومتبسلکشزفگعخقجآپحطص", false, false, ), - (Language::Arabic, "اليمونرتبةعدسفهكقأحجشطصىخإ", false, false, ), - (Language::Danish, "erntaisdlogmkfvubhpåyøæcjw", false, true, ), - (Language::Serbian, "аиоенрсуткјвдмплгзбaieonцш", false, false, ), - (Language::Lithuanian, "iasoretnukmlpvdjgėbyųšžcąį", false, true, ), - (Language::Slovene, "eaionrsltjvkdpmuzbghčcšžfy", false, true, ), - (Language::Slovak, "oaenirvtslkdmpuchjbzáyýíčé", true, true, ), - (Language::Hebrew, "יוהלרבתמאשנעםדקחפסכגטצןזך", false, false, ), - (Language::Bulgarian, "аиоентрсвлкдпмзгяъубчцйжщх", false, false, ), - (Language::Croatian, "aioenrjstuklvdmpgzbcčhšžćf", true, true, ), - (Language::Hindi, "करसनतमहपयलवजदगबशटअएथभडचधषइ", false, false, ), - (Language::Estonian, "aiestlunokrdmvgpjhäbõüfcöy", true, true, ), - (Language::Thai, "านรอกเงมยลวดทสตะปบคหแจพชขใ", false, false, ), - (Language::Greek, "ατοιενρσκηπςυμλίόάγέδήωχθύ", false, false, ), - (Language::Tamil, "கதபடரமலனவறயளசநஇணஅஆழஙஎஉஒஸ", false, false, ), - (Language::Kazakh, "аыентрлідсмқкобиуғжңзшйпгө", false, false, ), - ]; +use once_cell::sync::Lazy; +use std::iter::FromIterator; +pub(crate) static LANGUAGES: Lazy<[(Language, &'static str, bool, bool); 41]> = Lazy::new(|| { + [ + // language, alphabet, have_accents, pure_latin + (Language::English, "eationsrhldcmufpgwbyvkjxzq", false, true, ), + (Language::English, "eationsrhldcumfpgwybvkxjzq", false, true, ), + (Language::German, "enirstadhulgocmbfkwzpvüäöj", true, true, ), + (Language::French, "easnitrluodcpmévgfbhqàxèyj", true, true, ), + (Language::Dutch, "enairtodslghvmukcpbwjzfyxë", true, true, ), + (Language::Italian, "eiaonltrscdupmgvfbzhqèàkyò", true, true, ), + (Language::Polish, "aioenrzwsctkydpmuljłgbhąęó", true, true, ), + (Language::Spanish, "eaonsrildtcumpbgvfyóhqíjzá", true, true, ), + (Language::Russian, "оаеинстрвлкмдпугяызбйьчхжц", false, false, ), + (Language::Japanese, "人一大亅丁丨竹笑口日今二彳行十土丶寸寺時乙丿乂气気冂巾亠市目儿見八小凵県月彐門間木東山出本中刀分耳又取最言田心思刂前京尹事生厶云会未来白冫楽灬馬尸尺駅明耂者了阝都高卜占厂广店子申奄亻俺上方冖学衣艮食自", false, false, ), + (Language::Japanese, "ーンス・ルトリイアラックドシレジタフロカテマィグバムプオコデニウメサビナブャエュチキズダパミェョハセベガモツネボソノァヴワポペピケゴギザホゲォヤヒユヨヘゼヌゥゾヶヂヲヅヵヱヰヮヽ゠ヾヷヿヸヹヺ", false, false, ), + (Language::Japanese, "のにるたとはしいをでてがなれからさっりすあもこまうくよきんめおけそつだやえどわちみせじばへびずろほげむべひょゆぶごゃねふぐぎぼゅづざぞぬぜぱぽぷぴぃぁぇぺゞぢぉぅゐゝゑ゛゜ゎゔ゚ゟ゙ゕゖ", false, false, ), + (Language::Portuguese, "aeosirdntmuclpgvbfhãqéçází", true, true, ), + (Language::Swedish, "eanrtsildomkgvhfupäcböåyjx", true, true, ), + (Language::Chinese, "的一是不了在人有我他这个们中来上大为和国地到以说时要就出会可也你对生能而子那得于着下自之年过发后作里用道行所然家种事成方多经么去法学如都同现当没动面起看定天分还进好小部其些主样理心她本前开但因只从想实", false, false, ), + (Language::Ukrainian, "оаніирвтесклудмпзяьбгйчхцї", false, false, ), + (Language::Norwegian, "erntasioldgkmvfpubhåyjøcæw", false, true, ), + (Language::Finnish, "aintesloukämrvjhpydögcbfwz", true, true, ), + (Language::Vietnamese, "nhticgaoumlràđsevpbyưdákộế", true, true, ), + (Language::Czech, "oeantsilvrkdumpíchzáyjběéř", true, true, ), + (Language::Hungarian, "eatlsnkriozáégmbyvdhupjöfc", true, true, ), + (Language::Korean, "이다에의는로하을가고지서한은기으년대사시를리도인스일", false, false, ), + (Language::Indonesian, "aneirtusdkmlgpbohyjcwfvzxq", false, true, ), + (Language::Turkish, "aeinrlıkdtsmyuobüşvgzhcpçğ", true, true, ), + (Language::Romanian, "eiarntulocsdpmăfvîgbșțzhâj", true, true, ), + (Language::Farsi, "ایردنهومتبسلکشزفگعخقجآپحطص", false, false, ), + (Language::Arabic, "اليمونرتبةعدسفهكقأحجشطصىخإ", false, false, ), + (Language::Danish, "erntaisdlogmkfvubhpåyøæcjw", false, true, ), + (Language::Serbian, "аиоенрсуткјвдмплгзбaieonцш", false, false, ), + (Language::Lithuanian, "iasoretnukmlpvdjgėbyųšžcąį", false, true, ), + (Language::Slovene, "eaionrsltjvkdpmuzbghčcšžfy", false, true, ), + (Language::Slovak, "oaenirvtslkdmpuchjbzáyýíčé", true, true, ), + (Language::Hebrew, "יוהלרבתמאשנעםדקחפסכגטצןזך", false, false, ), + (Language::Bulgarian, "аиоентрсвлкдпмзгяъубчцйжщх", false, false, ), + (Language::Croatian, "aioenrjstuklvdmpgzbcčhšžćf", true, true, ), + (Language::Hindi, "करसनतमहपयलवजदगबशटअएथभडचधषइ", false, false, ), + (Language::Estonian, "aiestlunokrdmvgpjhäbõüfcöy", true, true, ), + (Language::Thai, "านรอกเงมยลวดทสตะปบคหแจพชขใ", false, false, ), + (Language::Greek, "ατοιενρσκηπςυμλίόάγέδήωχθύ", false, false, ), + (Language::Tamil, "கதபடரமலனவறயளசநஇணஅஆழஙஎஉஒஸ", false, false, ), + (Language::Kazakh, "аыентрлідсмқкобиуғжңзшйпгө", false, false, ), +] +}); +pub(crate) static LANGUAGE_SUPPORTED_COUNT: Lazy = Lazy::new(|| LANGUAGES.len()); // 41 - - // direct binding encoding to language - pub(crate) static ref ENCODING_TO_LANGUAGE: HashMap<&'static str, Language> = HashMap::from_iter([ - ("euc-kr", Language::Korean), - ("big5", Language::Chinese), - ("hz", Language::Chinese), - ("gbk", Language::Chinese), - ("gb18030", Language::Chinese), - ("euc-jp", Language::Japanese), - ("iso-2022-jp", Language::Japanese), - ("shift_jis", Language::Japanese), - ]); -} +pub(crate) static ENCODING_TO_LANGUAGE: Lazy> = Lazy::new(|| { + HashMap::from_iter([ + ("euc-kr", Language::Korean), + ("big5", Language::Chinese), + ("hz", Language::Chinese), + ("gbk", Language::Chinese), + ("gb18030", Language::Chinese), + ("euc-jp", Language::Japanese), + ("iso-2022-jp", Language::Japanese), + ("shift_jis", Language::Japanese), + ]) +}); diff --git a/src/cd.rs b/src/cd.rs index 2665553..7808596 100644 --- a/src/cd.rs +++ b/src/cd.rs @@ -210,7 +210,7 @@ pub(crate) fn coherence_ratio( let mut sufficient_match_count: u64 = 0; for layer in alpha_unicode_split(&decoded_sequence) { - if layer.chars().count() <= *TOO_SMALL_SEQUENCE { + if layer.chars().count() <= TOO_SMALL_SEQUENCE { continue; } let most_common = layer.chars().collect::>().most_common_ordered(); diff --git a/src/consts.rs b/src/consts.rs index beeb7cc..9ca001c 100644 --- a/src/consts.rs +++ b/src/consts.rs @@ -1,307 +1,314 @@ use ahash::{HashMap, HashSet}; use core::ops::RangeInclusive; use encoding::all::encodings; -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use regex::Regex; -lazy_static! { - - // Contain for each eligible encoding a list of/item bytes SIG/BOM - pub(crate) static ref ENCODING_MARKS: HashMap<&'static str, &'static [u8]> = HashMap::from_iter([ +// Contain for each eligible encoding a list of/item bytes SIG/BOM +pub(crate) static ENCODING_MARKS: Lazy> = Lazy::new(|| { + HashMap::from_iter([ ("utf-8", b"\xef\xbb\xbf".as_slice()), ("gb18030", b"\x84\x31\x95\x33".as_slice()), ("utf-16le", b"\xff\xfe".as_slice()), ("utf-16be", b"\xfe\xff".as_slice()), - ]); + ]) +}); - pub static ref MAX_PROCESSED_BYTES: usize = 500_000; - pub static ref TOO_SMALL_SEQUENCE: usize = 32; - pub static ref TOO_BIG_SEQUENCE: usize = 1_000_000; // 10E6 +pub static TOO_BIG_SEQUENCE: usize = 1_000_000; // 10E6 +pub(crate) static MAX_PROCESSED_BYTES: usize = 500_000; +pub(crate) static TOO_SMALL_SEQUENCE: usize = 32; +pub(crate) static UTF8_MAXIMAL_ALLOCATION: usize = 1_112_064; - pub(crate) static ref UTF8_MAXIMAL_ALLOCATION: usize = 1_112_064; - pub(crate) static ref UNICODE_RANGES_COMBINED: [(&'static str, RangeInclusive);279] = [ - ("Control character", 0..=31), - ("Basic Latin", 32..=127), - ("Latin-1 Supplement", 128..=255), - ("Latin Extended-A", 256..=383), - ("Latin Extended-B", 384..=591), - ("IPA Extensions", 592..=687), - ("Spacing Modifier Letters", 688..=767), - ("Combining Diacritical Marks", 768..=879), - ("Greek and Coptic", 880..=1023), - ("Cyrillic", 1024..=1279), - ("Cyrillic Supplement", 1280..=1327), - ("Armenian", 1328..=1423), - ("Hebrew", 1424..=1535), - ("Arabic", 1536..=1791), - ("Syriac", 1792..=1871), - ("Arabic Supplement", 1872..=1919), - ("Thaana", 1920..=1983), - ("NKo", 1984..=2047), - ("Samaritan", 2048..=2111), - ("Mandaic", 2112..=2143), - ("Syriac Supplement", 2144..=2159), - ("Arabic Extended-A", 2208..=2303), - ("Devanagari", 2304..=2431), - ("Bengali", 2432..=2559), - ("Gurmukhi", 2560..=2687), - ("Gujarati", 2688..=2815), - ("Oriya", 2816..=2943), - ("Tamil", 2944..=3071), - ("Telugu", 3072..=3199), - ("Kannada", 3200..=3327), - ("Malayalam", 3328..=3455), - ("Sinhala", 3456..=3583), - ("Thai", 3584..=3711), - ("Lao", 3712..=3839), - ("Tibetan", 3840..=4095), - ("Myanmar", 4096..=4255), - ("Georgian", 4256..=4351), - ("Hangul Jamo", 4352..=4607), - ("Ethiopic", 4608..=4991), - ("Ethiopic Supplement", 4992..=5023), - ("Cherokee", 5024..=5119), - ("Unified Canadian Aboriginal Syllabics", 5120..=5759), - ("Ogham", 5760..=5791), - ("Runic", 5792..=5887), - ("Tagalog", 5888..=5919), - ("Hanunoo", 5920..=5951), - ("Buhid", 5952..=5983), - ("Tagbanwa", 5984..=6015), - ("Khmer", 6016..=6143), - ("Mongolian", 6144..=6319), - ("Unified Canadian Aboriginal Syllabics Extended", 6320..=6399), - ("Limbu", 6400..=6479), - ("Tai Le", 6480..=6527), - ("New Tai Lue", 6528..=6623), - ("Khmer Symbols", 6624..=6655), - ("Buginese", 6656..=6687), - ("Tai Tham", 6688..=6831), - ("Combining Diacritical Marks Extended", 6832..=6911), - ("Balinese", 6912..=7039), - ("Sundanese", 7040..=7103), - ("Batak", 7104..=7167), - ("Lepcha", 7168..=7247), - ("Ol Chiki", 7248..=7295), - ("Cyrillic Extended C", 7296..=7311), - ("Sundanese Supplement", 7360..=7375), - ("Vedic Extensions", 7376..=7423), - ("Phonetic Extensions", 7424..=7551), - ("Phonetic Extensions Supplement", 7552..=7615), - ("Combining Diacritical Marks Supplement", 7616..=7679), - ("Latin Extended Additional", 7680..=7935), - ("Greek Extended", 7936..=8191), - ("General Punctuation", 8192..=8303), - ("Superscripts and Subscripts", 8304..=8351), - ("Currency Symbols", 8352..=8399), - ("Combining Diacritical Marks for Symbols", 8400..=8447), - ("Letterlike Symbols", 8448..=8527), - ("Number Forms", 8528..=8591), - ("Arrows", 8592..=8703), - ("Mathematical Operators", 8704..=8959), - ("Miscellaneous Technical", 8960..=9215), - ("Control Pictures", 9216..=9279), - ("Optical Character Recognition", 9280..=9311), - ("Enclosed Alphanumerics", 9312..=9471), - ("Box Drawing", 9472..=9599), - ("Block Elements", 9600..=9631), - ("Geometric Shapes", 9632..=9727), - ("Miscellaneous Symbols", 9728..=9983), - ("Dingbats", 9984..=10175), - ("Miscellaneous Mathematical Symbols-A", 10176..=10223), - ("Supplemental Arrows-A", 10224..=10239), - ("Braille Patterns", 10240..=10495), - ("Supplemental Arrows-B", 10496..=10623), - ("Miscellaneous Mathematical Symbols-B", 10624..=10751), - ("Supplemental Mathematical Operators", 10752..=11007), - ("Miscellaneous Symbols and Arrows", 11008..=11263), - ("Glagolitic", 11264..=11359), - ("Latin Extended-C", 11360..=11391), - ("Coptic", 11392..=11519), - ("Georgian Supplement", 11520..=11567), - ("Tifinagh", 11568..=11647), - ("Ethiopic Extended", 11648..=11743), - ("Cyrillic Extended-A", 11744..=11775), - ("Supplemental Punctuation", 11776..=11903), - ("CJK Radicals Supplement", 11904..=12031), - ("Kangxi Radicals", 12032..=12255), - ("Ideographic Description Characters", 12272..=12287), - ("CJK Symbols and Punctuation", 12288..=12351), - ("Hiragana", 12352..=12447), - ("Katakana", 12448..=12543), - ("Bopomofo", 12544..=12591), - ("Hangul Compatibility Jamo", 12592..=12687), - ("Kanbun", 12688..=12703), - ("Bopomofo Extended", 12704..=12735), - ("CJK Strokes", 12736..=12783), - ("Katakana Phonetic Extensions", 12784..=12799), - ("Enclosed CJK Letters and Months", 12800..=13055), - ("CJK Compatibility", 13056..=13311), - ("CJK Unified Ideographs Extension A", 13312..=19903), - ("Yijing Hexagram Symbols", 19904..=19967), - ("CJK Unified Ideographs", 19968..=40959), - ("Yi Syllables", 40960..=42127), - ("Yi Radicals", 42128..=42191), - ("Lisu", 42192..=42239), - ("Vai", 42240..=42559), - ("Cyrillic Extended-B", 42560..=42655), - ("Bamum", 42656..=42751), - ("Modifier Tone Letters", 42752..=42783), - ("Latin Extended-D", 42784..=43007), - ("Syloti Nagri", 43008..=43055), - ("Common Indic Number Forms", 43056..=43071), - ("Phags-pa", 43072..=43135), - ("Saurashtra", 43136..=43231), - ("Devanagari Extended", 43232..=43263), - ("Kayah Li", 43264..=43311), - ("Rejang", 43312..=43359), - ("Hangul Jamo Extended-A", 43360..=43391), - ("Javanese", 43392..=43487), - ("Myanmar Extended-B", 43488..=43519), - ("Cham", 43520..=43615), - ("Myanmar Extended-A", 43616..=43647), - ("Tai Viet", 43648..=43743), - ("Meetei Mayek Extensions", 43744..=43775), - ("Ethiopic Extended-A", 43776..=43823), - ("Latin Extended-E", 43824..=43887), - ("Cherokee Supplement", 43888..=43967), - ("Meetei Mayek", 43968..=44031), - ("Hangul Syllables", 44032..=55215), - ("Hangul Jamo Extended-B", 55216..=55295), - ("High Surrogates", 55296..=56191), - ("High Private Use Surrogates", 56192..=56319), - ("Low Surrogates", 56320..=57343), - ("Private Use Area", 57344..=63743), - ("CJK Compatibility Ideographs", 63744..=64255), - ("Alphabetic Presentation Forms", 64256..=64335), - ("Arabic Presentation Forms-A", 64336..=65023), - ("Variation Selectors", 65024..=65039), - ("Vertical Forms", 65040..=65055), - ("Combining Half Marks", 65056..=65071), - ("CJK Compatibility Forms", 65072..=65103), - ("Small Form Variants", 65104..=65135), - ("Arabic Presentation Forms-B", 65136..=65279), - ("Halfwidth and Fullwidth Forms", 65280..=65519), - ("Specials", 65520..=65535), - ("Linear B Syllabary", 65536..=65663), - ("Linear B Ideograms", 65664..=65791), - ("Aegean Numbers", 65792..=65855), - ("Ancient Greek Numbers", 65856..=65935), - ("Ancient Symbols", 65936..=65999), - ("Phaistos Disc", 66000..=66047), - ("Lycian", 66176..=66207), - ("Carian", 66208..=66271), - ("Coptic Epact Numbers", 66272..=66303), - ("Old Italic", 66304..=66351), - ("Gothic", 66352..=66383), - ("Old Permic", 66384..=66431), - ("Ugaritic", 66432..=66463), - ("Old Persian", 66464..=66527), - ("Deseret", 66560..=66639), - ("Shavian", 66640..=66687), - ("Osmanya", 66688..=66735), - ("Osage", 66736..=66815), - ("Elbasan", 66816..=66863), - ("Caucasian Albanian", 66864..=66927), - ("Linear A", 67072..=67455), - ("Cypriot Syllabary", 67584..=67647), - ("Imperial Aramaic", 67648..=67679), - ("Palmyrene", 67680..=67711), - ("Nabataean", 67712..=67759), - ("Hatran", 67808..=67839), - ("Phoenician", 67840..=67871), - ("Lydian", 67872..=67903), - ("Meroitic Hieroglyphs", 67968..=67999), - ("Meroitic Cursive", 68000..=68095), - ("Kharoshthi", 68096..=68191), - ("Old South Arabian", 68192..=68223), - ("Old North Arabian", 68224..=68255), - ("Manichaean", 68288..=68351), - ("Avestan", 68352..=68415), - ("Inscriptional Parthian", 68416..=68447), - ("Inscriptional Pahlavi", 68448..=68479), - ("Psalter Pahlavi", 68480..=68527), - ("Old Turkic", 68608..=68687), - ("Old Hungarian", 68736..=68863), - ("Rumi Numeral Symbols", 69216..=69247), - ("Brahmi", 69632..=69759), - ("Kaithi", 69760..=69839), - ("Sora Sompeng", 69840..=69887), - ("Chakma", 69888..=69967), - ("Mahajani", 69968..=70015), - ("Sharada", 70016..=70111), - ("Sinhala Archaic Numbers", 70112..=70143), - ("Khojki", 70144..=70223), - ("Multani", 70272..=70319), - ("Khudawadi", 70320..=70399), - ("Grantha", 70400..=70527), - ("Newa", 70656..=70783), - ("Tirhuta", 70784..=70879), - ("Siddham", 71040..=71167), - ("Modi", 71168..=71263), - ("Mongolian Supplement", 71264..=71295), - ("Takri", 71296..=71375), - ("Ahom", 71424..=71487), - ("Warang Citi", 71840..=71935), - ("Zanabazar Square", 72192..=72271), - ("Soyombo", 72272..=72367), - ("Pau Cin Hau", 72384..=72447), - ("Bhaiksuki", 72704..=72815), - ("Marchen", 72816..=72895), - ("Masaram Gondi", 72960..=73055), - ("Cuneiform", 73728..=74751), - ("Cuneiform Numbers and Punctuation", 74752..=74879), - ("Early Dynastic Cuneiform", 74880..=75087), - ("Egyptian Hieroglyphs", 77824..=78895), - ("Anatolian Hieroglyphs", 82944..=83583), - ("Bamum Supplement", 92160..=92735), - ("Mro", 92736..=92783), - ("Bassa Vah", 92880..=92927), - ("Pahawh Hmong", 92928..=93071), - ("Miao", 93952..=94111), - ("Ideographic Symbols and Punctuation", 94176..=94207), - ("Tangut", 94208..=100_351), - ("Tangut Components", 100_352..=101_119), - ("Kana Supplement", 110_592..=110_847), - ("Kana Extended-A", 110_848..=110_895), - ("Nushu", 110_960..=111_359), - ("Duployan", 113_664..=113_823), - ("Shorthand Format Controls", 113_824..=113_839), - ("Byzantine Musical Symbols", 118_784..=119_039), - ("Musical Symbols", 119_040..=119_295), - ("Ancient Greek Musical Notation", 119_296..=119_375), - ("Tai Xuan Jing Symbols", 119_552..=119_647), - ("Counting Rod Numerals", 119_648..=119_679), - ("Mathematical Alphanumeric Symbols", 119_808..=120_831), - ("Sutton SignWriting", 120_832..=121_519), - ("Glagolitic Supplement", 122_880..=122_927), - ("Mende Kikakui", 124_928..=125_151), - ("Adlam", 125_184..=125_279), - ("Arabic Mathematical Alphabetic Symbols", 126_464..=126_719), - ("Mahjong Tiles", 126_976..=127_023), - ("Domino Tiles", 127_024..=127_135), - ("Playing Cards", 127_136..=127_231), - ("Enclosed Alphanumeric Supplement", 127_232..=127_487), - ("Enclosed Ideographic Supplement", 127_488..=127_743), - ("Miscellaneous Symbols and Pictographs", 127_744..=128_511), - ("Emoticons range(Emoji)", 128_512..=128_591), - ("Ornamental Dingbats", 128_592..=128_639), - ("Transport and Map Symbols", 128_640..=128_767), - ("Alchemical Symbols", 128_768..=128_895), - ("Geometric Shapes Extended", 128_896..=129_023), - ("Supplemental Arrows-C", 129_024..=129_279), - ("Supplemental Symbols and Pictographs", 129_280..=129_535), - ("CJK Unified Ideographs Extension B", 131_072..=173_791), - ("CJK Unified Ideographs Extension C", 173_824..=177_983), - ("CJK Unified Ideographs Extension D", 177_984..=178_207), - ("CJK Unified Ideographs Extension E", 178_208..=183_983), - ("CJK Unified Ideographs Extension F", 183_984..=191_471), - ("CJK Compatibility Ideographs Supplement", 194_560..=195_103), - ("Tags", 917_504..=917_631), - ("Variation Selectors Supplement", 917_760..=917_999), - ]; +pub(crate) static UNICODE_RANGES_COMBINED: Lazy<[(&'static str, RangeInclusive); 279]> = + Lazy::new(|| { + [ + ("Control character", 0..=31), + ("Basic Latin", 32..=127), + ("Latin-1 Supplement", 128..=255), + ("Latin Extended-A", 256..=383), + ("Latin Extended-B", 384..=591), + ("IPA Extensions", 592..=687), + ("Spacing Modifier Letters", 688..=767), + ("Combining Diacritical Marks", 768..=879), + ("Greek and Coptic", 880..=1023), + ("Cyrillic", 1024..=1279), + ("Cyrillic Supplement", 1280..=1327), + ("Armenian", 1328..=1423), + ("Hebrew", 1424..=1535), + ("Arabic", 1536..=1791), + ("Syriac", 1792..=1871), + ("Arabic Supplement", 1872..=1919), + ("Thaana", 1920..=1983), + ("NKo", 1984..=2047), + ("Samaritan", 2048..=2111), + ("Mandaic", 2112..=2143), + ("Syriac Supplement", 2144..=2159), + ("Arabic Extended-A", 2208..=2303), + ("Devanagari", 2304..=2431), + ("Bengali", 2432..=2559), + ("Gurmukhi", 2560..=2687), + ("Gujarati", 2688..=2815), + ("Oriya", 2816..=2943), + ("Tamil", 2944..=3071), + ("Telugu", 3072..=3199), + ("Kannada", 3200..=3327), + ("Malayalam", 3328..=3455), + ("Sinhala", 3456..=3583), + ("Thai", 3584..=3711), + ("Lao", 3712..=3839), + ("Tibetan", 3840..=4095), + ("Myanmar", 4096..=4255), + ("Georgian", 4256..=4351), + ("Hangul Jamo", 4352..=4607), + ("Ethiopic", 4608..=4991), + ("Ethiopic Supplement", 4992..=5023), + ("Cherokee", 5024..=5119), + ("Unified Canadian Aboriginal Syllabics", 5120..=5759), + ("Ogham", 5760..=5791), + ("Runic", 5792..=5887), + ("Tagalog", 5888..=5919), + ("Hanunoo", 5920..=5951), + ("Buhid", 5952..=5983), + ("Tagbanwa", 5984..=6015), + ("Khmer", 6016..=6143), + ("Mongolian", 6144..=6319), + ( + "Unified Canadian Aboriginal Syllabics Extended", + 6320..=6399, + ), + ("Limbu", 6400..=6479), + ("Tai Le", 6480..=6527), + ("New Tai Lue", 6528..=6623), + ("Khmer Symbols", 6624..=6655), + ("Buginese", 6656..=6687), + ("Tai Tham", 6688..=6831), + ("Combining Diacritical Marks Extended", 6832..=6911), + ("Balinese", 6912..=7039), + ("Sundanese", 7040..=7103), + ("Batak", 7104..=7167), + ("Lepcha", 7168..=7247), + ("Ol Chiki", 7248..=7295), + ("Cyrillic Extended C", 7296..=7311), + ("Sundanese Supplement", 7360..=7375), + ("Vedic Extensions", 7376..=7423), + ("Phonetic Extensions", 7424..=7551), + ("Phonetic Extensions Supplement", 7552..=7615), + ("Combining Diacritical Marks Supplement", 7616..=7679), + ("Latin Extended Additional", 7680..=7935), + ("Greek Extended", 7936..=8191), + ("General Punctuation", 8192..=8303), + ("Superscripts and Subscripts", 8304..=8351), + ("Currency Symbols", 8352..=8399), + ("Combining Diacritical Marks for Symbols", 8400..=8447), + ("Letterlike Symbols", 8448..=8527), + ("Number Forms", 8528..=8591), + ("Arrows", 8592..=8703), + ("Mathematical Operators", 8704..=8959), + ("Miscellaneous Technical", 8960..=9215), + ("Control Pictures", 9216..=9279), + ("Optical Character Recognition", 9280..=9311), + ("Enclosed Alphanumerics", 9312..=9471), + ("Box Drawing", 9472..=9599), + ("Block Elements", 9600..=9631), + ("Geometric Shapes", 9632..=9727), + ("Miscellaneous Symbols", 9728..=9983), + ("Dingbats", 9984..=10175), + ("Miscellaneous Mathematical Symbols-A", 10176..=10223), + ("Supplemental Arrows-A", 10224..=10239), + ("Braille Patterns", 10240..=10495), + ("Supplemental Arrows-B", 10496..=10623), + ("Miscellaneous Mathematical Symbols-B", 10624..=10751), + ("Supplemental Mathematical Operators", 10752..=11007), + ("Miscellaneous Symbols and Arrows", 11008..=11263), + ("Glagolitic", 11264..=11359), + ("Latin Extended-C", 11360..=11391), + ("Coptic", 11392..=11519), + ("Georgian Supplement", 11520..=11567), + ("Tifinagh", 11568..=11647), + ("Ethiopic Extended", 11648..=11743), + ("Cyrillic Extended-A", 11744..=11775), + ("Supplemental Punctuation", 11776..=11903), + ("CJK Radicals Supplement", 11904..=12031), + ("Kangxi Radicals", 12032..=12255), + ("Ideographic Description Characters", 12272..=12287), + ("CJK Symbols and Punctuation", 12288..=12351), + ("Hiragana", 12352..=12447), + ("Katakana", 12448..=12543), + ("Bopomofo", 12544..=12591), + ("Hangul Compatibility Jamo", 12592..=12687), + ("Kanbun", 12688..=12703), + ("Bopomofo Extended", 12704..=12735), + ("CJK Strokes", 12736..=12783), + ("Katakana Phonetic Extensions", 12784..=12799), + ("Enclosed CJK Letters and Months", 12800..=13055), + ("CJK Compatibility", 13056..=13311), + ("CJK Unified Ideographs Extension A", 13312..=19903), + ("Yijing Hexagram Symbols", 19904..=19967), + ("CJK Unified Ideographs", 19968..=40959), + ("Yi Syllables", 40960..=42127), + ("Yi Radicals", 42128..=42191), + ("Lisu", 42192..=42239), + ("Vai", 42240..=42559), + ("Cyrillic Extended-B", 42560..=42655), + ("Bamum", 42656..=42751), + ("Modifier Tone Letters", 42752..=42783), + ("Latin Extended-D", 42784..=43007), + ("Syloti Nagri", 43008..=43055), + ("Common Indic Number Forms", 43056..=43071), + ("Phags-pa", 43072..=43135), + ("Saurashtra", 43136..=43231), + ("Devanagari Extended", 43232..=43263), + ("Kayah Li", 43264..=43311), + ("Rejang", 43312..=43359), + ("Hangul Jamo Extended-A", 43360..=43391), + ("Javanese", 43392..=43487), + ("Myanmar Extended-B", 43488..=43519), + ("Cham", 43520..=43615), + ("Myanmar Extended-A", 43616..=43647), + ("Tai Viet", 43648..=43743), + ("Meetei Mayek Extensions", 43744..=43775), + ("Ethiopic Extended-A", 43776..=43823), + ("Latin Extended-E", 43824..=43887), + ("Cherokee Supplement", 43888..=43967), + ("Meetei Mayek", 43968..=44031), + ("Hangul Syllables", 44032..=55215), + ("Hangul Jamo Extended-B", 55216..=55295), + ("High Surrogates", 55296..=56191), + ("High Private Use Surrogates", 56192..=56319), + ("Low Surrogates", 56320..=57343), + ("Private Use Area", 57344..=63743), + ("CJK Compatibility Ideographs", 63744..=64255), + ("Alphabetic Presentation Forms", 64256..=64335), + ("Arabic Presentation Forms-A", 64336..=65023), + ("Variation Selectors", 65024..=65039), + ("Vertical Forms", 65040..=65055), + ("Combining Half Marks", 65056..=65071), + ("CJK Compatibility Forms", 65072..=65103), + ("Small Form Variants", 65104..=65135), + ("Arabic Presentation Forms-B", 65136..=65279), + ("Halfwidth and Fullwidth Forms", 65280..=65519), + ("Specials", 65520..=65535), + ("Linear B Syllabary", 65536..=65663), + ("Linear B Ideograms", 65664..=65791), + ("Aegean Numbers", 65792..=65855), + ("Ancient Greek Numbers", 65856..=65935), + ("Ancient Symbols", 65936..=65999), + ("Phaistos Disc", 66000..=66047), + ("Lycian", 66176..=66207), + ("Carian", 66208..=66271), + ("Coptic Epact Numbers", 66272..=66303), + ("Old Italic", 66304..=66351), + ("Gothic", 66352..=66383), + ("Old Permic", 66384..=66431), + ("Ugaritic", 66432..=66463), + ("Old Persian", 66464..=66527), + ("Deseret", 66560..=66639), + ("Shavian", 66640..=66687), + ("Osmanya", 66688..=66735), + ("Osage", 66736..=66815), + ("Elbasan", 66816..=66863), + ("Caucasian Albanian", 66864..=66927), + ("Linear A", 67072..=67455), + ("Cypriot Syllabary", 67584..=67647), + ("Imperial Aramaic", 67648..=67679), + ("Palmyrene", 67680..=67711), + ("Nabataean", 67712..=67759), + ("Hatran", 67808..=67839), + ("Phoenician", 67840..=67871), + ("Lydian", 67872..=67903), + ("Meroitic Hieroglyphs", 67968..=67999), + ("Meroitic Cursive", 68000..=68095), + ("Kharoshthi", 68096..=68191), + ("Old South Arabian", 68192..=68223), + ("Old North Arabian", 68224..=68255), + ("Manichaean", 68288..=68351), + ("Avestan", 68352..=68415), + ("Inscriptional Parthian", 68416..=68447), + ("Inscriptional Pahlavi", 68448..=68479), + ("Psalter Pahlavi", 68480..=68527), + ("Old Turkic", 68608..=68687), + ("Old Hungarian", 68736..=68863), + ("Rumi Numeral Symbols", 69216..=69247), + ("Brahmi", 69632..=69759), + ("Kaithi", 69760..=69839), + ("Sora Sompeng", 69840..=69887), + ("Chakma", 69888..=69967), + ("Mahajani", 69968..=70015), + ("Sharada", 70016..=70111), + ("Sinhala Archaic Numbers", 70112..=70143), + ("Khojki", 70144..=70223), + ("Multani", 70272..=70319), + ("Khudawadi", 70320..=70399), + ("Grantha", 70400..=70527), + ("Newa", 70656..=70783), + ("Tirhuta", 70784..=70879), + ("Siddham", 71040..=71167), + ("Modi", 71168..=71263), + ("Mongolian Supplement", 71264..=71295), + ("Takri", 71296..=71375), + ("Ahom", 71424..=71487), + ("Warang Citi", 71840..=71935), + ("Zanabazar Square", 72192..=72271), + ("Soyombo", 72272..=72367), + ("Pau Cin Hau", 72384..=72447), + ("Bhaiksuki", 72704..=72815), + ("Marchen", 72816..=72895), + ("Masaram Gondi", 72960..=73055), + ("Cuneiform", 73728..=74751), + ("Cuneiform Numbers and Punctuation", 74752..=74879), + ("Early Dynastic Cuneiform", 74880..=75087), + ("Egyptian Hieroglyphs", 77824..=78895), + ("Anatolian Hieroglyphs", 82944..=83583), + ("Bamum Supplement", 92160..=92735), + ("Mro", 92736..=92783), + ("Bassa Vah", 92880..=92927), + ("Pahawh Hmong", 92928..=93071), + ("Miao", 93952..=94111), + ("Ideographic Symbols and Punctuation", 94176..=94207), + ("Tangut", 94208..=100_351), + ("Tangut Components", 100_352..=101_119), + ("Kana Supplement", 110_592..=110_847), + ("Kana Extended-A", 110_848..=110_895), + ("Nushu", 110_960..=111_359), + ("Duployan", 113_664..=113_823), + ("Shorthand Format Controls", 113_824..=113_839), + ("Byzantine Musical Symbols", 118_784..=119_039), + ("Musical Symbols", 119_040..=119_295), + ("Ancient Greek Musical Notation", 119_296..=119_375), + ("Tai Xuan Jing Symbols", 119_552..=119_647), + ("Counting Rod Numerals", 119_648..=119_679), + ("Mathematical Alphanumeric Symbols", 119_808..=120_831), + ("Sutton SignWriting", 120_832..=121_519), + ("Glagolitic Supplement", 122_880..=122_927), + ("Mende Kikakui", 124_928..=125_151), + ("Adlam", 125_184..=125_279), + ("Arabic Mathematical Alphabetic Symbols", 126_464..=126_719), + ("Mahjong Tiles", 126_976..=127_023), + ("Domino Tiles", 127_024..=127_135), + ("Playing Cards", 127_136..=127_231), + ("Enclosed Alphanumeric Supplement", 127_232..=127_487), + ("Enclosed Ideographic Supplement", 127_488..=127_743), + ("Miscellaneous Symbols and Pictographs", 127_744..=128_511), + ("Emoticons range(Emoji)", 128_512..=128_591), + ("Ornamental Dingbats", 128_592..=128_639), + ("Transport and Map Symbols", 128_640..=128_767), + ("Alchemical Symbols", 128_768..=128_895), + ("Geometric Shapes Extended", 128_896..=129_023), + ("Supplemental Arrows-C", 129_024..=129_279), + ("Supplemental Symbols and Pictographs", 129_280..=129_535), + ("CJK Unified Ideographs Extension B", 131_072..=173_791), + ("CJK Unified Ideographs Extension C", 173_824..=177_983), + ("CJK Unified Ideographs Extension D", 177_984..=178_207), + ("CJK Unified Ideographs Extension E", 178_208..=183_983), + ("CJK Unified Ideographs Extension F", 183_984..=191_471), + ("CJK Compatibility Ideographs Supplement", 194_560..=195_103), + ("Tags", 917_504..=917_631), + ("Variation Selectors Supplement", 917_760..=917_999), + ] + }); - pub(crate) static ref UNICODE_SECONDARY_RANGE_KEYWORD: HashSet<&'static str> = HashSet::from_iter([ +pub(crate) static UNICODE_SECONDARY_RANGE_KEYWORD: Lazy> = Lazy::new(|| { + HashSet::from_iter([ "Extended", "Extensions", "Modifier", @@ -317,145 +324,430 @@ lazy_static! { "Supplemental", "Supplement", "Tags", - ]); + ]) +}); - pub(crate) static ref COMMON_SAFE_ASCII_CHARACTERS: &'static str = "<>=:/&;{}[],|\"-"; +pub(crate) static COMMON_SAFE_ASCII_CHARACTERS: &'static str = "<>=:/&;{}[],|\"-"; - pub(crate) static ref RE_POSSIBLE_ENCODING_INDICATION: Regex = Regex::new( - r#"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:["']?)([a-zA-Z0-9\-_]+)(?:["']?)"# - ).unwrap(); +pub(crate) static RE_POSSIBLE_ENCODING_INDICATION: Lazy = Lazy::new(|| { + Regex::new( + r#"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:["']?)([a-zA-Z0-9\-_]+)(?:["']?)"# +).unwrap() +}); - pub static ref IANA_SUPPORTED: Vec<&'static str> = encodings() - .iter() - .filter(|&enc| !["error", "encoder-only-utf-8", "pua-mapped-binary"].contains(&enc.name()) ) - .map(|&enc| enc.whatwg_name().unwrap_or(enc.name())) - .collect(); +pub static IANA_SUPPORTED: Lazy> = Lazy::new(|| { + encodings() + .iter() + .filter(|&enc| !["error", "encoder-only-utf-8", "pua-mapped-binary"].contains(&enc.name())) + .map(|&enc| enc.whatwg_name().unwrap_or(enc.name())) + .collect() +}); - pub static ref IANA_SUPPORTED_COUNT: usize = IANA_SUPPORTED.len(); +pub static IANA_SUPPORTED_COUNT: Lazy = Lazy::new(|| IANA_SUPPORTED.len()); - // chardet encoding names (in lowercase!) - pub static ref CHARDET_CORRESPONDENCE: HashMap<&'static str, &'static str> = HashMap::from_iter([ +// chardet encoding names (in lowercase!) +pub static CHARDET_CORRESPONDENCE: Lazy> = Lazy::new(|| { + HashMap::from_iter([ ("tis-620", "windows-874"), ("utf-16", "utf-16le"), ("maccyrillic", "x-mac-cyrillic"), ("gb2312", "gbk"), ("cp949", "euc-kr"), - ]); + ]) +}); - // aliases (labels) are from https://encoding.spec.whatwg.org/#concept-encoding-get -> as is + lowercased - pub static ref IANA_SUPPORTED_ALIASES: HashMap<&'static str, Vec<&'static str>> = HashMap::from_iter([ - ("utf-8", vec!["unicode-1-1-utf-8", "unicode11utf8", "unicode20utf8", "utf-8", "utf8", "x-unicode20utf8"]), - ("ibm866", vec!["866", "cp866", "csibm866", "ibm866"]), - ("iso-8859-2", vec!["csisolatin2", "iso-8859-2", "iso-ir-101", "iso8859-2", "iso88592", "iso_8859-2", "iso_8859-2:1987", "l2", "latin2"]), - ("iso-8859-3", vec!["csisolatin3", "iso-8859-3", "iso-ir-109", "iso8859-3", "iso88593", "iso_8859-3", "iso_8859-3:1988", "l3", "latin3"]), - ("iso-8859-4", vec!["csisolatin4", "iso-8859-4", "iso-ir-110", "iso8859-4", "iso88594", "iso_8859-4", "iso_8859-4:1988", "l4", "latin4"]), - ("iso-8859-5", vec!["csisolatincyrillic", "cyrillic", "iso-8859-5", "iso-ir-144", "iso8859-5", "iso88595", "iso_8859-5", "iso_8859-5:1988"]), - ("iso-8859-6", vec!["arabic", "asmo-708", "csiso88596e", "csiso88596i", "csisolatinarabic", "ecma-114", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i", "iso-ir-127", "iso8859-6", "iso88596", "iso_8859-6", "iso_8859-6:1987"]), - ("iso-8859-7", vec!["csisolatingreek", "ecma-118", "elot_928", "greek", "greek8", "iso-8859-7", "iso-ir-126", "iso8859-7", "iso88597", "iso_8859-7", "iso_8859-7:1987", "sun_eu_greek"]), - ("iso-8859-8", vec!["csiso88598e", "csisolatinhebrew", "hebrew", "iso-8859-8", "iso-8859-8-e", "iso-ir-138", "iso8859-8", "iso88598", "iso_8859-8", "iso_8859-8:1988", "visual"]), - ("iso-8859-8-i", vec!["csiso88598i", "iso-8859-8-i", "logical"]), - ("iso-8859-10", vec!["csisolatin6", "iso-8859-10", "iso-ir-157", "iso8859-10", "iso885910", "l6", "latin6"]), - ("iso-8859-13", vec!["iso-8859-13", "iso8859-13", "iso885913"]), - ("iso-8859-14", vec!["iso-8859-14", "iso8859-14", "iso885914"]), - ("iso-8859-15", vec!["csisolatin9", "iso-8859-15", "iso8859-15", "iso885915", "iso_8859-15", "l9"]), - ("iso-8859-16", vec!["iso-8859-16"]), - ("koi8-r", vec!["cskoi8r", "koi", "koi8", "koi8-r", "koi8_r"]), - ("koi8-u", vec!["koi8-ru", "koi8-u"]), - ("macintosh", vec!["csmacintosh", "mac", "macintosh", "x-mac-roman"]), - ("windows-874", vec!["dos-874", "iso-8859-11", "iso8859-11", "iso885911", "tis-620", "windows-874"]), - ("windows-1250", vec!["cp1250", "windows-1250", "x-cp1250"]), - ("windows-1251", vec!["cp1251", "windows-1251", "x-cp1251"]), - ("windows-1252", vec!["ansi_x3.4-1968", "ascii", "cp1252", "cp819", "csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", "iso_8859-1", "iso_8859-1:1987", "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252"]), - ("windows-1253", vec!["cp1253", "windows-1253", "x-cp1253"]), - ("windows-1254", vec!["cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148", "iso8859-9", "iso88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", "windows-1254", "x-cp1254"]), - ("windows-1255", vec!["cp1255", "windows-1255", "x-cp1255"]), - ("windows-1256", vec!["cp1256", "windows-1256", "x-cp1256"]), - ("windows-1257", vec!["cp1257", "windows-1257", "x-cp1257"]), - ("windows-1258", vec!["cp1258", "windows-1258", "x-cp1258"]), - ("x-mac-cyrillic", vec!["x-mac-cyrillic", "x-mac-ukrainian"]), - ("gbk", vec!["chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "x-gbk"]), - ("gb18030", vec!["gb18030"]), - ("big5", vec!["big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5"]), - ("euc-jp", vec!["cseucpkdfmtjapanese", "euc-jp", "x-euc-jp"]), - ("iso-2022-jp", vec!["csiso2022jp", "iso-2022-jp"]), - ("shift_jis", vec!["csshiftjis", "ms932", "ms_kanji", "shift-jis", "shift_jis", "sjis", "windows-31j", "x-sjis"]), - ("euc-kr", vec!["cseuckr", "csksc56011987", "euc-kr", "iso-ir-149", "korean", "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "windows-949"]), - ("replacement", vec!["csiso2022kr", "hz-gb-2312", "iso-2022-cn", "iso-2022-cn-ext", "iso-2022-kr", "replacement"]), - ("utf-16be", vec!["unicodefffe", "utf-16be"]), - ("utf-16le", vec!["csunicode", "iso-10646-ucs-2", "ucs-2", "unicode", "unicodefeff", "utf-16", "utf-16le"]), - ("x-user-defined", vec!["x-user-defined"]), - ]); +// aliases (labels) are from https://encoding.spec.whatwg.org/#concept-encoding-get -> as is + lowercased +pub static IANA_SUPPORTED_ALIASES: Lazy>> = + Lazy::new(|| { + HashMap::from_iter([ + ( + "utf-8", + vec![ + "unicode-1-1-utf-8", + "unicode11utf8", + "unicode20utf8", + "utf-8", + "utf8", + "x-unicode20utf8", + ], + ), + ("ibm866", vec!["866", "cp866", "csibm866", "ibm866"]), + ( + "iso-8859-2", + vec![ + "csisolatin2", + "iso-8859-2", + "iso-ir-101", + "iso8859-2", + "iso88592", + "iso_8859-2", + "iso_8859-2:1987", + "l2", + "latin2", + ], + ), + ( + "iso-8859-3", + vec![ + "csisolatin3", + "iso-8859-3", + "iso-ir-109", + "iso8859-3", + "iso88593", + "iso_8859-3", + "iso_8859-3:1988", + "l3", + "latin3", + ], + ), + ( + "iso-8859-4", + vec![ + "csisolatin4", + "iso-8859-4", + "iso-ir-110", + "iso8859-4", + "iso88594", + "iso_8859-4", + "iso_8859-4:1988", + "l4", + "latin4", + ], + ), + ( + "iso-8859-5", + vec![ + "csisolatincyrillic", + "cyrillic", + "iso-8859-5", + "iso-ir-144", + "iso8859-5", + "iso88595", + "iso_8859-5", + "iso_8859-5:1988", + ], + ), + ( + "iso-8859-6", + vec![ + "arabic", + "asmo-708", + "csiso88596e", + "csiso88596i", + "csisolatinarabic", + "ecma-114", + "iso-8859-6", + "iso-8859-6-e", + "iso-8859-6-i", + "iso-ir-127", + "iso8859-6", + "iso88596", + "iso_8859-6", + "iso_8859-6:1987", + ], + ), + ( + "iso-8859-7", + vec![ + "csisolatingreek", + "ecma-118", + "elot_928", + "greek", + "greek8", + "iso-8859-7", + "iso-ir-126", + "iso8859-7", + "iso88597", + "iso_8859-7", + "iso_8859-7:1987", + "sun_eu_greek", + ], + ), + ( + "iso-8859-8", + vec![ + "csiso88598e", + "csisolatinhebrew", + "hebrew", + "iso-8859-8", + "iso-8859-8-e", + "iso-ir-138", + "iso8859-8", + "iso88598", + "iso_8859-8", + "iso_8859-8:1988", + "visual", + ], + ), + ( + "iso-8859-8-i", + vec!["csiso88598i", "iso-8859-8-i", "logical"], + ), + ( + "iso-8859-10", + vec![ + "csisolatin6", + "iso-8859-10", + "iso-ir-157", + "iso8859-10", + "iso885910", + "l6", + "latin6", + ], + ), + ( + "iso-8859-13", + vec!["iso-8859-13", "iso8859-13", "iso885913"], + ), + ( + "iso-8859-14", + vec!["iso-8859-14", "iso8859-14", "iso885914"], + ), + ( + "iso-8859-15", + vec![ + "csisolatin9", + "iso-8859-15", + "iso8859-15", + "iso885915", + "iso_8859-15", + "l9", + ], + ), + ("iso-8859-16", vec!["iso-8859-16"]), + ("koi8-r", vec!["cskoi8r", "koi", "koi8", "koi8-r", "koi8_r"]), + ("koi8-u", vec!["koi8-ru", "koi8-u"]), + ( + "macintosh", + vec!["csmacintosh", "mac", "macintosh", "x-mac-roman"], + ), + ( + "windows-874", + vec![ + "dos-874", + "iso-8859-11", + "iso8859-11", + "iso885911", + "tis-620", + "windows-874", + ], + ), + ("windows-1250", vec!["cp1250", "windows-1250", "x-cp1250"]), + ("windows-1251", vec!["cp1251", "windows-1251", "x-cp1251"]), + ( + "windows-1252", + vec![ + "ansi_x3.4-1968", + "ascii", + "cp1252", + "cp819", + "csisolatin1", + "ibm819", + "iso-8859-1", + "iso-ir-100", + "iso8859-1", + "iso88591", + "iso_8859-1", + "iso_8859-1:1987", + "l1", + "latin1", + "us-ascii", + "windows-1252", + "x-cp1252", + ], + ), + ("windows-1253", vec!["cp1253", "windows-1253", "x-cp1253"]), + ( + "windows-1254", + vec![ + "cp1254", + "csisolatin5", + "iso-8859-9", + "iso-ir-148", + "iso8859-9", + "iso88599", + "iso_8859-9", + "iso_8859-9:1989", + "l5", + "latin5", + "windows-1254", + "x-cp1254", + ], + ), + ("windows-1255", vec!["cp1255", "windows-1255", "x-cp1255"]), + ("windows-1256", vec!["cp1256", "windows-1256", "x-cp1256"]), + ("windows-1257", vec!["cp1257", "windows-1257", "x-cp1257"]), + ("windows-1258", vec!["cp1258", "windows-1258", "x-cp1258"]), + ("x-mac-cyrillic", vec!["x-mac-cyrillic", "x-mac-ukrainian"]), + ( + "gbk", + vec![ + "chinese", + "csgb2312", + "csiso58gb231280", + "gb2312", + "gb_2312", + "gb_2312-80", + "gbk", + "iso-ir-58", + "x-gbk", + ], + ), + ("gb18030", vec!["gb18030"]), + ( + "big5", + vec!["big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5"], + ), + ("euc-jp", vec!["cseucpkdfmtjapanese", "euc-jp", "x-euc-jp"]), + ("iso-2022-jp", vec!["csiso2022jp", "iso-2022-jp"]), + ( + "shift_jis", + vec![ + "csshiftjis", + "ms932", + "ms_kanji", + "shift-jis", + "shift_jis", + "sjis", + "windows-31j", + "x-sjis", + ], + ), + ( + "euc-kr", + vec![ + "cseuckr", + "csksc56011987", + "euc-kr", + "iso-ir-149", + "korean", + "ks_c_5601-1987", + "ks_c_5601-1989", + "ksc5601", + "ksc_5601", + "windows-949", + ], + ), + ( + "replacement", + vec![ + "csiso2022kr", + "hz-gb-2312", + "iso-2022-cn", + "iso-2022-cn-ext", + "iso-2022-kr", + "replacement", + ], + ), + ("utf-16be", vec!["unicodefffe", "utf-16be"]), + ( + "utf-16le", + vec![ + "csunicode", + "iso-10646-ucs-2", + "ucs-2", + "unicode", + "unicodefeff", + "utf-16", + "utf-16le", + ], + ), + ("x-user-defined", vec!["x-user-defined"]), + ]) + }); - pub static ref IANA_SUPPORTED_SIMILAR: HashMap<&'static str, Vec<&'static str>> = HashMap::from_iter([ - ("windows-1252", vec!["iso-8859-15", "windows-1254"]), - ("windows-1253", vec!["iso-8859-7"]), - ("windows-1254", vec!["iso-8859-15", "windows-1252"]), - ("windows-1257", vec!["iso-8859-13"]), - ("iso-8859-10", vec!["iso-8859-14", "iso-8859-15", "iso-8859-4", "windows-1254", "windows-1252"]), - ("iso-8859-13", vec!["windows-1257"]), - ("iso-8859-14", vec![ - "iso-8859-10", - "iso-8859-15", - "iso-8859-16", - "iso-8859-3", - "windows-1254", - "windows-1252", - ]), - ("iso-8859-15", vec![ - "windows-1252", - "windows-1254", - "iso-8859-10", - "iso-8859-14", - "iso-8859-16", - "iso-8859-3", - ]), - ("iso-8859-16", vec![ - "iso-8859-14", - "iso-8859-15", - "iso-8859-2", - "iso-8859-3", - "windows-1254", - "windows-1252", - ]), - ("iso-8859-2", vec![ - "iso-8859-16", - "iso-8859-4", - ]), - ("iso-8859-3", vec![ - "iso-8859-14", - "iso-8859-15", - "iso-8859-16", - "windows-1254", - "windows-1252", - ]), - ("iso-8859-4", vec![ - "iso-8859-10", - "iso-8859-2", - "windows-1254", - "windows-1252", - ]), - ("iso-8859-7", vec![ - "windows-1253" - ]), - ("windows-1254", vec![ - "windows-1252", - "windows-1258", - "iso-8859-10", - "iso-8859-14", - "iso-8859-15", - "iso-8859-16", - "iso-8859-3", - "iso-8859-4", - ]), - ("windows-1252", vec![ - "windows-1254", - "windows-1258", - "iso-8859-10", - "iso-8859-14", - "iso-8859-15", - "iso-8859-16", - "iso-8859-3", - "iso-8859-4", - ]), - ]); -} +pub static IANA_SUPPORTED_SIMILAR: Lazy>> = + Lazy::new(|| { + HashMap::from_iter([ + ("windows-1252", vec!["iso-8859-15", "windows-1254"]), + ("windows-1253", vec!["iso-8859-7"]), + ("windows-1254", vec!["iso-8859-15", "windows-1252"]), + ("windows-1257", vec!["iso-8859-13"]), + ( + "iso-8859-10", + vec![ + "iso-8859-14", + "iso-8859-15", + "iso-8859-4", + "windows-1254", + "windows-1252", + ], + ), + ("iso-8859-13", vec!["windows-1257"]), + ( + "iso-8859-14", + vec![ + "iso-8859-10", + "iso-8859-15", + "iso-8859-16", + "iso-8859-3", + "windows-1254", + "windows-1252", + ], + ), + ( + "iso-8859-15", + vec![ + "windows-1252", + "windows-1254", + "iso-8859-10", + "iso-8859-14", + "iso-8859-16", + "iso-8859-3", + ], + ), + ( + "iso-8859-16", + vec![ + "iso-8859-14", + "iso-8859-15", + "iso-8859-2", + "iso-8859-3", + "windows-1254", + "windows-1252", + ], + ), + ("iso-8859-2", vec!["iso-8859-16", "iso-8859-4"]), + ( + "iso-8859-3", + vec![ + "iso-8859-14", + "iso-8859-15", + "iso-8859-16", + "windows-1254", + "windows-1252", + ], + ), + ( + "iso-8859-4", + vec!["iso-8859-10", "iso-8859-2", "windows-1254", "windows-1252"], + ), + ("iso-8859-7", vec!["windows-1253"]), + ( + "windows-1254", + vec![ + "windows-1252", + "windows-1258", + "iso-8859-10", + "iso-8859-14", + "iso-8859-15", + "iso-8859-16", + "iso-8859-3", + "iso-8859-4", + ], + ), + ( + "windows-1252", + vec![ + "windows-1254", + "windows-1258", + "iso-8859-10", + "iso-8859-14", + "iso-8859-15", + "iso-8859-16", + "iso-8859-3", + "iso-8859-4", + ], + ), + ]) + }); diff --git a/src/entity.rs b/src/entity.rs index f41b6de..252c41d 100644 --- a/src/entity.rs +++ b/src/entity.rs @@ -321,7 +321,7 @@ impl CharsetMatches { pub fn append(&mut self, item: CharsetMatch) { // We should disable the submatch factoring when the input file is too heavy // (conserve RAM usage) - if item.payload.len() <= *TOO_BIG_SEQUENCE { + if item.payload.len() <= TOO_BIG_SEQUENCE { for m in &mut self.items { if m.decoded_payload() == item.decoded_payload() && (m.mean_mess_ratio - item.mean_mess_ratio).abs() < f32::EPSILON diff --git a/src/lib.rs b/src/lib.rs index 45c52c0..2eb99e5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -224,7 +224,7 @@ pub fn from_bytes(bytes: &[u8], settings: Option) -> Charset } // too small length - if bytes_length < *TOO_SMALL_SEQUENCE { + if bytes_length < TOO_SMALL_SEQUENCE { trace!( "Trying to detect encoding from a tiny portion of ({}) byte(s).", bytes_length @@ -232,7 +232,7 @@ pub fn from_bytes(bytes: &[u8], settings: Option) -> Charset } // too big length - let is_too_large_sequence = bytes_length > *TOO_BIG_SEQUENCE; + let is_too_large_sequence = bytes_length > TOO_BIG_SEQUENCE; if is_too_large_sequence { trace!( "Using lazy str decoding because the payload is quite large, ({}) byte(s).", @@ -318,7 +318,7 @@ pub fn from_bytes(bytes: &[u8], settings: Option) -> Charset false => 0, }; let end_idx = match is_too_large_sequence && !is_multi_byte_decoder { - true => *MAX_PROCESSED_BYTES, + true => MAX_PROCESSED_BYTES, false => bytes_length, }; let decoded_payload: Option = if let Ok(payload) = decode( @@ -432,7 +432,7 @@ pub fn from_bytes(bytes: &[u8], settings: Option) -> Charset // Only if initial MD tests passes if !lazy_str_hard_failure && is_too_large_sequence && !is_multi_byte_decoder { let decoded_chunk_result = decode( - &bytes[*MAX_PROCESSED_BYTES..], + &bytes[MAX_PROCESSED_BYTES..], encoding_iana, DecoderTrap::Strict, false, diff --git a/src/md/structs.rs b/src/md/structs.rs index d1afe7b..ea6a3fa 100644 --- a/src/md/structs.rs +++ b/src/md/structs.rs @@ -66,7 +66,7 @@ impl MessDetectorChar { #[cached( type = "UnboundCache", - create = "{ UnboundCache::with_capacity(*UTF8_MAXIMAL_ALLOCATION) }", + create = "{ UnboundCache::with_capacity(UTF8_MAXIMAL_ALLOCATION) }", convert = r#"{ character }"# )] fn new_mess_detector_character(character: char) -> MessDetectorChar { diff --git a/src/tests/detection_large_payload.rs b/src/tests/detection_large_payload.rs index 9c46eb9..6f0c7f0 100644 --- a/src/tests/detection_large_payload.rs +++ b/src/tests/detection_large_payload.rs @@ -4,7 +4,7 @@ use crate::from_bytes; #[test] fn test_large_payload_utf8_sig_basic_entry() { let mut payload = b"\xef\xbb\xbf".as_slice().to_vec(); - payload.extend(b"0".repeat(*TOO_BIG_SEQUENCE + 1).as_slice().to_vec()); + payload.extend(b"0".repeat(TOO_BIG_SEQUENCE + 1).as_slice().to_vec()); let result = from_bytes(&payload, None); let best_guess = result.get_best(); @@ -27,7 +27,7 @@ fn test_large_payload_utf8_sig_basic_entry() { #[test] fn test_large_payload_ascii_sig_basic_entry() { - let payload = b"0".repeat(*TOO_BIG_SEQUENCE + 1).as_slice().to_vec(); + let payload = b"0".repeat(TOO_BIG_SEQUENCE + 1).as_slice().to_vec(); let result = from_bytes(&payload, None); let best_guess = result.get_best(); @@ -54,7 +54,7 @@ fn test_large_payload_ascii_sig_basic_entry() { #[test] fn test_misleading_large_sequence() { let mut payload = b"hello simple ascii " - .repeat(*TOO_BIG_SEQUENCE) + .repeat(TOO_BIG_SEQUENCE) .as_slice() .to_vec(); payload.extend("我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。".as_bytes()); diff --git a/src/tests/mod.rs b/src/tests/mod.rs index df505ec..b630bd8 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -1,6 +1,6 @@ #![cfg(test)] use crate::entity::Language; -use lazy_static::lazy_static; +use once_cell::sync::Lazy; mod cd; mod detection_base; mod detection_edge_case; @@ -10,57 +10,58 @@ mod entity; mod md; mod utils; -lazy_static! { - pub static ref FILES_SAMPLES: Vec<(&'static str, Vec<&'static str>, &'static Language)> = vec![ - ( - "sample-turkish.txt", - vec!["windows-1254"], - &Language::Turkish - ), - ("sample-chinese.txt", vec!["big5"], &Language::Chinese), - ( - "sample-french-1.txt", - vec!["iso-8859-1", "windows-1252"], - &Language::French - ), - ( - "sample-arabic-1.txt", - vec!["windows-1256"], - &Language::Arabic - ), - ("sample-arabic.txt", vec!["utf-8"], &Language::Arabic), - ( - "sample-greek.txt", - vec!["windows-1253", "iso-8859-7"], - &Language::Greek - ), - ("sample-french.txt", vec!["utf-8"], &Language::French), - ("sample-russian-3.txt", vec!["utf-8"], &Language::Russian), - ( - "sample-greek-2.txt", - vec!["windows-1253", "iso-8859-7"], - &Language::Greek - ), - ( - "sample-hebrew-2.txt", - vec!["windows-1255", "iso-8859-8"], - &Language::Hebrew - ), - ( - "sample-hebrew-3.txt", - vec!["windows-1255", "iso-8859-8"], - &Language::Hebrew - ), - ("sample-bulgarian.txt", vec!["utf-8"], &Language::Bulgarian), - ("sample-english.bom.txt", vec!["utf-8"], &Language::English), - ("sample-spanish.txt", vec!["utf-8"], &Language::Spanish), - ("sample-korean.txt", vec!["euc-kr"], &Language::Korean), - ("sample-russian-2.txt", vec!["utf-8"], &Language::Russian), - ( - "sample-russian.txt", - vec!["x-mac-cyrillic"], - &Language::Russian - ), - ("sample-polish.txt", vec!["utf-8"], &Language::Polish), - ]; -} +pub static FILES_SAMPLES: Lazy, &'static Language)>> = + Lazy::new(|| { + vec![ + ( + "sample-turkish.txt", + vec!["windows-1254"], + &Language::Turkish, + ), + ("sample-chinese.txt", vec!["big5"], &Language::Chinese), + ( + "sample-french-1.txt", + vec!["iso-8859-1", "windows-1252"], + &Language::French, + ), + ( + "sample-arabic-1.txt", + vec!["windows-1256"], + &Language::Arabic, + ), + ("sample-arabic.txt", vec!["utf-8"], &Language::Arabic), + ( + "sample-greek.txt", + vec!["windows-1253", "iso-8859-7"], + &Language::Greek, + ), + ("sample-french.txt", vec!["utf-8"], &Language::French), + ("sample-russian-3.txt", vec!["utf-8"], &Language::Russian), + ( + "sample-greek-2.txt", + vec!["windows-1253", "iso-8859-7"], + &Language::Greek, + ), + ( + "sample-hebrew-2.txt", + vec!["windows-1255", "iso-8859-8"], + &Language::Hebrew, + ), + ( + "sample-hebrew-3.txt", + vec!["windows-1255", "iso-8859-8"], + &Language::Hebrew, + ), + ("sample-bulgarian.txt", vec!["utf-8"], &Language::Bulgarian), + ("sample-english.bom.txt", vec!["utf-8"], &Language::English), + ("sample-spanish.txt", vec!["utf-8"], &Language::Spanish), + ("sample-korean.txt", vec!["euc-kr"], &Language::Korean), + ("sample-russian-2.txt", vec!["utf-8"], &Language::Russian), + ( + "sample-russian.txt", + vec!["x-mac-cyrillic"], + &Language::Russian, + ), + ("sample-polish.txt", vec!["utf-8"], &Language::Polish), + ] + }); From 8950d77c4dba6f4a96524e2647d67b292b4a6c5a Mon Sep 17 00:00:00 2001 From: chris-ha458 Date: Thu, 23 Nov 2023 18:24:42 +0900 Subject: [PATCH 2/2] reorder statics and lazy's --- src/consts.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/consts.rs b/src/consts.rs index 9ca001c..6d56a65 100644 --- a/src/consts.rs +++ b/src/consts.rs @@ -4,6 +4,12 @@ use encoding::all::encodings; use once_cell::sync::Lazy; use regex::Regex; +pub static TOO_BIG_SEQUENCE: usize = 1_000_000; // 10E6 +pub(crate) static MAX_PROCESSED_BYTES: usize = 500_000; +pub(crate) static TOO_SMALL_SEQUENCE: usize = 32; +pub(crate) static UTF8_MAXIMAL_ALLOCATION: usize = 1_112_064; +pub(crate) static COMMON_SAFE_ASCII_CHARACTERS: &'static str = "<>=:/&;{}[],|\"-"; + // Contain for each eligible encoding a list of/item bytes SIG/BOM pub(crate) static ENCODING_MARKS: Lazy> = Lazy::new(|| { HashMap::from_iter([ @@ -14,11 +20,6 @@ pub(crate) static ENCODING_MARKS: Lazy> = L ]) }); -pub static TOO_BIG_SEQUENCE: usize = 1_000_000; // 10E6 -pub(crate) static MAX_PROCESSED_BYTES: usize = 500_000; -pub(crate) static TOO_SMALL_SEQUENCE: usize = 32; -pub(crate) static UTF8_MAXIMAL_ALLOCATION: usize = 1_112_064; - pub(crate) static UNICODE_RANGES_COMBINED: Lazy<[(&'static str, RangeInclusive); 279]> = Lazy::new(|| { [ @@ -327,8 +328,6 @@ pub(crate) static UNICODE_SECONDARY_RANGE_KEYWORD: Lazy> = ]) }); -pub(crate) static COMMON_SAFE_ASCII_CHARACTERS: &'static str = "<>=:/&;{}[],|\"-"; - pub(crate) static RE_POSSIBLE_ENCODING_INDICATION: Lazy = Lazy::new(|| { Regex::new( r#"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:["']?)([a-zA-Z0-9\-_]+)(?:["']?)"#