Skip to content

Commit

Permalink
Merge pull request #35 from chris-ha458/refactors
Browse files Browse the repository at this point in the history
Refactors
  • Loading branch information
nickspring authored Nov 20, 2023
2 parents 7086072 + ecae498 commit eb0f58a
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 62 deletions.
54 changes: 23 additions & 31 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -313,33 +313,29 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
}

// fast pre-check
let start_idx = if bom_or_sig_available {
sig_payload.unwrap().len()
} else {
0
let start_idx = match bom_or_sig_available {
true => sig_payload.unwrap().len(),
false => 0,
};
let end_idx = if is_too_large_sequence && !is_multi_byte_decoder {
*MAX_PROCESSED_BYTES
} else {
bytes_length
let end_idx = match is_too_large_sequence && !is_multi_byte_decoder {
true => *MAX_PROCESSED_BYTES,
false => bytes_length,
};
let decoded_payload: Option<String> = match decode(
let decoded_payload: Option<String> = if let Ok(payload) = decode(
&bytes[start_idx..end_idx],
encoding_iana,
DecoderTrap::Strict,
is_too_large_sequence && !is_multi_byte_decoder,
false,
) {
Ok(payload) if !is_too_large_sequence || is_multi_byte_decoder => Some(payload),
Ok(_) => None,
Err(_) => {
trace!(
"Code page {} does not fit given bytes sequence at ALL.",
encoding_iana,
);
tested_but_hard_failure.push(encoding_iana);
continue 'iana_encodings_loop;
}
(!is_too_large_sequence || is_multi_byte_decoder).then_some(payload)
} else {
trace!(
"Code page {} does not fit given bytes sequence at ALL.",
encoding_iana,
);
tested_but_hard_failure.push(encoding_iana);
continue 'iana_encodings_loop;
};

// soft failed pre-check
Expand Down Expand Up @@ -379,7 +375,7 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
None => bytes_length,
};
let starting_offset = match (bom_or_sig_available, &decoded_payload) {
(true, None) => sig_payload.as_ref().unwrap().len(),
(true, None) => start_idx,
_ => 0,
};
let offsets = (starting_offset..seq_len).step_by((seq_len / settings.steps).max(1));
Expand All @@ -396,17 +392,13 @@ pub fn from_bytes(bytes: &[u8], settings: Option<NormalizerSettings>) -> Charset
.take(settings.chunk_size)
.collect()),
// Bytes processing
None => {
let offset_end = (offset + settings.chunk_size).min(seq_len);
let cut_bytes_vec: &[u8] = &bytes[offset..offset_end];
decode(
cut_bytes_vec,
encoding_iana,
DecoderTrap::Strict,
false,
false,
)
}
None => decode(
&bytes[offset..(offset + settings.chunk_size).min(seq_len)],
encoding_iana,
DecoderTrap::Strict,
false,
false,
),
};

if is_invalid_chunk(&decoded_chunk_result, encoding_iana) {
Expand Down
41 changes: 10 additions & 31 deletions src/md.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,8 @@ pub(crate) fn mess_ratio(
Box::<ArchaicUpperLowerPlugin>::default(),
];

let length = decoded_sequence.chars().count();
let mut mean_mess_ratio: Option<f32> = None;
let early_calc_period: usize = match length {
let early_calc_period: usize = match decoded_sequence.chars().count() {
..=510 => 32,
511..=1023 => 64,
_ => 128,
Expand All @@ -53,7 +52,7 @@ pub(crate) fn mess_ratio(
.filter(|detector| detector.eligible(&mess_char))
.for_each(|detector| detector.feed(&mess_char));

if index.rem_euclid(early_calc_period) == early_calc_period - 1 {
if index % early_calc_period == early_calc_period - 1 {
let early_mess_ratio: f32 = detectors.iter().map(|x| x.ratio()).sum();
if early_mess_ratio >= maximum_threshold {
mean_mess_ratio = Some(early_mess_ratio);
Expand All @@ -65,38 +64,18 @@ pub(crate) fn mess_ratio(

if log_enabled!(log::Level::Trace) {
trace!(
"Mess-detector extended-analysis start: \
early_calc_period={}, \
mean_mess_ratio={}, \
maximum_threshold={}",
"Mess-detector extended-analysis start: early_calc_period={}, mean_mess_ratio={}, maximum_threshold={} \
{}",
early_calc_period,
return_ratio,
maximum_threshold,
detectors
.iter()
.filter(|d| d.ratio() > 0.0)
.map(|d| format!("{} produces ratio: {}", d.name(), d.ratio()))
.collect::<Vec<String>>()
.join("===")
);

/*if decoded_sequence.len() > 16 {
trace!(
"Chunk: {} ..... {}",
&decoded_sequence[..decoded_sequence
.char_indices()
.nth(16)
.map(|(i, _)| i)
.unwrap_or(decoded_sequence.chars().count())],
&decoded_sequence[decoded_sequence
.char_indices()
.nth(decoded_sequence.chars().count() - 16)
.map(|(i, _)| i)
.unwrap_or(decoded_sequence.chars().count())..],
);
}
*/

for detector in &detectors {
if detector.ratio() > 0.0 {
trace!("{} produces ratio: {}", detector.name(), detector.ratio());
}
}
trace!("===");
}

return_ratio
Expand Down

0 comments on commit eb0f58a

Please sign in to comment.