Skip to content

Commit

Permalink
Fix possible crash in HTML CSS image extraction
Browse files Browse the repository at this point in the history
When processing UTF-8 HTML code, the image extraction logic may panic if
the string contains a multi-byte grapheme that includes a '(', ')',
whitespace, or one of the other characters used to split the text when
searching for the base64 image content.

The panic is because the `split_at()` method will panic if you try to
split in the middle of a unicode grapheme.

This commit fixes the issue by processing the HTML string one grapheme
at a time instead of one character (byte) at a time.
The `grapheme_indices()` method is used to get the correct position of
the start of each grapheme for splitting the string.
  • Loading branch information
micahsnyder committed Apr 28, 2023
1 parent 7a6fe78 commit 2a21451
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 27 deletions.
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions libclamav_rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ transpose = "0.2"
num-traits = "0.2"
base64 = "0.21.0"
sha1 = "0.10.5"
unicode-segmentation = "1.10.1"

[lib]
crate-type = ["staticlib"]
Expand Down
2 changes: 1 addition & 1 deletion libclamav_rust/src/cdiff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1032,7 +1032,7 @@ fn process_line(ctx: &mut Context, line: &[u8]) -> Result<(), InputError> {
cmd_unlink(ctx, unlink_op)
}
_ => Err(InputError::UnknownCommand(
String::from_utf8_lossy(&cmd).to_string(),
String::from_utf8_lossy(cmd).to_string(),
)),
}
}
Expand Down
53 changes: 27 additions & 26 deletions libclamav_rust/src/css_image_extract.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use std::{ffi::CStr, mem::ManuallyDrop, os::raw::c_char};
use base64::{engine::general_purpose as base64_engine_standard, Engine as _};
use log::{debug, error, warn};
use thiserror::Error;
use unicode_segmentation::UnicodeSegmentation;

use crate::sys;

Expand Down Expand Up @@ -73,12 +74,12 @@ impl<'a> CssImageExtractor<'a> {
};

// Skip whitespace until we find '('
for (pos, c) in self.remaining.chars().enumerate() {
if c == '(' {
for (pos, c) in self.remaining.grapheme_indices(true) {
if c == "(" {
// Found left-paren.
(_, self.remaining) = self.remaining.split_at(pos + 1);
break;
} else if char::is_whitespace(c) {
} else if c.contains(char::is_whitespace) {
// Skipping whitespace.
continue;
} else {
Expand All @@ -90,11 +91,11 @@ impl<'a> CssImageExtractor<'a> {
// Find closing ')'
let mut depth = 1;
let mut url_parameter: Option<&str> = None;
for (pos, c) in self.remaining.chars().enumerate() {
if c == '(' {
for (pos, c) in self.remaining.grapheme_indices(true) {
if c == "(" {
// Found nested left-paren.
depth += 1;
} else if c == ')' {
} else if c == ")" {
if depth > 1 {
// Found nested right-paren.
depth -= 1;
Expand All @@ -121,8 +122,8 @@ impl<'a> CssImageExtractor<'a> {
// Strip optional whitespace and quotes from front and back.

// Trim off whitespace at beginning
for (pos, c) in url_parameter.chars().enumerate() {
if char::is_whitespace(c) {
for (pos, c) in url_parameter.grapheme_indices(true) {
if c.contains(char::is_whitespace) {
// Skipping whitespace before url contents.
continue;
} else {
Expand All @@ -132,8 +133,8 @@ impl<'a> CssImageExtractor<'a> {
}

// Trim off whitespace at end
for (pos, c) in url_parameter.chars().rev().enumerate() {
if char::is_whitespace(c) {
for (pos, c) in url_parameter.graphemes(true).rev().enumerate() {
if c.contains(char::is_whitespace) {
// Skipping whitespace after url contents.
continue;
} else {
Expand All @@ -143,24 +144,24 @@ impl<'a> CssImageExtractor<'a> {
}

// Trim off " at beginning.
let c = url_parameter.chars().next();
let c = url_parameter.graphemes(true).next();
if let Some(c) = c {
if c == '"' {
if c == "\"" {
(_, url_parameter) = url_parameter.split_at(1);
}
};

// Trim off " at end.
let c = url_parameter.chars().rev().next();
let c = url_parameter.graphemes(true).rev().next();
if let Some(c) = c {
if c == '"' {
if c == "\"" {
(url_parameter, _) = url_parameter.split_at(url_parameter.len() - 1);
}
};

// Trim off whitespace at beginning.
for (pos, c) in url_parameter.chars().enumerate() {
if char::is_whitespace(c) {
for (pos, c) in url_parameter.grapheme_indices(true) {
if c.contains(char::is_whitespace) {
// Skipping whitespace before url contents.
continue;
} else {
Expand All @@ -170,8 +171,8 @@ impl<'a> CssImageExtractor<'a> {
}

// Trim off whitespace at end.
for (pos, c) in url_parameter.chars().rev().enumerate() {
if char::is_whitespace(c) {
for (pos, c) in url_parameter.graphemes(true).rev().enumerate() {
if c.contains(char::is_whitespace) {
// Skipping whitespace after url contents.
continue;
} else {
Expand Down Expand Up @@ -203,12 +204,12 @@ impl<'a> CssImageExtractor<'a> {
};

// Skip whitespace until we find a 'b' (starting "base64")
for (pos, c) in url_parameter.chars().enumerate() {
if c == 'b' {
for (pos, c) in url_parameter.grapheme_indices(true) {
if c == "b" {
// Found 'b'.
(_, url_parameter) = url_parameter.split_at(pos + 1);
break;
} else if char::is_whitespace(c) {
} else if c.contains(char::is_whitespace) {
// Skipping whitespace.
continue;
} else {
Expand All @@ -227,12 +228,12 @@ impl<'a> CssImageExtractor<'a> {
(_, url_parameter) = url_parameter.split_at("ase64".len());

// Skip whitespace until we find ','
for (pos, c) in url_parameter.chars().enumerate() {
if c == ',' {
for (pos, c) in url_parameter.grapheme_indices(true) {
if c == "," {
// Found ','.
(_, url_parameter) = url_parameter.split_at(pos + 1);
break;
} else if char::is_whitespace(c) {
} else if c.contains(char::is_whitespace) {
// Skipping whitespace.
continue;
} else {
Expand All @@ -242,8 +243,8 @@ impl<'a> CssImageExtractor<'a> {
}

// Trim off whitespace at beginning.
for (pos, c) in url_parameter.chars().enumerate() {
if char::is_whitespace(c) {
for (pos, c) in url_parameter.grapheme_indices(true) {
if c.contains(char::is_whitespace) {
// Skipping whitespace before url contents.
continue;
} else {
Expand Down

0 comments on commit 2a21451

Please sign in to comment.