Fix possible crash in HTML CSS image extraction

When processing UTF-8 HTML code, the image extraction logic may panic if the string contains a multi-byte grapheme that includes a '(', ')', whitespace, or one of the other characters used to split the text when searching for the base64 image content. The panic is because the `split_at()` method will panic if you try to split in the middle of a unicode grapheme. This commit fixes the issue by processing the HTML string one grapheme at a time instead of one character (byte) at a time. The `grapheme_indices()` method is used to get the correct position of the start of each grapheme for splitting the string.
Cisco-Talos · Apr 28, 2023 · 2a21451 · 2a21451
1 parent 7a6fe78
commit 2a21451
Show file tree

Hide file tree

Showing 4 changed files with 36 additions and 27 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/libclamav_rust/Cargo.toml b/libclamav_rust/Cargo.toml
@@ -18,6 +18,7 @@ transpose = "0.2"
 num-traits = "0.2"
 base64 = "0.21.0"
 sha1 = "0.10.5"
+unicode-segmentation = "1.10.1"
 
 [lib]
 crate-type = ["staticlib"]

diff --git a/libclamav_rust/src/cdiff.rs b/libclamav_rust/src/cdiff.rs
@@ -1032,7 +1032,7 @@ fn process_line(ctx: &mut Context, line: &[u8]) -> Result<(), InputError> {
             cmd_unlink(ctx, unlink_op)
         }
         _ => Err(InputError::UnknownCommand(
-            String::from_utf8_lossy(&cmd).to_string(),
+            String::from_utf8_lossy(cmd).to_string(),
         )),
     }
 }

diff --git a/libclamav_rust/src/css_image_extract.rs b/libclamav_rust/src/css_image_extract.rs
@@ -25,6 +25,7 @@ use std::{ffi::CStr, mem::ManuallyDrop, os::raw::c_char};
 use base64::{engine::general_purpose as base64_engine_standard, Engine as _};
 use log::{debug, error, warn};
 use thiserror::Error;
+use unicode_segmentation::UnicodeSegmentation;
 
 use crate::sys;
 
@@ -73,12 +74,12 @@ impl<'a> CssImageExtractor<'a> {
             };
 
             // Skip whitespace until we find '('
-            for (pos, c) in self.remaining.chars().enumerate() {
-                if c == '(' {
+            for (pos, c) in self.remaining.grapheme_indices(true) {
+                if c == "(" {
                     // Found left-paren.
                     (_, self.remaining) = self.remaining.split_at(pos + 1);
                     break;
-                } else if char::is_whitespace(c) {
+                } else if c.contains(char::is_whitespace) {
                     // Skipping whitespace.
                     continue;
                 } else {
@@ -90,11 +91,11 @@ impl<'a> CssImageExtractor<'a> {
             // Find closing ')'
             let mut depth = 1;
             let mut url_parameter: Option<&str> = None;
-            for (pos, c) in self.remaining.chars().enumerate() {
-                if c == '(' {
+            for (pos, c) in self.remaining.grapheme_indices(true) {
+                if c == "(" {
                     // Found nested left-paren.
                     depth += 1;
-                } else if c == ')' {
+                } else if c == ")" {
                     if depth > 1 {
                         // Found nested right-paren.
                         depth -= 1;
@@ -121,8 +122,8 @@ impl<'a> CssImageExtractor<'a> {
             // Strip optional whitespace and quotes from front and back.
 
             // Trim off whitespace at beginning
-            for (pos, c) in url_parameter.chars().enumerate() {
-                if char::is_whitespace(c) {
+            for (pos, c) in url_parameter.grapheme_indices(true) {
+                if c.contains(char::is_whitespace) {
                     // Skipping whitespace before url contents.
                     continue;
                 } else {
@@ -132,8 +133,8 @@ impl<'a> CssImageExtractor<'a> {
             }
 
             // Trim off whitespace at end
-            for (pos, c) in url_parameter.chars().rev().enumerate() {
-                if char::is_whitespace(c) {
+            for (pos, c) in url_parameter.graphemes(true).rev().enumerate() {
+                if c.contains(char::is_whitespace) {
                     // Skipping whitespace after url contents.
                     continue;
                 } else {
@@ -143,24 +144,24 @@ impl<'a> CssImageExtractor<'a> {
             }
 
             // Trim off " at beginning.
-            let c = url_parameter.chars().next();
+            let c = url_parameter.graphemes(true).next();
             if let Some(c) = c {
-                if c == '"' {
+                if c == "\"" {
                     (_, url_parameter) = url_parameter.split_at(1);
                 }
             };
 
             // Trim off " at end.
-            let c = url_parameter.chars().rev().next();
+            let c = url_parameter.graphemes(true).rev().next();
             if let Some(c) = c {
-                if c == '"' {
+                if c == "\"" {
                     (url_parameter, _) = url_parameter.split_at(url_parameter.len() - 1);
                 }
             };
 
             // Trim off whitespace at beginning.
-            for (pos, c) in url_parameter.chars().enumerate() {
-                if char::is_whitespace(c) {
+            for (pos, c) in url_parameter.grapheme_indices(true) {
+                if c.contains(char::is_whitespace) {
                     // Skipping whitespace before url contents.
                     continue;
                 } else {
@@ -170,8 +171,8 @@ impl<'a> CssImageExtractor<'a> {
             }
 
             // Trim off whitespace at end.
-            for (pos, c) in url_parameter.chars().rev().enumerate() {
-                if char::is_whitespace(c) {
+            for (pos, c) in url_parameter.graphemes(true).rev().enumerate() {
+                if c.contains(char::is_whitespace) {
                     // Skipping whitespace after url contents.
                     continue;
                 } else {
@@ -203,12 +204,12 @@ impl<'a> CssImageExtractor<'a> {
             };
 
             // Skip whitespace until we find a 'b' (starting "base64")
-            for (pos, c) in url_parameter.chars().enumerate() {
-                if c == 'b' {
+            for (pos, c) in url_parameter.grapheme_indices(true) {
+                if c == "b" {
                     // Found 'b'.
                     (_, url_parameter) = url_parameter.split_at(pos + 1);
                     break;
-                } else if char::is_whitespace(c) {
+                } else if c.contains(char::is_whitespace) {
                     // Skipping whitespace.
                     continue;
                 } else {
@@ -227,12 +228,12 @@ impl<'a> CssImageExtractor<'a> {
             (_, url_parameter) = url_parameter.split_at("ase64".len());
 
             // Skip whitespace until we find ','
-            for (pos, c) in url_parameter.chars().enumerate() {
-                if c == ',' {
+            for (pos, c) in url_parameter.grapheme_indices(true) {
+                if c == "," {
                     // Found ','.
                     (_, url_parameter) = url_parameter.split_at(pos + 1);
                     break;
-                } else if char::is_whitespace(c) {
+                } else if c.contains(char::is_whitespace) {
                     // Skipping whitespace.
                     continue;
                 } else {
@@ -242,8 +243,8 @@ impl<'a> CssImageExtractor<'a> {
             }
 
             // Trim off whitespace at beginning.
-            for (pos, c) in url_parameter.chars().enumerate() {
-                if char::is_whitespace(c) {
+            for (pos, c) in url_parameter.grapheme_indices(true) {
+                if c.contains(char::is_whitespace) {
                     // Skipping whitespace before url contents.
                     continue;
                 } else {