Merge pull request #17 from parmsam/sam-dev

Improve performance with better convert_to_utf16le
parmsam · Apr 21, 2024 · 097faaf · 097faaf
2 parents ab27bfd + 48baee0
commit 097faaf
Show file tree

Hide file tree

Showing 3 changed files with 61 additions and 45 deletions.
diff --git a/R/lzstringr-package.R b/R/lzstringr-package.R
@@ -3,67 +3,72 @@
 ## usethis namespace: end
 NULL
 
+
+# Helper function to convert string to UTF-16LE with BOM
+convert_to_utf16le <- function(string) {
+  string <- enc2utf8(string)
+  string_utf16 <- iconv(string, from = "UTF-8", to = "UTF-16LE", toRaw = TRUE)[[1]]
+  bom_le <- charToRaw("\xFF\xFE")
+  if (!identical(string_utf16[1:2], bom_le)) {
+    string_utf16 <- c(bom_le, string_utf16)
+  }
+  string_utf16
+}
+
 decode_utf16_surrogate <- function(values) {
-  # Initialize an empty character vector to store decoded characters
-  decoded_chars <- character()
+  # Estimate the maximum number of characters (since surrogate pairs condense to one character)
+  max_chars <- length(values)
+  decoded_chars <- character(max_chars)  # Pre-allocate with maximum possible size
+  index <- 1  # Index to keep track of position in decoded_chars
+
   # Function to decode surrogate pairs
   decode_surrogates <- function(high, low) {
-    # Calculate the Unicode code point from surrogate values
-    # Formula: 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00)
     code_point <- 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00)
-    # Convert the Unicode code point to a character
     intToUtf8(code_point)
   }
+
   i <- 1
   while (i <= length(values)) {
-    if (values[i] < 0xD800 ||
-      values[i] > 0xDBFF) {
+    if (values[i] < 0xD800 || values[i] > 0xDBFF) {
       # Not a high surrogate
-      # Direct conversion for regular characters (like space)
-      decoded_chars <- c(decoded_chars, intToUtf8(values[i]))
+      decoded_chars[index] <- intToUtf8(values[i])
       i <- i + 1
     } else {
       # Decode surrogate pairs
-      decoded_chars <-
-        c(decoded_chars, decode_surrogates(values[i], values[i + 1]))
+      if (i + 1 > length(values)) {
+        stop("Malformed input: Surrogate high without a following low surrogate.")
+      }
+      decoded_chars[index] <- decode_surrogates(values[i], values[i + 1])
       i <- i + 2
     }
+    index <- index + 1
   }
+
+  # Truncate the vector to the actual number of characters decoded
+  decoded_chars <- decoded_chars[1:(index - 1)]
+
   # Combine into a single string
   paste(decoded_chars, collapse = "")
 }
 
+
 safe_compress <- function(string, f) {
-  string <- enc2utf8(string)
-  string_utf16 <-
-    iconv(string,
-      from = "UTF-8",
-      to = "UTF-16LE",
-      toRaw = TRUE
-    )[[1]]
-  bom_le <- charToRaw("\xFF\xFE")
-  if (!identical(string_utf16[1:2], bom_le)) {
-    string_utf16 <- c(bom_le, string_utf16)
-  }
+  string_utf16 <- convert_to_utf16le(string)
   result <- f(string_utf16)
+  if (length(result) == 0) {
+    return("")
+  }
   chr_result <- rawToChar(as.raw(result))
   Encoding(chr_result) <- "UTF-8"
   chr_result
 }
 
 safe_decompress <- function(string, f) {
-  string <- enc2utf8(string)
-  string_utf16 <-
-    iconv(string,
-      from = "UTF-8",
-      to = "UTF-16LE",
-      toRaw = TRUE
-    )[[1]]
-  bom_le <- charToRaw("\xFF\xFE")
-  if (!identical(string_utf16[1:2], bom_le)) {
-    string_utf16 <- c(bom_le, string_utf16)
-  }
+  string_utf16 <- convert_to_utf16le(string)
   result <- f(string_utf16)
+  if (length(result) == 0) {
+    return("")
+  }
   chr_result <- decode_utf16_surrogate(result)
   Encoding(chr_result) <- "UTF-8"
   chr_result

diff --git a/README.Rmd b/README.Rmd
@@ -47,7 +47,7 @@ compressed = lzstring::compressToBase64(message)
 compressed
 
 decompressed = lzstring::decompressFromBase64(compressed)
-decompressed
+cat(decompressed)
 ```
 
 ### JSON data
@@ -61,19 +61,23 @@ compressed = lzstring::compressToBase64(json_string)
 compressed
 
 decompressed = lzstring::decompressFromBase64(compressed)
-decompressed
 identical(json_string, decompressed)
+cat(decompressed)
 ```
 
 ### JS code
 
 ```{r}
-js_code <- "function test() { console.log('Hello, World!'); }"
+js_code <- "
+function test() { 
+  console.log('Hello, World!'); 
+}
+"
 compressed = lzstring::compressToBase64(js_code)
 compressed
 
 decompressed = lzstring::decompressFromBase64(compressed)
-decompressed
+cat(decompressed)
 ```
 
 ### R code

diff --git a/README.md b/README.md
@@ -50,8 +50,8 @@ compressed
 #> [1] "CoCwpgBAjgrglgYwNYQEYCcD2B3AdhAM0wA8IArGAWwAcBnCTANzHQgBdwIAbAQwC8AnhAAmmAOYBCIA"
 
 decompressed = lzstring::decompressFromBase64(compressed)
-decompressed
-#> [1] "The quick brown fox jumps over the lazy dog!"
+cat(decompressed)
+#> The quick brown fox jumps over the lazy dog!
 ```
 
 ### JSON data
@@ -66,23 +66,30 @@ compressed
 #> [1] "N4IgdghgtgpiBcBtEApA9gCzAAgCJrgF0AaECAcziQGYAGEkGKCASwBsFkArTMAOgAmBAAIwAHtAAObGHwDGaKCEIBfIA==="
 
 decompressed = lzstring::decompressFromBase64(compressed)
-decompressed
-#> [1] "{\"name\":[\"John Doe\"],\"age\":[30],\"email\":[\"[email protected]\"]}"
 identical(json_string, decompressed)
 #> [1] FALSE
+cat(decompressed)
+#> {"name":["John Doe"],"age":[30],"email":["[email protected]"]}
 ```
 
 ### JS code
 
 ``` r
-js_code <- "function test() { console.log('Hello, World!'); }"
+js_code <- "
+function test() { 
+  console.log('Hello, World!'); 
+}
+"
 compressed = lzstring::compressToBase64(js_code)
 compressed
-#> [1] "GYVwdgxgLglg9mABFApgZygCgJSIN6IQJpwA2KAdKXAOaYDkAEiqdQDSIDqcATqQCYBCetgDciAL5A=="
+#> [1] "FAMwrgdgxgLglgewgAhgUwM4wBQEpkDeywyyUSGCANmgHRUIDm2A5ABJpUMA0yA6ggBOVACYBCFrgDcxAL7AgA=="
 
 decompressed = lzstring::decompressFromBase64(compressed)
-decompressed
-#> [1] "function test() { console.log('Hello, World!'); }"
+cat(decompressed)
+#> 
+#> function test() { 
+#>   console.log('Hello, World!'); 
+#> }
 ```
 
 ### R code