diff --git a/R/lzstringr-package.R b/R/lzstringr-package.R index baf88a5..dea1e85 100644 --- a/R/lzstringr-package.R +++ b/R/lzstringr-package.R @@ -3,67 +3,72 @@ ## usethis namespace: end NULL + +# Helper function to convert string to UTF-16LE with BOM +convert_to_utf16le <- function(string) { + string <- enc2utf8(string) + string_utf16 <- iconv(string, from = "UTF-8", to = "UTF-16LE", toRaw = TRUE)[[1]] + bom_le <- charToRaw("\xFF\xFE") + if (!identical(string_utf16[1:2], bom_le)) { + string_utf16 <- c(bom_le, string_utf16) + } + string_utf16 +} + decode_utf16_surrogate <- function(values) { - # Initialize an empty character vector to store decoded characters - decoded_chars <- character() + # Estimate the maximum number of characters (since surrogate pairs condense to one character) + max_chars <- length(values) + decoded_chars <- character(max_chars) # Pre-allocate with maximum possible size + index <- 1 # Index to keep track of position in decoded_chars + # Function to decode surrogate pairs decode_surrogates <- function(high, low) { - # Calculate the Unicode code point from surrogate values - # Formula: 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00) code_point <- 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00) - # Convert the Unicode code point to a character intToUtf8(code_point) } + i <- 1 while (i <= length(values)) { - if (values[i] < 0xD800 || - values[i] > 0xDBFF) { + if (values[i] < 0xD800 || values[i] > 0xDBFF) { # Not a high surrogate - # Direct conversion for regular characters (like space) - decoded_chars <- c(decoded_chars, intToUtf8(values[i])) + decoded_chars[index] <- intToUtf8(values[i]) i <- i + 1 } else { # Decode surrogate pairs - decoded_chars <- - c(decoded_chars, decode_surrogates(values[i], values[i + 1])) + if (i + 1 > length(values)) { + stop("Malformed input: Surrogate high without a following low surrogate.") + } + decoded_chars[index] <- decode_surrogates(values[i], values[i + 1]) i <- i + 2 } + index <- index + 1 } + + # Truncate the vector to the actual number of characters decoded + decoded_chars <- decoded_chars[1:(index - 1)] + # Combine into a single string paste(decoded_chars, collapse = "") } + safe_compress <- function(string, f) { - string <- enc2utf8(string) - string_utf16 <- - iconv(string, - from = "UTF-8", - to = "UTF-16LE", - toRaw = TRUE - )[[1]] - bom_le <- charToRaw("\xFF\xFE") - if (!identical(string_utf16[1:2], bom_le)) { - string_utf16 <- c(bom_le, string_utf16) - } + string_utf16 <- convert_to_utf16le(string) result <- f(string_utf16) + if (length(result) == 0) { + return("") + } chr_result <- rawToChar(as.raw(result)) Encoding(chr_result) <- "UTF-8" chr_result } safe_decompress <- function(string, f) { - string <- enc2utf8(string) - string_utf16 <- - iconv(string, - from = "UTF-8", - to = "UTF-16LE", - toRaw = TRUE - )[[1]] - bom_le <- charToRaw("\xFF\xFE") - if (!identical(string_utf16[1:2], bom_le)) { - string_utf16 <- c(bom_le, string_utf16) - } + string_utf16 <- convert_to_utf16le(string) result <- f(string_utf16) + if (length(result) == 0) { + return("") + } chr_result <- decode_utf16_surrogate(result) Encoding(chr_result) <- "UTF-8" chr_result diff --git a/README.Rmd b/README.Rmd index 6f725d8..1103809 100644 --- a/README.Rmd +++ b/README.Rmd @@ -47,7 +47,7 @@ compressed = lzstring::compressToBase64(message) compressed decompressed = lzstring::decompressFromBase64(compressed) -decompressed +cat(decompressed) ``` ### JSON data @@ -61,19 +61,23 @@ compressed = lzstring::compressToBase64(json_string) compressed decompressed = lzstring::decompressFromBase64(compressed) -decompressed identical(json_string, decompressed) +cat(decompressed) ``` ### JS code ```{r} -js_code <- "function test() { console.log('Hello, World!'); }" +js_code <- " +function test() { + console.log('Hello, World!'); +} +" compressed = lzstring::compressToBase64(js_code) compressed decompressed = lzstring::decompressFromBase64(compressed) -decompressed +cat(decompressed) ``` ### R code diff --git a/README.md b/README.md index 02632a7..e670089 100644 --- a/README.md +++ b/README.md @@ -50,8 +50,8 @@ compressed #> [1] "CoCwpgBAjgrglgYwNYQEYCcD2B3AdhAM0wA8IArGAWwAcBnCTANzHQgBdwIAbAQwC8AnhAAmmAOYBCIA" decompressed = lzstring::decompressFromBase64(compressed) -decompressed -#> [1] "The quick brown fox jumps over the lazy dog!" +cat(decompressed) +#> The quick brown fox jumps over the lazy dog! ``` ### JSON data @@ -66,23 +66,30 @@ compressed #> [1] "N4IgdghgtgpiBcBtEApA9gCzAAgCJrgF0AaECAcziQGYAGEkGKCASwBsFkArTMAOgAmBAAIwAHtAAObGHwDGaKCEIBfIA===" decompressed = lzstring::decompressFromBase64(compressed) -decompressed -#> [1] "{\"name\":[\"John Doe\"],\"age\":[30],\"email\":[\"john.doe@example.com\"]}" identical(json_string, decompressed) #> [1] FALSE +cat(decompressed) +#> {"name":["John Doe"],"age":[30],"email":["john.doe@example.com"]} ``` ### JS code ``` r -js_code <- "function test() { console.log('Hello, World!'); }" +js_code <- " +function test() { + console.log('Hello, World!'); +} +" compressed = lzstring::compressToBase64(js_code) compressed -#> [1] "GYVwdgxgLglg9mABFApgZygCgJSIN6IQJpwA2KAdKXAOaYDkAEiqdQDSIDqcATqQCYBCetgDciAL5A==" +#> [1] "FAMwrgdgxgLglgewgAhgUwM4wBQEpkDeywyyUSGCANmgHRUIDm2A5ABJpUMA0yA6ggBOVACYBCFrgDcxAL7AgA==" decompressed = lzstring::decompressFromBase64(compressed) -decompressed -#> [1] "function test() { console.log('Hello, World!'); }" +cat(decompressed) +#> +#> function test() { +#> console.log('Hello, World!'); +#> } ``` ### R code