diff --git a/R/lzstringr-package.R b/R/lzstringr-package.R index 63d9415..ca92989 100644 --- a/R/lzstringr-package.R +++ b/R/lzstringr-package.R @@ -3,11 +3,38 @@ ## usethis namespace: end NULL +decode_utf16_surrogate <- function(values) { + # Initialize an empty character vector to store decoded characters + decoded_chars <- character() + # Function to decode surrogate pairs + decode_surrogates <- function(high, low) { + # Calculate the Unicode code point from surrogate values + # Formula: 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00) + code_point <- 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00) + # Convert the Unicode code point to a character + intToUtf8(code_point) + } + i <- 1 + while (i <= length(values)) { + if (values[i] < 0xD800 || values[i] > 0xDBFF) { # Not a high surrogate + # Direct conversion for regular characters (like space) + decoded_chars <- c(decoded_chars, intToUtf8(values[i])) + i <- i + 1 + } else { + # Decode surrogate pairs + decoded_chars <- c(decoded_chars, decode_surrogates(values[i], values[i + 1])) + i <- i + 2 + } + } + # Combine into a single string + paste(decoded_chars, collapse = "") +} + safe_compress <- function(string, f) { string <- enc2utf8(string) string <- iconv(string, from="UTF-8", to="UTF-16", toRaw=TRUE)[[1]] result <- f(string) - chr_result <- rawToChar(as.raw(result)) + chr_result <- decode_utf16_surrogate(result) Encoding(chr_result) <- "UTF-8" chr_result } @@ -16,7 +43,8 @@ safe_decompress <- function(string, f) { string <- enc2utf8(string) string <- iconv(string, from="UTF-8", to="UTF-16", toRaw=TRUE)[[1]] result <- f(string) - chr_result <- intToUtf8(result) + chr_result <- decode_utf16_surrogate(result) + Encoding(chr_result) <- "UTF-8" chr_result }