Skip to content

Commit

Permalink
add function to decode UTF-16 surrogate pairs and integrate with exis…
Browse files Browse the repository at this point in the history
…ting logic
  • Loading branch information
parmsam committed Apr 11, 2024
1 parent 9465c34 commit 6f91f50
Showing 1 changed file with 30 additions and 2 deletions.
32 changes: 30 additions & 2 deletions R/lzstringr-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,38 @@
## usethis namespace: end
NULL

decode_utf16_surrogate <- function(values) {
# Initialize an empty character vector to store decoded characters
decoded_chars <- character()
# Function to decode surrogate pairs
decode_surrogates <- function(high, low) {
# Calculate the Unicode code point from surrogate values
# Formula: 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00)
code_point <- 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00)
# Convert the Unicode code point to a character
intToUtf8(code_point)
}
i <- 1
while (i <= length(values)) {
if (values[i] < 0xD800 || values[i] > 0xDBFF) { # Not a high surrogate
# Direct conversion for regular characters (like space)
decoded_chars <- c(decoded_chars, intToUtf8(values[i]))
i <- i + 1
} else {
# Decode surrogate pairs
decoded_chars <- c(decoded_chars, decode_surrogates(values[i], values[i + 1]))
i <- i + 2
}
}
# Combine into a single string
paste(decoded_chars, collapse = "")
}

safe_compress <- function(string, f) {
string <- enc2utf8(string)
string <- iconv(string, from="UTF-8", to="UTF-16", toRaw=TRUE)[[1]]
result <- f(string)
chr_result <- rawToChar(as.raw(result))
chr_result <- decode_utf16_surrogate(result)
Encoding(chr_result) <- "UTF-8"
chr_result
}
Expand All @@ -16,7 +43,8 @@ safe_decompress <- function(string, f) {
string <- enc2utf8(string)
string <- iconv(string, from="UTF-8", to="UTF-16", toRaw=TRUE)[[1]]
result <- f(string)
chr_result <- intToUtf8(result)
chr_result <- decode_utf16_surrogate(result)
Encoding(chr_result) <- "UTF-8"
chr_result
}

Expand Down

0 comments on commit 6f91f50

Please sign in to comment.