Skip to content

Commit

Permalink
Merge pull request #17 from parmsam/sam-dev
Browse files Browse the repository at this point in the history
Improve performance with better convert_to_utf16le
  • Loading branch information
parmsam authored Apr 21, 2024
2 parents ab27bfd + 48baee0 commit 097faaf
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 45 deletions.
71 changes: 38 additions & 33 deletions R/lzstringr-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,67 +3,72 @@
## usethis namespace: end
NULL


# Helper function to convert string to UTF-16LE with BOM
convert_to_utf16le <- function(string) {
string <- enc2utf8(string)
string_utf16 <- iconv(string, from = "UTF-8", to = "UTF-16LE", toRaw = TRUE)[[1]]
bom_le <- charToRaw("\xFF\xFE")
if (!identical(string_utf16[1:2], bom_le)) {
string_utf16 <- c(bom_le, string_utf16)
}
string_utf16
}

decode_utf16_surrogate <- function(values) {
# Initialize an empty character vector to store decoded characters
decoded_chars <- character()
# Estimate the maximum number of characters (since surrogate pairs condense to one character)
max_chars <- length(values)
decoded_chars <- character(max_chars) # Pre-allocate with maximum possible size
index <- 1 # Index to keep track of position in decoded_chars

# Function to decode surrogate pairs
decode_surrogates <- function(high, low) {
# Calculate the Unicode code point from surrogate values
# Formula: 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00)
code_point <- 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00)
# Convert the Unicode code point to a character
intToUtf8(code_point)
}

i <- 1
while (i <= length(values)) {
if (values[i] < 0xD800 ||
values[i] > 0xDBFF) {
if (values[i] < 0xD800 || values[i] > 0xDBFF) {
# Not a high surrogate
# Direct conversion for regular characters (like space)
decoded_chars <- c(decoded_chars, intToUtf8(values[i]))
decoded_chars[index] <- intToUtf8(values[i])
i <- i + 1
} else {
# Decode surrogate pairs
decoded_chars <-
c(decoded_chars, decode_surrogates(values[i], values[i + 1]))
if (i + 1 > length(values)) {
stop("Malformed input: Surrogate high without a following low surrogate.")
}
decoded_chars[index] <- decode_surrogates(values[i], values[i + 1])
i <- i + 2
}
index <- index + 1
}

# Truncate the vector to the actual number of characters decoded
decoded_chars <- decoded_chars[1:(index - 1)]

# Combine into a single string
paste(decoded_chars, collapse = "")
}


safe_compress <- function(string, f) {
string <- enc2utf8(string)
string_utf16 <-
iconv(string,
from = "UTF-8",
to = "UTF-16LE",
toRaw = TRUE
)[[1]]
bom_le <- charToRaw("\xFF\xFE")
if (!identical(string_utf16[1:2], bom_le)) {
string_utf16 <- c(bom_le, string_utf16)
}
string_utf16 <- convert_to_utf16le(string)
result <- f(string_utf16)
if (length(result) == 0) {
return("")
}
chr_result <- rawToChar(as.raw(result))
Encoding(chr_result) <- "UTF-8"
chr_result
}

safe_decompress <- function(string, f) {
string <- enc2utf8(string)
string_utf16 <-
iconv(string,
from = "UTF-8",
to = "UTF-16LE",
toRaw = TRUE
)[[1]]
bom_le <- charToRaw("\xFF\xFE")
if (!identical(string_utf16[1:2], bom_le)) {
string_utf16 <- c(bom_le, string_utf16)
}
string_utf16 <- convert_to_utf16le(string)
result <- f(string_utf16)
if (length(result) == 0) {
return("")
}
chr_result <- decode_utf16_surrogate(result)
Encoding(chr_result) <- "UTF-8"
chr_result
Expand Down
12 changes: 8 additions & 4 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ compressed = lzstring::compressToBase64(message)
compressed
decompressed = lzstring::decompressFromBase64(compressed)
decompressed
cat(decompressed)
```

### JSON data
Expand All @@ -61,19 +61,23 @@ compressed = lzstring::compressToBase64(json_string)
compressed
decompressed = lzstring::decompressFromBase64(compressed)
decompressed
identical(json_string, decompressed)
cat(decompressed)
```

### JS code

```{r}
js_code <- "function test() { console.log('Hello, World!'); }"
js_code <- "
function test() {
console.log('Hello, World!');
}
"
compressed = lzstring::compressToBase64(js_code)
compressed
decompressed = lzstring::decompressFromBase64(compressed)
decompressed
cat(decompressed)
```

### R code
Expand Down
23 changes: 15 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ compressed
#> [1] "CoCwpgBAjgrglgYwNYQEYCcD2B3AdhAM0wA8IArGAWwAcBnCTANzHQgBdwIAbAQwC8AnhAAmmAOYBCIA"

decompressed = lzstring::decompressFromBase64(compressed)
decompressed
#> [1] "The quick brown fox jumps over the lazy dog!"
cat(decompressed)
#> The quick brown fox jumps over the lazy dog!
```

### JSON data
Expand All @@ -66,23 +66,30 @@ compressed
#> [1] "N4IgdghgtgpiBcBtEApA9gCzAAgCJrgF0AaECAcziQGYAGEkGKCASwBsFkArTMAOgAmBAAIwAHtAAObGHwDGaKCEIBfIA==="

decompressed = lzstring::decompressFromBase64(compressed)
decompressed
#> [1] "{\"name\":[\"John Doe\"],\"age\":[30],\"email\":[\"[email protected]\"]}"
identical(json_string, decompressed)
#> [1] FALSE
cat(decompressed)
#> {"name":["John Doe"],"age":[30],"email":["[email protected]"]}
```

### JS code

``` r
js_code <- "function test() { console.log('Hello, World!'); }"
js_code <- "
function test() {
console.log('Hello, World!');
}
"
compressed = lzstring::compressToBase64(js_code)
compressed
#> [1] "GYVwdgxgLglg9mABFApgZygCgJSIN6IQJpwA2KAdKXAOaYDkAEiqdQDSIDqcATqQCYBCetgDciAL5A=="
#> [1] "FAMwrgdgxgLglgewgAhgUwM4wBQEpkDeywyyUSGCANmgHRUIDm2A5ABJpUMA0yA6ggBOVACYBCFrgDcxAL7AgA=="

decompressed = lzstring::decompressFromBase64(compressed)
decompressed
#> [1] "function test() { console.log('Hello, World!'); }"
cat(decompressed)
#>
#> function test() {
#> console.log('Hello, World!');
#> }
```

### R code
Expand Down

0 comments on commit 097faaf

Please sign in to comment.