From ce1e5f8012695141a8b84da45db2563eadae23a2 Mon Sep 17 00:00:00 2001 From: parmsam Date: Tue, 9 Apr 2024 23:21:18 -0400 Subject: [PATCH] migrate away from std::wstring_convert --- R/cpp11.R | 16 +++++----- R/lzstringr-package.R | 28 ++++++++++++----- src/code.cpp | 70 ++++++++++++++++++++++++++----------------- src/cpp11.cpp | 24 +++++++-------- 4 files changed, 82 insertions(+), 56 deletions(-) diff --git a/R/cpp11.R b/R/cpp11.R index 9555b6f..4acfc7d 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -1,17 +1,17 @@ # Generated by cpp11: do not edit by hand -compressToEncodedURIComponent_ <- function(uncompressed8) { - .Call(`_lzstringr_compressToEncodedURIComponent_`, uncompressed8) +compressToEncodedURIComponent_ <- function(bytes) { + .Call(`_lzstringr_compressToEncodedURIComponent_`, bytes) } -decompressFromEncodedURIComponent_ <- function(compressed8) { - .Call(`_lzstringr_decompressFromEncodedURIComponent_`, compressed8) +decompressFromEncodedURIComponent_ <- function(bytes) { + .Call(`_lzstringr_decompressFromEncodedURIComponent_`, bytes) } -compressToBase64_ <- function(uncompressed8) { - .Call(`_lzstringr_compressToBase64_`, uncompressed8) +compressToBase64_ <- function(bytes) { + .Call(`_lzstringr_compressToBase64_`, bytes) } -decompressFromBase64_ <- function(compressed8) { - .Call(`_lzstringr_decompressFromBase64_`, compressed8) +decompressFromBase64_ <- function(bytes) { + .Call(`_lzstringr_decompressFromBase64_`, bytes) } diff --git a/R/lzstringr-package.R b/R/lzstringr-package.R index 41c2dff..0ce1a8e 100644 --- a/R/lzstringr-package.R +++ b/R/lzstringr-package.R @@ -3,26 +3,38 @@ ## usethis namespace: end NULL +safe_compress <- function(string, f) { + string <- enc2utf8(string) + string <- iconv(string, from="UTF-8", to="UTF-16", toRaw=TRUE)[[1]] + result <- f(string) + chr_result <- rawToChar(as.raw(result)) + chr_result +} + +safe_decompress <- function(string, f) { + string <- enc2utf8(string) + string <- iconv(string, from="UTF-8", to="UTF-16", toRaw=TRUE)[[1]] + result <- f(string) + chr_result <- intToUtf8(result) + chr_result +} + #' @export compressToBase64 compressToBase64 <- function(string) { - string <- enc2utf8(string) - compressToBase64_(string) + safe_compress(string, compressToBase64_) } #' @export decompressFromBase64 decompressFromBase64 <- function(string) { - string <- enc2utf8(string) - decompressFromBase64_(string) + safe_decompress(string, decompressFromBase64_) } #' @export compressToEncodedURIComponent compressToEncodedURIComponent <- function(string) { - string <- enc2utf8(string) - compressToEncodedURIComponent_(string) + safe_compress(string, compressToEncodedURIComponent_) } #' @export decompressFromEncodedURIComponent decompressFromEncodedURIComponent <- function(string) { - string <- enc2utf8(string) - decompressFromEncodedURIComponent_(string) + safe_decompress(string, decompressFromEncodedURIComponent_) } diff --git a/src/code.cpp b/src/code.cpp index 1be310a..4e9c87b 100644 --- a/src/code.cpp +++ b/src/code.cpp @@ -3,56 +3,70 @@ using namespace cpp11; #include #include "lz-string.hpp" #include +#include +#include + +std::u16string createUTF16String(const std::vector& bytes) { + if (bytes.size() < 2) { + throw std::runtime_error("Invalid byte array. Size must be at least 2 bytes."); + } + + // Check byte order mark (BOM) + bool isLittleEndian = (bytes[0] == 0xFF && bytes[1] == 0xFE); + bool isBigEndian = (bytes[0] == 0xFE && bytes[1] == 0xFF); + + if (!isLittleEndian && !isBigEndian) { + throw std::runtime_error("Invalid byte order mark (BOM)."); + } + + std::u16string result; + for (size_t i = 2; i < bytes.size(); i += 2) { + char16_t codeUnit; + if (isLittleEndian) { + codeUnit = static_cast(bytes[i] | (bytes[i + 1] << 8)); + } else { + codeUnit = static_cast((bytes[i] << 8) | bytes[i + 1]); + } + result.push_back(codeUnit); + } + + return result; +} + [[cpp11::register]] -std::string compressToEncodedURIComponent_(std::string uncompressed8) { - std::wstring_convert, char16_t> converter_8_to_16; - std::u16string uncompressed16 = converter_8_to_16.from_bytes(uncompressed8); +std::u16string compressToEncodedURIComponent_(std::vector bytes) { + std::u16string uncompressed16 = createUTF16String(bytes); auto compressed16 = lzstring::compressToEncodedURIComponent(uncompressed16); - std::wstring_convert, char16_t> converter_16_to_8; - std::string compressed8 = converter_16_to_8.to_bytes(compressed16); - - return compressed8; + return compressed16; } [[cpp11::register]] -std::string decompressFromEncodedURIComponent_(std::string compressed8) { - std::wstring_convert, char16_t> converter_8_to_16; - std::u16string compressed16 = converter_8_to_16.from_bytes(compressed8); +std::u16string decompressFromEncodedURIComponent_(std::vector bytes) { + std::u16string compressed16 = createUTF16String(bytes); auto uncompressed16 = lzstring::decompressFromEncodedURIComponent(compressed16); - std::wstring_convert, char16_t> converter_16_to_8; - std::string uncompressed8 = converter_16_to_8.to_bytes(uncompressed16); - - return uncompressed8; + return uncompressed16; } [[cpp11::register]] -std::string compressToBase64_(std::string uncompressed8) { - std::wstring_convert, char16_t> converter_8_to_16; - std::u16string uncompressed16 = converter_8_to_16.from_bytes(uncompressed8); +std::u16string compressToBase64_(std::vector bytes) { + std::u16string uncompressed16 = createUTF16String(bytes); auto compressed16 = lzstring::compressToBase64(uncompressed16); - std::wstring_convert, char16_t> converter_16_to_8; - std::string compressed8 = converter_16_to_8.to_bytes(compressed16); - - return compressed8; + return compressed16; } [[cpp11::register]] -std::string decompressFromBase64_(std::string compressed8) { - std::wstring_convert, char16_t> converter_8_to_16; - std::u16string compressed16 = converter_8_to_16.from_bytes(compressed8); +std::u16string decompressFromBase64_(std::vector bytes) { + std::u16string compressed16 = createUTF16String(bytes); auto uncompressed16 = lzstring::decompressFromBase64(compressed16); - std::wstring_convert, char16_t> converter_16_to_8; - std::string uncompressed8 = converter_16_to_8.to_bytes(uncompressed16); - - return uncompressed8; + return uncompressed16; } diff --git a/src/cpp11.cpp b/src/cpp11.cpp index acd1840..a7e1bdb 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -6,31 +6,31 @@ #include // code.cpp -std::string compressToEncodedURIComponent_(std::string uncompressed8); -extern "C" SEXP _lzstringr_compressToEncodedURIComponent_(SEXP uncompressed8) { +std::u16string compressToEncodedURIComponent_(std::vector bytes); +extern "C" SEXP _lzstringr_compressToEncodedURIComponent_(SEXP bytes) { BEGIN_CPP11 - return cpp11::as_sexp(compressToEncodedURIComponent_(cpp11::as_cpp>(uncompressed8))); + return cpp11::as_sexp(compressToEncodedURIComponent_(cpp11::as_cpp>>(bytes))); END_CPP11 } // code.cpp -std::string decompressFromEncodedURIComponent_(std::string compressed8); -extern "C" SEXP _lzstringr_decompressFromEncodedURIComponent_(SEXP compressed8) { +std::u16string decompressFromEncodedURIComponent_(std::vector bytes); +extern "C" SEXP _lzstringr_decompressFromEncodedURIComponent_(SEXP bytes) { BEGIN_CPP11 - return cpp11::as_sexp(decompressFromEncodedURIComponent_(cpp11::as_cpp>(compressed8))); + return cpp11::as_sexp(decompressFromEncodedURIComponent_(cpp11::as_cpp>>(bytes))); END_CPP11 } // code.cpp -std::string compressToBase64_(std::string uncompressed8); -extern "C" SEXP _lzstringr_compressToBase64_(SEXP uncompressed8) { +std::u16string compressToBase64_(std::vector bytes); +extern "C" SEXP _lzstringr_compressToBase64_(SEXP bytes) { BEGIN_CPP11 - return cpp11::as_sexp(compressToBase64_(cpp11::as_cpp>(uncompressed8))); + return cpp11::as_sexp(compressToBase64_(cpp11::as_cpp>>(bytes))); END_CPP11 } // code.cpp -std::string decompressFromBase64_(std::string compressed8); -extern "C" SEXP _lzstringr_decompressFromBase64_(SEXP compressed8) { +std::u16string decompressFromBase64_(std::vector bytes); +extern "C" SEXP _lzstringr_decompressFromBase64_(SEXP bytes) { BEGIN_CPP11 - return cpp11::as_sexp(decompressFromBase64_(cpp11::as_cpp>(compressed8))); + return cpp11::as_sexp(decompressFromBase64_(cpp11::as_cpp>>(bytes))); END_CPP11 }