From acd1a8d9ab05b33062b8094816fb23ca70a54704 Mon Sep 17 00:00:00 2001 From: Hans Larsen Date: Fri, 25 Oct 2024 07:52:56 -0700 Subject: [PATCH] Add a display_lossy() to write a JsString lossily (#4023) * Add a display_lossy() to write a JsString lossily * cargo fmt --- core/string/src/display.rs | 48 ++++++++++++++++++++++++++++++++++++++ core/string/src/lib.rs | 12 ++++++++-- core/string/src/str.rs | 8 +++++++ 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/core/string/src/display.rs b/core/string/src/display.rs index 9124af24e14..6873e406f72 100644 --- a/core/string/src/display.rs +++ b/core/string/src/display.rs @@ -34,6 +34,27 @@ impl<'a> From> for JsStrDisplayEscaped<'a> { } } +/// Display implementation for [`crate::JsString`] that escapes unicode characters. +#[derive(Debug)] +pub struct JsStrDisplayLossy<'a> { + inner: JsStr<'a>, +} + +impl fmt::Display for JsStrDisplayLossy<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // No need to optimize latin1. + self.inner + .code_points_lossy() + .try_for_each(|c| f.write_char(c)) + } +} + +impl<'a> From> for JsStrDisplayLossy<'a> { + fn from(inner: JsStr<'a>) -> Self { + Self { inner } + } +} + #[test] fn latin1() { // 0xE9 is `é` in ISO-8859-1 (see https://www.ascii-code.com/ISO-8859-1). @@ -41,4 +62,31 @@ fn latin1() { let rust_str = format!("{}", JsStrDisplayEscaped { inner: s }); assert_eq!(rust_str, "Hello é world!"); + + let rust_str = format!("{}", JsStrDisplayLossy { inner: s }); + assert_eq!(rust_str, "Hello é world!"); +} + +#[test] +fn emoji() { + // 0x1F600 is `😀` (see https://www.fileformat.info/info/unicode/char/1f600/index.htm). + let s = JsStr::utf16(&[0xD83D, 0xDE00]); + + let rust_str = format!("{}", JsStrDisplayEscaped { inner: s }); + assert_eq!(rust_str, "😀"); + + let rust_str = format!("{}", JsStrDisplayLossy { inner: s }); + assert_eq!(rust_str, "😀"); +} + +#[test] +fn unpaired_surrogates() { + // 0xD800 is an unpaired surrogate (see https://www.fileformat.info/info/unicode/char/d800/index.htm). + let s = JsStr::utf16(&[0xD800]); + + let rust_str = format!("{}", JsStrDisplayEscaped { inner: s }); + assert_eq!(rust_str, "\\uD800"); + + let rust_str = format!("{}", JsStrDisplayLossy { inner: s }); + assert_eq!(rust_str, "�"); } diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index 26922832aea..c4171c7f3df 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -26,7 +26,7 @@ mod tagged; mod tests; use self::{iter::Windows, str::JsSliceIndex}; -use crate::display::JsStrDisplayEscaped; +use crate::display::{JsStrDisplayEscaped, JsStrDisplayLossy}; use crate::tagged::{Tagged, UnwrappedTagged}; #[doc(inline)] pub use crate::{ @@ -960,7 +960,7 @@ impl JsString { } } - /// Gets a displayable escaped string. This may be faster and has less + /// Gets a displayable escaped string. This may be faster and has fewer /// allocations than `format!("{}", str.to_string_escaped())` when /// displaying. #[inline] @@ -968,6 +968,14 @@ impl JsString { pub fn display_escaped(&self) -> JsStrDisplayEscaped<'_> { JsStrDisplayEscaped::from(self.as_str()) } + + /// Gets a displayable lossy string. This may be faster and has fewer + /// allocations than `format!("{}", str.to_string_lossy())` when displaying. + #[inline] + #[must_use] + pub fn display_lossy(&self) -> JsStrDisplayLossy<'_> { + JsStrDisplayLossy::from(self.as_str()) + } } impl Clone for JsString { diff --git a/core/string/src/str.rs b/core/string/src/str.rs index ffbee00034c..c0c49f44f27 100644 --- a/core/string/src/str.rs +++ b/core/string/src/str.rs @@ -235,6 +235,14 @@ impl<'a> JsStr<'a> { m >= n && needle == self.get(m - n..).expect("already checked size") } + /// Gets an iterator of all the Unicode codepoints of a [`JsStr`], replacing + /// unpaired surrogates with the replacement character. This is faster than + /// using [`Self::code_points`]. + #[inline] + pub(crate) fn code_points_lossy(self) -> impl Iterator + 'a { + char::decode_utf16(self.iter()).map(|res| res.unwrap_or('\u{FFFD}')) + } + /// Gets an iterator of all the Unicode codepoints of a [`JsStr`]. /// This is not optimized for Latin1 strings. #[inline]