From 7fac8331ccac26338ee2a5afceb705f51112ebe0 Mon Sep 17 00:00:00 2001 From: AnttiP Date: Wed, 8 Jul 2020 23:09:34 +0300 Subject: [PATCH 1/8] A rough version of the new implementation. --- Cargo.toml | 2 + src/boxed.rs | 158 ++++++++++++++++---- src/casts.rs | 12 +- src/config.rs | 112 +++++++++++++-- src/inline.rs | 117 +++++++++------ src/iter.rs | 157 ++++++-------------- src/lib.rs | 350 ++++++++++++++++++++++++++++----------------- src/marker_byte.rs | 69 --------- 8 files changed, 575 insertions(+), 402 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 43e4ef3..3dbe61e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,9 @@ name = "smartstring" harness = false [features] +default = ["lazy_null_pointer_optimizations"] test = ["arbitrary", "arbitrary/derive"] +lazy_null_pointer_optimizations = [] [dependencies] static_assertions = "1.1.0" diff --git a/src/boxed.rs b/src/boxed.rs index 9530502..ac452e5 100644 --- a/src/boxed.rs +++ b/src/boxed.rs @@ -2,55 +2,151 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use std::cmp::Ordering; +use std::ops::{Deref, DerefMut}; +use crate::{SmartStringMode, SmartString, inline::InlineString}; -pub trait BoxedString { - fn string(&self) -> &String; - fn string_mut(&mut self) -> &mut String; - fn into_string(self) -> String; +pub trait BoxedString: Deref + DerefMut + Into { + //This is unsafe when null pointer optimizations are used + //Then, it is unsound if the capacity of the string is 0 + unsafe fn from_string_unchecked(string: String) -> Self; + fn capacity(&self) -> usize; +} + +//Just a string, but the fields are in fixed order +#[cfg(target_endian = "big")] +#[repr(C)] +#[derive(Debug)] +pub struct PseudoString { + capacity: usize, + ptr: std::ptr::NonNull, + size: usize, +} - fn cmp_with_str(&self, other: &str) -> Ordering; - fn cmp_with_self(&self, other: &Self) -> Ordering; - fn eq_with_str(&self, other: &str) -> bool; - fn eq_with_self(&self, other: &Self) -> bool; +#[cfg(target_endian = "little")] +#[cfg(not(feature = "lazy_null_pointer_optimizations"))] +#[repr(C)] +//This seems to be the most common arrangement of std::String +//However, with lazy null pointer optimizations, this arrangement does not work +#[derive(Debug)] +pub struct PseudoString { + ptr: std::ptr::NonNull, + capacity: usize, + size: usize, +} - fn len(&self) -> usize { - self.string().len() +#[cfg(target_endian = "little")] +#[cfg(feature = "lazy_null_pointer_optimizations")] +#[repr(C)] +#[derive(Debug)] +pub struct PseudoString { + ptr: std::ptr::NonNull, + size: usize, + capacity: std::num::NonZeroUsize, +} + +impl Deref for PseudoString { + type Target = str; + fn deref(&self) -> &str { + unsafe { + let slice = std::slice::from_raw_parts(self.ptr.as_ptr().cast(), self.size); + std::str::from_utf8_unchecked(slice) + } } } -impl BoxedString for String { - #[inline(always)] - fn string(&self) -> &String { - self +impl DerefMut for PseudoString { + fn deref_mut(&mut self) -> &mut str { + unsafe { + let slice = std::slice::from_raw_parts_mut(self.ptr.as_ptr().cast(), self.size); + std::str::from_utf8_unchecked_mut(slice) + } } +} +impl From for String { #[inline(always)] - fn string_mut(&mut self) -> &mut String { - self + fn from(string: PseudoString) -> Self { + unsafe { + String::from_raw_parts(string.ptr.as_ptr(), string.size, usize::from(string.capacity)) + } } +} + + +#[cfg(feature = "lazy_null_pointer_optimizations")] +unsafe fn to_capacity(size: usize) -> std::num::NonZeroUsize { + std::num::NonZeroUsize::new_unchecked(size) +} + +#[cfg(not(feature = "lazy_null_pointer_optimizations"))] +fn to_capacity(size: usize) -> usize { + size +} - fn into_string(self) -> String { - self +impl BoxedString for PseudoString { + unsafe fn from_string_unchecked(mut string: String) -> Self { + //into_raw_parts is nightly at the time of writing + //In the future the following code should be replaced with + //let (ptr, size, capacity) = string.into_raw_parts(); + let capacity = string.capacity(); + let bytes = string.as_mut_str(); + let ptr = bytes.as_mut_ptr(); + let size = bytes.len(); + std::mem::forget(string); + + Self {ptr: std::ptr::NonNull::new_unchecked(ptr), size, capacity: to_capacity(capacity)} } - #[inline(always)] - fn cmp_with_str(&self, other: &str) -> Ordering { - self.as_str().cmp(other) + fn capacity(&self) -> usize { + usize::from(self.capacity) << 1 >> 1 } +} - #[inline(always)] - fn cmp_with_self(&self, other: &Self) -> Ordering { - self.cmp(other) +#[derive(Debug)] +pub(crate) struct StringReference<'a, Mode: SmartStringMode> { + referrant: &'a mut SmartString, + string: String, +} + +impl<'a, Mode: SmartStringMode> StringReference<'a, Mode> { + //Safety: Discriminant must be boxed + pub(crate) unsafe fn from_smart_unchecked(smart: &'a mut SmartString) -> Self { + debug_assert_eq!(smart.discriminant(), crate::marker_byte::Discriminant::Boxed); + let boxed : Mode::BoxedString = std::mem::transmute_copy(smart); + let string = boxed.into(); + Self { + referrant: smart, + string + } } +} - #[inline(always)] - fn eq_with_str(&self, other: &str) -> bool { - self == other +impl<'a, Mode: SmartStringMode> Drop for StringReference<'a, Mode> { + fn drop(&mut self) { + let string = std::mem::replace(&mut self.string, String::new()); + if (Mode::DEALLOC && string.len() <= Mode::MAX_INLINE) || (!Mode::DEALLOC && cfg!(lazy_null_pointer_optimizations) && string.capacity() == 0) { + let transmuted = (self as *mut Self).cast(); + unsafe { + std::ptr::write(*transmuted, InlineString::::from(string.as_bytes())); + } + } else { + let transmuted = (self as *mut Self).cast(); + unsafe { + std::ptr::write(*transmuted, Mode::BoxedString::from_string_unchecked(string)); + } + } } +} - #[inline(always)] - fn eq_with_self(&self, other: &Self) -> bool { - self == other +impl<'a, Mode: SmartStringMode> Deref for StringReference<'a, Mode> { + type Target = String; + fn deref(&self) -> &String { + &self.string + } +} + +impl<'a, Mode: SmartStringMode> DerefMut for StringReference<'a, Mode> { + fn deref_mut(&mut self) -> &mut String { + &mut self.string } } diff --git a/src/casts.rs b/src/casts.rs index 591360d..7e1ada4 100644 --- a/src/casts.rs +++ b/src/casts.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::{inline::InlineString, SmartStringMode}; +use crate::{inline::InlineString, SmartStringMode, boxed::StringReference}; pub(crate) enum StringCast<'a, Mode: SmartStringMode> { Boxed(&'a Mode::BoxedString), @@ -10,7 +10,7 @@ pub(crate) enum StringCast<'a, Mode: SmartStringMode> { } pub(crate) enum StringCastMut<'a, Mode: SmartStringMode> { - Boxed(&'a mut Mode::BoxedString), + Boxed(StringReference<'a, Mode>), Inline(&'a mut InlineString), } @@ -18,3 +18,11 @@ pub(crate) enum StringCastInto { Boxed(Mode::BoxedString), Inline(InlineString), } + +//Same as transmute, except it doesn't check for same size +//This should be replaced when it's possible to constrain the sizes of generic associated types +pub(crate) unsafe fn please_transmute(from: A) -> B { + let ret = std::mem::transmute_copy(&from); + std::mem::forget(from); + ret +} diff --git a/src/config.rs b/src/config.rs index 38705a8..12753cc 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2,9 +2,9 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::{boxed::BoxedString, inline::InlineString, SmartString}; +use crate::{boxed::{BoxedString, PseudoString}, inline::InlineString, SmartString}; use static_assertions::{assert_eq_size, const_assert, const_assert_eq}; -use std::mem::{align_of, size_of}; +use std::mem::{align_of, size_of, MaybeUninit}; /// A compact string representation equal to [`String`][String] in size with guaranteed inlining. /// @@ -47,31 +47,115 @@ pub struct LazyCompact; /// [SmartString]: struct.SmartString.html /// [Compact]: struct.Compact.html /// [LazyCompact]: struct.LazyCompact.html -pub trait SmartStringMode { +/// +/// Implementing this trait is extremely unsafe and not recommended +/// The requirements are that: +/// * std::mem::size_of == std::mem::size_of +/// * std::mem::align_of == std::mem::align_of +/// * std::mem::size_of == std::size_of +/// * std::mem::align_of == std::mem::align_of +/// * It should be always safe to transmute from BoxedString into SmartString +/// * The highmost bit of BoxedString must be one +/// * If the highest std::mem::size_of bytes of BoxedString were casted into DiscriminantContainer +/// at any time, even in methods of BoxedString, it must be a valid DiscriminantContainer. +pub unsafe trait SmartStringMode { /// The boxed string type for this layout. - type BoxedString: BoxedString + From; - /// The inline string type for this layout. - type InlineArray: AsRef<[u8]> + AsMut<[u8]> + Clone + Copy; + type BoxedString: BoxedString; /// The maximum capacity of an inline string, in bytes. const MAX_INLINE: usize; /// A constant to decide whether to turn a wrapped string back into an inlined /// string whenever possible (`true`) or leave it as a wrapped string once wrapping /// has occurred (`false`). const DEALLOC: bool; + /// Unfortunately const generics don't exists at the time of writing + /// If DEALLOC == true or cfg!(feature = "lazy_null_pointer_optimizations") == true, this should be std::num::NonZeroUsize, + /// Otherwise it should be PossiblyZeroSize + type DiscriminantContainer: DiscriminantContainer; +} + +/// Contains the discriminant. This is a visible field in the SmartString struct, so the compiler +/// is able to make null pointer optimizations when the type allows them. +pub trait DiscriminantContainer { + /// Returns the full marker byte + fn get_full_marker(&self) -> u8; + /// Return Self with the requirement that the marker is inside + fn new(marker: u8) -> Self; + /// Flip the highest bit of marker + unsafe fn flip_bit(&mut self); +} + +impl DiscriminantContainer for std::num::NonZeroUsize { + fn get_full_marker(&self) -> u8 { + (self.get() >> (std::mem::size_of::() - 1)*8) as u8 + } + fn new(marker: u8) -> Self { + unsafe { + Self::new_unchecked( + ((marker as usize) << (std::mem::size_of::() - 1)*8) + 1 + ) + } + } + unsafe fn flip_bit(&mut self) { + *self = Self::new_unchecked(self.get()); + } +} + +/// A structure that stores a marker and raw data +#[cfg(target_endian = "big")] +#[cfg_attr(target_pointer_width = "64", repr(C, align(8)))] +#[cfg_attr(target_pointer_width = "32", repr(C, align(4)))] +struct PossiblyZeroSize { + marker: u8, + data: [MaybeUninit; std::mem::size_of::() - 1], +} + +/// A structure that stores a marker and raw data +#[cfg(target_endian = "little")] +#[cfg_attr(target_pointer_width = "64", repr(C, align(8)))] +#[cfg_attr(target_pointer_width = "32", repr(C, align(4)))] +#[derive(Debug)] +pub struct PossiblyZeroSize { + data: [MaybeUninit; std::mem::size_of::() - 1], + marker: u8, +} + +impl DiscriminantContainer for PossiblyZeroSize { + fn get_full_marker(&self) -> u8 { + self.marker + } + fn new(marker: u8) -> Self { + Self { + marker, + data: [MaybeUninit::uninit(); std::mem::size_of::() - 1], + } + } + unsafe fn flip_bit(&mut self) { + self.marker^= 128; + } } -impl SmartStringMode for Compact { - type BoxedString = String; - type InlineArray = [u8; size_of::() - 1]; +unsafe impl SmartStringMode for Compact { + type BoxedString = PseudoString; const MAX_INLINE: usize = size_of::() - 1; const DEALLOC: bool = true; + type DiscriminantContainer = std::num::NonZeroUsize; } -impl SmartStringMode for LazyCompact { - type BoxedString = String; - type InlineArray = [u8; size_of::() - 1]; + +#[cfg(not(feature = "lazy_null_pointer_optimizations"))] +unsafe impl SmartStringMode for LazyCompact { + type BoxedString = PseudoString; const MAX_INLINE: usize = size_of::() - 1; const DEALLOC: bool = false; + type DiscriminantContainer = PossiblyZeroSize; +} + +#[cfg(feature = "lazy_null_pointer_optimizations")] +unsafe impl SmartStringMode for LazyCompact { + type BoxedString = PseudoString; + const MAX_INLINE: usize = size_of::() - 1; + const DEALLOC: bool = false; + type DiscriminantContainer = std::num::NonZeroUsize; } // Assert that we're not using more space than we can encode in the header byte, @@ -94,6 +178,10 @@ assert_eq_size!(InlineString, SmartString); assert_eq_size!(String, SmartString); assert_eq_size!(String, SmartString); +assert_eq_size!(SmartString, Option>); +#[cfg(feature = "lazy_null_pointer_optimizations")] +assert_eq_size!(SmartString, Option>); + // Assert that `SmartString` is aligned correctly. const_assert_eq!(align_of::(), align_of::>()); const_assert_eq!(align_of::(), align_of::>()); diff --git a/src/inline.rs b/src/inline.rs index f157165..8a72caf 100644 --- a/src/inline.rs +++ b/src/inline.rs @@ -2,17 +2,29 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::{marker_byte::Marker, SmartStringMode}; +use crate::SmartStringMode; use std::{ mem::MaybeUninit, slice::{from_raw_parts, from_raw_parts_mut}, str::{from_utf8_unchecked, from_utf8_unchecked_mut}, }; -#[repr(C)] +#[cfg(target_endian = "big")] +#[cfg_attr(target_pointer_width = "64", repr(C, align(8)))] +#[cfg_attr(target_pointer_width = "32", repr(C, align(4)))] pub(crate) struct InlineString { - pub(crate) marker: Marker, - pub(crate) data: Mode::InlineArray, + pub(crate) marker: u8, + pub(crate) data: [MaybeUninit; 3*std::mem::size_of::() - 1], + phantom: std::marker::PhantomData<*const Mode>, +} + +#[cfg(target_endian = "little")] +#[cfg_attr(target_pointer_width = "64", repr(C, align(8)))] +#[cfg_attr(target_pointer_width = "32", repr(C, align(4)))] +pub(crate) struct InlineString { + pub(crate) data: [MaybeUninit; 3*std::mem::size_of::() - 1], + pub(crate) marker: u8, + phantom: std::marker::PhantomData<*const Mode>, } impl Clone for InlineString { @@ -25,90 +37,107 @@ impl Copy for InlineString {} impl InlineString { pub(crate) fn new() -> Self { - Self { - marker: Marker::new_inline(0), - data: unsafe { MaybeUninit::zeroed().assume_init() }, + let mut ret = Self { + marker: 128, + data: [MaybeUninit::uninit(); 3*std::mem::size_of::() - 1], + phantom: std::marker::PhantomData, + }; + //Are nullptr optimizations on? + if Mode::DEALLOC || cfg!(lazy_null_pointer_optimizations) { + //Initialize the 7 first or last bytes of data + for j in 0..(std::mem::size_of::() - 1) { + #[cfg(target_endian = "little")] + let j = 3*std::mem::size_of::() - 3 - j; + + ret.data[j] = MaybeUninit::zeroed(); + } } + ret } - pub(crate) fn set_size(&mut self, size: usize) { - self.marker.set_data(size as u8); + //len must be less than Mode::MAX_INLINE + //If growing, the newly avaliable bytes should be visible + pub(crate) unsafe fn set_len(&mut self, len: usize) { + debug_assert!(len <= Mode::MAX_INLINE); + self.marker = 128 | len as u8 } pub(crate) fn len(&self) -> usize { - let len = self.marker.data() as usize; - // Panic immediately if inline length is too high, which suggests - // assumptions made about `String`'s memory layout are invalid. - assert!(len <= Mode::MAX_INLINE); + let len = self.marker as usize & 127; + debug_assert!(len <= Mode::MAX_INLINE); len } - pub(crate) fn as_slice(&self) -> &[u8] { - self.data.as_ref() - } - - pub(crate) fn as_mut_slice(&mut self) -> &mut [u8] { + //Caller is responsible for keeping the utf-8 encoded string working + pub(crate) unsafe fn as_mut_slice(&mut self) -> &mut [MaybeUninit] { self.data.as_mut() } pub(crate) fn as_str(&self) -> &str { unsafe { - let data = from_raw_parts(self.data.as_ref().as_ptr(), self.len()); + let data = from_raw_parts(self.data.as_ref().as_ptr().cast(), self.len()); from_utf8_unchecked(data) } } pub(crate) fn as_mut_str(&mut self) -> &mut str { unsafe { - let data = from_raw_parts_mut(self.data.as_mut().as_mut_ptr(), self.len()); + let data = from_raw_parts_mut(self.data.as_mut().as_mut_ptr().cast(), self.len()); from_utf8_unchecked_mut(data) } } - pub(crate) fn insert_bytes(&mut self, index: usize, bytes: &[u8]) { - assert!(self.as_str().is_char_boundary(index)); + //Very unsafe: Caller needs to ensure that the string stays properly encoded + //and that the string doesn't overflow + pub(crate) unsafe fn insert_bytes(&mut self, index: usize, bytes: &[u8]) { + debug_assert!(self.as_str().is_char_boundary(index)); + debug_assert!(bytes.len() + self.len() <= Mode::MAX_INLINE); + debug_assert!(std::str::from_utf8(bytes).is_ok()); + if bytes.is_empty() { return; } let len = self.len(); - unsafe { - let ptr = self.data.as_mut().as_mut_ptr(); - if index != len { - ptr.add(index + bytes.len()) - .copy_from(&self.data.as_ref()[index], len - index); - } - ptr.add(index) - .copy_from_nonoverlapping(bytes.as_ptr(), bytes.len()); + let ptr = self.data.as_mut_ptr(); + if index != len { + ptr.add(index + bytes.len()) + .copy_from(self.data.as_ptr().add(index), len - index); } - self.set_size(len + bytes.len()); + ptr.add(index) + .copy_from_nonoverlapping(bytes.as_ptr().cast(), bytes.len()); + self.set_len(len + bytes.len()); } - pub(crate) fn remove_bytes(&mut self, start: usize, end: usize) { + //Caller needs to ensure that the string stays properly encoded + pub(crate) unsafe fn remove_bytes(&mut self, start: usize, end: usize) { let len = self.len(); - assert!(start <= end); - assert!(end <= len); - assert!(self.as_str().is_char_boundary(start)); - assert!(self.as_str().is_char_boundary(end)); + debug_assert!(start <= end); + debug_assert!(end <= len); + debug_assert!(self.as_str().is_char_boundary(start)); + debug_assert!(self.as_str().is_char_boundary(end)); if start == end { return; } if end < len { - unsafe { - let ptr = self.data.as_mut().as_mut_ptr(); - ptr.add(start).copy_from(ptr.add(end), len - end); - } + let ptr = self.data.as_mut_ptr(); + ptr.add(start).copy_from(ptr.add(end), len - end); } - self.set_size(len - (end - start)); + self.set_len(len - (end - start)); } } impl From<&'_ [u8]> for InlineString { fn from(bytes: &[u8]) -> Self { let len = bytes.len(); - debug_assert!(len <= Mode::MAX_INLINE); + assert!(len <= Mode::MAX_INLINE); + assert!(std::str::from_utf8(bytes).is_ok()); let mut out = Self::new(); - out.marker = Marker::new_inline(len as u8); - out.data.as_mut()[..len].copy_from_slice(bytes); + for i in 0..len { + out.data[i] = MaybeUninit::new(bytes[i]); + } + unsafe { + out.set_len(len); + } out } } diff --git a/src/iter.rs b/src/iter.rs index 3c8304d..96291ba 100644 --- a/src/iter.rs +++ b/src/iter.rs @@ -1,124 +1,48 @@ -use crate::{bounds_for, boxed::BoxedString, inline::InlineString, SmartString, SmartStringMode}; +use crate::{SmartString, SmartStringMode, casts::StringCastMut}; use std::{ fmt::{Debug, Error, Formatter}, iter::FusedIterator, - ops::RangeBounds, - str::Chars, - string::Drain as StringDrain, + ops::{RangeBounds, Bound}, }; /// A draining iterator for a [`SmartString`][SmartString]. /// /// [SmartString]: struct.SmartString.html -pub struct Drain<'a, Mode>(DrainCast<'a, Mode>) -where - Mode: SmartStringMode; - -enum DrainCast<'a, Mode> -where - Mode: SmartStringMode, -{ - Boxed { - string: *mut SmartString, - iter: Option>, - }, - Inline { - string: *mut InlineString, - start: usize, - end: usize, - iter: Chars<'a>, - }, -} - -impl<'a, Mode> Drain<'a, Mode> -where - Mode: SmartStringMode, -{ - pub(crate) fn new(string: &'a mut SmartString, range: R) -> Self - where - R: RangeBounds, - { - let string_ptr: *mut _ = string; - Drain(match string.cast_mut() { - crate::casts::StringCastMut::Boxed(boxed) => DrainCast::Boxed { - string: string_ptr, - iter: Some(boxed.string_mut().drain(range)), - }, - crate::casts::StringCastMut::Inline(inline) => { - let len = inline.len(); - let (start, end) = bounds_for(&range, len); - let string_ptr: *mut _ = inline; - let iter = inline.as_str()[start..end].chars(); - DrainCast::Inline { - string: string_ptr, - start, - end, - iter, - } - } - }) - } +pub struct Drain<'a, Mode: SmartStringMode> { + string: *mut SmartString, + iterator: std::str::Chars<'a>, + start: usize, + end: usize, } -impl<'a, Mode> Drop for Drain<'a, Mode> -where - Mode: SmartStringMode, -{ - fn drop(&mut self) { - match self.0 { - DrainCast::Boxed { - string, - ref mut iter, - } => unsafe { - iter.take(); - (*string).try_demote(); - }, - DrainCast::Inline { - string, start, end, .. - } => { - unsafe { (*string).remove_bytes(start, end) }; - } +impl<'a, Mode: SmartStringMode> Drain<'a, Mode> { + /// Creates a new draining iterator for a [`SmartString`][SmartString]. + /// + /// [SmartString]: struct.SmartString.html + pub fn new>(from: &'a mut SmartString, range: R) -> Self { + let start = match range.start_bound() { + Bound::Included(x) => *x, + Bound::Unbounded => 0, + Bound::Excluded(x) => x.checked_add(1).unwrap(), + }; + let end = match range.end_bound() { + Bound::Excluded(x) => *x, + Bound::Unbounded => from.len(), + Bound::Included(x) => x.checked_add(1).unwrap(), + }; + Self { + string: from, + iterator: from.as_str()[start..end].chars(), + start, + end, } } } -impl<'a, Mode> Iterator for Drain<'a, Mode> -where - Mode: SmartStringMode, -{ +impl<'a, Mode: SmartStringMode> Iterator for Drain<'a, Mode> { type Item = char; - - #[inline] fn next(&mut self) -> Option { - match &mut self.0 { - DrainCast::Boxed { - iter: Some(iter), .. - } => iter.next(), - DrainCast::Boxed { iter: None, .. } => unreachable!(), - DrainCast::Inline { iter, .. } => iter.next(), - } - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - match &self.0 { - DrainCast::Boxed { - iter: Some(iter), .. - } => iter.size_hint(), - DrainCast::Boxed { iter: None, .. } => unreachable!(), - DrainCast::Inline { iter, .. } => iter.size_hint(), - } - } - - #[inline] - fn last(mut self) -> Option { - match &mut self.0 { - DrainCast::Boxed { - iter: Some(iter), .. - } => iter.next_back(), - DrainCast::Boxed { iter: None, .. } => unreachable!(), - DrainCast::Inline { iter, .. } => iter.next_back(), - } + self.iterator.next() } } @@ -128,13 +52,7 @@ where { #[inline] fn next_back(&mut self) -> Option { - match &mut self.0 { - DrainCast::Boxed { - iter: Some(iter), .. - } => iter.next_back(), - DrainCast::Boxed { iter: None, .. } => unreachable!(), - DrainCast::Inline { iter, .. } => iter.next_back(), - } + self.iterator.next_back() } } @@ -148,3 +66,18 @@ where f.pad("Drain { ... }") } } + +impl<'a, Mode> Drop for Drain<'a, Mode> +where + Mode: SmartStringMode, +{ + fn drop(&mut self) { + //We must first replace the iterator with a dummy one, so it won't dangle + self.iterator = "".chars(); + //Now we can safely clear the string + unsafe { match (*self.string).cast_mut() { + StringCastMut::Boxed(mut string) => {string.drain(self.start..self.end);}, + StringCastMut::Inline(string) => string.remove_bytes(self.start, self.end), + }} + } +} diff --git a/src/lib.rs b/src/lib.rs index 2321b06..765d438 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -42,24 +42,16 @@ //! ## Give Me The Details //! //! [`SmartString`][SmartString] is the same size as [`String`][String] and -//! relies on pointer alignment to be able to store a discriminant bit in its -//! inline form that will never be present in its [`String`][String] form, thus -//! giving us 24 bytes (on 64-bit architectures) minus one bit to encode our +//! relies on restrictions on allocation size to be able to store a discriminant bit in its +//! inline form. More specifically the length and capacity of a string must fit in a isize, +//! which means that the most significant bit of those fields will always be zero. This +//! gives us 24 bytes (on 64-bit architectures) minus one bit to encode our //! inline string. It uses 23 bytes to store the string data and the remaining //! 7 bits to encode the string's length. When the available space is exceeded, //! it swaps itself out with a [`String`][String] containing its previous //! contents. Likewise, if the string's length should drop below its inline //! capacity again, it deallocates the string and moves its contents inline. //! -//! Given that we use the knowledge that a certain bit in the memory layout -//! of [`String`][String] will always be unset as a discriminant, you would be -//! able to call [`std::mem::transmute::()`][transmute] on a boxed -//! smart string and start using it as a normal [`String`][String] immediately - -//! there's no pointer tagging or similar trickery going on here. -//! (But please don't do that, there's an efficient [`Into`][IntoString] -//! implementation that does the exact same thing with no need to go unsafe -//! in your own code.) -//! //! It is aggressive about inlining strings, meaning that if you modify a heap allocated //! string such that it becomes short enough for inlining, it will be inlined immediately //! and the allocated [`String`][String] will be dropped. This may cause multiple @@ -72,7 +64,8 @@ //! it never re-inlines a string that's already been heap allocated, instead //! keeping the allocation around in case it needs it. This makes for less //! cache local strings, but is the best choice if you're more worried about -//! time spent on unnecessary allocations than cache locality. +//! time spent on unnecessary allocations than cache locality. By default +//! LazyCompact //! //! ## Performance //! @@ -83,6 +76,14 @@ //! memory efficient in these cases. There will always be a slight overhead on all //! operations on boxed strings, compared to [`String`][String]. //! +//! ## Null pointer optimizations +//! +//! By default, null pointer optimizations are enabled for SmartString, +//! so size_of == size_of>. +//! It's possible to disable them for SmartString, for a very small performance gain, +//! as it usually means that the bytes of SmartString can be reinterpreted as a String. +//! To do this, disable the feature flag lazy_null_pointer_optimizations +//! //! ## Feature Flags //! //! `smartstring` comes with optional support for the following crates through Cargo @@ -125,20 +126,19 @@ use std::{ fmt::{Debug, Error, Formatter, Write}, hash::{Hash, Hasher}, iter::FromIterator, - mem::{forget, MaybeUninit}, + mem::MaybeUninit, ops::{ Add, Bound, Deref, DerefMut, Index, IndexMut, Range, RangeBounds, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive, }, - ptr::drop_in_place, str::FromStr, }; mod config; -pub use config::{Compact, LazyCompact, SmartStringMode}; +pub use config::{Compact, LazyCompact, SmartStringMode, DiscriminantContainer, PossiblyZeroSize}; mod marker_byte; -use marker_byte::{Discriminant, Marker}; +use marker_byte::Discriminant; mod inline; use inline::InlineString; @@ -147,7 +147,7 @@ mod boxed; use boxed::BoxedString; mod casts; -use casts::{StringCast, StringCastInto, StringCastMut}; +use casts::{StringCast, StringCastInto, StringCastMut, please_transmute}; mod iter; pub use iter::Drain; @@ -201,16 +201,51 @@ pub mod alias { /// [Compact]: struct.Compact.html /// [LazyCompact]: struct.LazyCompact.html /// [String]: https://doc.rust-lang.org/std/string/struct.String.html + +#[cfg(target_endian = "big")] #[cfg_attr(target_pointer_width = "64", repr(C, align(8)))] #[cfg_attr(target_pointer_width = "32", repr(C, align(4)))] pub struct SmartString { - data: MaybeUninit>, + disc: Mode::DiscriminantContainer, + data: [MaybeUninit; 2*std::mem::size_of::()], +} + +//Documentation needs to be duplicated for + +/// A smart string. +/// +/// This wraps one of two string types: an inline string or a boxed string. +/// Conversion between the two happens opportunistically and transparently. +/// +/// It takes a layout as its type argument: one of [`Compact`][Compact] or [`LazyCompact`][LazyCompact]. +/// +/// It mimics the interface of [`String`][String] except where behaviour cannot +/// be guaranteed to stay consistent between its boxed and inline states. This means +/// you still have `capacity()` and `shrink_to_fit()`, relating to state that only +/// really exists in the boxed variant, because the inline variant can still give +/// sensible behaviour for these operations, but `with_capacity()`, `reserve()` etc are +/// absent, because they would have no effect on inline strings and the requested +/// state changes wouldn't carry over if the inline string is promoted to a boxed +/// one - not without also storing that state in the inline representation, which +/// would waste precious bytes for inline string data. +/// +/// [SmartString]: struct.SmartString.html +/// [Compact]: struct.Compact.html +/// [LazyCompact]: struct.LazyCompact.html +/// [String]: https://doc.rust-lang.org/std/string/struct.String.html + +#[cfg(target_endian = "little")] +#[cfg_attr(target_pointer_width = "64", repr(C, align(8)))] +#[cfg_attr(target_pointer_width = "32", repr(C, align(4)))] +pub struct SmartString { + data: [MaybeUninit; 2*std::mem::size_of::()], + disc: Mode::DiscriminantContainer, } impl Drop for SmartString { fn drop(&mut self) { - if let StringCastMut::Boxed(string) = self.cast_mut() { - unsafe { drop_in_place(string) }; + if let StringCastMut::Boxed(mut string) = self.cast_mut() { + string.clear(); } } } @@ -218,14 +253,14 @@ impl Drop for SmartString { impl Clone for SmartString { /// Clone a `SmartString`. /// - /// If the string is inlined, this is a [`Copy`][Copy] operation. Otherwise, + /// If the string is small enough to fit inline, this is a [`Copy`][Copy] operation. Otherwise, /// [`String::clone()`][String::clone] is invoked. /// /// [String::clone]: https://doc.rust-lang.org/std/string/struct.String.html#impl-Clone /// [Copy]: https://doc.rust-lang.org/std/marker/trait.Copy.html fn clone(&self) -> Self { match self.cast() { - StringCast::Boxed(string) => Self::from_boxed(string.string().clone().into()), + StringCast::Boxed(_) => Self::from(self.deref()), StringCast::Inline(string) => Self::from_inline(*string), } } @@ -237,7 +272,8 @@ impl Deref for SmartString { #[inline(always)] fn deref(&self) -> &Self::Target { match self.cast() { - StringCast::Boxed(string) => string.string().as_str(), + StringCast::Boxed(string) => string, + //Todo: implement deref for inline string StringCast::Inline(string) => string.as_str(), } } @@ -246,10 +282,16 @@ impl Deref for SmartString { impl DerefMut for SmartString { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { - match self.cast_mut() { - StringCastMut::Boxed(string) => string.string_mut().as_mut_str(), - StringCastMut::Inline(string) => string.as_mut_str(), - } + unsafe { match self.discriminant() { + Discriminant::Boxed => { + let boxed : &mut Mode::BoxedString = &mut *(self as *mut Self).cast(); + boxed + } + Discriminant::Inline => { + let inline : &mut InlineString = &mut *(self as *mut Self).cast(); + inline.as_mut_str() + } + }} } } @@ -261,60 +303,57 @@ impl SmartString { } fn from_boxed(boxed: Mode::BoxedString) -> Self { - let mut out = Self { - data: MaybeUninit::uninit(), - }; - let data_ptr: *mut Mode::BoxedString = out.data.as_mut_ptr().cast(); - unsafe { data_ptr.write(boxed) }; - out + unsafe { + please_transmute(boxed) + } } fn from_inline(inline: InlineString) -> Self { - let mut out = Self { - data: MaybeUninit::uninit(), + let ret = unsafe { + please_transmute(inline) }; - unsafe { out.data.as_mut_ptr().write(inline) }; - out + ret } fn discriminant(&self) -> Discriminant { - let ptr: *const Marker = self.data.as_ptr().cast(); - unsafe { *ptr }.discriminant() + match self.disc.get_full_marker() & 128 != 0 { + true => Discriminant::Inline, + false => Discriminant::Boxed, + } } fn cast(&self) -> StringCast<'_, Mode> { match self.discriminant() { - Discriminant::Inline => StringCast::Inline(unsafe { &*self.data.as_ptr() }), - Discriminant::Boxed => StringCast::Boxed(unsafe { &*self.data.as_ptr().cast() }), + Discriminant::Inline => StringCast::Inline(unsafe { &*(self as *const Self).cast() }), + Discriminant::Boxed => StringCast::Boxed(unsafe { &*(self as *const Self).cast() }), } } fn cast_mut(&mut self) -> StringCastMut<'_, Mode> { match self.discriminant() { - Discriminant::Inline => StringCastMut::Inline(unsafe { &mut *self.data.as_mut_ptr() }), + Discriminant::Inline => StringCastMut::Inline(unsafe { &mut *(self as *mut Self).cast() }), Discriminant::Boxed => { - StringCastMut::Boxed(unsafe { &mut *self.data.as_mut_ptr().cast() }) + StringCastMut::Boxed( unsafe { + boxed::StringReference::from_smart_unchecked(self) + }) } } } - fn cast_into(mut self) -> StringCastInto { + fn cast_into(self) -> StringCastInto { match self.discriminant() { - Discriminant::Inline => StringCastInto::Inline(unsafe { self.data.assume_init() }), - Discriminant::Boxed => StringCastInto::Boxed(unsafe { - let boxed_ptr: *mut Mode::BoxedString = self.data.as_mut_ptr().cast(); - let string = boxed_ptr.read(); - forget(self); - string - }), + Discriminant::Inline => StringCastInto::Inline(unsafe { please_transmute(self) }), + Discriminant::Boxed => StringCastInto::Boxed(unsafe { please_transmute(self) }), } } - fn promote_from(&mut self, string: String) { + //Unsafe: if nullptr optimizations are on, string must have nonzero size or capacity + //depending on the laziness + unsafe fn promote_from(&mut self, string: String) { debug_assert!(self.discriminant() == Discriminant::Inline); - let string: Mode::BoxedString = string.into(); - let data: *mut Mode::BoxedString = self.data.as_mut_ptr().cast(); - unsafe { data.write(string) }; + let ptr = (self as *mut Self).cast(); + //Use ptr::write to avoid redundant dropping + std::ptr::write(ptr, boxed::PseudoString::from_string_unchecked(string)); } /// Attempt to inline the string if it's currently heap allocated. @@ -330,21 +369,20 @@ impl SmartString { /// Attempt to inline the string regardless of whether `Mode::DEALLOC` is set. fn really_try_demote(&mut self) -> bool { - if let StringCastMut::Boxed(string) = self.cast_mut() { + let inlined = if let StringCastMut::Boxed(string) = self.cast_mut() { if string.len() > Mode::MAX_INLINE { - false + return false; } else { - let inlined = string.string().as_bytes().into(); - unsafe { - drop_in_place(string); - let data = &mut self.data.as_mut_ptr(); - data.write(inlined); - } - true + let deref: &str = string.deref(); + Self::from(deref) } } else { - true - } + return true; + }; + std::mem::forget( + std::mem::replace(self, inlined) + ); + true } /// Return the length in bytes of the string. @@ -379,38 +417,57 @@ impl SmartString { /// Push a character to the end of the string. pub fn push(&mut self, ch: char) { - match self.cast_mut() { - StringCastMut::Boxed(string) => string.string_mut().push(ch), + let promote = match self.cast_mut() { + StringCastMut::Boxed(mut string) => { + string.push(ch); + return; + } StringCastMut::Inline(string) => { let len = string.len(); - if len + ch.len_utf8() > Mode::MAX_INLINE { - let mut string = self.to_string(); + let chlen = ch.len_utf8(); + if len + chlen > Mode::MAX_INLINE { + let mut string = string.as_str().to_string(); string.push(ch); - self.promote_from(string); + string } else { - let written = ch.encode_utf8(&mut string.as_mut_slice()[len..]).len(); - string.set_size(len + written); + for e in &mut string.data[len..(len + chlen)] { + //These have to be initialized, as we are passing u8:s into encode_utf8 + *e = std::mem::MaybeUninit::new(0); + } + let written = ch.encode_utf8( unsafe { + std::mem::transmute(&mut string.data[len..(len + chlen)]) + }).len(); + unsafe { string.set_len(len + written) }; + return; } } - } + }; + unsafe {self.promote_from(promote)}; } /// Copy a string slice onto the end of the string. pub fn push_str(&mut self, string: &str) { let len = self.len(); - match self.cast_mut() { - StringCastMut::Boxed(this) => this.string_mut().push_str(string), + let promote = match self.cast_mut() { + StringCastMut::Boxed(mut this) => { + this.push_str(string); + return; + } StringCastMut::Inline(this) => { if len + string.len() > Mode::MAX_INLINE { - let mut this = self.to_string(); + let mut this = this.as_str().to_string(); this.push_str(string); - self.promote_from(this); + this } else { - this.as_mut_slice()[len..len + string.len()].copy_from_slice(string.as_bytes()); - this.set_size(len + string.len()); + unsafe { + this.as_mut_slice()[len..len + string.len()].copy_from_slice(std::mem::transmute(string.as_bytes())); + this.set_len(len + string.len()); + } + return } } - } + }; + unsafe {self.promote_from(promote)}; } /// Return the currently allocated capacity of the string. @@ -426,7 +483,7 @@ impl SmartString { /// [String::capacity]: https://doc.rust-lang.org/std/string/struct.String.html#method.capacity pub fn capacity(&self) -> usize { if let StringCast::Boxed(string) = self.cast() { - string.string().capacity() + string.capacity() } else { Mode::MAX_INLINE } @@ -445,9 +502,10 @@ impl SmartString { /// [capacity]: struct.SmartString.html#method.capacity /// [len]: struct.SmartString.html#method.len pub fn shrink_to_fit(&mut self) { - if let StringCastMut::Boxed(string) = self.cast_mut() { + if let StringCastMut::Boxed(mut string) = self.cast_mut() { if string.len() > Mode::MAX_INLINE { - string.string_mut().shrink_to_fit(); + string.shrink_to_fit(); + return; } } self.really_try_demote(); @@ -459,11 +517,13 @@ impl SmartString { /// If `new_len` isn't on a UTF-8 character boundary, this method panics. pub fn truncate(&mut self, new_len: usize) { match self.cast_mut() { - StringCastMut::Boxed(string) => string.string_mut().truncate(new_len), + StringCastMut::Boxed(mut string) => string.truncate(new_len), StringCastMut::Inline(string) => { if new_len < string.len() { assert!(string.as_str().is_char_boundary(new_len)); - string.set_size(new_len); + unsafe { + string.set_len(new_len) + }; } return; } @@ -474,10 +534,10 @@ impl SmartString { /// Pop a `char` off the end of the string. pub fn pop(&mut self) -> Option { let result = match self.cast_mut() { - StringCastMut::Boxed(string) => string.string_mut().pop()?, + StringCastMut::Boxed(mut string) => string.pop()?, StringCastMut::Inline(string) => { let ch = string.as_str().chars().rev().next()?; - string.set_size(string.len() - ch.len_utf8()); + unsafe {string.set_len(string.len() - ch.len_utf8());} return Some(ch); } }; @@ -490,7 +550,7 @@ impl SmartString { /// If the index doesn't fall on a UTF-8 character boundary, this method panics. pub fn remove(&mut self, index: usize) -> char { let result = match self.cast_mut() { - StringCastMut::Boxed(string) => string.string_mut().remove(index), + StringCastMut::Boxed(mut string) => string.remove(index), StringCastMut::Inline(string) => { let ch = match string.as_str()[index..].chars().next() { Some(ch) => ch, @@ -499,10 +559,9 @@ impl SmartString { let next = index + ch.len_utf8(); let len = string.len(); unsafe { - (&mut string.as_mut_slice()[index] as *mut u8) - .copy_from(&string.as_slice()[next], len - next); + string.data[index].as_mut_ptr().copy_from(string.data[next].as_ptr(), len - next); + string.set_len(len - (next - index)); } - string.set_size(len - (next - index)); return ch; } }; @@ -514,40 +573,54 @@ impl SmartString { /// /// If the index doesn't fall on a UTF-8 character boundary, this method panics. pub fn insert(&mut self, index: usize, ch: char) { - match self.cast_mut() { - StringCastMut::Boxed(string) => { - string.string_mut().insert(index, ch); + let promote = match self.cast_mut() { + StringCastMut::Boxed(mut string) => { + string.insert(index, ch); + return; } StringCastMut::Inline(string) if string.len() + ch.len_utf8() <= Mode::MAX_INLINE => { + if !string.as_str().is_char_boundary(index) { + panic!(); + } let mut buffer = [0; 4]; let buffer = ch.encode_utf8(&mut buffer).as_bytes(); - string.insert_bytes(index, buffer); + unsafe { + string.insert_bytes(index, buffer); + } + return; } - _ => { - let mut string = self.to_string(); + StringCastMut::Inline(string) => { + let mut string = string.as_str().to_string(); string.insert(index, ch); - self.promote_from(string); + string } - } + }; + unsafe {self.promote_from(promote)}; } /// Insert a string slice into the string at the given index. /// /// If the index doesn't fall on a UTF-8 character boundary, this method panics. pub fn insert_str(&mut self, index: usize, string: &str) { - match self.cast_mut() { - StringCastMut::Boxed(this) => { - this.string_mut().insert_str(index, string); + let promote = match self.cast_mut() { + StringCastMut::Boxed(mut this) => { + this.insert_str(index, string); + return; } StringCastMut::Inline(this) if this.len() + string.len() <= Mode::MAX_INLINE => { - this.insert_bytes(index, string.as_bytes()); - } - _ => { - let mut this = self.to_string(); + if !this.as_str().is_char_boundary(index) { + panic!(); + } + unsafe {this.insert_bytes(index, string.as_bytes())}; + return; + }, + StringCastMut::Inline(this) => { + let mut this = this.as_str().to_string(); this.insert_str(index, string); - self.promote_from(this); + this } - } + }; + unsafe {self.promote_from(promote)}; } /// Split the string into two at the given index. @@ -558,12 +631,14 @@ impl SmartString { /// If the index doesn't fall on a UTF-8 character boundary, this method panics. pub fn split_off(&mut self, index: usize) -> Self { let result = match self.cast_mut() { - StringCastMut::Boxed(string) => string.string_mut().split_off(index), + StringCastMut::Boxed(mut string) => string.split_off(index), StringCastMut::Inline(string) => { let s = string.as_str(); assert!(s.is_char_boundary(index)); let result = s[index..].into(); - string.set_size(index); + unsafe { + string.set_len(index); + } return result; } }; @@ -584,9 +659,7 @@ impl SmartString { F: FnMut(char) -> bool, { match self.cast_mut() { - StringCastMut::Boxed(string) => { - string.string_mut().retain(f); - } + StringCastMut::Boxed(mut string) => string.retain(f), StringCastMut::Inline(string) => { let len = string.len(); let mut del_bytes = 0; @@ -606,13 +679,17 @@ impl SmartString { if !f(ch) { del_bytes += ch_len; } else if del_bytes > 0 { - let ptr = string.as_mut_slice().as_mut_ptr(); - unsafe { ptr.add(index - del_bytes).copy_from(ptr.add(index), ch_len) }; + unsafe { + let ptr = string.as_mut_slice().as_mut_ptr(); + ptr.add(index - del_bytes).copy_from(ptr.add(index), ch_len); + }; } index += ch_len; } if del_bytes > 0 { - string.set_size(len - del_bytes); + unsafe { + string.set_len(len - del_bytes); + } } return; } @@ -636,10 +713,11 @@ impl SmartString { where R: RangeBounds, { - match self.cast_mut() { - StringCastMut::Boxed(string) => { - string.string_mut().replace_range(range, replace_with); - } + let promote = match self.cast_mut() { + StringCastMut::Boxed(mut string) => { + string.replace_range(range, replace_with); + return; + }, StringCastMut::Inline(string) => { let len = string.len(); let (start, end) = bounds_for(&range, len); @@ -652,22 +730,24 @@ impl SmartString { if (len - replaced_len) + replace_len > Mode::MAX_INLINE { let mut string = string.as_str().to_string(); string.replace_range(range, replace_with); - self.promote_from(string); + string } else { let new_end = start + replace_len; let end_size = len - end; - let ptr = string.as_mut_slice().as_mut_ptr(); unsafe { + let ptr = string.as_mut_slice().as_mut_ptr(); ptr.add(end).copy_to(ptr.add(new_end), end_size); ptr.add(start) - .copy_from(replace_with.as_bytes().as_ptr(), replace_len); + .copy_from(replace_with.as_bytes().as_ptr().cast(), replace_len); + string.set_len(start + replace_len + end_size); } - string.set_size(start + replace_len + end_size); + return; } - return; } + }; + unsafe { + self.promote_from(promote); } - self.try_demote(); } } @@ -805,7 +885,9 @@ impl IndexMut> for SmartString From<&'_ str> for SmartString { fn from(string: &'_ str) -> Self { if string.len() > Mode::MAX_INLINE { - Self::from_boxed(string.to_string().into()) + Self::from_boxed(unsafe { + Mode::BoxedString::from_string_unchecked(string.to_string()) + }) } else { Self::from_inline(string.as_bytes().into()) } @@ -815,7 +897,9 @@ impl From<&'_ str> for SmartString { impl From<&'_ String> for SmartString { fn from(string: &'_ String) -> Self { if string.len() > Mode::MAX_INLINE { - Self::from_boxed(string.clone().into()) + Self::from_boxed(unsafe { + Mode::BoxedString::from_string_unchecked(string.clone()) + }) } else { Self::from_inline(string.as_bytes().into()) } @@ -825,7 +909,9 @@ impl From<&'_ String> for SmartString { impl From for SmartString { fn from(string: String) -> Self { if string.len() > Mode::MAX_INLINE { - Self::from_boxed(string.into()) + Self::from_boxed(unsafe { + Mode::BoxedString::from_string_unchecked(string) + }) } else { Self::from_inline(string.as_bytes().into()) } @@ -1009,7 +1095,7 @@ impl Into for SmartString { /// [String]: https://doc.rust-lang.org/std/string/struct.String.html fn into(self) -> String { match self.cast_into() { - StringCastInto::Boxed(string) => string.into_string(), + StringCastInto::Boxed(string) => string.into(), StringCastInto::Inline(string) => string.as_str().to_string(), } } diff --git a/src/marker_byte.rs b/src/marker_byte.rs index 4bc58af..14d15f8 100644 --- a/src/marker_byte.rs +++ b/src/marker_byte.rs @@ -2,77 +2,8 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -// We need to know the endianness of the platform to know how to deal with pointers. -#[cfg(target_endian = "big")] -const UPSIDE_DOWN_LAND: bool = false; -#[cfg(target_endian = "little")] -const UPSIDE_DOWN_LAND: bool = true; - #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub(crate) enum Discriminant { Boxed, Inline, } - -impl Discriminant { - #[inline(always)] - fn from_bit(bit: bool) -> Self { - if bit { - Self::Inline - } else { - Self::Boxed - } - } - - #[inline(always)] - fn bit(self) -> u8 { - match self { - Self::Boxed => 0, - Self::Inline => 1, - } - } -} - -#[derive(Clone, Copy, Debug)] -pub(crate) struct Marker(u8); - -impl Marker { - #[inline(always)] - fn assemble(discriminant: Discriminant, data: u8) -> u8 { - let data = data; - if UPSIDE_DOWN_LAND { - data << 1 | discriminant.bit() - } else { - discriminant.bit() << 7 | data - } - } - - pub(crate) fn new_inline(data: u8) -> Self { - debug_assert!(data < 0x80); - Self(Self::assemble(Discriminant::Inline, data)) - } - - #[inline(always)] - pub(crate) fn discriminant(self) -> Discriminant { - Discriminant::from_bit(if UPSIDE_DOWN_LAND { - self.0 & 0x01 != 0 - } else { - self.0 & 0x80 != 0 - }) - } - - #[inline(always)] - pub(crate) fn data(self) -> u8 { - if UPSIDE_DOWN_LAND { - self.0 >> 1 - } else { - self.0 & 0x7f - } - } - - #[inline(always)] - pub(crate) fn set_data(&mut self, byte: u8) { - debug_assert!(byte < 0x80); - self.0 = Self::assemble(self.discriminant(), byte); - } -} From 99732c6f43d5d6e367f2ba84a6175df15db50333 Mon Sep 17 00:00:00 2001 From: AnttiP Date: Wed, 8 Jul 2020 23:57:28 +0300 Subject: [PATCH 2/8] Removed redundant try_demote() --- src/boxed.rs | 4 ++-- src/lib.rs | 23 +++++------------------ 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/src/boxed.rs b/src/boxed.rs index ac452e5..094cb81 100644 --- a/src/boxed.rs +++ b/src/boxed.rs @@ -6,7 +6,7 @@ use std::ops::{Deref, DerefMut}; use crate::{SmartStringMode, SmartString, inline::InlineString}; pub trait BoxedString: Deref + DerefMut + Into { - //This is unsafe when null pointer optimizations are used + //This is unsafe when null pointer optimizations are used with LazyCompact //Then, it is unsound if the capacity of the string is 0 unsafe fn from_string_unchecked(string: String) -> Self; fn capacity(&self) -> usize; @@ -25,9 +25,9 @@ pub struct PseudoString { #[cfg(target_endian = "little")] #[cfg(not(feature = "lazy_null_pointer_optimizations"))] #[repr(C)] +#[derive(Debug)] //This seems to be the most common arrangement of std::String //However, with lazy null pointer optimizations, this arrangement does not work -#[derive(Debug)] pub struct PseudoString { ptr: std::ptr::NonNull, capacity: usize, diff --git a/src/lib.rs b/src/lib.rs index 765d438..7413bfb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -210,7 +210,7 @@ pub struct SmartString { data: [MaybeUninit; 2*std::mem::size_of::()], } -//Documentation needs to be duplicated for +//Documentation needs to be duplicated /// A smart string. /// @@ -356,17 +356,6 @@ impl SmartString { std::ptr::write(ptr, boxed::PseudoString::from_string_unchecked(string)); } - /// Attempt to inline the string if it's currently heap allocated. - /// - /// Returns the resulting state: `true` if it's inlined, `false` if it's not. - fn try_demote(&mut self) -> bool { - if Mode::DEALLOC { - self.really_try_demote() - } else { - false - } - } - /// Attempt to inline the string regardless of whether `Mode::DEALLOC` is set. fn really_try_demote(&mut self) -> bool { let inlined = if let StringCastMut::Boxed(string) = self.cast_mut() { @@ -508,7 +497,10 @@ impl SmartString { return; } } - self.really_try_demote(); + if !Mode::DEALLOC && !cfg!(lazy_null_pointer_optimizations) { + self.really_try_demote(); + } + //Else: the demotion is already done by StringReference } /// Truncate the string to `new_len` bytes. @@ -528,7 +520,6 @@ impl SmartString { return; } } - self.try_demote(); } /// Pop a `char` off the end of the string. @@ -541,7 +532,6 @@ impl SmartString { return Some(ch); } }; - self.try_demote(); Some(result) } @@ -565,7 +555,6 @@ impl SmartString { return ch; } }; - self.try_demote(); result } @@ -642,7 +631,6 @@ impl SmartString { return result; } }; - self.try_demote(); result.into() } @@ -694,7 +682,6 @@ impl SmartString { return; } } - self.try_demote(); } /// Construct a draining iterator over a given range. From 20be7890c459e47ed72a664df40bf93bb58eae2d Mon Sep 17 00:00:00 2001 From: AnttiP Date: Thu, 9 Jul 2020 10:06:22 +0300 Subject: [PATCH 3/8] Clippy lints, test for Option integrity, InlineString deref --- src/config.rs | 8 +++++-- src/inline.rs | 46 ++++++++++++++++++++---------------- src/lib.rs | 64 ++++++++++++++++++++++++--------------------------- src/test.rs | 5 ++++ 4 files changed, 67 insertions(+), 56 deletions(-) diff --git a/src/config.rs b/src/config.rs index 12753cc..a027625 100644 --- a/src/config.rs +++ b/src/config.rs @@ -81,17 +81,21 @@ pub trait DiscriminantContainer { /// Return Self with the requirement that the marker is inside fn new(marker: u8) -> Self; /// Flip the highest bit of marker + /// + /// # Safety + /// + /// Caller must ensure this doesn't cause UB, for example by turning a Non-zero DiscriminantContainer into a zeroed one unsafe fn flip_bit(&mut self); } impl DiscriminantContainer for std::num::NonZeroUsize { fn get_full_marker(&self) -> u8 { - (self.get() >> (std::mem::size_of::() - 1)*8) as u8 + (self.get() >> ((std::mem::size_of::() - 1)*8)) as u8 } fn new(marker: u8) -> Self { unsafe { Self::new_unchecked( - ((marker as usize) << (std::mem::size_of::() - 1)*8) + 1 + ((marker as usize) << ((std::mem::size_of::() - 1)*8)) + 1 ) } } diff --git a/src/inline.rs b/src/inline.rs index 8a72caf..25ce2c5 100644 --- a/src/inline.rs +++ b/src/inline.rs @@ -7,6 +7,7 @@ use std::{ mem::MaybeUninit, slice::{from_raw_parts, from_raw_parts_mut}, str::{from_utf8_unchecked, from_utf8_unchecked_mut}, + ops::{Deref, DerefMut}, }; #[cfg(target_endian = "big")] @@ -35,6 +36,25 @@ impl Clone for InlineString { impl Copy for InlineString {} +impl Deref for InlineString { + type Target = str; + fn deref(&self) -> &str { + unsafe { + let data = from_raw_parts(self.data.as_ref().as_ptr().cast(), self.len()); + from_utf8_unchecked(data) + } + } +} + +impl DerefMut for InlineString { + fn deref_mut(&mut self) -> &mut str { + unsafe { + let data = from_raw_parts_mut(self.data.as_mut().as_mut_ptr().cast(), self.len()); + from_utf8_unchecked_mut(data) + } + } +} + impl InlineString { pub(crate) fn new() -> Self { let mut ret = Self { @@ -44,7 +64,7 @@ impl InlineString { }; //Are nullptr optimizations on? if Mode::DEALLOC || cfg!(lazy_null_pointer_optimizations) { - //Initialize the 7 first or last bytes of data + //Initialize the 7 highest bytes of data for j in 0..(std::mem::size_of::() - 1) { #[cfg(target_endian = "little")] let j = 3*std::mem::size_of::() - 3 - j; @@ -73,24 +93,10 @@ impl InlineString { self.data.as_mut() } - pub(crate) fn as_str(&self) -> &str { - unsafe { - let data = from_raw_parts(self.data.as_ref().as_ptr().cast(), self.len()); - from_utf8_unchecked(data) - } - } - - pub(crate) fn as_mut_str(&mut self) -> &mut str { - unsafe { - let data = from_raw_parts_mut(self.data.as_mut().as_mut_ptr().cast(), self.len()); - from_utf8_unchecked_mut(data) - } - } - //Very unsafe: Caller needs to ensure that the string stays properly encoded //and that the string doesn't overflow pub(crate) unsafe fn insert_bytes(&mut self, index: usize, bytes: &[u8]) { - debug_assert!(self.as_str().is_char_boundary(index)); + debug_assert!(self.is_char_boundary(index)); debug_assert!(bytes.len() + self.len() <= Mode::MAX_INLINE); debug_assert!(std::str::from_utf8(bytes).is_ok()); @@ -113,8 +119,8 @@ impl InlineString { let len = self.len(); debug_assert!(start <= end); debug_assert!(end <= len); - debug_assert!(self.as_str().is_char_boundary(start)); - debug_assert!(self.as_str().is_char_boundary(end)); + debug_assert!(self.is_char_boundary(start)); + debug_assert!(self.is_char_boundary(end)); if start == end { return; } @@ -132,8 +138,8 @@ impl From<&'_ [u8]> for InlineString { assert!(len <= Mode::MAX_INLINE); assert!(std::str::from_utf8(bytes).is_ok()); let mut out = Self::new(); - for i in 0..len { - out.data[i] = MaybeUninit::new(bytes[i]); + for (i, byte) in out.data.iter_mut().enumerate().take(len) { + *byte = MaybeUninit::new(bytes[i]); } unsafe { out.set_len(len); diff --git a/src/lib.rs b/src/lib.rs index 7413bfb..87309be 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -273,8 +273,7 @@ impl Deref for SmartString { fn deref(&self) -> &Self::Target { match self.cast() { StringCast::Boxed(string) => string, - //Todo: implement deref for inline string - StringCast::Inline(string) => string.as_str(), + StringCast::Inline(string) => string, } } } @@ -289,7 +288,7 @@ impl DerefMut for SmartString { } Discriminant::Inline => { let inline : &mut InlineString = &mut *(self as *mut Self).cast(); - inline.as_mut_str() + inline } }} } @@ -309,16 +308,16 @@ impl SmartString { } fn from_inline(inline: InlineString) -> Self { - let ret = unsafe { + unsafe { please_transmute(inline) - }; - ret + } } fn discriminant(&self) -> Discriminant { - match self.disc.get_full_marker() & 128 != 0 { - true => Discriminant::Inline, - false => Discriminant::Boxed, + if self.disc.get_full_marker() & 128 != 0 { + Discriminant::Inline + } else { + Discriminant::Boxed } } @@ -415,7 +414,7 @@ impl SmartString { let len = string.len(); let chlen = ch.len_utf8(); if len + chlen > Mode::MAX_INLINE { - let mut string = string.as_str().to_string(); + let mut string = string.to_string(); string.push(ch); string } else { @@ -424,7 +423,7 @@ impl SmartString { *e = std::mem::MaybeUninit::new(0); } let written = ch.encode_utf8( unsafe { - std::mem::transmute(&mut string.data[len..(len + chlen)]) + &mut *(&mut string.data[len..(len + chlen)] as *mut [std::mem::MaybeUninit] as *mut [u8]) }).len(); unsafe { string.set_len(len + written) }; return; @@ -444,12 +443,14 @@ impl SmartString { } StringCastMut::Inline(this) => { if len + string.len() > Mode::MAX_INLINE { - let mut this = this.as_str().to_string(); + let mut this = this.to_string(); this.push_str(string); this } else { unsafe { - this.as_mut_slice()[len..len + string.len()].copy_from_slice(std::mem::transmute(string.as_bytes())); + this.as_mut_slice()[len..len + string.len()].copy_from_slice( + &*(string.as_bytes() as *const [u8] as *const [std::mem::MaybeUninit]) + ); this.set_len(len + string.len()); } return @@ -512,12 +513,11 @@ impl SmartString { StringCastMut::Boxed(mut string) => string.truncate(new_len), StringCastMut::Inline(string) => { if new_len < string.len() { - assert!(string.as_str().is_char_boundary(new_len)); + assert!(string.is_char_boundary(new_len)); unsafe { string.set_len(new_len) }; } - return; } } } @@ -527,7 +527,7 @@ impl SmartString { let result = match self.cast_mut() { StringCastMut::Boxed(mut string) => string.pop()?, StringCastMut::Inline(string) => { - let ch = string.as_str().chars().rev().next()?; + let ch = string.chars().rev().next()?; unsafe {string.set_len(string.len() - ch.len_utf8());} return Some(ch); } @@ -539,10 +539,10 @@ impl SmartString { /// /// If the index doesn't fall on a UTF-8 character boundary, this method panics. pub fn remove(&mut self, index: usize) -> char { - let result = match self.cast_mut() { + match self.cast_mut() { StringCastMut::Boxed(mut string) => string.remove(index), StringCastMut::Inline(string) => { - let ch = match string.as_str()[index..].chars().next() { + let ch = match string[index..].chars().next() { Some(ch) => ch, None => panic!("cannot remove a char from the end of a string"), }; @@ -552,10 +552,9 @@ impl SmartString { string.data[index].as_mut_ptr().copy_from(string.data[next].as_ptr(), len - next); string.set_len(len - (next - index)); } - return ch; + ch } - }; - result + } } /// Insert a `char` into the string at the given index. @@ -568,7 +567,7 @@ impl SmartString { return; } StringCastMut::Inline(string) if string.len() + ch.len_utf8() <= Mode::MAX_INLINE => { - if !string.as_str().is_char_boundary(index) { + if !string.is_char_boundary(index) { panic!(); } let mut buffer = [0; 4]; @@ -579,7 +578,7 @@ impl SmartString { return; } StringCastMut::Inline(string) => { - let mut string = string.as_str().to_string(); + let mut string = string.to_string(); string.insert(index, ch); string } @@ -597,14 +596,14 @@ impl SmartString { return; } StringCastMut::Inline(this) if this.len() + string.len() <= Mode::MAX_INLINE => { - if !this.as_str().is_char_boundary(index) { + if !this.is_char_boundary(index) { panic!(); } unsafe {this.insert_bytes(index, string.as_bytes())}; return; }, StringCastMut::Inline(this) => { - let mut this = this.as_str().to_string(); + let mut this = this.to_string(); this.insert_str(index, string); this } @@ -622,9 +621,8 @@ impl SmartString { let result = match self.cast_mut() { StringCastMut::Boxed(mut string) => string.split_off(index), StringCastMut::Inline(string) => { - let s = string.as_str(); - assert!(s.is_char_boundary(index)); - let result = s[index..].into(); + assert!(string.is_char_boundary(index)); + let result = string[index..].into(); unsafe { string.set_len(index); } @@ -656,7 +654,6 @@ impl SmartString { while index < len { let ch = unsafe { string - .as_mut_str() .get_unchecked(index..len) .chars() .next() @@ -679,7 +676,6 @@ impl SmartString { string.set_len(len - del_bytes); } } - return; } } } @@ -710,12 +706,12 @@ impl SmartString { let (start, end) = bounds_for(&range, len); assert!(end >= start); assert!(end <= len); - assert!(string.as_str().is_char_boundary(start)); - assert!(string.as_str().is_char_boundary(end)); + assert!(string.is_char_boundary(start)); + assert!(string.is_char_boundary(end)); let replaced_len = end - start; let replace_len = replace_with.len(); if (len - replaced_len) + replace_len > Mode::MAX_INLINE { - let mut string = string.as_str().to_string(); + let mut string = string.to_string(); string.replace_range(range, replace_with); string } else { @@ -1083,7 +1079,7 @@ impl Into for SmartString { fn into(self) -> String { match self.cast_into() { StringCastInto::Boxed(string) => string.into(), - StringCastInto::Inline(string) => string.as_str().to_string(), + StringCastInto::Inline(string) => string.to_string(), } } } diff --git a/src/test.rs b/src/test.rs index 5573737..248f634 100644 --- a/src/test.rs +++ b/src/test.rs @@ -363,6 +363,11 @@ fn assert_invariants(control: &str, subject: &SmartString ); let control_smart: SmartString = control.into(); assert_eq!(Ordering::Equal, subject.cmp(&control_smart)); + if std::mem::size_of::>() == std::mem::size_of::>>() { + let ptr : *const Option> = (subject as *const SmartString).cast(); + //Todo: should include a black box to prevent rustc from optimizing this + assert!(unsafe {(*ptr).is_some()}); + } } pub fn test_everything(constructor: Constructor, actions: Vec) { From 22e65f68b89ba262eebc400abd6bf4a23eaa4fcb Mon Sep 17 00:00:00 2001 From: AnttiP Date: Thu, 9 Jul 2020 10:51:52 +0300 Subject: [PATCH 4/8] Keeping up with the master branch --- CHANGELOG.md | 11 ++++++++++- Cargo.toml | 2 +- src/lib.rs | 48 +++++++++++++++++++++++++++--------------------- src/proptest.rs | 8 ++++++++ src/test.rs | 7 +++++++ 5 files changed, 53 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 65267f9..3d01478 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,16 +5,25 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.2.3] - 2020-07-07 ### ADDED +- `SmartString` now implements `Display`. (#6) - `SmartString` now implements `FromIterator`. - Support for [`serde`](https://serde.rs/) behind the `serde` feature flag. (#2) - Support for [`arbitrary`](https://crates.io/crates/arbitrary) behind the `arbitrary` feature flag. - Support for [`proptest`](https://crates.io/crates/proptest) behind the `proptest` feature flag. +### FIXED + +- `SmartString::push_str` would previously trigger two heap allocations while promoting an inline + string to a boxed string, one of which was unnecessary. It now only makes the one strictly + necessary allocation. (#5) +- Fixed a bug where `SmartString::remove` would panic if you tried to remove the last index in an + inline string. + ## [0.2.2] - 2020-07-05 ### FIXED diff --git a/Cargo.toml b/Cargo.toml index 3dbe61e..c836772 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "smartstring" -version = "0.2.2" +version = "0.2.3" authors = ["Bodil Stokke "] edition = "2018" license = "MPL-2.0+" diff --git a/src/lib.rs b/src/lib.rs index 87309be..052d243 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -123,7 +123,7 @@ use std::{ borrow::{Borrow, BorrowMut}, cmp::Ordering, convert::Infallible, - fmt::{Debug, Error, Formatter, Write}, + fmt::{Debug, Display, Error, Formatter, Write}, hash::{Hash, Hasher}, iter::FromIterator, mem::MaybeUninit, @@ -412,18 +412,19 @@ impl SmartString { } StringCastMut::Inline(string) => { let len = string.len(); - let chlen = ch.len_utf8(); - if len + chlen > Mode::MAX_INLINE { - let mut string = string.to_string(); - string.push(ch); - string + let new_len = len + ch.len_utf8(); + if new_len > Mode::MAX_INLINE { + let mut new_str = String::with_capacity(new_len); + new_str.push_str(string); + new_str.push(ch); + new_str } else { - for e in &mut string.data[len..(len + chlen)] { + for e in &mut string.data[len..new_len] { //These have to be initialized, as we are passing u8:s into encode_utf8 *e = std::mem::MaybeUninit::new(0); } let written = ch.encode_utf8( unsafe { - &mut *(&mut string.data[len..(len + chlen)] as *mut [std::mem::MaybeUninit] as *mut [u8]) + &mut *(&mut string.data[len..new_len] as *mut [std::mem::MaybeUninit] as *mut [u8]) }).len(); unsafe { string.set_len(len + written) }; return; @@ -442,16 +443,18 @@ impl SmartString { return; } StringCastMut::Inline(this) => { - if len + string.len() > Mode::MAX_INLINE { - let mut this = this.to_string(); - this.push_str(string); - this + let new_len = len + string.len(); + if new_len > Mode::MAX_INLINE { + let mut new_str = String::with_capacity(new_len); + new_str.push_str(this); + new_str.push_str(string); + new_str } else { unsafe { - this.as_mut_slice()[len..len + string.len()].copy_from_slice( + this.as_mut_slice()[len..new_len].copy_from_slice( &*(string.as_bytes() as *const [u8] as *const [std::mem::MaybeUninit]) ); - this.set_len(len + string.len()); + this.set_len(new_len); } return } @@ -548,8 +551,11 @@ impl SmartString { }; let next = index + ch.len_utf8(); let len = string.len(); + let tail_len = len - next; unsafe { - string.data[index].as_mut_ptr().copy_from(string.data[next].as_ptr(), len - next); + if tail_len > 0 { + string.data[index].as_mut_ptr().copy_from(string.data[next].as_ptr(), len - next); + } string.set_len(len - (next - index)); } ch @@ -1084,12 +1090,6 @@ impl Into for SmartString { } } -impl ToString for SmartString { - fn to_string(&self) -> String { - self.as_str().to_string() - } -} - impl PartialEq for SmartString { fn eq(&self, other: &str) -> bool { self.as_str() == other @@ -1158,6 +1158,12 @@ impl Debug for SmartString { } } +impl Display for SmartString { + fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { + Display::fmt(self.as_str(), f) + } +} + impl Write for SmartString { fn write_str(&mut self, string: &str) -> Result<(), Error> { self.push_str(string); diff --git a/src/proptest.rs b/src/proptest.rs index e95bf1f..1379493 100644 --- a/src/proptest.rs +++ b/src/proptest.rs @@ -1,6 +1,7 @@ //! `proptest` strategies (requires the `proptest` feature flag). use crate::{SmartString, SmartStringMode}; +use proptest::proptest; use proptest::strategy::{BoxedStrategy, Strategy}; use proptest::string::Error; @@ -15,3 +16,10 @@ where { proptest::string::string_regex(regex).map(|g| g.prop_map(SmartString::from).boxed()) } + +proptest! { + #[test] + fn strategy(string in string_regex(".+").unwrap()) { + assert!(!SmartString::::is_empty(&string)); + } +} diff --git a/src/test.rs b/src/test.rs index 248f634..b3cbae3 100644 --- a/src/test.rs +++ b/src/test.rs @@ -504,4 +504,11 @@ mod tests { assert_eq!(control_smart, string); assert_eq!(Ordering::Equal, string.cmp(&control_smart)); } + + #[test] + fn dont_panic_on_removing_last_index_from_an_inline_string() { + let mut s = + SmartString::::from("\u{323}\u{323}\u{323}ω\u{323}\u{323}\u{323}㌣\u{e323}㤘"); + s.remove(20); + } } From 50c2f594c78bc33cdc3a78ec6d1f82700d2494a5 Mon Sep 17 00:00:00 2001 From: AnttiP Date: Thu, 9 Jul 2020 11:08:29 +0300 Subject: [PATCH 5/8] Removed something unnecesary, added a test for capacity --- src/boxed.rs | 2 +- src/test.rs | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/boxed.rs b/src/boxed.rs index 094cb81..dd95086 100644 --- a/src/boxed.rs +++ b/src/boxed.rs @@ -98,7 +98,7 @@ impl BoxedString for PseudoString { } fn capacity(&self) -> usize { - usize::from(self.capacity) << 1 >> 1 + usize::from(self.capacity) } } diff --git a/src/test.rs b/src/test.rs index b3cbae3..ec000e2 100644 --- a/src/test.rs +++ b/src/test.rs @@ -511,4 +511,12 @@ mod tests { SmartString::::from("\u{323}\u{323}\u{323}ω\u{323}\u{323}\u{323}㌣\u{e323}㤘"); s.remove(20); } + + #[test] + fn return_reasonable_capacity() { + let boxed = SmartString::::from("𝕃𝕠𝕟𝕘𝕖𝕣 𝕥𝕙𝕒𝕟 𝟚𝟛 𝕓𝕪𝕥𝕖𝕤"); + let inlined = SmartString::::from("Shorter"); + assert!(boxed.capacity() <= isize::MAX as usize); + assert!(inlined.capacity() == std::mem::size_of::() - 1); + } } From 5320ddbb3c6876dcef96aa843a21097e35491eca Mon Sep 17 00:00:00 2001 From: AnttiP Date: Thu, 9 Jul 2020 11:24:06 +0300 Subject: [PATCH 6/8] Changelog --- CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d01478..5912813 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### ADDED +- `SmartString` now supports null pointer optimizations. `Option` is now the same size as `SmartString`. +- A feature flag `lazy_null_pointer_optimizations`, which enables null pointer optimizations for `SmartString`. On by default. + +### FIXED +- `SmartString` now uses the size or capacity field to store the discriminant bit, instead of relying on pointer alignment (#4) +- `SmartString` doesn't rely on the internal layout of `String` (#4) + ## [0.2.3] - 2020-07-07 ### ADDED From 13390e95b3c94b8409896b8512a1e5d6b9668b3e Mon Sep 17 00:00:00 2001 From: AnttiP Date: Thu, 9 Jul 2020 11:43:13 +0300 Subject: [PATCH 7/8] Missing pub on a struct --- src/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config.rs b/src/config.rs index a027625..bb59745 100644 --- a/src/config.rs +++ b/src/config.rs @@ -108,7 +108,7 @@ impl DiscriminantContainer for std::num::NonZeroUsize { #[cfg(target_endian = "big")] #[cfg_attr(target_pointer_width = "64", repr(C, align(8)))] #[cfg_attr(target_pointer_width = "32", repr(C, align(4)))] -struct PossiblyZeroSize { +pub struct PossiblyZeroSize { marker: u8, data: [MaybeUninit; std::mem::size_of::() - 1], } From b0f4ce7dbe94e782b7bdc23782ede3d49e3ed9a4 Mon Sep 17 00:00:00 2001 From: AnttiP Date: Thu, 9 Jul 2020 13:31:26 +0300 Subject: [PATCH 8/8] Cargo format --- src/boxed.rs | 37 ++++++++++++------ src/casts.rs | 2 +- src/config.rs | 15 ++++---- src/inline.rs | 10 ++--- src/iter.rs | 16 +++++--- src/lib.rs | 105 +++++++++++++++++++++++--------------------------- src/test.rs | 7 ++-- 7 files changed, 103 insertions(+), 89 deletions(-) diff --git a/src/boxed.rs b/src/boxed.rs index dd95086..c760eaa 100644 --- a/src/boxed.rs +++ b/src/boxed.rs @@ -2,8 +2,8 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. +use crate::{inline::InlineString, SmartString, SmartStringMode}; use std::ops::{Deref, DerefMut}; -use crate::{SmartStringMode, SmartString, inline::InlineString}; pub trait BoxedString: Deref + DerefMut + Into { //This is unsafe when null pointer optimizations are used with LazyCompact @@ -67,19 +67,22 @@ impl From for String { #[inline(always)] fn from(string: PseudoString) -> Self { unsafe { - String::from_raw_parts(string.ptr.as_ptr(), string.size, usize::from(string.capacity)) + String::from_raw_parts( + string.ptr.as_ptr(), + string.size, + usize::from(string.capacity), + ) } } } - #[cfg(feature = "lazy_null_pointer_optimizations")] -unsafe fn to_capacity(size: usize) -> std::num::NonZeroUsize { +unsafe fn to_capacity(size: usize) -> std::num::NonZeroUsize { std::num::NonZeroUsize::new_unchecked(size) } #[cfg(not(feature = "lazy_null_pointer_optimizations"))] -fn to_capacity(size: usize) -> usize { +fn to_capacity(size: usize) -> usize { size } @@ -94,7 +97,11 @@ impl BoxedString for PseudoString { let size = bytes.len(); std::mem::forget(string); - Self {ptr: std::ptr::NonNull::new_unchecked(ptr), size, capacity: to_capacity(capacity)} + Self { + ptr: std::ptr::NonNull::new_unchecked(ptr), + size, + capacity: to_capacity(capacity), + } } fn capacity(&self) -> usize { @@ -111,12 +118,15 @@ pub(crate) struct StringReference<'a, Mode: SmartStringMode> { impl<'a, Mode: SmartStringMode> StringReference<'a, Mode> { //Safety: Discriminant must be boxed pub(crate) unsafe fn from_smart_unchecked(smart: &'a mut SmartString) -> Self { - debug_assert_eq!(smart.discriminant(), crate::marker_byte::Discriminant::Boxed); - let boxed : Mode::BoxedString = std::mem::transmute_copy(smart); + debug_assert_eq!( + smart.discriminant(), + crate::marker_byte::Discriminant::Boxed + ); + let boxed: Mode::BoxedString = std::mem::transmute_copy(smart); let string = boxed.into(); Self { referrant: smart, - string + string, } } } @@ -124,7 +134,9 @@ impl<'a, Mode: SmartStringMode> StringReference<'a, Mode> { impl<'a, Mode: SmartStringMode> Drop for StringReference<'a, Mode> { fn drop(&mut self) { let string = std::mem::replace(&mut self.string, String::new()); - if (Mode::DEALLOC && string.len() <= Mode::MAX_INLINE) || (!Mode::DEALLOC && cfg!(lazy_null_pointer_optimizations) && string.capacity() == 0) { + if (Mode::DEALLOC && string.len() <= Mode::MAX_INLINE) + || (!Mode::DEALLOC && cfg!(lazy_null_pointer_optimizations) && string.capacity() == 0) + { let transmuted = (self as *mut Self).cast(); unsafe { std::ptr::write(*transmuted, InlineString::::from(string.as_bytes())); @@ -132,7 +144,10 @@ impl<'a, Mode: SmartStringMode> Drop for StringReference<'a, Mode> { } else { let transmuted = (self as *mut Self).cast(); unsafe { - std::ptr::write(*transmuted, Mode::BoxedString::from_string_unchecked(string)); + std::ptr::write( + *transmuted, + Mode::BoxedString::from_string_unchecked(string), + ); } } } diff --git a/src/casts.rs b/src/casts.rs index 7e1ada4..4f56b92 100644 --- a/src/casts.rs +++ b/src/casts.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::{inline::InlineString, SmartStringMode, boxed::StringReference}; +use crate::{boxed::StringReference, inline::InlineString, SmartStringMode}; pub(crate) enum StringCast<'a, Mode: SmartStringMode> { Boxed(&'a Mode::BoxedString), diff --git a/src/config.rs b/src/config.rs index bb59745..760d3a6 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2,7 +2,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::{boxed::{BoxedString, PseudoString}, inline::InlineString, SmartString}; +use crate::{ + boxed::{BoxedString, PseudoString}, + inline::InlineString, + SmartString, +}; use static_assertions::{assert_eq_size, const_assert, const_assert_eq}; use std::mem::{align_of, size_of, MaybeUninit}; @@ -90,13 +94,11 @@ pub trait DiscriminantContainer { impl DiscriminantContainer for std::num::NonZeroUsize { fn get_full_marker(&self) -> u8 { - (self.get() >> ((std::mem::size_of::() - 1)*8)) as u8 + (self.get() >> ((std::mem::size_of::() - 1) * 8)) as u8 } fn new(marker: u8) -> Self { unsafe { - Self::new_unchecked( - ((marker as usize) << ((std::mem::size_of::() - 1)*8)) + 1 - ) + Self::new_unchecked(((marker as usize) << ((std::mem::size_of::() - 1) * 8)) + 1) } } unsafe fn flip_bit(&mut self) { @@ -134,7 +136,7 @@ impl DiscriminantContainer for PossiblyZeroSize { } } unsafe fn flip_bit(&mut self) { - self.marker^= 128; + self.marker ^= 128; } } @@ -145,7 +147,6 @@ unsafe impl SmartStringMode for Compact { type DiscriminantContainer = std::num::NonZeroUsize; } - #[cfg(not(feature = "lazy_null_pointer_optimizations"))] unsafe impl SmartStringMode for LazyCompact { type BoxedString = PseudoString; diff --git a/src/inline.rs b/src/inline.rs index 25ce2c5..7cd0b11 100644 --- a/src/inline.rs +++ b/src/inline.rs @@ -5,9 +5,9 @@ use crate::SmartStringMode; use std::{ mem::MaybeUninit, + ops::{Deref, DerefMut}, slice::{from_raw_parts, from_raw_parts_mut}, str::{from_utf8_unchecked, from_utf8_unchecked_mut}, - ops::{Deref, DerefMut}, }; #[cfg(target_endian = "big")] @@ -15,7 +15,7 @@ use std::{ #[cfg_attr(target_pointer_width = "32", repr(C, align(4)))] pub(crate) struct InlineString { pub(crate) marker: u8, - pub(crate) data: [MaybeUninit; 3*std::mem::size_of::() - 1], + pub(crate) data: [MaybeUninit; 3 * std::mem::size_of::() - 1], phantom: std::marker::PhantomData<*const Mode>, } @@ -23,7 +23,7 @@ pub(crate) struct InlineString { #[cfg_attr(target_pointer_width = "64", repr(C, align(8)))] #[cfg_attr(target_pointer_width = "32", repr(C, align(4)))] pub(crate) struct InlineString { - pub(crate) data: [MaybeUninit; 3*std::mem::size_of::() - 1], + pub(crate) data: [MaybeUninit; 3 * std::mem::size_of::() - 1], pub(crate) marker: u8, phantom: std::marker::PhantomData<*const Mode>, } @@ -59,7 +59,7 @@ impl InlineString { pub(crate) fn new() -> Self { let mut ret = Self { marker: 128, - data: [MaybeUninit::uninit(); 3*std::mem::size_of::() - 1], + data: [MaybeUninit::uninit(); 3 * std::mem::size_of::() - 1], phantom: std::marker::PhantomData, }; //Are nullptr optimizations on? @@ -67,7 +67,7 @@ impl InlineString { //Initialize the 7 highest bytes of data for j in 0..(std::mem::size_of::() - 1) { #[cfg(target_endian = "little")] - let j = 3*std::mem::size_of::() - 3 - j; + let j = 3 * std::mem::size_of::() - 3 - j; ret.data[j] = MaybeUninit::zeroed(); } diff --git a/src/iter.rs b/src/iter.rs index 96291ba..9ab6ed6 100644 --- a/src/iter.rs +++ b/src/iter.rs @@ -1,8 +1,8 @@ -use crate::{SmartString, SmartStringMode, casts::StringCastMut}; +use crate::{casts::StringCastMut, SmartString, SmartStringMode}; use std::{ fmt::{Debug, Error, Formatter}, iter::FusedIterator, - ops::{RangeBounds, Bound}, + ops::{Bound, RangeBounds}, }; /// A draining iterator for a [`SmartString`][SmartString]. @@ -75,9 +75,13 @@ where //We must first replace the iterator with a dummy one, so it won't dangle self.iterator = "".chars(); //Now we can safely clear the string - unsafe { match (*self.string).cast_mut() { - StringCastMut::Boxed(mut string) => {string.drain(self.start..self.end);}, - StringCastMut::Inline(string) => string.remove_bytes(self.start, self.end), - }} + unsafe { + match (*self.string).cast_mut() { + StringCastMut::Boxed(mut string) => { + string.drain(self.start..self.end); + } + StringCastMut::Inline(string) => string.remove_bytes(self.start, self.end), + } + } } } diff --git a/src/lib.rs b/src/lib.rs index 052d243..25a9c0f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -135,7 +135,7 @@ use std::{ }; mod config; -pub use config::{Compact, LazyCompact, SmartStringMode, DiscriminantContainer, PossiblyZeroSize}; +pub use config::{Compact, DiscriminantContainer, LazyCompact, PossiblyZeroSize, SmartStringMode}; mod marker_byte; use marker_byte::Discriminant; @@ -147,7 +147,7 @@ mod boxed; use boxed::BoxedString; mod casts; -use casts::{StringCast, StringCastInto, StringCastMut, please_transmute}; +use casts::{please_transmute, StringCast, StringCastInto, StringCastMut}; mod iter; pub use iter::Drain; @@ -207,7 +207,7 @@ pub mod alias { #[cfg_attr(target_pointer_width = "32", repr(C, align(4)))] pub struct SmartString { disc: Mode::DiscriminantContainer, - data: [MaybeUninit; 2*std::mem::size_of::()], + data: [MaybeUninit; 2 * std::mem::size_of::()], } //Documentation needs to be duplicated @@ -238,7 +238,7 @@ pub struct SmartString { #[cfg_attr(target_pointer_width = "64", repr(C, align(8)))] #[cfg_attr(target_pointer_width = "32", repr(C, align(4)))] pub struct SmartString { - data: [MaybeUninit; 2*std::mem::size_of::()], + data: [MaybeUninit; 2 * std::mem::size_of::()], disc: Mode::DiscriminantContainer, } @@ -281,16 +281,18 @@ impl Deref for SmartString { impl DerefMut for SmartString { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { - unsafe { match self.discriminant() { - Discriminant::Boxed => { - let boxed : &mut Mode::BoxedString = &mut *(self as *mut Self).cast(); - boxed - } - Discriminant::Inline => { - let inline : &mut InlineString = &mut *(self as *mut Self).cast(); - inline + unsafe { + match self.discriminant() { + Discriminant::Boxed => { + let boxed: &mut Mode::BoxedString = &mut *(self as *mut Self).cast(); + boxed + } + Discriminant::Inline => { + let inline: &mut InlineString = &mut *(self as *mut Self).cast(); + inline + } } - }} + } } } @@ -302,15 +304,11 @@ impl SmartString { } fn from_boxed(boxed: Mode::BoxedString) -> Self { - unsafe { - please_transmute(boxed) - } + unsafe { please_transmute(boxed) } } fn from_inline(inline: InlineString) -> Self { - unsafe { - please_transmute(inline) - } + unsafe { please_transmute(inline) } } fn discriminant(&self) -> Discriminant { @@ -330,11 +328,11 @@ impl SmartString { fn cast_mut(&mut self) -> StringCastMut<'_, Mode> { match self.discriminant() { - Discriminant::Inline => StringCastMut::Inline(unsafe { &mut *(self as *mut Self).cast() }), + Discriminant::Inline => { + StringCastMut::Inline(unsafe { &mut *(self as *mut Self).cast() }) + } Discriminant::Boxed => { - StringCastMut::Boxed( unsafe { - boxed::StringReference::from_smart_unchecked(self) - }) + StringCastMut::Boxed(unsafe { boxed::StringReference::from_smart_unchecked(self) }) } } } @@ -367,9 +365,7 @@ impl SmartString { } else { return true; }; - std::mem::forget( - std::mem::replace(self, inlined) - ); + std::mem::forget(std::mem::replace(self, inlined)); true } @@ -423,15 +419,19 @@ impl SmartString { //These have to be initialized, as we are passing u8:s into encode_utf8 *e = std::mem::MaybeUninit::new(0); } - let written = ch.encode_utf8( unsafe { - &mut *(&mut string.data[len..new_len] as *mut [std::mem::MaybeUninit] as *mut [u8]) - }).len(); + let written = ch + .encode_utf8(unsafe { + &mut *(&mut string.data[len..new_len] + as *mut [std::mem::MaybeUninit] + as *mut [u8]) + }) + .len(); unsafe { string.set_len(len + written) }; return; } } }; - unsafe {self.promote_from(promote)}; + unsafe { self.promote_from(promote) }; } /// Copy a string slice onto the end of the string. @@ -452,15 +452,16 @@ impl SmartString { } else { unsafe { this.as_mut_slice()[len..new_len].copy_from_slice( - &*(string.as_bytes() as *const [u8] as *const [std::mem::MaybeUninit]) + &*(string.as_bytes() as *const [u8] + as *const [std::mem::MaybeUninit]), ); this.set_len(new_len); } - return + return; } } }; - unsafe {self.promote_from(promote)}; + unsafe { self.promote_from(promote) }; } /// Return the currently allocated capacity of the string. @@ -517,9 +518,7 @@ impl SmartString { StringCastMut::Inline(string) => { if new_len < string.len() { assert!(string.is_char_boundary(new_len)); - unsafe { - string.set_len(new_len) - }; + unsafe { string.set_len(new_len) }; } } } @@ -531,7 +530,9 @@ impl SmartString { StringCastMut::Boxed(mut string) => string.pop()?, StringCastMut::Inline(string) => { let ch = string.chars().rev().next()?; - unsafe {string.set_len(string.len() - ch.len_utf8());} + unsafe { + string.set_len(string.len() - ch.len_utf8()); + } return Some(ch); } }; @@ -554,7 +555,9 @@ impl SmartString { let tail_len = len - next; unsafe { if tail_len > 0 { - string.data[index].as_mut_ptr().copy_from(string.data[next].as_ptr(), len - next); + string.data[index] + .as_mut_ptr() + .copy_from(string.data[next].as_ptr(), len - next); } string.set_len(len - (next - index)); } @@ -589,7 +592,7 @@ impl SmartString { string } }; - unsafe {self.promote_from(promote)}; + unsafe { self.promote_from(promote) }; } /// Insert a string slice into the string at the given index. @@ -605,16 +608,16 @@ impl SmartString { if !this.is_char_boundary(index) { panic!(); } - unsafe {this.insert_bytes(index, string.as_bytes())}; + unsafe { this.insert_bytes(index, string.as_bytes()) }; return; - }, + } StringCastMut::Inline(this) => { let mut this = this.to_string(); this.insert_str(index, string); this } }; - unsafe {self.promote_from(promote)}; + unsafe { self.promote_from(promote) }; } /// Split the string into two at the given index. @@ -658,13 +661,7 @@ impl SmartString { let mut index = 0; while index < len { - let ch = unsafe { - string - .get_unchecked(index..len) - .chars() - .next() - .unwrap() - }; + let ch = unsafe { string.get_unchecked(index..len).chars().next().unwrap() }; let ch_len = ch.len_utf8(); if !f(ch) { @@ -706,7 +703,7 @@ impl SmartString { StringCastMut::Boxed(mut string) => { string.replace_range(range, replace_with); return; - }, + } StringCastMut::Inline(string) => { let len = string.len(); let (start, end) = bounds_for(&range, len); @@ -886,9 +883,7 @@ impl From<&'_ str> for SmartString { impl From<&'_ String> for SmartString { fn from(string: &'_ String) -> Self { if string.len() > Mode::MAX_INLINE { - Self::from_boxed(unsafe { - Mode::BoxedString::from_string_unchecked(string.clone()) - }) + Self::from_boxed(unsafe { Mode::BoxedString::from_string_unchecked(string.clone()) }) } else { Self::from_inline(string.as_bytes().into()) } @@ -898,9 +893,7 @@ impl From<&'_ String> for SmartString { impl From for SmartString { fn from(string: String) -> Self { if string.len() > Mode::MAX_INLINE { - Self::from_boxed(unsafe { - Mode::BoxedString::from_string_unchecked(string) - }) + Self::from_boxed(unsafe { Mode::BoxedString::from_string_unchecked(string) }) } else { Self::from_inline(string.as_bytes().into()) } diff --git a/src/test.rs b/src/test.rs index ec000e2..895906e 100644 --- a/src/test.rs +++ b/src/test.rs @@ -363,10 +363,11 @@ fn assert_invariants(control: &str, subject: &SmartString ); let control_smart: SmartString = control.into(); assert_eq!(Ordering::Equal, subject.cmp(&control_smart)); - if std::mem::size_of::>() == std::mem::size_of::>>() { - let ptr : *const Option> = (subject as *const SmartString).cast(); + if std::mem::size_of::>() == std::mem::size_of::>>() + { + let ptr: *const Option> = (subject as *const SmartString).cast(); //Todo: should include a black box to prevent rustc from optimizing this - assert!(unsafe {(*ptr).is_some()}); + assert!(unsafe { (*ptr).is_some() }); } }