List of all items
Structs
- BStr
- Bytes
- CharIndices
- Chars
- EscapeBytes
- FieldsWith
- Find
- FindReverse
- Finder
- FinderReverse
- Lines
- LinesWithTerminator
- Split
- SplitN
- SplitNReverse
- SplitReverse
- Utf8Chunk
- Utf8Chunks
- Utf8Error
diff --git a/.lock b/.lock new file mode 100644 index 000000000..e69de29bb diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 000000000..e69de29bb diff --git a/bstr/all.html b/bstr/all.html new file mode 100644 index 000000000..7ed273fdf --- /dev/null +++ b/bstr/all.html @@ -0,0 +1 @@ +
Redirecting to ../../bstr/struct.BStr.html...
+ + + \ No newline at end of file diff --git a/bstr/escape_bytes/struct.EscapeBytes.html b/bstr/escape_bytes/struct.EscapeBytes.html new file mode 100644 index 000000000..9f3f91ccf --- /dev/null +++ b/bstr/escape_bytes/struct.EscapeBytes.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.EscapeBytes.html...
+ + + \ No newline at end of file diff --git a/bstr/ext_slice/fn.B.html b/bstr/ext_slice/fn.B.html new file mode 100644 index 000000000..b2172fd37 --- /dev/null +++ b/bstr/ext_slice/fn.B.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/fn.B.html...
+ + + \ No newline at end of file diff --git a/bstr/ext_slice/struct.Bytes.html b/bstr/ext_slice/struct.Bytes.html new file mode 100644 index 000000000..498033243 --- /dev/null +++ b/bstr/ext_slice/struct.Bytes.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.Bytes.html...
+ + + \ No newline at end of file diff --git a/bstr/ext_slice/struct.FieldsWith.html b/bstr/ext_slice/struct.FieldsWith.html new file mode 100644 index 000000000..e778e6990 --- /dev/null +++ b/bstr/ext_slice/struct.FieldsWith.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.FieldsWith.html...
+ + + \ No newline at end of file diff --git a/bstr/ext_slice/struct.Find.html b/bstr/ext_slice/struct.Find.html new file mode 100644 index 000000000..269561d7d --- /dev/null +++ b/bstr/ext_slice/struct.Find.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.Find.html...
+ + + \ No newline at end of file diff --git a/bstr/ext_slice/struct.FindReverse.html b/bstr/ext_slice/struct.FindReverse.html new file mode 100644 index 000000000..c89f2081f --- /dev/null +++ b/bstr/ext_slice/struct.FindReverse.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.FindReverse.html...
+ + + \ No newline at end of file diff --git a/bstr/ext_slice/struct.Finder.html b/bstr/ext_slice/struct.Finder.html new file mode 100644 index 000000000..fd517547c --- /dev/null +++ b/bstr/ext_slice/struct.Finder.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.Finder.html...
+ + + \ No newline at end of file diff --git a/bstr/ext_slice/struct.FinderReverse.html b/bstr/ext_slice/struct.FinderReverse.html new file mode 100644 index 000000000..50b9ad2ce --- /dev/null +++ b/bstr/ext_slice/struct.FinderReverse.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.FinderReverse.html...
+ + + \ No newline at end of file diff --git a/bstr/ext_slice/struct.Lines.html b/bstr/ext_slice/struct.Lines.html new file mode 100644 index 000000000..9f640093c --- /dev/null +++ b/bstr/ext_slice/struct.Lines.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.Lines.html...
+ + + \ No newline at end of file diff --git a/bstr/ext_slice/struct.LinesWithTerminator.html b/bstr/ext_slice/struct.LinesWithTerminator.html new file mode 100644 index 000000000..1caf77778 --- /dev/null +++ b/bstr/ext_slice/struct.LinesWithTerminator.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.LinesWithTerminator.html...
+ + + \ No newline at end of file diff --git a/bstr/ext_slice/struct.Split.html b/bstr/ext_slice/struct.Split.html new file mode 100644 index 000000000..f7b99d074 --- /dev/null +++ b/bstr/ext_slice/struct.Split.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.Split.html...
+ + + \ No newline at end of file diff --git a/bstr/ext_slice/struct.SplitN.html b/bstr/ext_slice/struct.SplitN.html new file mode 100644 index 000000000..bf7f8d991 --- /dev/null +++ b/bstr/ext_slice/struct.SplitN.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.SplitN.html...
+ + + \ No newline at end of file diff --git a/bstr/ext_slice/struct.SplitNReverse.html b/bstr/ext_slice/struct.SplitNReverse.html new file mode 100644 index 000000000..fc62cf8b9 --- /dev/null +++ b/bstr/ext_slice/struct.SplitNReverse.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.SplitNReverse.html...
+ + + \ No newline at end of file diff --git a/bstr/ext_slice/struct.SplitReverse.html b/bstr/ext_slice/struct.SplitReverse.html new file mode 100644 index 000000000..95a6dba77 --- /dev/null +++ b/bstr/ext_slice/struct.SplitReverse.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.SplitReverse.html...
+ + + \ No newline at end of file diff --git a/bstr/ext_slice/trait.ByteSlice.html b/bstr/ext_slice/trait.ByteSlice.html new file mode 100644 index 000000000..e28197ace --- /dev/null +++ b/bstr/ext_slice/trait.ByteSlice.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/trait.ByteSlice.html...
+ + + \ No newline at end of file diff --git a/bstr/fn.B.html b/bstr/fn.B.html new file mode 100644 index 000000000..ee5939a12 --- /dev/null +++ b/bstr/fn.B.html @@ -0,0 +1,31 @@ +pub fn B<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a [u8] ⓘ
A short-hand constructor for building a &[u8]
.
This idiosyncratic constructor is useful for concisely building byte string +slices. Its primary utility is in conveniently writing byte string literals +in a uniform way. For example, consider this code that does not compile:
+ +let strs = vec![b"a", b"xy"];
The above code doesn’t compile because the type of the byte string literal
+b"a"
is &'static [u8; 1]
, and the type of b"xy"
is
+&'static [u8; 2]
. Since their types aren’t the same, they can’t be stored
+in the same Vec
. (This is dissimilar from normal Unicode string slices,
+where both "a"
and "xy"
have the same type of &'static str
.)
One way of getting the above code to compile is to convert byte strings to +slices. You might try this:
+ +let strs = vec![&b"a", &b"xy"];
But this just creates values with type & &'static [u8; 1]
and
+& &'static [u8; 2]
. Instead, you need to force the issue like so:
let strs = vec![&b"a"[..], &b"xy"[..]];
+// or
+let strs = vec![b"a".as_ref(), b"xy".as_ref()];
But neither of these are particularly convenient to type, especially when +it’s something as common as a string literal. Thus, this constructor +permits writing the following instead:
+ +use bstr::B;
+
+let strs = vec![B("a"), B(b"xy")];
Notice that this also lets you mix and match both string literals and byte +string literals. This can be quite convenient!
+pub fn decode_last_utf8<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize)
UTF-8 decode a single Unicode scalar value from the end of a slice.
+When successful, the corresponding Unicode scalar value is returned along +with the number of bytes it was encoded with. The number of bytes consumed +for a successful decode is always between 1 and 4, inclusive.
+When unsuccessful, None
is returned along with the number of bytes that
+make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
+the number of bytes consumed is always between 0 and 3, inclusive, where
+0 is only returned when slice
is empty.
Basic usage:
+ +use bstr::decode_last_utf8;
+
+// Decoding a valid codepoint.
+let (ch, size) = decode_last_utf8(b"\xE2\x98\x83");
+assert_eq!(Some('☃'), ch);
+assert_eq!(3, size);
+
+// Decoding an incomplete codepoint.
+let (ch, size) = decode_last_utf8(b"\xE2\x98");
+assert_eq!(None, ch);
+assert_eq!(2, size);
This example shows how to iterate over all codepoints in UTF-8 encoded +bytes in reverse, while replacing invalid UTF-8 sequences with the +replacement codepoint:
+ +use bstr::{B, decode_last_utf8};
+
+let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
+let mut chars = vec![];
+while !bytes.is_empty() {
+ let (ch, size) = decode_last_utf8(bytes);
+ bytes = &bytes[..bytes.len()-size];
+ chars.push(ch.unwrap_or('\u{FFFD}'));
+}
+assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
pub fn decode_utf8<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize)
UTF-8 decode a single Unicode scalar value from the beginning of a slice.
+When successful, the corresponding Unicode scalar value is returned along +with the number of bytes it was encoded with. The number of bytes consumed +for a successful decode is always between 1 and 4, inclusive.
+When unsuccessful, None
is returned along with the number of bytes that
+make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
+the number of bytes consumed is always between 0 and 3, inclusive, where
+0 is only returned when slice
is empty.
Basic usage:
+ +use bstr::decode_utf8;
+
+// Decoding a valid codepoint.
+let (ch, size) = decode_utf8(b"\xE2\x98\x83");
+assert_eq!(Some('☃'), ch);
+assert_eq!(3, size);
+
+// Decoding an incomplete codepoint.
+let (ch, size) = decode_utf8(b"\xE2\x98");
+assert_eq!(None, ch);
+assert_eq!(2, size);
This example shows how to iterate over all codepoints in UTF-8 encoded +bytes, while replacing invalid UTF-8 sequences with the replacement +codepoint:
+ +use bstr::{B, decode_utf8};
+
+let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
+let mut chars = vec![];
+while !bytes.is_empty() {
+ let (ch, size) = decode_utf8(bytes);
+ bytes = &bytes[size..];
+ chars.push(ch.unwrap_or('\u{FFFD}'));
+}
+assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
A byte string library.
+Byte strings are just like standard Unicode strings with one very important +difference: byte strings are only conventionally UTF-8 while Rust’s standard +Unicode strings are guaranteed to be valid UTF-8. The primary motivation for +byte strings is for handling arbitrary bytes that are mostly UTF-8.
+This crate provides two important traits that provide string oriented methods
+on &[u8]
and Vec<u8>
types:
ByteSlice
extends the [u8]
type with additional
+string oriented methods.ByteVec
extends the Vec<u8>
type with additional
+string oriented methods.Additionally, this crate provides two concrete byte string types that deref to
+[u8]
and Vec<u8>
. These are useful for storing byte string types, and come
+with convenient std::fmt::Debug
implementations:
BStr
is a byte string slice, analogous to str
.BString
is an owned growable byte string buffer,
+analogous to String
.Additionally, the free function B
serves as a convenient short
+hand for writing byte string literals.
Byte strings build on the existing APIs for Vec<u8>
and &[u8]
, with
+additional string oriented methods. Operations such as iterating over
+graphemes, searching for substrings, replacing substrings, trimming and case
+conversion are examples of things not provided on the standard library &[u8]
+APIs but are provided by this crate. For example, this code iterates over all
+of occurrences of a substring:
use bstr::ByteSlice;
+
+let s = b"foo bar foo foo quux foo";
+
+let mut matches = vec![];
+for start in s.find_iter("foo") {
+ matches.push(start);
+}
+assert_eq!(matches, [0, 8, 12, 21]);
Here’s another example showing how to do a search and replace (and also showing
+use of the B
function):
use bstr::{B, ByteSlice};
+
+let old = B("foo ☃☃☃ foo foo quux foo");
+let new = old.replace("foo", "hello");
+assert_eq!(new, B("hello ☃☃☃ hello hello quux hello"));
And here’s an example that shows case conversion, even in the presence of +invalid UTF-8:
+ +use bstr::{ByteSlice, ByteVec};
+
+let mut lower = Vec::from("hello β");
+lower[0] = b'\xFF';
+// lowercase β is uppercased to Β
+assert_eq!(lower.to_uppercase(), b"\xFFELLO \xCE\x92");
When working with byte strings, it is often useful to be able to print them
+as if they were byte strings and not sequences of integers. While this crate
+cannot affect the std::fmt::Debug
implementations for [u8]
and Vec<u8>
,
+this crate does provide the BStr
and BString
types which have convenient
+std::fmt::Debug
implementations.
For example, this
+ +use bstr::ByteSlice;
+
+let mut bytes = Vec::from("hello β");
+bytes[0] = b'\xFF';
+
+println!("{:?}", bytes.as_bstr());
will output "\xFFello β"
.
This example works because the
+ByteSlice::as_bstr
+method converts any &[u8]
to a &BStr
.
This library reflects my belief that UTF-8 by convention is a better trade +off in some circumstances than guaranteed UTF-8.
+The first time this idea hit me was in the implementation of Rust’s regex
+engine. In particular, very little of the internal implementation cares at all
+about searching valid UTF-8 encoded strings. Indeed, internally, the
+implementation converts &str
from the API to &[u8]
fairly quickly and
+just deals with raw bytes. UTF-8 match boundaries are then guaranteed by the
+finite state machine itself rather than any specific string type. This makes it
+possible to not only run regexes on &str
values, but also on &[u8]
values.
Why would you ever want to run a regex on a &[u8]
though? Well, &[u8]
is
+the fundamental way at which one reads data from all sorts of streams, via the
+standard library’s Read
+trait. In particular, there is no platform independent way to determine whether
+what you’re reading from is some binary file or a human readable text file.
+Therefore, if you’re writing a program to search files, you probably need to
+deal with &[u8]
directly unless you’re okay with first converting it to a
+&str
and dropping any bytes that aren’t valid UTF-8. (Or otherwise determine
+the encoding—which is often impractical—and perform a transcoding step.)
+Often, the simplest and most robust way to approach this is to simply treat the
+contents of a file as if it were mostly valid UTF-8 and pass through invalid
+UTF-8 untouched. This may not be the most correct approach though!
One case in particular exacerbates these issues, and that’s memory mapping
+a file. When you memory map a file, that file may be gigabytes big, but all
+you get is a &[u8]
. Converting that to a &str
all in one go is generally
+not a good idea because of the costs associated with doing so, and also
+because it generally causes one to do two passes over the data instead of
+one, which is quite undesirable. It is of course usually possible to do it an
+incremental way by only parsing chunks at a time, but this is often complex to
+do or impractical. For example, many regex engines only accept one contiguous
+sequence of bytes at a time with no way to perform incremental matching.
bstr
in public APIsThis library is past version 1
and is expected to remain at version 1
for
+the foreseeable future. Therefore, it is encouraged to put types from bstr
+(like BStr
and BString
) in your public API if that makes sense for your
+crate.
With that said, in general, it should be possible to avoid putting anything
+in this crate into your public APIs. Namely, you should never need to use the
+ByteSlice
or ByteVec
traits as bounds on public APIs, since their only
+purpose is to extend the methods on the concrete types [u8]
and Vec<u8>
,
+respectively. Similarly, it should not be necessary to put either the BStr
or
+BString
types into public APIs. If you want to use them internally, then they
+can be converted to/from [u8]
/Vec<u8>
as needed. The conversions are free.
So while it shouldn’t ever be 100% necessary to make bstr
a public
+dependency, there may be cases where it is convenient to do so. This is an
+explicitly supported use case of bstr
, and as such, major version releases
+should be exceptionally rare.
The primary difference between [u8]
and str
is that the former is
+conventionally UTF-8 while the latter is guaranteed to be UTF-8. The phrase
+“conventionally UTF-8” means that a [u8]
may contain bytes that do not form
+a valid UTF-8 sequence, but operations defined on the type in this crate are
+generally most useful on valid UTF-8 sequences. For example, iterating over
+Unicode codepoints or grapheme clusters is an operation that is only defined
+on valid UTF-8. Therefore, when invalid UTF-8 is encountered, the Unicode
+replacement codepoint is substituted. Thus, a byte string that is not UTF-8 at
+all is of limited utility when using these crate.
However, not all operations on byte strings are specifically Unicode aware. For +example, substring search has no specific Unicode semantics ascribed to it. It +works just as well for byte strings that are completely valid UTF-8 as for byte +strings that contain no valid UTF-8 at all. Similarly for replacements and +various other operations that do not need any Unicode specific tailoring.
+Aside from the difference in how UTF-8 is handled, the APIs between [u8]
and
+str
(and Vec<u8>
and String
) are intentionally very similar, including
+maintaining the same behavior for corner cases in things like substring
+splitting. There are, however, some differences:
matches
, but instead, find_iter
.
+In general, this crate does not define any generic
+Pattern
+infrastructure, and instead prefers adding new methods for different
+argument types. For example, matches
can search by a char
or a &str
,
+where as find_iter
can only search by a byte string. find_char
can be
+used for searching by a char
.SliceConcatExt
in the standard library is unstable, it is not
+possible to reuse that to implement join
and concat
methods. Instead,
+join
and concat
are provided as free
+functions that perform a similar task.String
/str
APIs will panic if a particular index was not on a valid
+UTF-8 code unit sequence boundary. Conversely, no such checking is performed
+in this crate, as is consistent with treating byte strings as a sequence of
+bytes. This means callers are responsible for maintaining a UTF-8 invariant
+if that’s important.starts_with_str
, have a
+_str
suffix to differentiate them from similar routines already defined
+on the [u8]
type. The difference is that starts_with
requires its
+parameter to be a &[u8]
, where as starts_with_str
permits its parameter
+to by anything that implements AsRef<[u8]>
, which is more flexible. This
+means you can write bytes.starts_with_str("☃")
instead of
+bytes.starts_with("☃".as_bytes())
.Otherwise, you should find most of the APIs between this crate and the standard +library string APIs to be very similar, if not identical.
+Since byte strings are only conventionally UTF-8, there is no guarantee +that byte strings contain valid UTF-8. Indeed, it is perfectly legal for a +byte string to contain arbitrary bytes. However, since this library defines +a string type, it provides many operations specified by Unicode. These +operations are typically only defined over codepoints, and thus have no real +meaning on bytes that are invalid UTF-8 because they do not map to a particular +codepoint.
+For this reason, whenever operations defined only on codepoints are used, this
+library will automatically convert invalid UTF-8 to the Unicode replacement
+codepoint, U+FFFD
, which looks like this: �
. For example, an
+iterator over codepoints will yield a Unicode
+replacement codepoint whenever it comes across bytes that are not valid UTF-8:
use bstr::ByteSlice;
+
+let bs = b"a\xFF\xFFz";
+let chars: Vec<char> = bs.chars().collect();
+assert_eq!(vec!['a', '\u{FFFD}', '\u{FFFD}', 'z'], chars);
There are a few ways in which invalid bytes can be substituted with a Unicode +replacement codepoint. One way, not used by this crate, is to replace every +individual invalid byte with a single replacement codepoint. In contrast, the +approach this crate uses is called the “substitution of maximal subparts,” as +specified by the Unicode Standard (Chapter 3, Section 9). (This approach is +also used by W3C’s Encoding Standard.) In +this strategy, a replacement codepoint is inserted whenever a byte is found +that cannot possibly lead to a valid UTF-8 code unit sequence. If there were +previous bytes that represented a prefix of a well-formed UTF-8 code unit +sequence, then all of those bytes (up to 3) are substituted with a single +replacement codepoint. For example:
+ +use bstr::ByteSlice;
+
+let bs = b"a\xF0\x9F\x87z";
+let chars: Vec<char> = bs.chars().collect();
+// The bytes \xF0\x9F\x87 could lead to a valid UTF-8 sequence, but 3 of them
+// on their own are invalid. Only one replacement codepoint is substituted,
+// which demonstrates the "substitution of maximal subparts" strategy.
+assert_eq!(vec!['a', '\u{FFFD}', 'z'], chars);
If you do need to access the raw bytes for some reason in an iterator like
+Chars
, then you should use the iterator’s “indices” variant, which gives
+the byte offsets containing the invalid UTF-8 bytes that were substituted with
+the replacement codepoint. For example:
use bstr::{B, ByteSlice};
+
+let bs = b"a\xE2\x98z";
+let chars: Vec<(usize, usize, char)> = bs.char_indices().collect();
+// Even though the replacement codepoint is encoded as 3 bytes itself, the
+// byte range given here is only two bytes, corresponding to the original
+// raw bytes.
+assert_eq!(vec![(0, 1, 'a'), (1, 3, '\u{FFFD}'), (3, 4, 'z')], chars);
+
+// Thus, getting the original raw bytes is as simple as slicing the original
+// byte string:
+let chars: Vec<&[u8]> = bs.char_indices().map(|(s, e, _)| &bs[s..e]).collect();
+assert_eq!(vec![B("a"), B(b"\xE2\x98"), B("z")], chars);
One of the premiere features of Rust’s standard library is how it handles file +paths. In particular, it makes it very hard to write incorrect code while +simultaneously providing a correct cross platform abstraction for manipulating +file paths. The key challenge that one faces with file paths across platforms +is derived from the following observations:
+(In both cases, certain sequences aren’t allowed. For example a NUL
byte is
+not allowed in either case. But we can ignore this for the purposes of this
+section.)
Byte strings, like the ones provided in this crate, line up really well with +file paths on Unix like systems, which are themselves just arbitrary sequences +of bytes. It turns out that if you treat them as “mostly UTF-8,” then things +work out pretty well. On the contrary, byte strings don’t really work +that well on Windows because it’s not possible to correctly roundtrip file +paths between 16-bit integers and something that looks like UTF-8 without +explicitly defining an encoding to do this for you, which is anathema to byte +strings, which are just bytes.
+Rust’s standard library elegantly solves this problem by specifying an +internal encoding for file paths that’s only used on Windows called +WTF-8. Its key properties are that they +permit losslessly roundtripping file paths on Windows by extending UTF-8 to +support an encoding of surrogate codepoints, while simultaneously supporting +zero-cost conversion from Rust’s Unicode strings to file paths. (Since UTF-8 is +a proper subset of WTF-8.)
+The fundamental point at which the above strategy fails is when you want to +treat file paths as things that look like strings in a zero cost way. In most +cases, this is actually the wrong thing to do, but some cases call for it, +for example, glob or regex matching on file paths. This is because WTF-8 is +treated as an internal implementation detail, and there is no way to access +those bytes via a public API. Therefore, such consumers are limited in what +they can do:
+os_str_bytes
crate.While this library may provide facilities for (1) in the future, currently, +this library only provides facilities for (2) and (3). In particular, a suite +of conversion functions are provided that permit converting between byte +strings, OS strings and file paths. For owned byte strings, they are:
+ByteVec::from_os_string
ByteVec::from_os_str_lossy
ByteVec::from_path_buf
ByteVec::from_path_lossy
ByteVec::into_os_string
ByteVec::into_os_string_lossy
ByteVec::into_path_buf
ByteVec::into_path_buf_lossy
For byte string slices, they are:
+ByteSlice::from_os_str
ByteSlice::from_path
ByteSlice::to_os_str
ByteSlice::to_os_str_lossy
ByteSlice::to_path
ByteSlice::to_path_lossy
On Unix, all of these conversions are rigorously zero cost, which gives one +a way to ergonomically deal with raw file paths exactly as they are using +normal string-related functions. On Windows, these conversion routines perform +a UTF-8 check and either return an error or lossily decode the file path +into valid UTF-8, depending on which function you use. This means that you +cannot roundtrip all file paths on Windows correctly using these conversion +routines. However, this may be an acceptable downside since such file paths +are exceptionally rare. Moreover, roundtripping isn’t always necessary, for +example, if all you’re doing is filtering based on file paths.
+The reason why using byte strings for this is potentially superior than the +standard library’s approach is that a lot of Rust code is already lossily +converting file paths to Rust’s Unicode strings, which are required to be valid +UTF-8, and thus contain latent bugs on Unix where paths with invalid UTF-8 are +not terribly uncommon. If you instead use byte strings, then you’re guaranteed +to write correct code for Unix, at the cost of getting a corner case wrong on +Windows.
+This crates comes with a few features that control standard library, serde +and Unicode support.
+std
- Enabled by default. This provides APIs that require the standard
+library, such as Vec<u8>
and PathBuf
. Enabling this feature also enables
+the alloc
feature and any other relevant std
features for dependencies.alloc
- Enabled by default. This provides APIs that require allocations
+via the alloc
crate, such as Vec<u8>
.unicode
- Enabled by default. This provides APIs that require sizable
+Unicode data compiled into the binary. This includes, but is not limited to,
+grapheme/word/sentence segmenters. When this is disabled, basic support such
+as UTF-8 decoding is still included. Note that currently, enabling this
+feature also requires enabling the std
feature. It is expected that this
+limitation will be lifted at some point.serde
- Enables implementations of serde traits for BStr
, and also
+BString
when alloc
is enabled.&[u8]
that provides convenient string oriented trait impls.char
values that represent an escaping of arbitrary bytes.n
substrings in a byte string, split by a
+separator.n
substrings in a byte string, split by a
+separator, in reverse.&[u8]
with string oriented methods.&[u8]
.pub struct BStr { /* private fields */ }
A wrapper for &[u8]
that provides convenient string oriented trait impls.
If you need ownership or a growable byte string buffer, then use
+BString
.
Using a &BStr
is just like using a &[u8]
, since BStr
+implements Deref
to [u8]
. So all methods available on [u8]
+are also available on BStr
.
A &BStr
has the same representation as a &str
. That is, a &BStr
is
+a fat pointer which consists of a pointer to some bytes and a length.
The BStr
type has a number of trait implementations, and in particular,
+defines equality and ordinal comparisons between &BStr
, &str
and
+&[u8]
for convenience.
The Debug
implementation for BStr
shows its bytes as a normal string.
+For invalid UTF-8, hex escape sequences are used.
The Display
implementation behaves as if BStr
were first lossily
+converted to a str
. Invalid UTF-8 bytes are substituted with the Unicode
+replacement codepoint, which looks like this: �.
Directly creates a BStr
slice from anything that can be converted
+to a byte slice.
This is very similar to the B
function, except this
+returns a &BStr
instead of a &[u8]
.
This is a cost-free conversion.
+You can create BStr
’s from byte arrays, byte slices or even string
+slices:
use bstr::BStr;
+
+let a = BStr::new(b"abc");
+let b = BStr::new(&b"abc"[..]);
+let c = BStr::new("abc");
+
+assert_eq!(a, b);
+assert_eq!(a, c);
sort_floats
)Sorts the slice of floats.
+This sort is in-place (i.e. does not allocate), O(n * log(n)) worst-case, and uses
+the ordering defined by f32::total_cmp
.
This uses the same sorting algorithm as sort_unstable_by
.
#![feature(sort_floats)]
+let mut v = [2.6, -5e-8, f32::NAN, 8.29, f32::INFINITY, -1.0, 0.0, -f32::INFINITY, -0.0];
+
+v.sort_floats();
+let sorted = [-f32::INFINITY, -1.0, -5e-8, -0.0, 0.0, 2.6, 8.29, f32::INFINITY, f32::NAN];
+assert_eq!(&v[..8], &sorted[..8]);
+assert!(v[8].is_nan());
Checks if all bytes in this slice are within the ASCII range.
+ascii_char
)If this slice is_ascii
, returns it as a slice of
+ASCII characters, otherwise returns None
.
ascii_char
)Converts this slice of bytes into a slice of ASCII characters, +without checking whether they’re valid.
+Every byte in the slice must be in 0..=127
, or else this is UB.
Checks that two slices are an ASCII case-insensitive match.
+Same as to_ascii_lowercase(a) == to_ascii_lowercase(b)
,
+but without allocating and copying temporaries.
Converts this slice to its ASCII upper case equivalent in-place.
+ASCII letters ‘a’ to ‘z’ are mapped to ‘A’ to ‘Z’, +but non-ASCII letters are unchanged.
+To return a new uppercased value without modifying the existing one, use
+to_ascii_uppercase
.
Converts this slice to its ASCII lower case equivalent in-place.
+ASCII letters ‘A’ to ‘Z’ are mapped to ‘a’ to ‘z’, +but non-ASCII letters are unchanged.
+To return a new lowercased value without modifying the existing one, use
+to_ascii_lowercase
.
Returns an iterator that produces an escaped version of this slice, +treating it as an ASCII string.
+
+let s = b"0\t\r\n'\"\\\x9d";
+let escaped = s.escape_ascii().to_string();
+assert_eq!(escaped, "0\\t\\r\\n\\'\\\"\\\\\\x9d");
byte_slice_trim_ascii
)Returns a byte slice with leading ASCII whitespace bytes removed.
+‘Whitespace’ refers to the definition used by
+u8::is_ascii_whitespace
.
#![feature(byte_slice_trim_ascii)]
+
+assert_eq!(b" \t hello world\n".trim_ascii_start(), b"hello world\n");
+assert_eq!(b" ".trim_ascii_start(), b"");
+assert_eq!(b"".trim_ascii_start(), b"");
byte_slice_trim_ascii
)Returns a byte slice with trailing ASCII whitespace bytes removed.
+‘Whitespace’ refers to the definition used by
+u8::is_ascii_whitespace
.
#![feature(byte_slice_trim_ascii)]
+
+assert_eq!(b"\r hello world\n ".trim_ascii_end(), b"\r hello world");
+assert_eq!(b" ".trim_ascii_end(), b"");
+assert_eq!(b"".trim_ascii_end(), b"");
byte_slice_trim_ascii
)Returns a byte slice with leading and trailing ASCII whitespace bytes +removed.
+‘Whitespace’ refers to the definition used by
+u8::is_ascii_whitespace
.
#![feature(byte_slice_trim_ascii)]
+
+assert_eq!(b"\r hello world\n ".trim_ascii(), b"hello world");
+assert_eq!(b" ".trim_ascii(), b"");
+assert_eq!(b"".trim_ascii(), b"");
Returns the number of elements in the slice.
+let a = [1, 2, 3];
+assert_eq!(a.len(), 3);
Returns true
if the slice has a length of 0.
let a = [1, 2, 3];
+assert!(!a.is_empty());
Returns the first element of the slice, or None
if it is empty.
let v = [10, 40, 30];
+assert_eq!(Some(&10), v.first());
+
+let w: &[i32] = &[];
+assert_eq!(None, w.first());
Returns a mutable pointer to the first element of the slice, or None
if it is empty.
let x = &mut [0, 1, 2];
+
+if let Some(first) = x.first_mut() {
+ *first = 5;
+}
+assert_eq!(x, &[5, 1, 2]);
Returns the first and all the rest of the elements of the slice, or None
if it is empty.
let x = &[0, 1, 2];
+
+if let Some((first, elements)) = x.split_first() {
+ assert_eq!(first, &0);
+ assert_eq!(elements, &[1, 2]);
+}
Returns the first and all the rest of the elements of the slice, or None
if it is empty.
let x = &mut [0, 1, 2];
+
+if let Some((first, elements)) = x.split_first_mut() {
+ *first = 3;
+ elements[0] = 4;
+ elements[1] = 5;
+}
+assert_eq!(x, &[3, 4, 5]);
Returns the last and all the rest of the elements of the slice, or None
if it is empty.
let x = &[0, 1, 2];
+
+if let Some((last, elements)) = x.split_last() {
+ assert_eq!(last, &2);
+ assert_eq!(elements, &[0, 1]);
+}
Returns the last and all the rest of the elements of the slice, or None
if it is empty.
let x = &mut [0, 1, 2];
+
+if let Some((last, elements)) = x.split_last_mut() {
+ *last = 3;
+ elements[0] = 4;
+ elements[1] = 5;
+}
+assert_eq!(x, &[4, 5, 3]);
Returns the last element of the slice, or None
if it is empty.
let v = [10, 40, 30];
+assert_eq!(Some(&30), v.last());
+
+let w: &[i32] = &[];
+assert_eq!(None, w.last());
Returns a mutable pointer to the last item in the slice.
+let x = &mut [0, 1, 2];
+
+if let Some(last) = x.last_mut() {
+ *last = 10;
+}
+assert_eq!(x, &[0, 1, 10]);
slice_first_last_chunk
)Returns the first N
elements of the slice, or None
if it has fewer than N
elements.
#![feature(slice_first_last_chunk)]
+
+let u = [10, 40, 30];
+assert_eq!(Some(&[10, 40]), u.first_chunk::<2>());
+
+let v: &[i32] = &[10];
+assert_eq!(None, v.first_chunk::<2>());
+
+let w: &[i32] = &[];
+assert_eq!(Some(&[]), w.first_chunk::<0>());
slice_first_last_chunk
)Returns a mutable reference to the first N
elements of the slice,
+or None
if it has fewer than N
elements.
#![feature(slice_first_last_chunk)]
+
+let x = &mut [0, 1, 2];
+
+if let Some(first) = x.first_chunk_mut::<2>() {
+ first[0] = 5;
+ first[1] = 4;
+}
+assert_eq!(x, &[5, 4, 2]);
slice_first_last_chunk
)Returns the first N
elements of the slice and the remainder,
+or None
if it has fewer than N
elements.
#![feature(slice_first_last_chunk)]
+
+let x = &[0, 1, 2];
+
+if let Some((first, elements)) = x.split_first_chunk::<2>() {
+ assert_eq!(first, &[0, 1]);
+ assert_eq!(elements, &[2]);
+}
slice_first_last_chunk
)Returns a mutable reference to the first N
elements of the slice and the remainder,
+or None
if it has fewer than N
elements.
#![feature(slice_first_last_chunk)]
+
+let x = &mut [0, 1, 2];
+
+if let Some((first, elements)) = x.split_first_chunk_mut::<2>() {
+ first[0] = 3;
+ first[1] = 4;
+ elements[0] = 5;
+}
+assert_eq!(x, &[3, 4, 5]);
slice_first_last_chunk
)Returns the last N
elements of the slice and the remainder,
+or None
if it has fewer than N
elements.
#![feature(slice_first_last_chunk)]
+
+let x = &[0, 1, 2];
+
+if let Some((last, elements)) = x.split_last_chunk::<2>() {
+ assert_eq!(last, &[1, 2]);
+ assert_eq!(elements, &[0]);
+}
slice_first_last_chunk
)Returns the last and all the rest of the elements of the slice, or None
if it is empty.
#![feature(slice_first_last_chunk)]
+
+let x = &mut [0, 1, 2];
+
+if let Some((last, elements)) = x.split_last_chunk_mut::<2>() {
+ last[0] = 3;
+ last[1] = 4;
+ elements[0] = 5;
+}
+assert_eq!(x, &[5, 3, 4]);
slice_first_last_chunk
)Returns the last element of the slice, or None
if it is empty.
#![feature(slice_first_last_chunk)]
+
+let u = [10, 40, 30];
+assert_eq!(Some(&[40, 30]), u.last_chunk::<2>());
+
+let v: &[i32] = &[10];
+assert_eq!(None, v.last_chunk::<2>());
+
+let w: &[i32] = &[];
+assert_eq!(Some(&[]), w.last_chunk::<0>());
slice_first_last_chunk
)Returns a mutable pointer to the last item in the slice.
+#![feature(slice_first_last_chunk)]
+
+let x = &mut [0, 1, 2];
+
+if let Some(last) = x.last_chunk_mut::<2>() {
+ last[0] = 10;
+ last[1] = 20;
+}
+assert_eq!(x, &[0, 10, 20]);
Returns a reference to an element or subslice depending on the type of +index.
+None
if out of bounds.None
if out of bounds.let v = [10, 40, 30];
+assert_eq!(Some(&40), v.get(1));
+assert_eq!(Some(&[10, 40][..]), v.get(0..2));
+assert_eq!(None, v.get(3));
+assert_eq!(None, v.get(0..4));
Returns a reference to an element or subslice, without doing bounds +checking.
+For a safe alternative see get
.
Calling this method with an out-of-bounds index is undefined behavior +even if the resulting reference is not used.
+let x = &[1, 2, 4];
+
+unsafe {
+ assert_eq!(x.get_unchecked(1), &2);
+}
Returns a mutable reference to an element or subslice, without doing +bounds checking.
+For a safe alternative see get_mut
.
Calling this method with an out-of-bounds index is undefined behavior +even if the resulting reference is not used.
+let x = &mut [1, 2, 4];
+
+unsafe {
+ let elem = x.get_unchecked_mut(1);
+ *elem = 13;
+}
+assert_eq!(x, &[1, 13, 4]);
Returns a raw pointer to the slice’s buffer.
+The caller must ensure that the slice outlives the pointer this +function returns, or else it will end up pointing to garbage.
+The caller must also ensure that the memory the pointer (non-transitively) points to
+is never written to (except inside an UnsafeCell
) using this pointer or any pointer
+derived from it. If you need to mutate the contents of the slice, use as_mut_ptr
.
Modifying the container referenced by this slice may cause its buffer +to be reallocated, which would also make any pointers to it invalid.
+let x = &[1, 2, 4];
+let x_ptr = x.as_ptr();
+
+unsafe {
+ for i in 0..x.len() {
+ assert_eq!(x.get_unchecked(i), &*x_ptr.add(i));
+ }
+}
Returns an unsafe mutable pointer to the slice’s buffer.
+The caller must ensure that the slice outlives the pointer this +function returns, or else it will end up pointing to garbage.
+Modifying the container referenced by this slice may cause its buffer +to be reallocated, which would also make any pointers to it invalid.
+let x = &mut [1, 2, 4];
+let x_ptr = x.as_mut_ptr();
+
+unsafe {
+ for i in 0..x.len() {
+ *x_ptr.add(i) += 2;
+ }
+}
+assert_eq!(x, &[3, 4, 6]);
Returns the two raw pointers spanning the slice.
+The returned range is half-open, which means that the end pointer +points one past the last element of the slice. This way, an empty +slice is represented by two equal pointers, and the difference between +the two pointers represents the size of the slice.
+See as_ptr
for warnings on using these pointers. The end pointer
+requires extra caution, as it does not point to a valid element in the
+slice.
This function is useful for interacting with foreign interfaces which +use two pointers to refer to a range of elements in memory, as is +common in C++.
+It can also be useful to check if a pointer to an element refers to an +element of this slice:
+ +let a = [1, 2, 3];
+let x = &a[1] as *const _;
+let y = &5 as *const _;
+
+assert!(a.as_ptr_range().contains(&x));
+assert!(!a.as_ptr_range().contains(&y));
Returns the two unsafe mutable pointers spanning the slice.
+The returned range is half-open, which means that the end pointer +points one past the last element of the slice. This way, an empty +slice is represented by two equal pointers, and the difference between +the two pointers represents the size of the slice.
+See as_mut_ptr
for warnings on using these pointers. The end
+pointer requires extra caution, as it does not point to a valid element
+in the slice.
This function is useful for interacting with foreign interfaces which +use two pointers to refer to a range of elements in memory, as is +common in C++.
+Swaps two elements in the slice.
+If a
equals to b
, it’s guaranteed that elements won’t change value.
Panics if a
or b
are out of bounds.
let mut v = ["a", "b", "c", "d", "e"];
+v.swap(2, 4);
+assert!(v == ["a", "b", "e", "d", "c"]);
slice_swap_unchecked
)Swaps two elements in the slice, without doing bounds checking.
+For a safe alternative see swap
.
Calling this method with an out-of-bounds index is undefined behavior.
+The caller has to ensure that a < self.len()
and b < self.len()
.
#![feature(slice_swap_unchecked)]
+
+let mut v = ["a", "b", "c", "d"];
+// SAFETY: we know that 1 and 3 are both indices of the slice
+unsafe { v.swap_unchecked(1, 3) };
+assert!(v == ["a", "d", "c", "b"]);
Reverses the order of elements in the slice, in place.
+let mut v = [1, 2, 3];
+v.reverse();
+assert!(v == [3, 2, 1]);
Returns an iterator over the slice.
+The iterator yields all items from start to end.
+let x = &[1, 2, 4];
+let mut iterator = x.iter();
+
+assert_eq!(iterator.next(), Some(&1));
+assert_eq!(iterator.next(), Some(&2));
+assert_eq!(iterator.next(), Some(&4));
+assert_eq!(iterator.next(), None);
Returns an iterator that allows modifying each value.
+The iterator yields all items from start to end.
+let x = &mut [1, 2, 4];
+for elem in x.iter_mut() {
+ *elem += 2;
+}
+assert_eq!(x, &[3, 4, 6]);
Returns an iterator over all contiguous windows of length
+size
. The windows overlap. If the slice is shorter than
+size
, the iterator returns no values.
Panics if size
is 0.
let slice = ['r', 'u', 's', 't'];
+let mut iter = slice.windows(2);
+assert_eq!(iter.next().unwrap(), &['r', 'u']);
+assert_eq!(iter.next().unwrap(), &['u', 's']);
+assert_eq!(iter.next().unwrap(), &['s', 't']);
+assert!(iter.next().is_none());
If the slice is shorter than size
:
let slice = ['f', 'o', 'o'];
+let mut iter = slice.windows(4);
+assert!(iter.next().is_none());
There’s no windows_mut
, as that existing would let safe code violate the
+“only one &mut
at a time to the same thing” rule. However, you can sometimes
+use Cell::as_slice_of_cells
in
+conjunction with windows
to accomplish something similar:
use std::cell::Cell;
+
+let mut array = ['R', 'u', 's', 't', ' ', '2', '0', '1', '5'];
+let slice = &mut array[..];
+let slice_of_cells: &[Cell<char>] = Cell::from_mut(slice).as_slice_of_cells();
+for w in slice_of_cells.windows(3) {
+ Cell::swap(&w[0], &w[2]);
+}
+assert_eq!(array, ['s', 't', ' ', '2', '0', '1', '5', 'u', 'R']);
Returns an iterator over chunk_size
elements of the slice at a time, starting at the
+beginning of the slice.
The chunks are slices and do not overlap. If chunk_size
does not divide the length of the
+slice, then the last chunk will not have length chunk_size
.
See chunks_exact
for a variant of this iterator that returns chunks of always exactly
+chunk_size
elements, and rchunks
for the same iterator but starting at the end of the
+slice.
Panics if chunk_size
is 0.
let slice = ['l', 'o', 'r', 'e', 'm'];
+let mut iter = slice.chunks(2);
+assert_eq!(iter.next().unwrap(), &['l', 'o']);
+assert_eq!(iter.next().unwrap(), &['r', 'e']);
+assert_eq!(iter.next().unwrap(), &['m']);
+assert!(iter.next().is_none());
Returns an iterator over chunk_size
elements of the slice at a time, starting at the
+beginning of the slice.
The chunks are mutable slices, and do not overlap. If chunk_size
does not divide the
+length of the slice, then the last chunk will not have length chunk_size
.
See chunks_exact_mut
for a variant of this iterator that returns chunks of always
+exactly chunk_size
elements, and rchunks_mut
for the same iterator but starting at
+the end of the slice.
Panics if chunk_size
is 0.
let v = &mut [0, 0, 0, 0, 0];
+let mut count = 1;
+
+for chunk in v.chunks_mut(2) {
+ for elem in chunk.iter_mut() {
+ *elem += count;
+ }
+ count += 1;
+}
+assert_eq!(v, &[1, 1, 2, 2, 3]);
Returns an iterator over chunk_size
elements of the slice at a time, starting at the
+beginning of the slice.
The chunks are slices and do not overlap. If chunk_size
does not divide the length of the
+slice, then the last up to chunk_size-1
elements will be omitted and can be retrieved
+from the remainder
function of the iterator.
Due to each chunk having exactly chunk_size
elements, the compiler can often optimize the
+resulting code better than in the case of chunks
.
See chunks
for a variant of this iterator that also returns the remainder as a smaller
+chunk, and rchunks_exact
for the same iterator but starting at the end of the slice.
Panics if chunk_size
is 0.
let slice = ['l', 'o', 'r', 'e', 'm'];
+let mut iter = slice.chunks_exact(2);
+assert_eq!(iter.next().unwrap(), &['l', 'o']);
+assert_eq!(iter.next().unwrap(), &['r', 'e']);
+assert!(iter.next().is_none());
+assert_eq!(iter.remainder(), &['m']);
Returns an iterator over chunk_size
elements of the slice at a time, starting at the
+beginning of the slice.
The chunks are mutable slices, and do not overlap. If chunk_size
does not divide the
+length of the slice, then the last up to chunk_size-1
elements will be omitted and can be
+retrieved from the into_remainder
function of the iterator.
Due to each chunk having exactly chunk_size
elements, the compiler can often optimize the
+resulting code better than in the case of chunks_mut
.
See chunks_mut
for a variant of this iterator that also returns the remainder as a
+smaller chunk, and rchunks_exact_mut
for the same iterator but starting at the end of
+the slice.
Panics if chunk_size
is 0.
let v = &mut [0, 0, 0, 0, 0];
+let mut count = 1;
+
+for chunk in v.chunks_exact_mut(2) {
+ for elem in chunk.iter_mut() {
+ *elem += count;
+ }
+ count += 1;
+}
+assert_eq!(v, &[1, 1, 2, 2, 0]);
slice_as_chunks
)Splits the slice into a slice of N
-element arrays,
+assuming that there’s no remainder.
This may only be called when
+N
-element chunks (aka self.len() % N == 0
).N != 0
.#![feature(slice_as_chunks)]
+let slice: &[char] = &['l', 'o', 'r', 'e', 'm', '!'];
+let chunks: &[[char; 1]] =
+ // SAFETY: 1-element chunks never have remainder
+ unsafe { slice.as_chunks_unchecked() };
+assert_eq!(chunks, &[['l'], ['o'], ['r'], ['e'], ['m'], ['!']]);
+let chunks: &[[char; 3]] =
+ // SAFETY: The slice length (6) is a multiple of 3
+ unsafe { slice.as_chunks_unchecked() };
+assert_eq!(chunks, &[['l', 'o', 'r'], ['e', 'm', '!']]);
+
+// These would be unsound:
+// let chunks: &[[_; 5]] = slice.as_chunks_unchecked() // The slice length is not a multiple of 5
+// let chunks: &[[_; 0]] = slice.as_chunks_unchecked() // Zero-length chunks are never allowed
slice_as_chunks
)Splits the slice into a slice of N
-element arrays,
+starting at the beginning of the slice,
+and a remainder slice with length strictly less than N
.
Panics if N
is 0. This check will most probably get changed to a compile time
+error before this method gets stabilized.
#![feature(slice_as_chunks)]
+let slice = ['l', 'o', 'r', 'e', 'm'];
+let (chunks, remainder) = slice.as_chunks();
+assert_eq!(chunks, &[['l', 'o'], ['r', 'e']]);
+assert_eq!(remainder, &['m']);
If you expect the slice to be an exact multiple, you can combine
+let
-else
with an empty slice pattern:
#![feature(slice_as_chunks)]
+let slice = ['R', 'u', 's', 't'];
+let (chunks, []) = slice.as_chunks::<2>() else {
+ panic!("slice didn't have even length")
+};
+assert_eq!(chunks, &[['R', 'u'], ['s', 't']]);
slice_as_chunks
)Splits the slice into a slice of N
-element arrays,
+starting at the end of the slice,
+and a remainder slice with length strictly less than N
.
Panics if N
is 0. This check will most probably get changed to a compile time
+error before this method gets stabilized.
#![feature(slice_as_chunks)]
+let slice = ['l', 'o', 'r', 'e', 'm'];
+let (remainder, chunks) = slice.as_rchunks();
+assert_eq!(remainder, &['l']);
+assert_eq!(chunks, &[['o', 'r'], ['e', 'm']]);
array_chunks
)Returns an iterator over N
elements of the slice at a time, starting at the
+beginning of the slice.
The chunks are array references and do not overlap. If N
does not divide the
+length of the slice, then the last up to N-1
elements will be omitted and can be
+retrieved from the remainder
function of the iterator.
This method is the const generic equivalent of chunks_exact
.
Panics if N
is 0. This check will most probably get changed to a compile time
+error before this method gets stabilized.
#![feature(array_chunks)]
+let slice = ['l', 'o', 'r', 'e', 'm'];
+let mut iter = slice.array_chunks();
+assert_eq!(iter.next().unwrap(), &['l', 'o']);
+assert_eq!(iter.next().unwrap(), &['r', 'e']);
+assert!(iter.next().is_none());
+assert_eq!(iter.remainder(), &['m']);
slice_as_chunks
)Splits the slice into a slice of N
-element arrays,
+assuming that there’s no remainder.
This may only be called when
+N
-element chunks (aka self.len() % N == 0
).N != 0
.#![feature(slice_as_chunks)]
+let slice: &mut [char] = &mut ['l', 'o', 'r', 'e', 'm', '!'];
+let chunks: &mut [[char; 1]] =
+ // SAFETY: 1-element chunks never have remainder
+ unsafe { slice.as_chunks_unchecked_mut() };
+chunks[0] = ['L'];
+assert_eq!(chunks, &[['L'], ['o'], ['r'], ['e'], ['m'], ['!']]);
+let chunks: &mut [[char; 3]] =
+ // SAFETY: The slice length (6) is a multiple of 3
+ unsafe { slice.as_chunks_unchecked_mut() };
+chunks[1] = ['a', 'x', '?'];
+assert_eq!(slice, &['L', 'o', 'r', 'a', 'x', '?']);
+
+// These would be unsound:
+// let chunks: &[[_; 5]] = slice.as_chunks_unchecked_mut() // The slice length is not a multiple of 5
+// let chunks: &[[_; 0]] = slice.as_chunks_unchecked_mut() // Zero-length chunks are never allowed
slice_as_chunks
)Splits the slice into a slice of N
-element arrays,
+starting at the beginning of the slice,
+and a remainder slice with length strictly less than N
.
Panics if N
is 0. This check will most probably get changed to a compile time
+error before this method gets stabilized.
#![feature(slice_as_chunks)]
+let v = &mut [0, 0, 0, 0, 0];
+let mut count = 1;
+
+let (chunks, remainder) = v.as_chunks_mut();
+remainder[0] = 9;
+for chunk in chunks {
+ *chunk = [count; 2];
+ count += 1;
+}
+assert_eq!(v, &[1, 1, 2, 2, 9]);
slice_as_chunks
)Splits the slice into a slice of N
-element arrays,
+starting at the end of the slice,
+and a remainder slice with length strictly less than N
.
Panics if N
is 0. This check will most probably get changed to a compile time
+error before this method gets stabilized.
#![feature(slice_as_chunks)]
+let v = &mut [0, 0, 0, 0, 0];
+let mut count = 1;
+
+let (remainder, chunks) = v.as_rchunks_mut();
+remainder[0] = 9;
+for chunk in chunks {
+ *chunk = [count; 2];
+ count += 1;
+}
+assert_eq!(v, &[9, 1, 1, 2, 2]);
array_chunks
)Returns an iterator over N
elements of the slice at a time, starting at the
+beginning of the slice.
The chunks are mutable array references and do not overlap. If N
does not divide
+the length of the slice, then the last up to N-1
elements will be omitted and
+can be retrieved from the into_remainder
function of the iterator.
This method is the const generic equivalent of chunks_exact_mut
.
Panics if N
is 0. This check will most probably get changed to a compile time
+error before this method gets stabilized.
#![feature(array_chunks)]
+let v = &mut [0, 0, 0, 0, 0];
+let mut count = 1;
+
+for chunk in v.array_chunks_mut() {
+ *chunk = [count; 2];
+ count += 1;
+}
+assert_eq!(v, &[1, 1, 2, 2, 0]);
array_windows
)Returns an iterator over overlapping windows of N
elements of a slice,
+starting at the beginning of the slice.
This is the const generic equivalent of windows
.
If N
is greater than the size of the slice, it will return no windows.
Panics if N
is 0. This check will most probably get changed to a compile time
+error before this method gets stabilized.
#![feature(array_windows)]
+let slice = [0, 1, 2, 3];
+let mut iter = slice.array_windows();
+assert_eq!(iter.next().unwrap(), &[0, 1]);
+assert_eq!(iter.next().unwrap(), &[1, 2]);
+assert_eq!(iter.next().unwrap(), &[2, 3]);
+assert!(iter.next().is_none());
Returns an iterator over chunk_size
elements of the slice at a time, starting at the end
+of the slice.
The chunks are slices and do not overlap. If chunk_size
does not divide the length of the
+slice, then the last chunk will not have length chunk_size
.
See rchunks_exact
for a variant of this iterator that returns chunks of always exactly
+chunk_size
elements, and chunks
for the same iterator but starting at the beginning
+of the slice.
Panics if chunk_size
is 0.
let slice = ['l', 'o', 'r', 'e', 'm'];
+let mut iter = slice.rchunks(2);
+assert_eq!(iter.next().unwrap(), &['e', 'm']);
+assert_eq!(iter.next().unwrap(), &['o', 'r']);
+assert_eq!(iter.next().unwrap(), &['l']);
+assert!(iter.next().is_none());
Returns an iterator over chunk_size
elements of the slice at a time, starting at the end
+of the slice.
The chunks are mutable slices, and do not overlap. If chunk_size
does not divide the
+length of the slice, then the last chunk will not have length chunk_size
.
See rchunks_exact_mut
for a variant of this iterator that returns chunks of always
+exactly chunk_size
elements, and chunks_mut
for the same iterator but starting at the
+beginning of the slice.
Panics if chunk_size
is 0.
let v = &mut [0, 0, 0, 0, 0];
+let mut count = 1;
+
+for chunk in v.rchunks_mut(2) {
+ for elem in chunk.iter_mut() {
+ *elem += count;
+ }
+ count += 1;
+}
+assert_eq!(v, &[3, 2, 2, 1, 1]);
Returns an iterator over chunk_size
elements of the slice at a time, starting at the
+end of the slice.
The chunks are slices and do not overlap. If chunk_size
does not divide the length of the
+slice, then the last up to chunk_size-1
elements will be omitted and can be retrieved
+from the remainder
function of the iterator.
Due to each chunk having exactly chunk_size
elements, the compiler can often optimize the
+resulting code better than in the case of rchunks
.
See rchunks
for a variant of this iterator that also returns the remainder as a smaller
+chunk, and chunks_exact
for the same iterator but starting at the beginning of the
+slice.
Panics if chunk_size
is 0.
let slice = ['l', 'o', 'r', 'e', 'm'];
+let mut iter = slice.rchunks_exact(2);
+assert_eq!(iter.next().unwrap(), &['e', 'm']);
+assert_eq!(iter.next().unwrap(), &['o', 'r']);
+assert!(iter.next().is_none());
+assert_eq!(iter.remainder(), &['l']);
Returns an iterator over chunk_size
elements of the slice at a time, starting at the end
+of the slice.
The chunks are mutable slices, and do not overlap. If chunk_size
does not divide the
+length of the slice, then the last up to chunk_size-1
elements will be omitted and can be
+retrieved from the into_remainder
function of the iterator.
Due to each chunk having exactly chunk_size
elements, the compiler can often optimize the
+resulting code better than in the case of chunks_mut
.
See rchunks_mut
for a variant of this iterator that also returns the remainder as a
+smaller chunk, and chunks_exact_mut
for the same iterator but starting at the beginning
+of the slice.
Panics if chunk_size
is 0.
let v = &mut [0, 0, 0, 0, 0];
+let mut count = 1;
+
+for chunk in v.rchunks_exact_mut(2) {
+ for elem in chunk.iter_mut() {
+ *elem += count;
+ }
+ count += 1;
+}
+assert_eq!(v, &[0, 2, 2, 1, 1]);
slice_group_by
)Returns an iterator over the slice producing non-overlapping runs +of elements using the predicate to separate them.
+The predicate is called on two elements following themselves,
+it means the predicate is called on slice[0]
and slice[1]
+then on slice[1]
and slice[2]
and so on.
#![feature(slice_group_by)]
+
+let slice = &[1, 1, 1, 3, 3, 2, 2, 2];
+
+let mut iter = slice.group_by(|a, b| a == b);
+
+assert_eq!(iter.next(), Some(&[1, 1, 1][..]));
+assert_eq!(iter.next(), Some(&[3, 3][..]));
+assert_eq!(iter.next(), Some(&[2, 2, 2][..]));
+assert_eq!(iter.next(), None);
This method can be used to extract the sorted subslices:
+ +#![feature(slice_group_by)]
+
+let slice = &[1, 1, 2, 3, 2, 3, 2, 3, 4];
+
+let mut iter = slice.group_by(|a, b| a <= b);
+
+assert_eq!(iter.next(), Some(&[1, 1, 2, 3][..]));
+assert_eq!(iter.next(), Some(&[2, 3][..]));
+assert_eq!(iter.next(), Some(&[2, 3, 4][..]));
+assert_eq!(iter.next(), None);
slice_group_by
)Returns an iterator over the slice producing non-overlapping mutable +runs of elements using the predicate to separate them.
+The predicate is called on two elements following themselves,
+it means the predicate is called on slice[0]
and slice[1]
+then on slice[1]
and slice[2]
and so on.
#![feature(slice_group_by)]
+
+let slice = &mut [1, 1, 1, 3, 3, 2, 2, 2];
+
+let mut iter = slice.group_by_mut(|a, b| a == b);
+
+assert_eq!(iter.next(), Some(&mut [1, 1, 1][..]));
+assert_eq!(iter.next(), Some(&mut [3, 3][..]));
+assert_eq!(iter.next(), Some(&mut [2, 2, 2][..]));
+assert_eq!(iter.next(), None);
This method can be used to extract the sorted subslices:
+ +#![feature(slice_group_by)]
+
+let slice = &mut [1, 1, 2, 3, 2, 3, 2, 3, 4];
+
+let mut iter = slice.group_by_mut(|a, b| a <= b);
+
+assert_eq!(iter.next(), Some(&mut [1, 1, 2, 3][..]));
+assert_eq!(iter.next(), Some(&mut [2, 3][..]));
+assert_eq!(iter.next(), Some(&mut [2, 3, 4][..]));
+assert_eq!(iter.next(), None);
Divides one slice into two at an index.
+The first will contain all indices from [0, mid)
(excluding
+the index mid
itself) and the second will contain all
+indices from [mid, len)
(excluding the index len
itself).
Panics if mid > len
.
let v = [1, 2, 3, 4, 5, 6];
+
+{
+ let (left, right) = v.split_at(0);
+ assert_eq!(left, []);
+ assert_eq!(right, [1, 2, 3, 4, 5, 6]);
+}
+
+{
+ let (left, right) = v.split_at(2);
+ assert_eq!(left, [1, 2]);
+ assert_eq!(right, [3, 4, 5, 6]);
+}
+
+{
+ let (left, right) = v.split_at(6);
+ assert_eq!(left, [1, 2, 3, 4, 5, 6]);
+ assert_eq!(right, []);
+}
Divides one mutable slice into two at an index.
+The first will contain all indices from [0, mid)
(excluding
+the index mid
itself) and the second will contain all
+indices from [mid, len)
(excluding the index len
itself).
Panics if mid > len
.
let mut v = [1, 0, 3, 0, 5, 6];
+let (left, right) = v.split_at_mut(2);
+assert_eq!(left, [1, 0]);
+assert_eq!(right, [3, 0, 5, 6]);
+left[1] = 2;
+right[1] = 4;
+assert_eq!(v, [1, 2, 3, 4, 5, 6]);
slice_split_at_unchecked
)Divides one slice into two at an index, without doing bounds checking.
+The first will contain all indices from [0, mid)
(excluding
+the index mid
itself) and the second will contain all
+indices from [mid, len)
(excluding the index len
itself).
For a safe alternative see split_at
.
Calling this method with an out-of-bounds index is undefined behavior
+even if the resulting reference is not used. The caller has to ensure that
+0 <= mid <= self.len()
.
#![feature(slice_split_at_unchecked)]
+
+let v = [1, 2, 3, 4, 5, 6];
+
+unsafe {
+ let (left, right) = v.split_at_unchecked(0);
+ assert_eq!(left, []);
+ assert_eq!(right, [1, 2, 3, 4, 5, 6]);
+}
+
+unsafe {
+ let (left, right) = v.split_at_unchecked(2);
+ assert_eq!(left, [1, 2]);
+ assert_eq!(right, [3, 4, 5, 6]);
+}
+
+unsafe {
+ let (left, right) = v.split_at_unchecked(6);
+ assert_eq!(left, [1, 2, 3, 4, 5, 6]);
+ assert_eq!(right, []);
+}
slice_split_at_unchecked
)Divides one mutable slice into two at an index, without doing bounds checking.
+The first will contain all indices from [0, mid)
(excluding
+the index mid
itself) and the second will contain all
+indices from [mid, len)
(excluding the index len
itself).
For a safe alternative see split_at_mut
.
Calling this method with an out-of-bounds index is undefined behavior
+even if the resulting reference is not used. The caller has to ensure that
+0 <= mid <= self.len()
.
#![feature(slice_split_at_unchecked)]
+
+let mut v = [1, 0, 3, 0, 5, 6];
+// scoped to restrict the lifetime of the borrows
+unsafe {
+ let (left, right) = v.split_at_mut_unchecked(2);
+ assert_eq!(left, [1, 0]);
+ assert_eq!(right, [3, 0, 5, 6]);
+ left[1] = 2;
+ right[1] = 4;
+}
+assert_eq!(v, [1, 2, 3, 4, 5, 6]);
split_array
)Divides one slice into an array and a remainder slice at an index.
+The array will contain all indices from [0, N)
(excluding
+the index N
itself) and the slice will contain all
+indices from [N, len)
(excluding the index len
itself).
Panics if N > len
.
#![feature(split_array)]
+
+let v = &[1, 2, 3, 4, 5, 6][..];
+
+{
+ let (left, right) = v.split_array_ref::<0>();
+ assert_eq!(left, &[]);
+ assert_eq!(right, [1, 2, 3, 4, 5, 6]);
+}
+
+{
+ let (left, right) = v.split_array_ref::<2>();
+ assert_eq!(left, &[1, 2]);
+ assert_eq!(right, [3, 4, 5, 6]);
+}
+
+{
+ let (left, right) = v.split_array_ref::<6>();
+ assert_eq!(left, &[1, 2, 3, 4, 5, 6]);
+ assert_eq!(right, []);
+}
split_array
)Divides one mutable slice into an array and a remainder slice at an index.
+The array will contain all indices from [0, N)
(excluding
+the index N
itself) and the slice will contain all
+indices from [N, len)
(excluding the index len
itself).
Panics if N > len
.
#![feature(split_array)]
+
+let mut v = &mut [1, 0, 3, 0, 5, 6][..];
+let (left, right) = v.split_array_mut::<2>();
+assert_eq!(left, &mut [1, 0]);
+assert_eq!(right, [3, 0, 5, 6]);
+left[1] = 2;
+right[1] = 4;
+assert_eq!(v, [1, 2, 3, 4, 5, 6]);
split_array
)Divides one slice into an array and a remainder slice at an index from +the end.
+The slice will contain all indices from [0, len - N)
(excluding
+the index len - N
itself) and the array will contain all
+indices from [len - N, len)
(excluding the index len
itself).
Panics if N > len
.
#![feature(split_array)]
+
+let v = &[1, 2, 3, 4, 5, 6][..];
+
+{
+ let (left, right) = v.rsplit_array_ref::<0>();
+ assert_eq!(left, [1, 2, 3, 4, 5, 6]);
+ assert_eq!(right, &[]);
+}
+
+{
+ let (left, right) = v.rsplit_array_ref::<2>();
+ assert_eq!(left, [1, 2, 3, 4]);
+ assert_eq!(right, &[5, 6]);
+}
+
+{
+ let (left, right) = v.rsplit_array_ref::<6>();
+ assert_eq!(left, []);
+ assert_eq!(right, &[1, 2, 3, 4, 5, 6]);
+}
split_array
)Divides one mutable slice into an array and a remainder slice at an +index from the end.
+The slice will contain all indices from [0, len - N)
(excluding
+the index N
itself) and the array will contain all
+indices from [len - N, len)
(excluding the index len
itself).
Panics if N > len
.
#![feature(split_array)]
+
+let mut v = &mut [1, 0, 3, 0, 5, 6][..];
+let (left, right) = v.rsplit_array_mut::<4>();
+assert_eq!(left, [1, 0]);
+assert_eq!(right, &mut [3, 0, 5, 6]);
+left[1] = 2;
+right[1] = 4;
+assert_eq!(v, [1, 2, 3, 4, 5, 6]);
Returns an iterator over subslices separated by elements that match
+pred
. The matched element is not contained in the subslices.
let slice = [10, 40, 33, 20];
+let mut iter = slice.split(|num| num % 3 == 0);
+
+assert_eq!(iter.next().unwrap(), &[10, 40]);
+assert_eq!(iter.next().unwrap(), &[20]);
+assert!(iter.next().is_none());
If the first element is matched, an empty slice will be the first item +returned by the iterator. Similarly, if the last element in the slice +is matched, an empty slice will be the last item returned by the +iterator:
+ +let slice = [10, 40, 33];
+let mut iter = slice.split(|num| num % 3 == 0);
+
+assert_eq!(iter.next().unwrap(), &[10, 40]);
+assert_eq!(iter.next().unwrap(), &[]);
+assert!(iter.next().is_none());
If two matched elements are directly adjacent, an empty slice will be +present between them:
+ +let slice = [10, 6, 33, 20];
+let mut iter = slice.split(|num| num % 3 == 0);
+
+assert_eq!(iter.next().unwrap(), &[10]);
+assert_eq!(iter.next().unwrap(), &[]);
+assert_eq!(iter.next().unwrap(), &[20]);
+assert!(iter.next().is_none());
Returns an iterator over mutable subslices separated by elements that
+match pred
. The matched element is not contained in the subslices.
let mut v = [10, 40, 30, 20, 60, 50];
+
+for group in v.split_mut(|num| *num % 3 == 0) {
+ group[0] = 1;
+}
+assert_eq!(v, [1, 40, 30, 1, 60, 1]);
Returns an iterator over subslices separated by elements that match
+pred
. The matched element is contained in the end of the previous
+subslice as a terminator.
let slice = [10, 40, 33, 20];
+let mut iter = slice.split_inclusive(|num| num % 3 == 0);
+
+assert_eq!(iter.next().unwrap(), &[10, 40, 33]);
+assert_eq!(iter.next().unwrap(), &[20]);
+assert!(iter.next().is_none());
If the last element of the slice is matched, +that element will be considered the terminator of the preceding slice. +That slice will be the last item returned by the iterator.
+ +let slice = [3, 10, 40, 33];
+let mut iter = slice.split_inclusive(|num| num % 3 == 0);
+
+assert_eq!(iter.next().unwrap(), &[3]);
+assert_eq!(iter.next().unwrap(), &[10, 40, 33]);
+assert!(iter.next().is_none());
Returns an iterator over mutable subslices separated by elements that
+match pred
. The matched element is contained in the previous
+subslice as a terminator.
let mut v = [10, 40, 30, 20, 60, 50];
+
+for group in v.split_inclusive_mut(|num| *num % 3 == 0) {
+ let terminator_idx = group.len()-1;
+ group[terminator_idx] = 1;
+}
+assert_eq!(v, [10, 40, 1, 20, 1, 1]);
Returns an iterator over subslices separated by elements that match
+pred
, starting at the end of the slice and working backwards.
+The matched element is not contained in the subslices.
let slice = [11, 22, 33, 0, 44, 55];
+let mut iter = slice.rsplit(|num| *num == 0);
+
+assert_eq!(iter.next().unwrap(), &[44, 55]);
+assert_eq!(iter.next().unwrap(), &[11, 22, 33]);
+assert_eq!(iter.next(), None);
As with split()
, if the first or last element is matched, an empty
+slice will be the first (or last) item returned by the iterator.
let v = &[0, 1, 1, 2, 3, 5, 8];
+let mut it = v.rsplit(|n| *n % 2 == 0);
+assert_eq!(it.next().unwrap(), &[]);
+assert_eq!(it.next().unwrap(), &[3, 5]);
+assert_eq!(it.next().unwrap(), &[1, 1]);
+assert_eq!(it.next().unwrap(), &[]);
+assert_eq!(it.next(), None);
Returns an iterator over mutable subslices separated by elements that
+match pred
, starting at the end of the slice and working
+backwards. The matched element is not contained in the subslices.
let mut v = [100, 400, 300, 200, 600, 500];
+
+let mut count = 0;
+for group in v.rsplit_mut(|num| *num % 3 == 0) {
+ count += 1;
+ group[0] = count;
+}
+assert_eq!(v, [3, 400, 300, 2, 600, 1]);
Returns an iterator over subslices separated by elements that match
+pred
, limited to returning at most n
items. The matched element is
+not contained in the subslices.
The last element returned, if any, will contain the remainder of the +slice.
+Print the slice split once by numbers divisible by 3 (i.e., [10, 40]
,
+[20, 60, 50]
):
let v = [10, 40, 30, 20, 60, 50];
+
+for group in v.splitn(2, |num| *num % 3 == 0) {
+ println!("{group:?}");
+}
Returns an iterator over mutable subslices separated by elements that match
+pred
, limited to returning at most n
items. The matched element is
+not contained in the subslices.
The last element returned, if any, will contain the remainder of the +slice.
+let mut v = [10, 40, 30, 20, 60, 50];
+
+for group in v.splitn_mut(2, |num| *num % 3 == 0) {
+ group[0] = 1;
+}
+assert_eq!(v, [1, 40, 30, 1, 60, 50]);
Returns an iterator over subslices separated by elements that match
+pred
limited to returning at most n
items. This starts at the end of
+the slice and works backwards. The matched element is not contained in
+the subslices.
The last element returned, if any, will contain the remainder of the +slice.
+Print the slice split once, starting from the end, by numbers divisible
+by 3 (i.e., [50]
, [10, 40, 30, 20]
):
let v = [10, 40, 30, 20, 60, 50];
+
+for group in v.rsplitn(2, |num| *num % 3 == 0) {
+ println!("{group:?}");
+}
Returns an iterator over subslices separated by elements that match
+pred
limited to returning at most n
items. This starts at the end of
+the slice and works backwards. The matched element is not contained in
+the subslices.
The last element returned, if any, will contain the remainder of the +slice.
+let mut s = [10, 40, 30, 20, 60, 50];
+
+for group in s.rsplitn_mut(2, |num| *num % 3 == 0) {
+ group[0] = 1;
+}
+assert_eq!(s, [1, 40, 30, 20, 60, 1]);
slice_split_once
)Splits the slice on the first element that matches the specified +predicate.
+If any matching elements are resent in the slice, returns the prefix
+before the match and suffix after. The matching element itself is not
+included. If no elements match, returns None
.
#![feature(slice_split_once)]
+let s = [1, 2, 3, 2, 4];
+assert_eq!(s.split_once(|&x| x == 2), Some((
+ &[1][..],
+ &[3, 2, 4][..]
+)));
+assert_eq!(s.split_once(|&x| x == 0), None);
slice_split_once
)Splits the slice on the last element that matches the specified +predicate.
+If any matching elements are resent in the slice, returns the prefix
+before the match and suffix after. The matching element itself is not
+included. If no elements match, returns None
.
#![feature(slice_split_once)]
+let s = [1, 2, 3, 2, 4];
+assert_eq!(s.rsplit_once(|&x| x == 2), Some((
+ &[1, 2, 3][..],
+ &[4][..]
+)));
+assert_eq!(s.rsplit_once(|&x| x == 0), None);
Returns true
if the slice contains an element with the given value.
This operation is O(n).
+Note that if you have a sorted slice, binary_search
may be faster.
let v = [10, 40, 30];
+assert!(v.contains(&30));
+assert!(!v.contains(&50));
If you do not have a &T
, but some other value that you can compare
+with one (for example, String
implements PartialEq<str>
), you can
+use iter().any
:
let v = [String::from("hello"), String::from("world")]; // slice of `String`
+assert!(v.iter().any(|e| e == "hello")); // search with `&str`
+assert!(!v.iter().any(|e| e == "hi"));
Returns true
if needle
is a prefix of the slice.
let v = [10, 40, 30];
+assert!(v.starts_with(&[10]));
+assert!(v.starts_with(&[10, 40]));
+assert!(!v.starts_with(&[50]));
+assert!(!v.starts_with(&[10, 50]));
Always returns true
if needle
is an empty slice:
let v = &[10, 40, 30];
+assert!(v.starts_with(&[]));
+let v: &[u8] = &[];
+assert!(v.starts_with(&[]));
Returns true
if needle
is a suffix of the slice.
let v = [10, 40, 30];
+assert!(v.ends_with(&[30]));
+assert!(v.ends_with(&[40, 30]));
+assert!(!v.ends_with(&[50]));
+assert!(!v.ends_with(&[50, 30]));
Always returns true
if needle
is an empty slice:
let v = &[10, 40, 30];
+assert!(v.ends_with(&[]));
+let v: &[u8] = &[];
+assert!(v.ends_with(&[]));
Returns a subslice with the prefix removed.
+If the slice starts with prefix
, returns the subslice after the prefix, wrapped in Some
.
+If prefix
is empty, simply returns the original slice.
If the slice does not start with prefix
, returns None
.
let v = &[10, 40, 30];
+assert_eq!(v.strip_prefix(&[10]), Some(&[40, 30][..]));
+assert_eq!(v.strip_prefix(&[10, 40]), Some(&[30][..]));
+assert_eq!(v.strip_prefix(&[50]), None);
+assert_eq!(v.strip_prefix(&[10, 50]), None);
+
+let prefix : &str = "he";
+assert_eq!(b"hello".strip_prefix(prefix.as_bytes()),
+ Some(b"llo".as_ref()));
Returns a subslice with the suffix removed.
+If the slice ends with suffix
, returns the subslice before the suffix, wrapped in Some
.
+If suffix
is empty, simply returns the original slice.
If the slice does not end with suffix
, returns None
.
let v = &[10, 40, 30];
+assert_eq!(v.strip_suffix(&[30]), Some(&[10, 40][..]));
+assert_eq!(v.strip_suffix(&[40, 30]), Some(&[10][..]));
+assert_eq!(v.strip_suffix(&[50]), None);
+assert_eq!(v.strip_suffix(&[50, 30]), None);
Binary searches this slice for a given element. +If the slice is not sorted, the returned result is unspecified and +meaningless.
+If the value is found then Result::Ok
is returned, containing the
+index of the matching element. If there are multiple matches, then any
+one of the matches could be returned. The index is chosen
+deterministically, but is subject to change in future versions of Rust.
+If the value is not found then Result::Err
is returned, containing
+the index where a matching element could be inserted while maintaining
+sorted order.
See also binary_search_by
, binary_search_by_key
, and partition_point
.
Looks up a series of four elements. The first is found, with a
+uniquely determined position; the second and third are not
+found; the fourth could match any position in [1, 4]
.
let s = [0, 1, 1, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55];
+
+assert_eq!(s.binary_search(&13), Ok(9));
+assert_eq!(s.binary_search(&4), Err(7));
+assert_eq!(s.binary_search(&100), Err(13));
+let r = s.binary_search(&1);
+assert!(match r { Ok(1..=4) => true, _ => false, });
If you want to find that whole range of matching items, rather than
+an arbitrary matching one, that can be done using partition_point
:
let s = [0, 1, 1, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55];
+
+let low = s.partition_point(|x| x < &1);
+assert_eq!(low, 1);
+let high = s.partition_point(|x| x <= &1);
+assert_eq!(high, 5);
+let r = s.binary_search(&1);
+assert!((low..high).contains(&r.unwrap()));
+
+assert!(s[..low].iter().all(|&x| x < 1));
+assert!(s[low..high].iter().all(|&x| x == 1));
+assert!(s[high..].iter().all(|&x| x > 1));
+
+// For something not found, the "range" of equal items is empty
+assert_eq!(s.partition_point(|x| x < &11), 9);
+assert_eq!(s.partition_point(|x| x <= &11), 9);
+assert_eq!(s.binary_search(&11), Err(9));
If you want to insert an item to a sorted vector, while maintaining
+sort order, consider using partition_point
:
let mut s = vec![0, 1, 1, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55];
+let num = 42;
+let idx = s.partition_point(|&x| x < num);
+// The above is equivalent to `let idx = s.binary_search(&num).unwrap_or_else(|x| x);`
+s.insert(idx, num);
+assert_eq!(s, [0, 1, 1, 1, 1, 2, 3, 5, 8, 13, 21, 34, 42, 55]);
Binary searches this slice with a comparator function.
+The comparator function should return an order code that indicates
+whether its argument is Less
, Equal
or Greater
the desired
+target.
+If the slice is not sorted or if the comparator function does not
+implement an order consistent with the sort order of the underlying
+slice, the returned result is unspecified and meaningless.
If the value is found then Result::Ok
is returned, containing the
+index of the matching element. If there are multiple matches, then any
+one of the matches could be returned. The index is chosen
+deterministically, but is subject to change in future versions of Rust.
+If the value is not found then Result::Err
is returned, containing
+the index where a matching element could be inserted while maintaining
+sorted order.
See also binary_search
, binary_search_by_key
, and partition_point
.
Looks up a series of four elements. The first is found, with a
+uniquely determined position; the second and third are not
+found; the fourth could match any position in [1, 4]
.
let s = [0, 1, 1, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55];
+
+let seek = 13;
+assert_eq!(s.binary_search_by(|probe| probe.cmp(&seek)), Ok(9));
+let seek = 4;
+assert_eq!(s.binary_search_by(|probe| probe.cmp(&seek)), Err(7));
+let seek = 100;
+assert_eq!(s.binary_search_by(|probe| probe.cmp(&seek)), Err(13));
+let seek = 1;
+let r = s.binary_search_by(|probe| probe.cmp(&seek));
+assert!(match r { Ok(1..=4) => true, _ => false, });
Binary searches this slice with a key extraction function.
+Assumes that the slice is sorted by the key, for instance with
+sort_by_key
using the same key extraction function.
+If the slice is not sorted by the key, the returned result is
+unspecified and meaningless.
If the value is found then Result::Ok
is returned, containing the
+index of the matching element. If there are multiple matches, then any
+one of the matches could be returned. The index is chosen
+deterministically, but is subject to change in future versions of Rust.
+If the value is not found then Result::Err
is returned, containing
+the index where a matching element could be inserted while maintaining
+sorted order.
See also binary_search
, binary_search_by
, and partition_point
.
Looks up a series of four elements in a slice of pairs sorted by
+their second elements. The first is found, with a uniquely
+determined position; the second and third are not found; the
+fourth could match any position in [1, 4]
.
let s = [(0, 0), (2, 1), (4, 1), (5, 1), (3, 1),
+ (1, 2), (2, 3), (4, 5), (5, 8), (3, 13),
+ (1, 21), (2, 34), (4, 55)];
+
+assert_eq!(s.binary_search_by_key(&13, |&(a, b)| b), Ok(9));
+assert_eq!(s.binary_search_by_key(&4, |&(a, b)| b), Err(7));
+assert_eq!(s.binary_search_by_key(&100, |&(a, b)| b), Err(13));
+let r = s.binary_search_by_key(&1, |&(a, b)| b);
+assert!(match r { Ok(1..=4) => true, _ => false, });
Sorts the slice, but might not preserve the order of equal elements.
+This sort is unstable (i.e., may reorder equal elements), in-place +(i.e., does not allocate), and O(n * log(n)) worst-case.
+The current algorithm is based on pattern-defeating quicksort by Orson Peters, +which combines the fast average case of randomized quicksort with the fast worst case of +heapsort, while achieving linear time on slices with certain patterns. It uses some +randomization to avoid degenerate cases, but with a fixed seed to always provide +deterministic behavior.
+It is typically faster than stable sorting, except in a few special cases, e.g., when the +slice consists of several concatenated sorted sequences.
+let mut v = [-5, 4, 1, -3, 2];
+
+v.sort_unstable();
+assert!(v == [-5, -3, 1, 2, 4]);
Sorts the slice with a comparator function, but might not preserve the order of equal +elements.
+This sort is unstable (i.e., may reorder equal elements), in-place +(i.e., does not allocate), and O(n * log(n)) worst-case.
+The comparator function must define a total ordering for the elements in the slice. If
+the ordering is not total, the order of the elements is unspecified. An order is a
+total order if it is (for all a
, b
and c
):
a < b
, a == b
or a > b
is true, anda < b
and b < c
implies a < c
. The same must hold for both ==
and >
.For example, while f64
doesn’t implement Ord
because NaN != NaN
, we can use
+partial_cmp
as our sort function when we know the slice doesn’t contain a NaN
.
let mut floats = [5f64, 4.0, 1.0, 3.0, 2.0];
+floats.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
+assert_eq!(floats, [1.0, 2.0, 3.0, 4.0, 5.0]);
The current algorithm is based on pattern-defeating quicksort by Orson Peters, +which combines the fast average case of randomized quicksort with the fast worst case of +heapsort, while achieving linear time on slices with certain patterns. It uses some +randomization to avoid degenerate cases, but with a fixed seed to always provide +deterministic behavior.
+It is typically faster than stable sorting, except in a few special cases, e.g., when the +slice consists of several concatenated sorted sequences.
+let mut v = [5, 4, 1, 3, 2];
+v.sort_unstable_by(|a, b| a.cmp(b));
+assert!(v == [1, 2, 3, 4, 5]);
+
+// reverse sorting
+v.sort_unstable_by(|a, b| b.cmp(a));
+assert!(v == [5, 4, 3, 2, 1]);
Sorts the slice with a key extraction function, but might not preserve the order of equal +elements.
+This sort is unstable (i.e., may reorder equal elements), in-place +(i.e., does not allocate), and O(m * n * log(n)) worst-case, where the key function is +O(m).
+The current algorithm is based on pattern-defeating quicksort by Orson Peters, +which combines the fast average case of randomized quicksort with the fast worst case of +heapsort, while achieving linear time on slices with certain patterns. It uses some +randomization to avoid degenerate cases, but with a fixed seed to always provide +deterministic behavior.
+Due to its key calling strategy, sort_unstable_by_key
+is likely to be slower than sort_by_cached_key
in
+cases where the key function is expensive.
let mut v = [-5i32, 4, 1, -3, 2];
+
+v.sort_unstable_by_key(|k| k.abs());
+assert!(v == [1, 2, -3, 4, -5]);
Reorder the slice such that the element at index
is at its final sorted position.
This reordering has the additional property that any value at position i < index
will be
+less than or equal to any value at a position j > index
. Additionally, this reordering is
+unstable (i.e. any number of equal elements may end up at position index
), in-place
+(i.e. does not allocate), and runs in O(n) time.
+This function is also known as “kth element” in other libraries.
It returns a triplet of the following from the reordered slice:
+the subslice prior to index
, the element at index
, and the subslice after index
;
+accordingly, the values in those two subslices will respectively all be less-than-or-equal-to
+and greater-than-or-equal-to the value of the element at index
.
The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also
+the basis for sort_unstable
. The fallback algorithm is Median of Medians using Tukey’s Ninther for
+pivot selection, which guarantees linear runtime for all inputs.
Panics when index >= len()
, meaning it always panics on empty slices.
let mut v = [-5i32, 4, 1, -3, 2];
+
+// Find the median
+v.select_nth_unstable(2);
+
+// We are only guaranteed the slice will be one of the following, based on the way we sort
+// about the specified index.
+assert!(v == [-3, -5, 1, 2, 4] ||
+ v == [-5, -3, 1, 2, 4] ||
+ v == [-3, -5, 1, 4, 2] ||
+ v == [-5, -3, 1, 4, 2]);
Reorder the slice with a comparator function such that the element at index
is at its
+final sorted position.
This reordering has the additional property that any value at position i < index
will be
+less than or equal to any value at a position j > index
using the comparator function.
+Additionally, this reordering is unstable (i.e. any number of equal elements may end up at
+position index
), in-place (i.e. does not allocate), and runs in O(n) time.
+This function is also known as “kth element” in other libraries.
It returns a triplet of the following from
+the slice reordered according to the provided comparator function: the subslice prior to
+index
, the element at index
, and the subslice after index
; accordingly, the values in
+those two subslices will respectively all be less-than-or-equal-to and greater-than-or-equal-to
+the value of the element at index
.
The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also
+the basis for sort_unstable
. The fallback algorithm is Median of Medians using Tukey’s Ninther for
+pivot selection, which guarantees linear runtime for all inputs.
Panics when index >= len()
, meaning it always panics on empty slices.
let mut v = [-5i32, 4, 1, -3, 2];
+
+// Find the median as if the slice were sorted in descending order.
+v.select_nth_unstable_by(2, |a, b| b.cmp(a));
+
+// We are only guaranteed the slice will be one of the following, based on the way we sort
+// about the specified index.
+assert!(v == [2, 4, 1, -5, -3] ||
+ v == [2, 4, 1, -3, -5] ||
+ v == [4, 2, 1, -5, -3] ||
+ v == [4, 2, 1, -3, -5]);
Reorder the slice with a key extraction function such that the element at index
is at its
+final sorted position.
This reordering has the additional property that any value at position i < index
will be
+less than or equal to any value at a position j > index
using the key extraction function.
+Additionally, this reordering is unstable (i.e. any number of equal elements may end up at
+position index
), in-place (i.e. does not allocate), and runs in O(n) time.
+This function is also known as “kth element” in other libraries.
It returns a triplet of the following from
+the slice reordered according to the provided key extraction function: the subslice prior to
+index
, the element at index
, and the subslice after index
; accordingly, the values in
+those two subslices will respectively all be less-than-or-equal-to and greater-than-or-equal-to
+the value of the element at index
.
The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also
+the basis for sort_unstable
. The fallback algorithm is Median of Medians using Tukey’s Ninther for
+pivot selection, which guarantees linear runtime for all inputs.
Panics when index >= len()
, meaning it always panics on empty slices.
let mut v = [-5i32, 4, 1, -3, 2];
+
+// Return the median as if the array were sorted according to absolute value.
+v.select_nth_unstable_by_key(2, |a| a.abs());
+
+// We are only guaranteed the slice will be one of the following, based on the way we sort
+// about the specified index.
+assert!(v == [1, 2, -3, 4, -5] ||
+ v == [1, 2, -3, -5, 4] ||
+ v == [2, 1, -3, 4, -5] ||
+ v == [2, 1, -3, -5, 4]);
slice_partition_dedup
)Moves all consecutive repeated elements to the end of the slice according to the
+PartialEq
trait implementation.
Returns two slices. The first contains no consecutive repeated elements. +The second contains all the duplicates in no specified order.
+If the slice is sorted, the first returned slice contains no duplicates.
+#![feature(slice_partition_dedup)]
+
+let mut slice = [1, 2, 2, 3, 3, 2, 1, 1];
+
+let (dedup, duplicates) = slice.partition_dedup();
+
+assert_eq!(dedup, [1, 2, 3, 2, 1]);
+assert_eq!(duplicates, [2, 3, 1]);
slice_partition_dedup
)Moves all but the first of consecutive elements to the end of the slice satisfying +a given equality relation.
+Returns two slices. The first contains no consecutive repeated elements. +The second contains all the duplicates in no specified order.
+The same_bucket
function is passed references to two elements from the slice and
+must determine if the elements compare equal. The elements are passed in opposite order
+from their order in the slice, so if same_bucket(a, b)
returns true
, a
is moved
+at the end of the slice.
If the slice is sorted, the first returned slice contains no duplicates.
+#![feature(slice_partition_dedup)]
+
+let mut slice = ["foo", "Foo", "BAZ", "Bar", "bar", "baz", "BAZ"];
+
+let (dedup, duplicates) = slice.partition_dedup_by(|a, b| a.eq_ignore_ascii_case(b));
+
+assert_eq!(dedup, ["foo", "BAZ", "Bar", "baz"]);
+assert_eq!(duplicates, ["bar", "Foo", "BAZ"]);
slice_partition_dedup
)Moves all but the first of consecutive elements to the end of the slice that resolve +to the same key.
+Returns two slices. The first contains no consecutive repeated elements. +The second contains all the duplicates in no specified order.
+If the slice is sorted, the first returned slice contains no duplicates.
+#![feature(slice_partition_dedup)]
+
+let mut slice = [10, 20, 21, 30, 30, 20, 11, 13];
+
+let (dedup, duplicates) = slice.partition_dedup_by_key(|i| *i / 10);
+
+assert_eq!(dedup, [10, 20, 30, 20, 11]);
+assert_eq!(duplicates, [21, 30, 13]);
Rotates the slice in-place such that the first mid
elements of the
+slice move to the end while the last self.len() - mid
elements move to
+the front. After calling rotate_left
, the element previously at index
+mid
will become the first element in the slice.
This function will panic if mid
is greater than the length of the
+slice. Note that mid == self.len()
does not panic and is a no-op
+rotation.
Takes linear (in self.len()
) time.
let mut a = ['a', 'b', 'c', 'd', 'e', 'f'];
+a.rotate_left(2);
+assert_eq!(a, ['c', 'd', 'e', 'f', 'a', 'b']);
Rotating a subslice:
+ +let mut a = ['a', 'b', 'c', 'd', 'e', 'f'];
+a[1..5].rotate_left(1);
+assert_eq!(a, ['a', 'c', 'd', 'e', 'b', 'f']);
Rotates the slice in-place such that the first self.len() - k
+elements of the slice move to the end while the last k
elements move
+to the front. After calling rotate_right
, the element previously at
+index self.len() - k
will become the first element in the slice.
This function will panic if k
is greater than the length of the
+slice. Note that k == self.len()
does not panic and is a no-op
+rotation.
Takes linear (in self.len()
) time.
let mut a = ['a', 'b', 'c', 'd', 'e', 'f'];
+a.rotate_right(2);
+assert_eq!(a, ['e', 'f', 'a', 'b', 'c', 'd']);
Rotating a subslice:
+ +let mut a = ['a', 'b', 'c', 'd', 'e', 'f'];
+a[1..5].rotate_right(1);
+assert_eq!(a, ['a', 'e', 'b', 'c', 'd', 'f']);
Fills self
with elements by cloning value
.
let mut buf = vec![0; 10];
+buf.fill(1);
+assert_eq!(buf, vec![1; 10]);
Fills self
with elements returned by calling a closure repeatedly.
This method uses a closure to create new values. If you’d rather
+Clone
a given value, use fill
. If you want to use the Default
+trait to generate values, you can pass Default::default
as the
+argument.
let mut buf = vec![1; 10];
+buf.fill_with(Default::default);
+assert_eq!(buf, vec![0; 10]);
Copies the elements from src
into self
.
The length of src
must be the same as self
.
This function will panic if the two slices have different lengths.
+Cloning two elements from a slice into another:
+ +let src = [1, 2, 3, 4];
+let mut dst = [0, 0];
+
+// Because the slices have to be the same length,
+// we slice the source slice from four elements
+// to two. It will panic if we don't do this.
+dst.clone_from_slice(&src[2..]);
+
+assert_eq!(src, [1, 2, 3, 4]);
+assert_eq!(dst, [3, 4]);
Rust enforces that there can only be one mutable reference with no
+immutable references to a particular piece of data in a particular
+scope. Because of this, attempting to use clone_from_slice
on a
+single slice will result in a compile failure:
let mut slice = [1, 2, 3, 4, 5];
+
+slice[..2].clone_from_slice(&slice[3..]); // compile fail!
To work around this, we can use split_at_mut
to create two distinct
+sub-slices from a slice:
let mut slice = [1, 2, 3, 4, 5];
+
+{
+ let (left, right) = slice.split_at_mut(2);
+ left.clone_from_slice(&right[1..]);
+}
+
+assert_eq!(slice, [4, 5, 3, 4, 5]);
Copies all elements from src
into self
, using a memcpy.
The length of src
must be the same as self
.
If T
does not implement Copy
, use clone_from_slice
.
This function will panic if the two slices have different lengths.
+Copying two elements from a slice into another:
+ +let src = [1, 2, 3, 4];
+let mut dst = [0, 0];
+
+// Because the slices have to be the same length,
+// we slice the source slice from four elements
+// to two. It will panic if we don't do this.
+dst.copy_from_slice(&src[2..]);
+
+assert_eq!(src, [1, 2, 3, 4]);
+assert_eq!(dst, [3, 4]);
Rust enforces that there can only be one mutable reference with no
+immutable references to a particular piece of data in a particular
+scope. Because of this, attempting to use copy_from_slice
on a
+single slice will result in a compile failure:
let mut slice = [1, 2, 3, 4, 5];
+
+slice[..2].copy_from_slice(&slice[3..]); // compile fail!
To work around this, we can use split_at_mut
to create two distinct
+sub-slices from a slice:
let mut slice = [1, 2, 3, 4, 5];
+
+{
+ let (left, right) = slice.split_at_mut(2);
+ left.copy_from_slice(&right[1..]);
+}
+
+assert_eq!(slice, [4, 5, 3, 4, 5]);
Copies elements from one part of the slice to another part of itself, +using a memmove.
+src
is the range within self
to copy from. dest
is the starting
+index of the range within self
to copy to, which will have the same
+length as src
. The two ranges may overlap. The ends of the two ranges
+must be less than or equal to self.len()
.
This function will panic if either range exceeds the end of the slice,
+or if the end of src
is before the start.
Copying four bytes within a slice:
+ +let mut bytes = *b"Hello, World!";
+
+bytes.copy_within(1..5, 8);
+
+assert_eq!(&bytes, b"Hello, Wello!");
Swaps all elements in self
with those in other
.
The length of other
must be the same as self
.
This function will panic if the two slices have different lengths.
+Swapping two elements across slices:
+ +let mut slice1 = [0, 0];
+let mut slice2 = [1, 2, 3, 4];
+
+slice1.swap_with_slice(&mut slice2[2..]);
+
+assert_eq!(slice1, [3, 4]);
+assert_eq!(slice2, [1, 2, 0, 0]);
Rust enforces that there can only be one mutable reference to a
+particular piece of data in a particular scope. Because of this,
+attempting to use swap_with_slice
on a single slice will result in
+a compile failure:
let mut slice = [1, 2, 3, 4, 5];
+slice[..2].swap_with_slice(&mut slice[3..]); // compile fail!
To work around this, we can use split_at_mut
to create two distinct
+mutable sub-slices from a slice:
let mut slice = [1, 2, 3, 4, 5];
+
+{
+ let (left, right) = slice.split_at_mut(2);
+ left.swap_with_slice(&mut right[1..]);
+}
+
+assert_eq!(slice, [4, 5, 3, 1, 2]);
Transmute the slice to a slice of another type, ensuring alignment of the types is +maintained.
+This method splits the slice into three distinct slices: prefix, correctly aligned middle +slice of a new type, and the suffix slice. How exactly the slice is split up is not +specified; the middle part may be smaller than necessary. However, if this fails to return a +maximal middle part, that is because code is running in a context where performance does not +matter, such as a sanitizer attempting to find alignment bugs. Regular code running +in a default (debug or release) execution will return a maximal middle part.
+This method has no purpose when either input element T
or output element U
are
+zero-sized and will return the original slice without splitting anything.
This method is essentially a transmute
with respect to the elements in the returned
+middle slice, so all the usual caveats pertaining to transmute::<T, U>
also apply here.
Basic usage:
+ +unsafe {
+ let bytes: [u8; 7] = [1, 2, 3, 4, 5, 6, 7];
+ let (prefix, shorts, suffix) = bytes.align_to::<u16>();
+ // less_efficient_algorithm_for_bytes(prefix);
+ // more_efficient_algorithm_for_aligned_shorts(shorts);
+ // less_efficient_algorithm_for_bytes(suffix);
+}
Transmute the mutable slice to a mutable slice of another type, ensuring alignment of the +types is maintained.
+This method splits the slice into three distinct slices: prefix, correctly aligned middle +slice of a new type, and the suffix slice. How exactly the slice is split up is not +specified; the middle part may be smaller than necessary. However, if this fails to return a +maximal middle part, that is because code is running in a context where performance does not +matter, such as a sanitizer attempting to find alignment bugs. Regular code running +in a default (debug or release) execution will return a maximal middle part.
+This method has no purpose when either input element T
or output element U
are
+zero-sized and will return the original slice without splitting anything.
This method is essentially a transmute
with respect to the elements in the returned
+middle slice, so all the usual caveats pertaining to transmute::<T, U>
also apply here.
Basic usage:
+ +unsafe {
+ let mut bytes: [u8; 7] = [1, 2, 3, 4, 5, 6, 7];
+ let (prefix, shorts, suffix) = bytes.align_to_mut::<u16>();
+ // less_efficient_algorithm_for_bytes(prefix);
+ // more_efficient_algorithm_for_aligned_shorts(shorts);
+ // less_efficient_algorithm_for_bytes(suffix);
+}
portable_simd
)Split a slice into a prefix, a middle of aligned SIMD types, and a suffix.
+This is a safe wrapper around slice::align_to
, so has the same weak
+postconditions as that method. You’re only assured that
+self.len() == prefix.len() + middle.len() * LANES + suffix.len()
.
Notably, all of the following are possible:
+prefix.len() >= LANES
.middle.is_empty()
despite self.len() >= 3 * LANES
.suffix.len() >= LANES
.That said, this is a safe method, so if you’re only writing safe code, +then this can at most cause incorrect logic, not unsoundness.
+This will panic if the size of the SIMD type is different from
+LANES
times that of the scalar.
At the time of writing, the trait restrictions on Simd<T, LANES>
keeps
+that from ever happening, as only power-of-two numbers of lanes are
+supported. It’s possible that, in the future, those restrictions might
+be lifted in a way that would make it possible to see panics from this
+method for something like LANES == 3
.
#![feature(portable_simd)]
+use core::simd::SimdFloat;
+
+let short = &[1, 2, 3];
+let (prefix, middle, suffix) = short.as_simd::<4>();
+assert_eq!(middle, []); // Not enough elements for anything in the middle
+
+// They might be split in any possible way between prefix and suffix
+let it = prefix.iter().chain(suffix).copied();
+assert_eq!(it.collect::<Vec<_>>(), vec![1, 2, 3]);
+
+fn basic_simd_sum(x: &[f32]) -> f32 {
+ use std::ops::Add;
+ use std::simd::f32x4;
+ let (prefix, middle, suffix) = x.as_simd();
+ let sums = f32x4::from_array([
+ prefix.iter().copied().sum(),
+ 0.0,
+ 0.0,
+ suffix.iter().copied().sum(),
+ ]);
+ let sums = middle.iter().copied().fold(sums, f32x4::add);
+ sums.reduce_sum()
+}
+
+let numbers: Vec<f32> = (1..101).map(|x| x as _).collect();
+assert_eq!(basic_simd_sum(&numbers[1..99]), 4949.0);
portable_simd
)Split a mutable slice into a mutable prefix, a middle of aligned SIMD types, +and a mutable suffix.
+This is a safe wrapper around slice::align_to_mut
, so has the same weak
+postconditions as that method. You’re only assured that
+self.len() == prefix.len() + middle.len() * LANES + suffix.len()
.
Notably, all of the following are possible:
+prefix.len() >= LANES
.middle.is_empty()
despite self.len() >= 3 * LANES
.suffix.len() >= LANES
.That said, this is a safe method, so if you’re only writing safe code, +then this can at most cause incorrect logic, not unsoundness.
+This is the mutable version of slice::as_simd
; see that for examples.
This will panic if the size of the SIMD type is different from
+LANES
times that of the scalar.
At the time of writing, the trait restrictions on Simd<T, LANES>
keeps
+that from ever happening, as only power-of-two numbers of lanes are
+supported. It’s possible that, in the future, those restrictions might
+be lifted in a way that would make it possible to see panics from this
+method for something like LANES == 3
.
is_sorted
)Checks if the elements of this slice are sorted.
+That is, for each element a
and its following element b
, a <= b
must hold. If the
+slice yields exactly zero or one element, true
is returned.
Note that if Self::Item
is only PartialOrd
, but not Ord
, the above definition
+implies that this function returns false
if any two consecutive items are not
+comparable.
#![feature(is_sorted)]
+let empty: [i32; 0] = [];
+
+assert!([1, 2, 2, 9].is_sorted());
+assert!(![1, 3, 2, 4].is_sorted());
+assert!([0].is_sorted());
+assert!(empty.is_sorted());
+assert!(![0.0, 1.0, f32::NAN].is_sorted());
is_sorted
)Checks if the elements of this slice are sorted using the given comparator function.
+Instead of using PartialOrd::partial_cmp
, this function uses the given compare
+function to determine the ordering of two elements. Apart from that, it’s equivalent to
+is_sorted
; see its documentation for more information.
is_sorted
)Checks if the elements of this slice are sorted using the given key extraction function.
+Instead of comparing the slice’s elements directly, this function compares the keys of the
+elements, as determined by f
. Apart from that, it’s equivalent to is_sorted
; see its
+documentation for more information.
#![feature(is_sorted)]
+
+assert!(["c", "bb", "aaa"].is_sorted_by_key(|s| s.len()));
+assert!(![-2i32, -1, 0, 3].is_sorted_by_key(|n| n.abs()));
Returns the index of the partition point according to the given predicate +(the index of the first element of the second partition).
+The slice is assumed to be partitioned according to the given predicate.
+This means that all elements for which the predicate returns true are at the start of the slice
+and all elements for which the predicate returns false are at the end.
+For example, [7, 15, 3, 5, 4, 12, 6]
is partitioned under the predicate x % 2 != 0
+(all odd numbers are at the start, all even at the end).
If this slice is not partitioned, the returned result is unspecified and meaningless, +as this method performs a kind of binary search.
+See also binary_search
, binary_search_by
, and binary_search_by_key
.
let v = [1, 2, 3, 3, 5, 6, 7];
+let i = v.partition_point(|&x| x < 5);
+
+assert_eq!(i, 4);
+assert!(v[..i].iter().all(|&x| x < 5));
+assert!(v[i..].iter().all(|&x| !(x < 5)));
If all elements of the slice match the predicate, including if the slice +is empty, then the length of the slice will be returned:
+ +let a = [2, 4, 8];
+assert_eq!(a.partition_point(|x| x < &100), a.len());
+let a: [i32; 0] = [];
+assert_eq!(a.partition_point(|x| x < &100), 0);
If you want to insert an item to a sorted vector, while maintaining +sort order:
+ +let mut s = vec![0, 1, 1, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55];
+let num = 42;
+let idx = s.partition_point(|&x| x < num);
+s.insert(idx, num);
+assert_eq!(s, [0, 1, 1, 1, 1, 2, 3, 5, 8, 13, 21, 34, 42, 55]);
slice_take
)Removes the subslice corresponding to the given range +and returns a reference to it.
+Returns None
and does not modify the slice if the given
+range is out of bounds.
Note that this method only accepts one-sided ranges such as
+2..
or ..6
, but not 2..6
.
Taking the first three elements of a slice:
+ +#![feature(slice_take)]
+
+let mut slice: &[_] = &['a', 'b', 'c', 'd'];
+let mut first_three = slice.take(..3).unwrap();
+
+assert_eq!(slice, &['d']);
+assert_eq!(first_three, &['a', 'b', 'c']);
Taking the last two elements of a slice:
+ +#![feature(slice_take)]
+
+let mut slice: &[_] = &['a', 'b', 'c', 'd'];
+let mut tail = slice.take(2..).unwrap();
+
+assert_eq!(slice, &['a', 'b']);
+assert_eq!(tail, &['c', 'd']);
Getting None
when range
is out of bounds:
#![feature(slice_take)]
+
+let mut slice: &[_] = &['a', 'b', 'c', 'd'];
+
+assert_eq!(None, slice.take(5..));
+assert_eq!(None, slice.take(..5));
+assert_eq!(None, slice.take(..=4));
+let expected: &[char] = &['a', 'b', 'c', 'd'];
+assert_eq!(Some(expected), slice.take(..4));
slice_take
)Removes the subslice corresponding to the given range +and returns a mutable reference to it.
+Returns None
and does not modify the slice if the given
+range is out of bounds.
Note that this method only accepts one-sided ranges such as
+2..
or ..6
, but not 2..6
.
Taking the first three elements of a slice:
+ +#![feature(slice_take)]
+
+let mut slice: &mut [_] = &mut ['a', 'b', 'c', 'd'];
+let mut first_three = slice.take_mut(..3).unwrap();
+
+assert_eq!(slice, &mut ['d']);
+assert_eq!(first_three, &mut ['a', 'b', 'c']);
Taking the last two elements of a slice:
+ +#![feature(slice_take)]
+
+let mut slice: &mut [_] = &mut ['a', 'b', 'c', 'd'];
+let mut tail = slice.take_mut(2..).unwrap();
+
+assert_eq!(slice, &mut ['a', 'b']);
+assert_eq!(tail, &mut ['c', 'd']);
Getting None
when range
is out of bounds:
#![feature(slice_take)]
+
+let mut slice: &mut [_] = &mut ['a', 'b', 'c', 'd'];
+
+assert_eq!(None, slice.take_mut(5..));
+assert_eq!(None, slice.take_mut(..5));
+assert_eq!(None, slice.take_mut(..=4));
+let expected: &mut [_] = &mut ['a', 'b', 'c', 'd'];
+assert_eq!(Some(expected), slice.take_mut(..4));
slice_take
)Removes the first element of the slice and returns a reference +to it.
+Returns None
if the slice is empty.
#![feature(slice_take)]
+
+let mut slice: &[_] = &['a', 'b', 'c'];
+let first = slice.take_first().unwrap();
+
+assert_eq!(slice, &['b', 'c']);
+assert_eq!(first, &'a');
slice_take
)Removes the first element of the slice and returns a mutable +reference to it.
+Returns None
if the slice is empty.
#![feature(slice_take)]
+
+let mut slice: &mut [_] = &mut ['a', 'b', 'c'];
+let first = slice.take_first_mut().unwrap();
+*first = 'd';
+
+assert_eq!(slice, &['b', 'c']);
+assert_eq!(first, &'d');
slice_take
)Removes the last element of the slice and returns a reference +to it.
+Returns None
if the slice is empty.
#![feature(slice_take)]
+
+let mut slice: &[_] = &['a', 'b', 'c'];
+let last = slice.take_last().unwrap();
+
+assert_eq!(slice, &['a', 'b']);
+assert_eq!(last, &'c');
slice_take
)Removes the last element of the slice and returns a mutable +reference to it.
+Returns None
if the slice is empty.
#![feature(slice_take)]
+
+let mut slice: &mut [_] = &mut ['a', 'b', 'c'];
+let last = slice.take_last_mut().unwrap();
+*last = 'd';
+
+assert_eq!(slice, &['a', 'b']);
+assert_eq!(last, &'d');
get_many_mut
)Returns mutable references to many indices at once, without doing any checks.
+For a safe alternative see get_many_mut
.
Calling this method with overlapping or out-of-bounds indices is undefined behavior +even if the resulting references are not used.
+#![feature(get_many_mut)]
+
+let x = &mut [1, 2, 4];
+
+unsafe {
+ let [a, b] = x.get_many_unchecked_mut([0, 2]);
+ *a *= 10;
+ *b *= 100;
+}
+assert_eq!(x, &[10, 2, 400]);
get_many_mut
)Returns mutable references to many indices at once.
+Returns an error if any index is out-of-bounds, or if the same index was +passed more than once.
+#![feature(get_many_mut)]
+
+let v = &mut [1, 2, 3];
+if let Ok([a, b]) = v.get_many_mut([0, 2]) {
+ *a = 413;
+ *b = 612;
+}
+assert_eq!(v, &[413, 2, 612]);
slice_flatten
)Takes a &[[T; N]]
, and flattens it to a &[T]
.
This panics if the length of the resulting slice would overflow a usize
.
This is only possible when flattening a slice of arrays of zero-sized
+types, and thus tends to be irrelevant in practice. If
+size_of::<T>() > 0
, this will never panic.
#![feature(slice_flatten)]
+
+assert_eq!([[1, 2, 3], [4, 5, 6]].flatten(), &[1, 2, 3, 4, 5, 6]);
+
+assert_eq!(
+ [[1, 2, 3], [4, 5, 6]].flatten(),
+ [[1, 2], [3, 4], [5, 6]].flatten(),
+);
+
+let slice_of_empty_arrays: &[[i32; 0]] = &[[], [], [], [], []];
+assert!(slice_of_empty_arrays.flatten().is_empty());
+
+let empty_slice_of_arrays: &[[u32; 10]] = &[];
+assert!(empty_slice_of_arrays.flatten().is_empty());
slice_flatten
)Takes a &mut [[T; N]]
, and flattens it to a &mut [T]
.
This panics if the length of the resulting slice would overflow a usize
.
This is only possible when flattening a slice of arrays of zero-sized
+types, and thus tends to be irrelevant in practice. If
+size_of::<T>() > 0
, this will never panic.
#![feature(slice_flatten)]
+
+fn add_5_to_all(slice: &mut [i32]) {
+ for i in slice {
+ *i += 5;
+ }
+}
+
+let mut array = [[1, 2, 3], [4, 5, 6], [7, 8, 9]];
+add_5_to_all(array.flatten_mut());
+assert_eq!(array, [[6, 7, 8], [9, 10, 11], [12, 13, 14]]);
ascii_char
)Views this slice of ASCII characters as a UTF-8 str
.
ascii_char
)Views this slice of ASCII characters as a slice of u8
bytes.
sort_floats
)Sorts the slice of floats.
+This sort is in-place (i.e. does not allocate), O(n * log(n)) worst-case, and uses
+the ordering defined by f64::total_cmp
.
This uses the same sorting algorithm as sort_unstable_by
.
#![feature(sort_floats)]
+let mut v = [2.6, -5e-8, f64::NAN, 8.29, f64::INFINITY, -1.0, 0.0, -f64::INFINITY, -0.0];
+
+v.sort_floats();
+let sorted = [-f64::INFINITY, -1.0, -5e-8, -0.0, 0.0, 2.6, 8.29, f64::INFINITY, f64::NAN];
+assert_eq!(&v[..8], &sorted[..8]);
+assert!(v[8].is_nan());
no_global_oom_handling
only.Sorts the slice.
+This sort is stable (i.e., does not reorder equal elements) and O(n * log(n)) worst-case.
+When applicable, unstable sorting is preferred because it is generally faster than stable
+sorting and it doesn’t allocate auxiliary memory.
+See sort_unstable
.
The current algorithm is an adaptive, iterative merge sort inspired by +timsort. +It is designed to be very fast in cases where the slice is nearly sorted, or consists of +two or more sorted sequences concatenated one after another.
+Also, it allocates temporary storage half the size of self
, but for short slices a
+non-allocating insertion sort is used instead.
let mut v = [-5, 4, 1, -3, 2];
+
+v.sort();
+assert!(v == [-5, -3, 1, 2, 4]);
no_global_oom_handling
only.Sorts the slice with a comparator function.
+This sort is stable (i.e., does not reorder equal elements) and O(n * log(n)) worst-case.
+The comparator function must define a total ordering for the elements in the slice. If
+the ordering is not total, the order of the elements is unspecified. An order is a
+total order if it is (for all a
, b
and c
):
a < b
, a == b
or a > b
is true, anda < b
and b < c
implies a < c
. The same must hold for both ==
and >
.For example, while f64
doesn’t implement Ord
because NaN != NaN
, we can use
+partial_cmp
as our sort function when we know the slice doesn’t contain a NaN
.
let mut floats = [5f64, 4.0, 1.0, 3.0, 2.0];
+floats.sort_by(|a, b| a.partial_cmp(b).unwrap());
+assert_eq!(floats, [1.0, 2.0, 3.0, 4.0, 5.0]);
When applicable, unstable sorting is preferred because it is generally faster than stable
+sorting and it doesn’t allocate auxiliary memory.
+See sort_unstable_by
.
The current algorithm is an adaptive, iterative merge sort inspired by +timsort. +It is designed to be very fast in cases where the slice is nearly sorted, or consists of +two or more sorted sequences concatenated one after another.
+Also, it allocates temporary storage half the size of self
, but for short slices a
+non-allocating insertion sort is used instead.
let mut v = [5, 4, 1, 3, 2];
+v.sort_by(|a, b| a.cmp(b));
+assert!(v == [1, 2, 3, 4, 5]);
+
+// reverse sorting
+v.sort_by(|a, b| b.cmp(a));
+assert!(v == [5, 4, 3, 2, 1]);
no_global_oom_handling
only.Sorts the slice with a key extraction function.
+This sort is stable (i.e., does not reorder equal elements) and O(m * n * log(n)) +worst-case, where the key function is O(m).
+For expensive key functions (e.g. functions that are not simple property accesses or
+basic operations), sort_by_cached_key
is likely to be
+significantly faster, as it does not recompute element keys.
When applicable, unstable sorting is preferred because it is generally faster than stable
+sorting and it doesn’t allocate auxiliary memory.
+See sort_unstable_by_key
.
The current algorithm is an adaptive, iterative merge sort inspired by +timsort. +It is designed to be very fast in cases where the slice is nearly sorted, or consists of +two or more sorted sequences concatenated one after another.
+Also, it allocates temporary storage half the size of self
, but for short slices a
+non-allocating insertion sort is used instead.
let mut v = [-5i32, 4, 1, -3, 2];
+
+v.sort_by_key(|k| k.abs());
+assert!(v == [1, 2, -3, 4, -5]);
no_global_oom_handling
only.Sorts the slice with a key extraction function.
+During sorting, the key function is called at most once per element, by using +temporary storage to remember the results of key evaluation. +The order of calls to the key function is unspecified and may change in future versions +of the standard library.
+This sort is stable (i.e., does not reorder equal elements) and O(m * n + n * log(n)) +worst-case, where the key function is O(m).
+For simple key functions (e.g., functions that are property accesses or
+basic operations), sort_by_key
is likely to be
+faster.
The current algorithm is based on pattern-defeating quicksort by Orson Peters, +which combines the fast average case of randomized quicksort with the fast worst case of +heapsort, while achieving linear time on slices with certain patterns. It uses some +randomization to avoid degenerate cases, but with a fixed seed to always provide +deterministic behavior.
+In the worst case, the algorithm allocates temporary storage in a Vec<(K, usize)>
the
+length of the slice.
let mut v = [-5i32, 4, 32, -3, 2];
+
+v.sort_by_cached_key(|k| k.to_string());
+assert!(v == [-3, -5, 2, 32, 4]);
no_global_oom_handling
only.Copies self
into a new Vec
.
let s = [10, 40, 30];
+let x = s.to_vec();
+// Here, `s` and `x` can be modified independently.
allocator_api
)no_global_oom_handling
only.Copies self
into a new Vec
with an allocator.
#![feature(allocator_api)]
+
+use std::alloc::System;
+
+let s = [10, 40, 30];
+let x = s.to_vec_in(System);
+// Here, `s` and `x` can be modified independently.
no_global_oom_handling
only.Flattens a slice of T
into a single value Self::Output
.
assert_eq!(["hello", "world"].concat(), "helloworld");
+assert_eq!([[1, 2], [3, 4]].concat(), [1, 2, 3, 4]);
Flattens a slice of T
into a single value Self::Output
, placing a
+given separator between each.
assert_eq!(["hello", "world"].join(" "), "hello world");
+assert_eq!([[1, 2], [3, 4]].join(&0), [1, 2, 0, 3, 4]);
+assert_eq!([[1, 2], [3, 4]].join(&[0, 0][..]), [1, 2, 0, 0, 3, 4]);
Flattens a slice of T
into a single value Self::Output
, placing a
+given separator between each.
assert_eq!(["hello", "world"].connect(" "), "hello world");
+assert_eq!([[1, 2], [3, 4]].connect(&0), [1, 2, 0, 3, 4]);
no_global_oom_handling
only.Returns a vector containing a copy of this slice where each byte +is mapped to its ASCII upper case equivalent.
+ASCII letters ‘a’ to ‘z’ are mapped to ‘A’ to ‘Z’, +but non-ASCII letters are unchanged.
+To uppercase the value in-place, use make_ascii_uppercase
.
no_global_oom_handling
only.Returns a vector containing a copy of this slice where each byte +is mapped to its ASCII lower case equivalent.
+ASCII letters ‘A’ to ‘Z’ are mapped to ‘a’ to ‘z’, +but non-ASCII letters are unchanged.
+To lowercase the value in-place, use make_ascii_lowercase
.
self
and other
) and is used by the <=
+operator. Read moreself
and other
) and is used by the <=
+operator. Read moreself
and other
) and is used by the <=
+operator. Read moreself
and other
) and is used by the <=
+operator. Read moreself
and other
) and is used by the <=
+operator. Read moreself
and other
) and is used by the <=
+operator. Read moreself
and other
) and is used by the <=
+operator. Read moreself
and other
) and is used by the <=
+operator. Read moreself
and other
) and is used by the <=
+operator. Read morepub struct Bytes<'a> { /* private fields */ }
An iterator over the bytes in a byte string.
+'a
is the lifetime of the byte string being traversed.
iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct CharIndices<'a> { /* private fields */ }
An iterator over Unicode scalar values in a byte string and their +byte index positions.
+When invalid UTF-8 byte sequences are found, they are substituted with the
+Unicode replacement codepoint (U+FFFD
) using the
+“maximal subpart” strategy.
Note that this is slightly different from the CharIndices
iterator
+provided by the standard library. Aside from working on possibly invalid
+UTF-8, this iterator provides both the corresponding starting and ending
+byte indices of each codepoint yielded. The ending position is necessary to
+slice the original byte string when invalid UTF-8 bytes are converted into
+a Unicode replacement codepoint, since a single replacement codepoint can
+substitute anywhere from 1 to 3 invalid bytes (inclusive).
This iterator is created by the
+char_indices
method provided
+by the ByteSlice
extension trait for &[u8]
.
View the underlying data as a subslice of the original data.
+The slice returned has the same lifetime as the original slice, and so +the iterator can continue to be used while this exists.
+use bstr::ByteSlice;
+
+let mut it = b"abc".char_indices();
+
+assert_eq!(b"abc", it.as_bytes());
+it.next();
+assert_eq!(b"bc", it.as_bytes());
+it.next();
+it.next();
+assert_eq!(b"", it.as_bytes());
iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct Chars<'a> { /* private fields */ }
An iterator over Unicode scalar values in a byte string.
+When invalid UTF-8 byte sequences are found, they are substituted with the
+Unicode replacement codepoint (U+FFFD
) using the
+“maximal subpart” strategy.
This iterator is created by the
+chars
method provided by the
+ByteSlice
extension trait for &[u8]
.
View the underlying data as a subslice of the original data.
+The slice returned has the same lifetime as the original slice, and so +the iterator can continue to be used while this exists.
+use bstr::ByteSlice;
+
+let mut chars = b"abc".chars();
+
+assert_eq!(b"abc", chars.as_bytes());
+chars.next();
+assert_eq!(b"bc", chars.as_bytes());
+chars.next();
+chars.next();
+assert_eq!(b"", chars.as_bytes());
iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct EscapeBytes<'a> { /* private fields */ }
An iterator of char
values that represent an escaping of arbitrary bytes.
The lifetime parameter 'a
refers to the lifetime of the bytes being
+escaped.
This iterator is created by the
+ByteSlice::escape_bytes
method.
iter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct FieldsWith<'a, F> { /* private fields */ }
An iterator over fields in the byte string, separated by a predicate over +codepoints.
+This iterator splits a byte string based on its predicate function such +that the elements returned are separated by contiguous runs of codepoints +for which the predicate returns true.
+'a
is the lifetime of the byte string being split, while F
is the type
+of the predicate, i.e., FnMut(char) -> bool
.
iter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct Find<'h, 'n> { /* private fields */ }
An iterator over non-overlapping substring matches.
+Matches are reported by the byte offset at which they begin.
+'h
is the lifetime of the haystack while 'n
is the lifetime of the
+needle.
iter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct FindReverse<'h, 'n> { /* private fields */ }
An iterator over non-overlapping substring matches in reverse.
+Matches are reported by the byte offset at which they begin.
+'h
is the lifetime of the haystack while 'n
is the lifetime of the
+needle.
iter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct Finder<'a>(/* private fields */);
A single substring searcher fixed to a particular needle.
+The purpose of this type is to permit callers to construct a substring
+searcher that can be used to search haystacks without the overhead of
+constructing the searcher in the first place. This is a somewhat niche
+concern when it’s necessary to re-use the same needle to search multiple
+different haystacks with as little overhead as possible. In general, using
+ByteSlice::find
+or
+ByteSlice::find_iter
+is good enough, but Finder
is useful when you can meaningfully observe
+searcher construction time in a profile.
When the std
feature is enabled, then this type has an into_owned
+version which permits building a Finder
that is not connected to the
+lifetime of its needle.
Create a new finder for the given needle.
+Returns the needle that this finder searches for.
+Note that the lifetime of the needle returned is tied to the lifetime
+of the finder, and may be shorter than the 'a
lifetime. Namely, a
+finder’s needle can be either borrowed or owned, so the lifetime of the
+needle returned must necessarily be the shorter of the two.
Returns the index of the first occurrence of this needle in the given +haystack.
+The haystack may be any type that can be cheaply converted into a
+&[u8]
. This includes, but is not limited to, &str
and &[u8]
.
This routine is guaranteed to have worst case linear time complexity
+with respect to both the needle and the haystack. That is, this runs
+in O(needle.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use bstr::Finder;
+
+let haystack = "foo bar baz";
+assert_eq!(Some(0), Finder::new("foo").find(haystack));
+assert_eq!(Some(4), Finder::new("bar").find(haystack));
+assert_eq!(None, Finder::new("quux").find(haystack));
pub struct FinderReverse<'a>(/* private fields */);
A single substring reverse searcher fixed to a particular needle.
+The purpose of this type is to permit callers to construct a substring
+searcher that can be used to search haystacks without the overhead of
+constructing the searcher in the first place. This is a somewhat niche
+concern when it’s necessary to re-use the same needle to search multiple
+different haystacks with as little overhead as possible. In general, using
+ByteSlice::rfind
+or
+ByteSlice::rfind_iter
+is good enough, but FinderReverse
is useful when you can meaningfully
+observe searcher construction time in a profile.
When the std
feature is enabled, then this type has an into_owned
+version which permits building a FinderReverse
that is not connected to
+the lifetime of its needle.
Create a new reverse finder for the given needle.
+Returns the needle that this finder searches for.
+Note that the lifetime of the needle returned is tied to the lifetime
+of this finder, and may be shorter than the 'a
lifetime. Namely,
+a finder’s needle can be either borrowed or owned, so the lifetime of
+the needle returned must necessarily be the shorter of the two.
Returns the index of the last occurrence of this needle in the given +haystack.
+The haystack may be any type that can be cheaply converted into a
+&[u8]
. This includes, but is not limited to, &str
and &[u8]
.
This routine is guaranteed to have worst case linear time complexity
+with respect to both the needle and the haystack. That is, this runs
+in O(needle.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use bstr::FinderReverse;
+
+let haystack = "foo bar baz";
+assert_eq!(Some(0), FinderReverse::new("foo").rfind(haystack));
+assert_eq!(Some(4), FinderReverse::new("bar").rfind(haystack));
+assert_eq!(None, FinderReverse::new("quux").rfind(haystack));
source
. Read morepub struct Lines<'a> { /* private fields */ }
An iterator over all lines in a byte string, without their terminators.
+For this iterator, the only line terminators recognized are \r\n
and
+\n
.
'a
is the lifetime of the byte string being iterated over.
Return a copy of the rest of the underlying bytes without affecting the +iterator itself.
+Basic usage:
+ +use bstr::{B, ByteSlice};
+
+let s = b"\
+foo
+bar\r
+baz";
+let mut lines = s.lines();
+assert_eq!(lines.next(), Some(B("foo")));
+assert_eq!(lines.as_bytes(), B("bar\r\nbaz"));
iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct LinesWithTerminator<'a> { /* private fields */ }
An iterator over all lines in a byte string, including their terminators.
+For this iterator, the only line terminator recognized is \n
. (Since
+line terminators are included, this also handles \r\n
line endings.)
Line terminators are only included if they are present in the original +byte string. For example, the last line in a byte string may not end with +a line terminator.
+Concatenating all elements yielded by this iterator is guaranteed to yield +the original byte string.
+'a
is the lifetime of the byte string being iterated over.
Return a copy of the rest of the underlying bytes without affecting the +iterator itself.
+Basic usage:
+ +use bstr::{B, ByteSlice};
+
+let s = b"\
+foo
+bar\r
+baz";
+let mut lines = s.lines_with_terminator();
+assert_eq!(lines.next(), Some(B("foo\n")));
+assert_eq!(lines.as_bytes(), B("bar\r\nbaz"));
iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct Split<'h, 's> { /* private fields */ }
An iterator over substrings in a byte string, split by a separator.
+'h
is the lifetime of the byte string being split (the haystack), while
+'s
is the lifetime of the byte string doing the splitting.
iter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct SplitN<'h, 's> { /* private fields */ }
An iterator over at most n
substrings in a byte string, split by a
+separator.
'h
is the lifetime of the byte string being split (the haystack), while
+'s
is the lifetime of the byte string doing the splitting.
iter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct SplitNReverse<'h, 's> { /* private fields */ }
An iterator over at most n
substrings in a byte string, split by a
+separator, in reverse.
'h
is the lifetime of the byte string being split (the haystack), while
+'s
is the lifetime of the byte string doing the splitting.
iter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct SplitReverse<'h, 's> { /* private fields */ }
An iterator over substrings in a byte string, split by a separator, in +reverse.
+'h
is the lifetime of the byte string being split (the haystack), while
+'s
is the lifetime of the byte string doing the splitting.
iter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct Utf8Chunk<'a> { /* private fields */ }
A chunk of valid UTF-8, possibly followed by invalid UTF-8 bytes.
+This is yielded by the
+Utf8Chunks
+iterator, which can be created via the
+ByteSlice::utf8_chunks
+method.
The 'a
lifetime parameter corresponds to the lifetime of the bytes that
+are being iterated over.
Returns the (possibly empty) valid UTF-8 bytes in this chunk.
+This may be empty if there are consecutive sequences of invalid UTF-8 +bytes.
+Returns the (possibly empty) invalid UTF-8 bytes in this chunk that +immediately follow the valid UTF-8 bytes in this chunk.
+This is only empty when this chunk corresponds to the last chunk in +the original bytes.
+The maximum length of this slice is 3. That is, invalid UTF-8 byte
+sequences greater than 1 always correspond to a valid prefix of
+a valid UTF-8 encoded codepoint. This corresponds to the “substitution
+of maximal subparts” strategy that is described in more detail in the
+docs for the
+ByteSlice::to_str_lossy
+method.
Returns whether the invalid sequence might still become valid if more +bytes are added.
+Returns true if the end of the input was reached unexpectedly, +without encountering an unexpected byte.
+This can only be the case for the last chunk.
+pub struct Utf8Chunks<'a> { /* private fields */ }
An iterator over chunks of valid UTF-8 in a byte slice.
+See utf8_chunks
.
iter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct Utf8Error { /* private fields */ }
An error that occurs when UTF-8 decoding fails.
+This error occurs when attempting to convert a non-UTF-8 byte
+string to a Rust string that must be valid UTF-8. For example,
+to_str
is one such method.
This example shows what happens when a given byte sequence is invalid, +but ends with a sequence that is a possible prefix of valid UTF-8.
+ +use bstr::{B, ByteSlice};
+
+let s = B(b"foobar\xF1\x80\x80");
+let err = s.to_str().unwrap_err();
+assert_eq!(err.valid_up_to(), 6);
+assert_eq!(err.error_len(), None);
This example shows what happens when a given byte sequence contains +invalid UTF-8.
+ +use bstr::ByteSlice;
+
+let s = b"foobar\xF1\x80\x80quux";
+let err = s.to_str().unwrap_err();
+assert_eq!(err.valid_up_to(), 6);
+// The error length reports the maximum number of bytes that correspond to
+// a valid prefix of a UTF-8 encoded codepoint.
+assert_eq!(err.error_len(), Some(3));
+
+// In contrast to the above which contains a single invalid prefix,
+// consider the case of multiple individual bytes that are never valid
+// prefixes. Note how the value of error_len changes!
+let s = b"foobar\xFF\xFFquux";
+let err = s.to_str().unwrap_err();
+assert_eq!(err.valid_up_to(), 6);
+assert_eq!(err.error_len(), Some(1));
+
+// The fact that it's an invalid prefix does not change error_len even
+// when it immediately precedes the end of the string.
+let s = b"foobar\xFF";
+let err = s.to_str().unwrap_err();
+assert_eq!(err.valid_up_to(), 6);
+assert_eq!(err.error_len(), Some(1));
Returns the byte index of the position immediately following the last +valid UTF-8 byte.
+This examples shows how valid_up_to
can be used to retrieve a
+possibly empty prefix that is guaranteed to be valid UTF-8:
use bstr::ByteSlice;
+
+let s = b"foobar\xF1\x80\x80quux";
+let err = s.to_str().unwrap_err();
+
+// This is guaranteed to never panic.
+let string = s[..err.valid_up_to()].to_str().unwrap();
+assert_eq!(string, "foobar");
Returns the total number of invalid UTF-8 bytes immediately following
+the position returned by valid_up_to
. This value is always at least
+1
, but can be up to 3
if bytes form a valid prefix of some UTF-8
+encoded codepoint.
If the end of the original input was found before a valid UTF-8 encoded
+codepoint could be completed, then this returns None
. This is useful
+when processing streams, where a None
value signals that more input
+might be needed.
pub trait ByteSlice: Sealed {
+Show 44 methods
// Provided methods
+ fn as_bstr(&self) -> &BStr { ... }
+ fn as_bstr_mut(&mut self) -> &mut BStr { ... }
+ fn to_str(&self) -> Result<&str, Utf8Error> { ... }
+ unsafe fn to_str_unchecked(&self) -> &str { ... }
+ fn contains_str<B: AsRef<[u8]>>(&self, needle: B) -> bool { ... }
+ fn starts_with_str<B: AsRef<[u8]>>(&self, prefix: B) -> bool { ... }
+ fn ends_with_str<B: AsRef<[u8]>>(&self, suffix: B) -> bool { ... }
+ fn find<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize> { ... }
+ fn rfind<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize> { ... }
+ fn find_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ needle: &'n B
+ ) -> Find<'h, 'n> ⓘ { ... }
+ fn rfind_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ needle: &'n B
+ ) -> FindReverse<'h, 'n> ⓘ { ... }
+ fn find_byte(&self, byte: u8) -> Option<usize> { ... }
+ fn rfind_byte(&self, byte: u8) -> Option<usize> { ... }
+ fn find_char(&self, ch: char) -> Option<usize> { ... }
+ fn rfind_char(&self, ch: char) -> Option<usize> { ... }
+ fn find_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> { ... }
+ fn find_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> { ... }
+ fn rfind_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> { ... }
+ fn rfind_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> { ... }
+ fn fields_with<F: FnMut(char) -> bool>(&self, f: F) -> FieldsWith<'_, F> ⓘ { ... }
+ fn split_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ splitter: &'s B
+ ) -> Split<'h, 's> ⓘ { ... }
+ fn rsplit_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ splitter: &'s B
+ ) -> SplitReverse<'h, 's> ⓘ { ... }
+ fn split_once_str<'a, B: ?Sized + AsRef<[u8]>>(
+ &'a self,
+ splitter: &B
+ ) -> Option<(&'a [u8], &'a [u8])> { ... }
+ fn rsplit_once_str<'a, B: ?Sized + AsRef<[u8]>>(
+ &'a self,
+ splitter: &B
+ ) -> Option<(&'a [u8], &'a [u8])> { ... }
+ fn splitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ limit: usize,
+ splitter: &'s B
+ ) -> SplitN<'h, 's> ⓘ { ... }
+ fn rsplitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ limit: usize,
+ splitter: &'s B
+ ) -> SplitNReverse<'h, 's> ⓘ { ... }
+ fn bytes(&self) -> Bytes<'_> ⓘ { ... }
+ fn chars(&self) -> Chars<'_> ⓘ { ... }
+ fn char_indices(&self) -> CharIndices<'_> ⓘ { ... }
+ fn utf8_chunks(&self) -> Utf8Chunks<'_> ⓘ { ... }
+ fn lines(&self) -> Lines<'_> ⓘ { ... }
+ fn lines_with_terminator(&self) -> LinesWithTerminator<'_> ⓘ { ... }
+ fn trim_with<F: FnMut(char) -> bool>(&self, trim: F) -> &[u8] ⓘ { ... }
+ fn trim_start_with<F: FnMut(char) -> bool>(&self, trim: F) -> &[u8] ⓘ { ... }
+ fn trim_end_with<F: FnMut(char) -> bool>(&self, trim: F) -> &[u8] ⓘ { ... }
+ fn make_ascii_lowercase(&mut self) { ... }
+ fn make_ascii_uppercase(&mut self) { ... }
+ fn escape_bytes(&self) -> EscapeBytes<'_> ⓘ { ... }
+ fn reverse_bytes(&mut self) { ... }
+ fn reverse_chars(&mut self) { ... }
+ fn is_ascii(&self) -> bool { ... }
+ fn is_utf8(&self) -> bool { ... }
+ fn last_byte(&self) -> Option<u8> { ... }
+ fn find_non_ascii_byte(&self) -> Option<usize> { ... }
+}
A trait that extends &[u8]
with string oriented methods.
This trait is sealed and cannot be implemented outside of bstr
.
Return this byte slice as a &BStr
.
Use &BStr
is useful because of its fmt::Debug
representation
+and various other trait implementations (such as PartialEq
and
+PartialOrd
). In particular, the Debug
implementation for BStr
+shows its bytes as a normal string. For invalid UTF-8, hex escape
+sequences are used.
Basic usage:
+ +use bstr::ByteSlice;
+
+println!("{:?}", b"foo\xFFbar".as_bstr());
Return this byte slice as a &mut BStr
.
Use &mut BStr
is useful because of its fmt::Debug
representation
+and various other trait implementations (such as PartialEq
and
+PartialOrd
). In particular, the Debug
implementation for BStr
+shows its bytes as a normal string. For invalid UTF-8, hex escape
+sequences are used.
Basic usage:
+ +use bstr::ByteSlice;
+
+let mut bytes = *b"foo\xFFbar";
+println!("{:?}", &mut bytes.as_bstr_mut());
Safely convert this byte string into a &str
if it’s valid UTF-8.
If this byte string is not valid UTF-8, then an error is returned. The +error returned indicates the first invalid byte found and the length +of the error.
+In cases where a lossy conversion to &str
is acceptable, then use one
+of the to_str_lossy
or
+to_str_lossy_into
+methods.
Basic usage:
+ +use bstr::{B, ByteSlice, ByteVec};
+
+let s = B("☃βツ").to_str()?;
+assert_eq!("☃βツ", s);
+
+let mut bstring = <Vec<u8>>::from("☃βツ");
+bstring.push(b'\xFF');
+let err = bstring.to_str().unwrap_err();
+assert_eq!(8, err.valid_up_to());
Unsafely convert this byte string into a &str
, without checking for
+valid UTF-8.
Callers must ensure that this byte string is valid UTF-8 before
+calling this method. Converting a byte string into a &str
that is
+not valid UTF-8 is considered undefined behavior.
This routine is useful in performance sensitive contexts where the
+UTF-8 validity of the byte string is already known and it is
+undesirable to pay the cost of an additional UTF-8 validation check
+that to_str
performs.
Basic usage:
+ +use bstr::{B, ByteSlice};
+
+// SAFETY: This is safe because string literals are guaranteed to be
+// valid UTF-8 by the Rust compiler.
+let s = unsafe { B("☃βツ").to_str_unchecked() };
+assert_eq!("☃βツ", s);
Returns true if and only if this byte string contains the given needle.
+Basic usage:
+ +use bstr::ByteSlice;
+
+assert!(b"foo bar".contains_str("foo"));
+assert!(b"foo bar".contains_str("bar"));
+assert!(!b"foo".contains_str("foobar"));
Returns true if and only if this byte string has the given prefix.
+Basic usage:
+ +use bstr::ByteSlice;
+
+assert!(b"foo bar".starts_with_str("foo"));
+assert!(!b"foo bar".starts_with_str("bar"));
+assert!(!b"foo".starts_with_str("foobar"));
Returns true if and only if this byte string has the given suffix.
+Basic usage:
+ +use bstr::ByteSlice;
+
+assert!(b"foo bar".ends_with_str("bar"));
+assert!(!b"foo bar".ends_with_str("foo"));
+assert!(!b"bar".ends_with_str("foobar"));
Returns the index of the first occurrence of the given needle.
+The needle may be any type that can be cheaply converted into a
+&[u8]
. This includes, but is not limited to, &str
and &[u8]
.
Note that if you’re are searching for the same needle in many
+different small haystacks, it may be faster to initialize a
+Finder
once, and reuse it for each search.
This routine is guaranteed to have worst case linear time complexity
+with respect to both the needle and the haystack. That is, this runs
+in O(needle.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use bstr::ByteSlice;
+
+let s = b"foo bar baz";
+assert_eq!(Some(0), s.find("foo"));
+assert_eq!(Some(4), s.find("bar"));
+assert_eq!(None, s.find("quux"));
Returns the index of the last occurrence of the given needle.
+The needle may be any type that can be cheaply converted into a
+&[u8]
. This includes, but is not limited to, &str
and &[u8]
.
Note that if you’re are searching for the same needle in many
+different small haystacks, it may be faster to initialize a
+FinderReverse
once, and reuse it for
+each search.
This routine is guaranteed to have worst case linear time complexity
+with respect to both the needle and the haystack. That is, this runs
+in O(needle.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use bstr::ByteSlice;
+
+let s = b"foo bar baz";
+assert_eq!(Some(0), s.rfind("foo"));
+assert_eq!(Some(4), s.rfind("bar"));
+assert_eq!(Some(8), s.rfind("ba"));
+assert_eq!(None, s.rfind("quux"));
Returns an iterator of the non-overlapping occurrences of the given +needle. The iterator yields byte offset positions indicating the start +of each match.
+This routine is guaranteed to have worst case linear time complexity
+with respect to both the needle and the haystack. That is, this runs
+in O(needle.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use bstr::ByteSlice;
+
+let s = b"foo bar foo foo quux foo";
+let matches: Vec<usize> = s.find_iter("foo").collect();
+assert_eq!(matches, vec![0, 8, 12, 21]);
An empty string matches at every position, including the position +immediately following the last byte:
+ +use bstr::ByteSlice;
+
+let matches: Vec<usize> = b"foo".find_iter("").collect();
+assert_eq!(matches, vec![0, 1, 2, 3]);
+
+let matches: Vec<usize> = b"".find_iter("").collect();
+assert_eq!(matches, vec![0]);
Returns an iterator of the non-overlapping occurrences of the given +needle in reverse. The iterator yields byte offset positions indicating +the start of each match.
+This routine is guaranteed to have worst case linear time complexity
+with respect to both the needle and the haystack. That is, this runs
+in O(needle.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use bstr::ByteSlice;
+
+let s = b"foo bar foo foo quux foo";
+let matches: Vec<usize> = s.rfind_iter("foo").collect();
+assert_eq!(matches, vec![21, 12, 8, 0]);
An empty string matches at every position, including the position +immediately following the last byte:
+ +use bstr::ByteSlice;
+
+let matches: Vec<usize> = b"foo".rfind_iter("").collect();
+assert_eq!(matches, vec![3, 2, 1, 0]);
+
+let matches: Vec<usize> = b"".rfind_iter("").collect();
+assert_eq!(matches, vec![0]);
Returns the index of the first occurrence of the given byte. If the
+byte does not occur in this byte string, then None
is returned.
Basic usage:
+ +use bstr::ByteSlice;
+
+assert_eq!(Some(10), b"foo bar baz".find_byte(b'z'));
+assert_eq!(None, b"foo bar baz".find_byte(b'y'));
Returns the index of the last occurrence of the given byte. If the
+byte does not occur in this byte string, then None
is returned.
Basic usage:
+ +use bstr::ByteSlice;
+
+assert_eq!(Some(10), b"foo bar baz".rfind_byte(b'z'));
+assert_eq!(None, b"foo bar baz".rfind_byte(b'y'));
Returns the index of the first occurrence of the given codepoint.
+If the codepoint does not occur in this byte string, then None
is
+returned.
Note that if one searches for the replacement codepoint, \u{FFFD}
,
+then only explicit occurrences of that encoding will be found. Invalid
+UTF-8 sequences will not be matched.
Basic usage:
+ +use bstr::{B, ByteSlice};
+
+assert_eq!(Some(10), b"foo bar baz".find_char('z'));
+assert_eq!(Some(4), B("αβγγδ").find_char('γ'));
+assert_eq!(None, b"foo bar baz".find_char('y'));
Returns the index of the last occurrence of the given codepoint.
+If the codepoint does not occur in this byte string, then None
is
+returned.
Note that if one searches for the replacement codepoint, \u{FFFD}
,
+then only explicit occurrences of that encoding will be found. Invalid
+UTF-8 sequences will not be matched.
Basic usage:
+ +use bstr::{B, ByteSlice};
+
+assert_eq!(Some(10), b"foo bar baz".rfind_char('z'));
+assert_eq!(Some(6), B("αβγγδ").rfind_char('γ'));
+assert_eq!(None, b"foo bar baz".rfind_char('y'));
Returns the index of the first occurrence of any of the bytes in the +provided set.
+The byteset
may be any type that can be cheaply converted into a
+&[u8]
. This includes, but is not limited to, &str
and &[u8]
, but
+note that passing a &str
which contains multibyte characters may not
+behave as you expect: each byte in the &str
is treated as an
+individual member of the byte set.
Note that order is irrelevant for the byteset
parameter, and
+duplicate bytes present in its body are ignored.
This routine is guaranteed to have worst case linear time complexity
+with respect to both the set of bytes and the haystack. That is, this
+runs in O(byteset.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use bstr::ByteSlice;
+
+assert_eq!(b"foo bar baz".find_byteset(b"zr"), Some(6));
+assert_eq!(b"foo baz bar".find_byteset(b"bzr"), Some(4));
+assert_eq!(None, b"foo baz bar".find_byteset(b"\t\n"));
+// The empty byteset never matches.
+assert_eq!(None, b"abc".find_byteset(b""));
+assert_eq!(None, b"".find_byteset(b""));
Returns the index of the first occurrence of a byte that is not a +member of the provided set.
+The byteset
may be any type that can be cheaply converted into a
+&[u8]
. This includes, but is not limited to, &str
and &[u8]
, but
+note that passing a &str
which contains multibyte characters may not
+behave as you expect: each byte in the &str
is treated as an
+individual member of the byte set.
Note that order is irrelevant for the byteset
parameter, and
+duplicate bytes present in its body are ignored.
This routine is guaranteed to have worst case linear time complexity
+with respect to both the set of bytes and the haystack. That is, this
+runs in O(byteset.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use bstr::ByteSlice;
+
+assert_eq!(b"foo bar baz".find_not_byteset(b"fo "), Some(4));
+assert_eq!(b"\t\tbaz bar".find_not_byteset(b" \t\r\n"), Some(2));
+assert_eq!(b"foo\nbaz\tbar".find_not_byteset(b"\t\n"), Some(0));
+// The negation of the empty byteset matches everything.
+assert_eq!(Some(0), b"abc".find_not_byteset(b""));
+// But an empty string never contains anything.
+assert_eq!(None, b"".find_not_byteset(b""));
Returns the index of the last occurrence of any of the bytes in the +provided set.
+The byteset
may be any type that can be cheaply converted into a
+&[u8]
. This includes, but is not limited to, &str
and &[u8]
, but
+note that passing a &str
which contains multibyte characters may not
+behave as you expect: each byte in the &str
is treated as an
+individual member of the byte set.
Note that order is irrelevant for the byteset
parameter, and duplicate
+bytes present in its body are ignored.
This routine is guaranteed to have worst case linear time complexity
+with respect to both the set of bytes and the haystack. That is, this
+runs in O(byteset.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use bstr::ByteSlice;
+
+assert_eq!(b"foo bar baz".rfind_byteset(b"agb"), Some(9));
+assert_eq!(b"foo baz bar".rfind_byteset(b"rabz "), Some(10));
+assert_eq!(b"foo baz bar".rfind_byteset(b"\n123"), None);
Returns the index of the last occurrence of a byte that is not a member +of the provided set.
+The byteset
may be any type that can be cheaply converted into a
+&[u8]
. This includes, but is not limited to, &str
and &[u8]
, but
+note that passing a &str
which contains multibyte characters may not
+behave as you expect: each byte in the &str
is treated as an
+individual member of the byte set.
Note that order is irrelevant for the byteset
parameter, and
+duplicate bytes present in its body are ignored.
This routine is guaranteed to have worst case linear time complexity
+with respect to both the set of bytes and the haystack. That is, this
+runs in O(byteset.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use bstr::ByteSlice;
+
+assert_eq!(b"foo bar baz,\t".rfind_not_byteset(b",\t"), Some(10));
+assert_eq!(b"foo baz bar".rfind_not_byteset(b"rabz "), Some(2));
+assert_eq!(None, b"foo baz bar".rfind_not_byteset(b"barfoz "));
Returns an iterator over the fields in a byte string, separated by +contiguous codepoints satisfying the given predicate.
+If this byte string is not valid UTF-8, then the given closure will +be called with a Unicode replacement codepoint when invalid UTF-8 +bytes are seen.
+Basic usage:
+ +use bstr::{B, ByteSlice};
+
+let s = b"123foo999999bar1quux123456";
+let fields: Vec<&[u8]> = s.fields_with(|c| c.is_numeric()).collect();
+assert_eq!(fields, vec![B("foo"), B("bar"), B("quux")]);
A byte string consisting of all codepoints satisfying the predicate +yields no elements:
+ +use bstr::ByteSlice;
+
+assert_eq!(0, b"1911354563".fields_with(|c| c.is_numeric()).count());
Returns an iterator over substrings of this byte string, separated +by the given byte string. Each element yielded is guaranteed not to +include the splitter substring.
+The splitter may be any type that can be cheaply converted into a
+&[u8]
. This includes, but is not limited to, &str
and &[u8]
.
Basic usage:
+ +use bstr::{B, ByteSlice};
+
+let x: Vec<&[u8]> = b"Mary had a little lamb".split_str(" ").collect();
+assert_eq!(x, vec![
+ B("Mary"), B("had"), B("a"), B("little"), B("lamb"),
+]);
+
+let x: Vec<&[u8]> = b"".split_str("X").collect();
+assert_eq!(x, vec![b""]);
+
+let x: Vec<&[u8]> = b"lionXXtigerXleopard".split_str("X").collect();
+assert_eq!(x, vec![B("lion"), B(""), B("tiger"), B("leopard")]);
+
+let x: Vec<&[u8]> = b"lion::tiger::leopard".split_str("::").collect();
+assert_eq!(x, vec![B("lion"), B("tiger"), B("leopard")]);
If a string contains multiple contiguous separators, you will end up +with empty strings yielded by the iterator:
+ +use bstr::{B, ByteSlice};
+
+let x: Vec<&[u8]> = b"||||a||b|c".split_str("|").collect();
+assert_eq!(x, vec![
+ B(""), B(""), B(""), B(""), B("a"), B(""), B("b"), B("c"),
+]);
+
+let x: Vec<&[u8]> = b"(///)".split_str("/").collect();
+assert_eq!(x, vec![B("("), B(""), B(""), B(")")]);
Separators at the start or end of a string are neighbored by empty +strings.
+ +use bstr::{B, ByteSlice};
+
+let x: Vec<&[u8]> = b"010".split_str("0").collect();
+assert_eq!(x, vec![B(""), B("1"), B("")]);
When the empty string is used as a separator, it splits every byte +in the byte string, along with the beginning and end of the byte +string.
+ +use bstr::{B, ByteSlice};
+
+let x: Vec<&[u8]> = b"rust".split_str("").collect();
+assert_eq!(x, vec![
+ B(""), B("r"), B("u"), B("s"), B("t"), B(""),
+]);
+
+// Splitting by an empty string is not UTF-8 aware. Elements yielded
+// may not be valid UTF-8!
+let x: Vec<&[u8]> = B("☃").split_str("").collect();
+assert_eq!(x, vec![
+ B(""), B(b"\xE2"), B(b"\x98"), B(b"\x83"), B(""),
+]);
Contiguous separators, especially whitespace, can lead to possibly +surprising behavior. For example, this code is correct:
+ +use bstr::{B, ByteSlice};
+
+let x: Vec<&[u8]> = b" a b c".split_str(" ").collect();
+assert_eq!(x, vec![
+ B(""), B(""), B(""), B(""), B("a"), B(""), B("b"), B("c"),
+]);
It does not give you ["a", "b", "c"]
. For that behavior, use
+fields
instead.
Returns an iterator over substrings of this byte string, separated by +the given byte string, in reverse. Each element yielded is guaranteed +not to include the splitter substring.
+The splitter may be any type that can be cheaply converted into a
+&[u8]
. This includes, but is not limited to, &str
and &[u8]
.
Basic usage:
+ +use bstr::{B, ByteSlice};
+
+let x: Vec<&[u8]> =
+ b"Mary had a little lamb".rsplit_str(" ").collect();
+assert_eq!(x, vec![
+ B("lamb"), B("little"), B("a"), B("had"), B("Mary"),
+]);
+
+let x: Vec<&[u8]> = b"".rsplit_str("X").collect();
+assert_eq!(x, vec![b""]);
+
+let x: Vec<&[u8]> = b"lionXXtigerXleopard".rsplit_str("X").collect();
+assert_eq!(x, vec![B("leopard"), B("tiger"), B(""), B("lion")]);
+
+let x: Vec<&[u8]> = b"lion::tiger::leopard".rsplit_str("::").collect();
+assert_eq!(x, vec![B("leopard"), B("tiger"), B("lion")]);
If a string contains multiple contiguous separators, you will end up +with empty strings yielded by the iterator:
+ +use bstr::{B, ByteSlice};
+
+let x: Vec<&[u8]> = b"||||a||b|c".rsplit_str("|").collect();
+assert_eq!(x, vec![
+ B("c"), B("b"), B(""), B("a"), B(""), B(""), B(""), B(""),
+]);
+
+let x: Vec<&[u8]> = b"(///)".rsplit_str("/").collect();
+assert_eq!(x, vec![B(")"), B(""), B(""), B("(")]);
Separators at the start or end of a string are neighbored by empty +strings.
+ +use bstr::{B, ByteSlice};
+
+let x: Vec<&[u8]> = b"010".rsplit_str("0").collect();
+assert_eq!(x, vec![B(""), B("1"), B("")]);
When the empty string is used as a separator, it splits every byte +in the byte string, along with the beginning and end of the byte +string.
+ +use bstr::{B, ByteSlice};
+
+let x: Vec<&[u8]> = b"rust".rsplit_str("").collect();
+assert_eq!(x, vec![
+ B(""), B("t"), B("s"), B("u"), B("r"), B(""),
+]);
+
+// Splitting by an empty string is not UTF-8 aware. Elements yielded
+// may not be valid UTF-8!
+let x: Vec<&[u8]> = B("☃").rsplit_str("").collect();
+assert_eq!(x, vec![B(""), B(b"\x83"), B(b"\x98"), B(b"\xE2"), B("")]);
Contiguous separators, especially whitespace, can lead to possibly +surprising behavior. For example, this code is correct:
+ +use bstr::{B, ByteSlice};
+
+let x: Vec<&[u8]> = b" a b c".rsplit_str(" ").collect();
+assert_eq!(x, vec![
+ B("c"), B("b"), B(""), B("a"), B(""), B(""), B(""), B(""),
+]);
It does not give you ["a", "b", "c"]
.
Split this byte string at the first occurrence of splitter
.
If the splitter
is found in the byte string, returns a tuple
+containing the parts of the string before and after the first occurrence
+of splitter
respectively. Otherwise, if there are no occurrences of
+splitter
in the byte string, returns None
.
The splitter may be any type that can be cheaply converted into a
+&[u8]
. This includes, but is not limited to, &str
and &[u8]
.
If you need to split on the last instance of a delimiter instead, see
+the ByteSlice::rsplit_once_str
method .
Basic usage:
+ +use bstr::{B, ByteSlice};
+
+assert_eq!(
+ B("foo,bar").split_once_str(","),
+ Some((B("foo"), B("bar"))),
+);
+assert_eq!(
+ B("foo,bar,baz").split_once_str(","),
+ Some((B("foo"), B("bar,baz"))),
+);
+assert_eq!(B("foo").split_once_str(","), None);
+assert_eq!(B("foo,").split_once_str(b","), Some((B("foo"), B(""))));
+assert_eq!(B(",foo").split_once_str(b","), Some((B(""), B("foo"))));
Split this byte string at the last occurrence of splitter
.
If the splitter
is found in the byte string, returns a tuple
+containing the parts of the string before and after the last occurrence
+of splitter
, respectively. Otherwise, if there are no occurrences of
+splitter
in the byte string, returns None
.
The splitter may be any type that can be cheaply converted into a
+&[u8]
. This includes, but is not limited to, &str
and &[u8]
.
If you need to split on the first instance of a delimiter instead, see
+the ByteSlice::split_once_str
method.
Basic usage:
+ +use bstr::{B, ByteSlice};
+
+assert_eq!(
+ B("foo,bar").rsplit_once_str(","),
+ Some((B("foo"), B("bar"))),
+);
+assert_eq!(
+ B("foo,bar,baz").rsplit_once_str(","),
+ Some((B("foo,bar"), B("baz"))),
+);
+assert_eq!(B("foo").rsplit_once_str(","), None);
+assert_eq!(B("foo,").rsplit_once_str(b","), Some((B("foo"), B(""))));
+assert_eq!(B(",foo").rsplit_once_str(b","), Some((B(""), B("foo"))));
Returns an iterator of at most limit
substrings of this byte string,
+separated by the given byte string. If limit
substrings are yielded,
+then the last substring will contain the remainder of this byte string.
The needle may be any type that can be cheaply converted into a
+&[u8]
. This includes, but is not limited to, &str
and &[u8]
.
Basic usage:
+ +use bstr::{B, ByteSlice};
+
+let x: Vec<_> = b"Mary had a little lamb".splitn_str(3, " ").collect();
+assert_eq!(x, vec![B("Mary"), B("had"), B("a little lamb")]);
+
+let x: Vec<_> = b"".splitn_str(3, "X").collect();
+assert_eq!(x, vec![b""]);
+
+let x: Vec<_> = b"lionXXtigerXleopard".splitn_str(3, "X").collect();
+assert_eq!(x, vec![B("lion"), B(""), B("tigerXleopard")]);
+
+let x: Vec<_> = b"lion::tiger::leopard".splitn_str(2, "::").collect();
+assert_eq!(x, vec![B("lion"), B("tiger::leopard")]);
+
+let x: Vec<_> = b"abcXdef".splitn_str(1, "X").collect();
+assert_eq!(x, vec![B("abcXdef")]);
+
+let x: Vec<_> = b"abcdef".splitn_str(2, "X").collect();
+assert_eq!(x, vec![B("abcdef")]);
+
+let x: Vec<_> = b"abcXdef".splitn_str(0, "X").collect();
+assert!(x.is_empty());
Returns an iterator of at most limit
substrings of this byte string,
+separated by the given byte string, in reverse. If limit
substrings
+are yielded, then the last substring will contain the remainder of this
+byte string.
The needle may be any type that can be cheaply converted into a
+&[u8]
. This includes, but is not limited to, &str
and &[u8]
.
Basic usage:
+ +use bstr::{B, ByteSlice};
+
+let x: Vec<_> =
+ b"Mary had a little lamb".rsplitn_str(3, " ").collect();
+assert_eq!(x, vec![B("lamb"), B("little"), B("Mary had a")]);
+
+let x: Vec<_> = b"".rsplitn_str(3, "X").collect();
+assert_eq!(x, vec![b""]);
+
+let x: Vec<_> = b"lionXXtigerXleopard".rsplitn_str(3, "X").collect();
+assert_eq!(x, vec![B("leopard"), B("tiger"), B("lionX")]);
+
+let x: Vec<_> = b"lion::tiger::leopard".rsplitn_str(2, "::").collect();
+assert_eq!(x, vec![B("leopard"), B("lion::tiger")]);
+
+let x: Vec<_> = b"abcXdef".rsplitn_str(1, "X").collect();
+assert_eq!(x, vec![B("abcXdef")]);
+
+let x: Vec<_> = b"abcdef".rsplitn_str(2, "X").collect();
+assert_eq!(x, vec![B("abcdef")]);
+
+let x: Vec<_> = b"abcXdef".rsplitn_str(0, "X").collect();
+assert!(x.is_empty());
Returns an iterator over the bytes in this byte string.
+Basic usage:
+ +use bstr::ByteSlice;
+
+let bs = b"foobar";
+let bytes: Vec<u8> = bs.bytes().collect();
+assert_eq!(bytes, bs);
Returns an iterator over the Unicode scalar values in this byte string. +If invalid UTF-8 is encountered, then the Unicode replacement codepoint +is yielded instead.
+Basic usage:
+ +use bstr::ByteSlice;
+
+let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
+let chars: Vec<char> = bs.chars().collect();
+assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
Codepoints can also be iterated over in reverse:
+ +use bstr::ByteSlice;
+
+let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
+let chars: Vec<char> = bs.chars().rev().collect();
+assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
Returns an iterator over the Unicode scalar values in this byte string +along with their starting and ending byte index positions. If invalid +UTF-8 is encountered, then the Unicode replacement codepoint is yielded +instead.
+Note that this is slightly different from the CharIndices
iterator
+provided by the standard library. Aside from working on possibly
+invalid UTF-8, this iterator provides both the corresponding starting
+and ending byte indices of each codepoint yielded. The ending position
+is necessary to slice the original byte string when invalid UTF-8 bytes
+are converted into a Unicode replacement codepoint, since a single
+replacement codepoint can substitute anywhere from 1 to 3 invalid bytes
+(inclusive).
Basic usage:
+ +use bstr::ByteSlice;
+
+let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
+let chars: Vec<(usize, usize, char)> = bs.char_indices().collect();
+assert_eq!(chars, vec![
+ (0, 3, '☃'),
+ (3, 4, '\u{FFFD}'),
+ (4, 8, '𝞃'),
+ (8, 10, '\u{FFFD}'),
+ (10, 11, 'a'),
+]);
Codepoints can also be iterated over in reverse:
+ +use bstr::ByteSlice;
+
+let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
+let chars: Vec<(usize, usize, char)> = bs
+ .char_indices()
+ .rev()
+ .collect();
+assert_eq!(chars, vec![
+ (10, 11, 'a'),
+ (8, 10, '\u{FFFD}'),
+ (4, 8, '𝞃'),
+ (3, 4, '\u{FFFD}'),
+ (0, 3, '☃'),
+]);
Iterate over chunks of valid UTF-8.
+The iterator returned yields chunks of valid UTF-8 separated by invalid
+UTF-8 bytes, if they exist. Invalid UTF-8 bytes are always 1-3 bytes,
+which are determined via the “substitution of maximal subparts”
+strategy described in the docs for the
+ByteSlice::to_str_lossy
+method.
This example shows how to gather all valid and invalid chunks from a +byte slice:
+ +use bstr::{ByteSlice, Utf8Chunk};
+
+let bytes = b"foo\xFD\xFEbar\xFF";
+
+let (mut valid_chunks, mut invalid_chunks) = (vec![], vec![]);
+for chunk in bytes.utf8_chunks() {
+ if !chunk.valid().is_empty() {
+ valid_chunks.push(chunk.valid());
+ }
+ if !chunk.invalid().is_empty() {
+ invalid_chunks.push(chunk.invalid());
+ }
+}
+
+assert_eq!(valid_chunks, vec!["foo", "bar"]);
+assert_eq!(invalid_chunks, vec![b"\xFD", b"\xFE", b"\xFF"]);
An iterator over all lines in a byte string, without their +terminators.
+For this iterator, the only line terminators recognized are \r\n
and
+\n
.
Basic usage:
+ +use bstr::{B, ByteSlice};
+
+let s = b"\
+foo
+
+bar\r
+baz
+
+
+quux";
+let lines: Vec<&[u8]> = s.lines().collect();
+assert_eq!(lines, vec![
+ B("foo"), B(""), B("bar"), B("baz"), B(""), B(""), B("quux"),
+]);
An iterator over all lines in a byte string, including their +terminators.
+For this iterator, the only line terminator recognized is \n
. (Since
+line terminators are included, this also handles \r\n
line endings.)
Line terminators are only included if they are present in the original +byte string. For example, the last line in a byte string may not end +with a line terminator.
+Concatenating all elements yielded by this iterator is guaranteed to +yield the original byte string.
+Basic usage:
+ +use bstr::{B, ByteSlice};
+
+let s = b"\
+foo
+
+bar\r
+baz
+
+
+quux";
+let lines: Vec<&[u8]> = s.lines_with_terminator().collect();
+assert_eq!(lines, vec![
+ B("foo\n"),
+ B("\n"),
+ B("bar\r\n"),
+ B("baz\n"),
+ B("\n"),
+ B("\n"),
+ B("quux"),
+]);
Return a byte string slice with leading and trailing characters +satisfying the given predicate removed.
+Basic usage:
+ +use bstr::{B, ByteSlice};
+
+let s = b"123foo5bar789";
+assert_eq!(s.trim_with(|c| c.is_numeric()), B("foo5bar"));
Return a byte string slice with leading characters satisfying the given +predicate removed.
+Basic usage:
+ +use bstr::{B, ByteSlice};
+
+let s = b"123foo5bar789";
+assert_eq!(s.trim_start_with(|c| c.is_numeric()), B("foo5bar789"));
Return a byte string slice with trailing characters satisfying the +given predicate removed.
+Basic usage:
+ +use bstr::{B, ByteSlice};
+
+let s = b"123foo5bar789";
+assert_eq!(s.trim_end_with(|c| c.is_numeric()), B("123foo5bar"));
Convert this byte string to its lowercase ASCII equivalent in place.
+In this case, lowercase is only defined in ASCII letters. Namely, the
+letters A-Z
are converted to a-z
. All other bytes remain unchanged.
If you don’t need to do the conversion in
+place and instead prefer convenience, then use
+to_ascii_lowercase
instead.
Basic usage:
+ +use bstr::ByteSlice;
+
+let mut s = <Vec<u8>>::from("HELLO Β");
+s.make_ascii_lowercase();
+assert_eq!(s, "hello Β".as_bytes());
Invalid UTF-8 remains as is:
+ +use bstr::{B, ByteSlice, ByteVec};
+
+let mut s = <Vec<u8>>::from_slice(b"FOO\xFFBAR\xE2\x98BAZ");
+s.make_ascii_lowercase();
+assert_eq!(s, B(b"foo\xFFbar\xE2\x98baz"));
Convert this byte string to its uppercase ASCII equivalent in place.
+In this case, uppercase is only defined in ASCII letters. Namely, the
+letters a-z
are converted to A-Z
. All other bytes remain unchanged.
If you don’t need to do the conversion in
+place and instead prefer convenience, then use
+to_ascii_uppercase
instead.
Basic usage:
+ +use bstr::{B, ByteSlice};
+
+let mut s = <Vec<u8>>::from("hello β");
+s.make_ascii_uppercase();
+assert_eq!(s, B("HELLO β"));
Invalid UTF-8 remains as is:
+ +use bstr::{B, ByteSlice, ByteVec};
+
+let mut s = <Vec<u8>>::from_slice(b"foo\xFFbar\xE2\x98baz");
+s.make_ascii_uppercase();
+assert_eq!(s, B(b"FOO\xFFBAR\xE2\x98BAZ"));
Escapes this byte string into a sequence of char
values.
When the sequence of char
values is concatenated into a string, the
+result is always valid UTF-8. Any unprintable or invalid UTF-8 in this
+byte string are escaped using using \xNN
notation. Moreover, the
+characters \0
, \r
, \n
, \t
and \
are escaped as well.
This is useful when one wants to get a human readable view of the raw +bytes that is also valid UTF-8.
+The iterator returned implements the Display
trait. So one can do
+b"foo\xFFbar".escape_bytes().to_string()
to get a String
with its
+bytes escaped.
The dual of this function is [ByteVec::unescape_bytes
].
Note that this is similar to, but not equivalent to the Debug
+implementation on BStr
and [BString
]. The Debug
implementations
+also use the debug representation for all Unicode codepoints. However,
+this escaping routine only escapes individual bytes. All Unicode
+codepoints above U+007F
are passed through unchanged without any
+escaping.
use bstr::{B, ByteSlice};
+
+assert_eq!(r"foo\xFFbar", b"foo\xFFbar".escape_bytes().to_string());
+assert_eq!(r"foo\nbar", b"foo\nbar".escape_bytes().to_string());
+assert_eq!(r"foo\tbar", b"foo\tbar".escape_bytes().to_string());
+assert_eq!(r"foo\\bar", b"foo\\bar".escape_bytes().to_string());
+assert_eq!(r"foo☃bar", B("foo☃bar").escape_bytes().to_string());
Reverse the bytes in this string, in place.
+This is not necessarily a well formed operation! For example, if this +byte string contains valid UTF-8 that isn’t ASCII, then reversing the +string will likely result in invalid UTF-8 and otherwise non-sensical +content.
+Note that this is equivalent to the generic [u8]::reverse
method.
+This method is provided to permit callers to explicitly differentiate
+between reversing bytes, codepoints and graphemes.
Basic usage:
+ +use bstr::ByteSlice;
+
+let mut s = <Vec<u8>>::from("hello");
+s.reverse_bytes();
+assert_eq!(s, "olleh".as_bytes());
Reverse the codepoints in this string, in place.
+If this byte string is valid UTF-8, then its reversal by codepoint +is also guaranteed to be valid UTF-8.
+This operation is equivalent to the following, but without allocating:
+ +use bstr::ByteSlice;
+
+let mut s = <Vec<u8>>::from("foo☃bar");
+
+let mut chars: Vec<char> = s.chars().collect();
+chars.reverse();
+
+let reversed: String = chars.into_iter().collect();
+assert_eq!(reversed, "rab☃oof");
Note that this is not necessarily a well formed operation. For example,
+if this byte string contains grapheme clusters with more than one
+codepoint, then those grapheme clusters will not necessarily be
+preserved. If you’d like to preserve grapheme clusters, then use
+reverse_graphemes
instead.
Basic usage:
+ +use bstr::ByteSlice;
+
+let mut s = <Vec<u8>>::from("foo☃bar");
+s.reverse_chars();
+assert_eq!(s, "rab☃oof".as_bytes());
This example shows that not all reversals lead to a well formed string. +For example, in this case, combining marks are used to put accents over +some letters, and those accent marks must appear after the codepoints +they modify.
+ +use bstr::{B, ByteSlice};
+
+let mut s = <Vec<u8>>::from("résumé");
+s.reverse_chars();
+assert_eq!(s, B(b"\xCC\x81emus\xCC\x81er"));
A word of warning: the above example relies on the fact that
+résumé
is in decomposed normal form, which means there are separate
+codepoints for the accents above e
. If it is instead in composed
+normal form, then the example works:
use bstr::{B, ByteSlice};
+
+let mut s = <Vec<u8>>::from("résumé");
+s.reverse_chars();
+assert_eq!(s, B("émusér"));
The point here is to be cautious and not assume that just because
+reverse_chars
works in one case, that it therefore works in all
+cases.
Returns true if and only if every byte in this byte string is ASCII.
+ASCII is an encoding that defines 128 codepoints. A byte corresponds to
+an ASCII codepoint if and only if it is in the inclusive range
+[0, 127]
.
Basic usage:
+ +use bstr::{B, ByteSlice};
+
+assert!(B("abc").is_ascii());
+assert!(!B("☃βツ").is_ascii());
+assert!(!B(b"\xFF").is_ascii());
Returns true if and only if the entire byte string is valid UTF-8.
+If you need location information about where a byte string’s first
+invalid UTF-8 byte is, then use the to_str
method.
Basic usage:
+ +use bstr::{B, ByteSlice};
+
+assert!(B("abc").is_utf8());
+assert!(B("☃βツ").is_utf8());
+// invalid bytes
+assert!(!B(b"abc\xFF").is_utf8());
+// surrogate encoding
+assert!(!B(b"\xED\xA0\x80").is_utf8());
+// incomplete sequence
+assert!(!B(b"\xF0\x9D\x9Ca").is_utf8());
+// overlong sequence
+assert!(!B(b"\xF0\x82\x82\xAC").is_utf8());
Returns the last byte in this byte string, if it’s non-empty. If this
+byte string is empty, this returns None
.
Note that this is like the generic [u8]::last
, except this returns
+the byte by value instead of a reference to the byte.
Basic usage:
+ +use bstr::ByteSlice;
+
+assert_eq!(Some(b'z'), b"baz".last_byte());
+assert_eq!(None, b"".last_byte());
Returns the index of the first non-ASCII byte in this byte string (if
+any such indices exist). Specifically, it returns the index of the
+first byte with a value greater than or equal to 0x80
.
Basic usage:
+ +use bstr::{ByteSlice, B};
+
+assert_eq!(Some(3), b"abc\xff".find_non_ascii_byte());
+assert_eq!(None, b"abcde".find_non_ascii_byte());
+assert_eq!(Some(0), B("😀").find_non_ascii_byte());
Redirecting to ../../bstr/fn.decode_utf8.html...
+ + + \ No newline at end of file diff --git a/bstr/utf8/fn.decode_last.html b/bstr/utf8/fn.decode_last.html new file mode 100644 index 000000000..266c86194 --- /dev/null +++ b/bstr/utf8/fn.decode_last.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/fn.decode_last_utf8.html...
+ + + \ No newline at end of file diff --git a/bstr/utf8/struct.CharIndices.html b/bstr/utf8/struct.CharIndices.html new file mode 100644 index 000000000..170d585f1 --- /dev/null +++ b/bstr/utf8/struct.CharIndices.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.CharIndices.html...
+ + + \ No newline at end of file diff --git a/bstr/utf8/struct.Chars.html b/bstr/utf8/struct.Chars.html new file mode 100644 index 000000000..cd0f70649 --- /dev/null +++ b/bstr/utf8/struct.Chars.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.Chars.html...
+ + + \ No newline at end of file diff --git a/bstr/utf8/struct.Utf8Chunk.html b/bstr/utf8/struct.Utf8Chunk.html new file mode 100644 index 000000000..33d58af5c --- /dev/null +++ b/bstr/utf8/struct.Utf8Chunk.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.Utf8Chunk.html...
+ + + \ No newline at end of file diff --git a/bstr/utf8/struct.Utf8Chunks.html b/bstr/utf8/struct.Utf8Chunks.html new file mode 100644 index 000000000..da5b637a9 --- /dev/null +++ b/bstr/utf8/struct.Utf8Chunks.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.Utf8Chunks.html...
+ + + \ No newline at end of file diff --git a/bstr/utf8/struct.Utf8Error.html b/bstr/utf8/struct.Utf8Error.html new file mode 100644 index 000000000..2c0fe9c9b --- /dev/null +++ b/bstr/utf8/struct.Utf8Error.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../bstr/struct.Utf8Error.html...
+ + + \ No newline at end of file diff --git a/crates.js b/crates.js new file mode 100644 index 000000000..0b0382bd8 --- /dev/null +++ b/crates.js @@ -0,0 +1 @@ +window.ALL_CRATES = ["bstr","memchr","roe"]; \ No newline at end of file diff --git a/help.html b/help.html new file mode 100644 index 000000000..ca5a10cfe --- /dev/null +++ b/help.html @@ -0,0 +1 @@ +pub fn is_equal(x: &[u8], y: &[u8]) -> bool
Compare corresponding bytes in x
and y
for equality.
That is, this returns true if and only if x.len() == y.len()
and
+x[i] == y[i]
for all 0 <= i < x.len()
.
This routine is marked inline(always)
. If you want to call this function
+in a way that is not always inlined, you’ll need to wrap a call to it in
+another function that is marked as inline(never)
or just inline
.
Why not use slice equality instead? Well, slice equality usually results in
+a call out to the current platform’s libc
which might not be inlineable
+or have other overhead. This routine isn’t guaranteed to be a win, but it
+might be in some cases.
pub unsafe fn is_equal_raw(x: *const u8, y: *const u8, n: usize) -> bool
Compare n
bytes at the given pointers for equality.
This returns true if and only if *x.add(i) == *y.add(i)
for all
+0 <= i < n
.
This routine is marked inline(always)
. If you want to call this function
+in a way that is not always inlined, you’ll need to wrap a call to it in
+another function that is marked as inline(never)
or just inline
.
Why not use slice equality instead? Well, slice equality usually results in
+a call out to the current platform’s libc
which might not be inlineable
+or have other overhead. This routine isn’t guaranteed to be a win, but it
+might be in some cases.
x
and y
must be valid for reads of up to n
bytes.x
and y
must point to an initialized value.x
and y
must each point to an allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object. x
and y
do not need to point to the same allocated
+object, but they may.x
and y
must be derived from a pointer to their respective
+allocated objects.x
and x+n
must not overflow isize
. Similarly
+for y
and y+n
.pub fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool
Returns true if and only if needle
is a prefix of haystack
.
This uses a latency optimized variant of memcmp
internally which might
+make this faster for very short strings.
This routine is marked inline(always)
. If you want to call this function
+in a way that is not always inlined, you’ll need to wrap a call to it in
+another function that is marked as inline(never)
or just inline
.
pub fn is_suffix(haystack: &[u8], needle: &[u8]) -> bool
Returns true if and only if needle
is a suffix of haystack
.
This uses a latency optimized variant of memcmp
internally which might
+make this faster for very short strings.
This routine is marked inline(always)
. If you want to call this function
+in a way that is not always inlined, you’ll need to wrap a call to it in
+another function that is marked as inline(never)
or just inline
.
Contains architecture independent routines.
+These routines are often used as a “fallback” implementation when the more +specialized architecture dependent routines are unavailable.
+memchr
and friends.x
and y
for equality.n
bytes at the given pointers for equality.needle
is a prefix of haystack
.needle
is a suffix of haystack
.Provides architecture independent implementations of memchr
and friends.
The main types in this module are One
, Two
and Three
. They are for
+searching for one, two or three distinct bytes, respectively, in a haystack.
+Each type also has corresponding double ended iterators. These searchers
+are typically slower than hand-coded vector routines accomplishing the same
+task, but are also typically faster than naive scalar code. These routines
+effectively work by treating a usize
as a vector of 8-bit lanes, and thus
+achieves some level of data parallelism even without explicit vector support.
The One
searcher also provides a One::count
routine for efficiently
+counting the number of times a single byte occurs in a haystack. This is
+useful, for example, for counting the number of lines in a haystack. This
+routine exists because it is usually faster, especially with a high match
+count, then using One::find
repeatedly. (OneIter
specializes its
+Iterator::count
implementation to use this routine.)
Only one, two and three bytes are supported because three bytes is about
+the point where one sees diminishing returns. Beyond this point and it’s
+probably (but not necessarily) better to just use a simple [bool; 256]
array
+or similar. However, it depends mightily on the specific work-load and the
+expected match frequency.
pub struct One { /* private fields */ }
Finds all occurrences of a single byte in a haystack.
+Create a new searcher that finds occurrences of the byte given.
+Return the first occurrence of the needle in the given haystack. If no
+such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value for a non-empty haystack is haystack.len() - 1
.
Return the last occurrence of the needle in the given haystack. If no
+such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value for a non-empty haystack is haystack.len() - 1
.
Counts all occurrences of this byte in the given haystack.
+Like find
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
Like rfind
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
Counts all occurrences of this byte in the given haystack represented +by raw pointers.
+This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, 0
will always be returned.
pub struct OneIter<'a, 'h> { /* private fields */ }
An iterator over all occurrences of a single byte in a haystack.
+This iterator implements DoubleEndedIterator
, which means it can also be
+used to find occurrences in reverse order.
This iterator is created by the One::iter
method.
The lifetime parameters are as follows:
+'a
refers to the lifetime of the underlying One
searcher.'h
refers to the lifetime of the haystack being searched.iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct Three { /* private fields */ }
Finds all occurrences of three bytes in a haystack.
+That is, this reports matches of one of three possible bytes. For example,
+searching for a
, b
or o
in afoobar
would report matches at offsets
+0
, 2
, 3
, 4
and 5
.
Create a new searcher that finds occurrences of the three needle bytes +given.
+Return the first occurrence of one of the needle bytes in the given
+haystack. If no such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value for a non-empty haystack is haystack.len() - 1
.
Return the last occurrence of one of the needle bytes in the given
+haystack. If no such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value for a non-empty haystack is haystack.len() - 1
.
Like find
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
Like rfind
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
pub struct ThreeIter<'a, 'h> { /* private fields */ }
An iterator over all occurrences of three possible bytes in a haystack.
+This iterator implements DoubleEndedIterator
, which means it can also be
+used to find occurrences in reverse order.
This iterator is created by the Three::iter
method.
The lifetime parameters are as follows:
+'a
refers to the lifetime of the underlying Three
searcher.'h
refers to the lifetime of the haystack being searched.iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct Two { /* private fields */ }
Finds all occurrences of two bytes in a haystack.
+That is, this reports matches of one of two possible bytes. For example,
+searching for a
or b
in afoobar
would report matches at offsets 0
,
+4
and 5
.
Create a new searcher that finds occurrences of the two needle bytes +given.
+Return the first occurrence of one of the needle bytes in the given
+haystack. If no such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value for a non-empty haystack is haystack.len() - 1
.
Return the last occurrence of one of the needle bytes in the given
+haystack. If no such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value for a non-empty haystack is haystack.len() - 1
.
Like find
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
Like rfind
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
pub struct TwoIter<'a, 'h> { /* private fields */ }
An iterator over all occurrences of two possible bytes in a haystack.
+This iterator implements DoubleEndedIterator
, which means it can also be
+used to find occurrences in reverse order.
This iterator is created by the Two::iter
method.
The lifetime parameters are as follows:
+'a
refers to the lifetime of the underlying Two
searcher.'h
refers to the lifetime of the haystack being searched.iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)Provides an architecture independent implementation of the “packed pair” +algorithm.
+The “packed pair” algorithm is based on the generic SIMD algorithm. The main +difference is that it (by default) uses a background distribution of byte +frequencies to heuristically select the pair of bytes to search for. Note that +this module provides an architecture independent version that doesn’t do as +good of a job keeping the search for candidates inside a SIMD hot path. It +however can be good enough in many circumstances.
+pub struct Finder { /* private fields */ }
An architecture independent “packed pair” finder.
+This finder picks two bytes that it believes have high predictive power for +indicating an overall match of a needle. At search time, it reports offsets +where the needle could match based on whether the pair of bytes it chose +match.
+This is architecture independent because it utilizes memchr
to find the
+occurrence of one of the bytes in the pair, and then checks whether the
+second byte matches. If it does, in the case of Finder::find_prefilter
,
+the location at which the needle could match is returned.
It is generally preferred to use architecture specific routines for a +“packed pair” prefilter, but this can be a useful fallback when the +architecture independent routines are unavailable.
+Create a new prefilter that reports possible locations where the given +needle matches.
+Create a new prefilter using the pair given.
+If the prefilter could not be constructed, then None
is returned.
This constructor permits callers to control precisely which pair of +bytes is used as a predicate.
+Run this finder on the given haystack as a prefilter.
+If a candidate match is found, then an offset where the needle could +begin in the haystack is returned.
+pub struct Pair { /* private fields */ }
A pair of byte offsets into a needle to use as a predicate.
+This pair is used as a predicate to quickly filter out positions in a +haystack in which a needle cannot match. In some cases, this pair can even +be used in vector algorithms such that the vector algorithm only switches +over to scalar code once this pair has been found.
+A pair of offsets can be used in both substring search implementations and +in prefilters. The former will report matches of a needle in a haystack +where as the latter will only report possible matches of a needle.
+The offsets are limited each to a maximum of 255 to keep memory usage low. +Moreover, it’s rarely advantageous to create a predicate using offsets +greater than 255 anyway.
+The only guarantee enforced on the pair of offsets is that they are not
+equivalent. It is not necessarily the case that index1 < index2
for
+example. By convention, index1
corresponds to the byte in the needle
+that is believed to be most the predictive. Note also that because of the
+requirement that the indices be both valid for the needle used to build
+the pair and not equal, it follows that a pair can only be constructed for
+needles with length at least 2.
Create a new pair of offsets from the given needle.
+If a pair could not be created (for example, if the needle is too
+short), then None
is returned.
This chooses the pair in the needle that is believed to be as +predictive of an overall match of the needle as possible.
+Create a new pair of offsets from the given needle and ranker.
+This permits the caller to choose a background frequency distribution +with which bytes are selected. The idea is to select a pair of bytes +that is believed to strongly predict a match in the haystack. This +usually means selecting bytes that occur rarely in a haystack.
+If a pair could not be created (for example, if the needle is too
+short), then None
is returned.
Create a new pair using the offsets given for the needle given.
+This bypasses any sort of heuristic process for choosing the offsets +and permits the caller to choose the offsets themselves.
+Indices are limited to valid u8
values so that a Pair
uses less
+memory. It is not possible to create a Pair
with offsets bigger than
+u8::MAX
. It’s likely that such a thing is not needed, but if it is,
+it’s suggested to build your own bespoke algorithm because you’re
+likely working on a very niche case. (File an issue if this suggestion
+does not make sense to you.)
If a pair could not be created (for example, if the needle is too
+short), then None
is returned.
pub trait HeuristicFrequencyRank {
+ // Required method
+ fn rank(&self, byte: u8) -> u8;
+}
This trait allows the user to customize the heuristic used to determine the +relative frequency of a given byte in the dataset being searched.
+The use of this trait can have a dramatic impact on performance depending
+on the type of data being searched. The details of why are explained in the
+docs of crate::memmem::Prefilter
. To summarize, the core algorithm uses
+a prefilter to quickly identify candidate matches that are later verified
+more slowly. This prefilter is implemented in terms of trying to find
+rare
bytes at specific offsets that will occur less frequently in the
+dataset. While the concept of a rare
byte is similar for most datasets,
+there are some specific datasets (like binary executables) that have
+dramatically different byte distributions. For these datasets customizing
+the byte frequency heuristic can have a massive impact on performance, and
+might even need to be done at runtime.
The default implementation of HeuristicFrequencyRank
reads from the
+static frequency table defined in src/memmem/byte_frequencies.rs
. This
+is optimal for most inputs, so if you are unsure of the impact of using a
+custom HeuristicFrequencyRank
you should probably just use the default.
use memchr::{
+ arch::all::packedpair::HeuristicFrequencyRank,
+ memmem::FinderBuilder,
+};
+
+/// A byte-frequency table that is good for scanning binary executables.
+struct Binary;
+
+impl HeuristicFrequencyRank for Binary {
+ fn rank(&self, byte: u8) -> u8 {
+ const TABLE: [u8; 256] = [
+ 255, 128, 61, 43, 50, 41, 27, 28, 57, 15, 21, 13, 24, 17, 17,
+ 89, 58, 16, 11, 7, 14, 23, 7, 6, 24, 9, 6, 5, 9, 4, 7, 16,
+ 68, 11, 9, 6, 88, 7, 4, 4, 23, 9, 4, 8, 8, 5, 10, 4, 30, 11,
+ 9, 24, 11, 5, 5, 5, 19, 11, 6, 17, 9, 9, 6, 8,
+ 48, 58, 11, 14, 53, 40, 9, 9, 254, 35, 3, 6, 52, 23, 6, 6, 27,
+ 4, 7, 11, 14, 13, 10, 11, 11, 5, 2, 10, 16, 12, 6, 19,
+ 19, 20, 5, 14, 16, 31, 19, 7, 14, 20, 4, 4, 19, 8, 18, 20, 24,
+ 1, 25, 19, 58, 29, 10, 5, 15, 20, 2, 2, 9, 4, 3, 5,
+ 51, 11, 4, 53, 23, 39, 6, 4, 13, 81, 4, 186, 5, 67, 3, 2, 15,
+ 0, 0, 1, 3, 2, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0,
+ 12, 2, 1, 1, 3, 1, 1, 1, 6, 1, 2, 1, 3, 1, 1, 2, 9, 1, 1, 0,
+ 2, 2, 4, 4, 11, 6, 7, 3, 6, 9, 4, 5,
+ 46, 18, 8, 18, 17, 3, 8, 20, 16, 10, 3, 7, 175, 4, 6, 7, 13,
+ 3, 7, 3, 3, 1, 3, 3, 10, 3, 1, 5, 2, 0, 1, 2,
+ 16, 3, 5, 1, 6, 1, 1, 2, 58, 20, 3, 14, 12, 2, 1, 3, 16, 3, 5,
+ 8, 3, 1, 8, 6, 17, 6, 5, 3, 8, 6, 13, 175,
+ ];
+ TABLE[byte as usize]
+ }
+}
+// Create a new finder with the custom heuristic.
+let finder = FinderBuilder::new()
+ .build_forward_with_ranker(Binary, b"\x00\x00\xdd\xdd");
+// Find needle with custom heuristic.
+assert!(finder.find(b"\x00\x00\x00\xdd\xdd").is_some());
Return the heuristic frequency rank of the given byte. A lower rank +means the byte is believed to occur less frequently in the haystack.
+Some uses of this heuristic may treat arbitrary absolute rank values as +significant. For example, an implementation detail in this crate may +determine that heuristic prefilters are inappropriate if every byte in +the needle has a “high” rank.
+This permits passing any implementation of HeuristicFrequencyRank
as a
+borrowed version of itself.
An implementation of the Rabin-Karp substring search algorithm.
+Rabin-Karp works by creating a hash of the needle provided and then computing
+a rolling hash for each needle sized window in the haystack. When the rolling
+hash matches the hash of the needle, a byte-wise comparison is done to check
+if a match exists. The worst case time complexity of Rabin-Karp is O(m * n)
where m ~ len(needle)
and n ~ len(haystack)
. Its worst case space
+complexity is constant.
The main utility of Rabin-Karp is that the searcher can be constructed very +quickly with very little memory. This makes it especially useful when searching +for small needles in small haystacks, as it might finish its search before a +beefier algorithm (like Two-Way) even starts.
+pub struct Finder { /* private fields */ }
A forward substring searcher using the Rabin-Karp algorithm.
+Note that, as a lower level API, a Finder
does not have access to the
+needle it was constructed with. For this reason, executing a search
+with a Finder
requires passing both the needle and the haystack,
+where the needle is exactly equivalent to the one given to the Finder
+at construction time. This design was chosen so that callers can have
+more precise control over where and how many times a needle is stored.
+For example, in cases where Rabin-Karp is just one of several possible
+substring search algorithms.
Create a new Rabin-Karp forward searcher for the given needle
.
The needle may be empty. The empty needle matches at every byte offset.
+Note that callers must pass the same needle to all search calls using
+this Finder
.
Return the first occurrence of the needle
in the haystack
+given. If no such occurrence exists, then None
is returned.
The needle
provided must match the needle given to this finder at
+construction time.
The maximum value this can return is haystack.len()
, which can only
+occur when the needle and haystack both have length zero. Otherwise,
+for non-empty haystacks, the maximum value is haystack.len() - 1
.
Like find
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and <= end
. The pointer returned is only ever equivalent
+to end
when both the needle and haystack are empty. (That is, the
+empty string matches the empty string.)
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+Note that start
and end
below refer to both pairs of pointers given
+to this routine. That is, the conditions apply to both hstart
/hend
+and nstart
/nend
.
start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.start <= end
.pub struct FinderRev(/* private fields */);
A reverse substring searcher using the Rabin-Karp algorithm.
+Create a new Rabin-Karp reverse searcher for the given needle
.
Return the last occurrence of the needle
in the haystack
+given. If no such occurrence exists, then None
is returned.
The needle
provided must match the needle given to this finder at
+construction time.
The maximum value this can return is haystack.len()
, which can only
+occur when the needle and haystack both have length zero. Otherwise,
+for non-empty haystacks, the maximum value is haystack.len() - 1
.
Like rfind
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and <= end
. The pointer returned is only ever equivalent
+to end
when both the needle and haystack are empty. (That is, the
+empty string matches the empty string.)
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+Note that start
and end
below refer to both pairs of pointers given
+to this routine. That is, the conditions apply to both hstart
/hend
+and nstart
/nend
.
start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.start <= end
.pub struct Finder { /* private fields */ }
A forward substring searcher using the Shift-Or algorithm.
+Create a new Shift-Or forward searcher for the given needle
.
The needle may be empty. The empty needle matches at every byte offset.
+Return the first occurrence of the needle given to Finder::new
in
+the haystack
given. If no such occurrence exists, then None
is
+returned.
Unlike most other substring search implementations in this crate, this +finder does not require passing the needle at search time. A match can +be determined without the needle at all since the required information +is already encoded into this finder at construction time.
+The maximum value this can return is haystack.len()
, which can only
+occur when the needle and haystack both have length zero. Otherwise,
+for non-empty haystacks, the maximum value is haystack.len() - 1
.
An implementation of the Two-Way substring search algorithm.
+Finder
can be built for forward searches, while FinderRev
can be built
+for reverse searches.
Two-Way makes for a nice general purpose substring search algorithm because of
+its time and space complexity properties. It also performs well in practice.
+Namely, with m = len(needle)
and n = len(haystack)
, Two-Way takes O(m)
+time to create a finder, O(1)
space and O(n)
search time. In other words,
+the preprocessing step is quick, doesn’t require any heap memory and the worst
+case search time is guaranteed to be linear in the haystack regardless of the
+size of the needle.
While vector algorithms will usually beat Two-Way handedly, vector algorithms +also usually have pathological or edge cases that are better handled by Two-Way. +Moreover, not all targets support vector algorithms or implementations for them +simply may not exist yet.
+Two-Way can be found in the memmem
implementations in at least GNU libc and
+musl.
pub struct Finder(/* private fields */);
A forward substring searcher that uses the Two-Way algorithm.
+Create a searcher that finds occurrences of the given needle
.
An empty needle
results in a match at every position in a haystack,
+including at haystack.len()
.
Returns the first occurrence of needle
in the given haystack
, or
+None
if no such occurrence could be found.
The needle
given must be the same as the needle
provided to
+Finder::new
.
An empty needle
results in a match at every position in a haystack,
+including at haystack.len()
.
pub struct FinderRev(/* private fields */);
A reverse substring searcher that uses the Two-Way algorithm.
+Create a searcher that finds occurrences of the given needle
.
An empty needle
results in a match at every position in a haystack,
+including at haystack.len()
.
Returns the last occurrence of needle
in the given haystack
, or
+None
if no such occurrence could be found.
The needle
given must be the same as the needle
provided to
+FinderRev::new
.
An empty needle
results in a match at every position in a haystack,
+including at haystack.len()
.
This module defines 256-bit vector implementations of memchr
and friends.
The main types in this module are One
, Two
and Three
. They are for
+searching for one, two or three distinct bytes, respectively, in a haystack.
+Each type also has corresponding double ended iterators. These searchers are
+typically much faster than scalar routines accomplishing the same task.
The One
searcher also provides a One::count
routine for efficiently
+counting the number of times a single byte occurs in a haystack. This is
+useful, for example, for counting the number of lines in a haystack. This
+routine exists because it is usually faster, especially with a high match
+count, then using One::find
repeatedly. (OneIter
specializes its
+Iterator::count
implementation to use this routine.)
Only one, two and three bytes are supported because three bytes is about
+the point where one sees diminishing returns. Beyond this point and it’s
+probably (but not necessarily) better to just use a simple [bool; 256]
array
+or similar. However, it depends mightily on the specific work-load and the
+expected match frequency.
pub struct One { /* private fields */ }
Finds all occurrences of a single byte in a haystack.
+Create a new searcher that finds occurrences of the needle byte given.
+This particular searcher is specialized to use AVX2 vector instructions +that typically make it quite fast. (SSE2 is used for haystacks that +are too short to accommodate an AVX2 vector.)
+If either SSE2 or AVX2 is unavailable in the current environment, then
+None
is returned.
sse2
and avx2
only.Create a new finder specific to AVX2 vectors and routines without +checking that either SSE2 or AVX2 is available.
+Callers must guarantee that it is safe to execute both sse2
and
+avx2
instructions in the current environment.
Note that it is a common misconception that if one compiles for an
+x86_64
target, then they therefore automatically have access to SSE2
+instructions. While this is almost always the case, it isn’t true in
+100% of cases.
Returns true when this implementation is available in the current +environment.
+When this is true, it is guaranteed that One::new
will return
+a Some
value. Similarly, when it is false, it is guaranteed that
+One::new
will return a None
value.
Note also that for the lifetime of a single program, if this returns +true then it will always return true.
+Return the first occurrence of one of the needle bytes in the given
+haystack. If no such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value is haystack.len() - 1
.
Return the last occurrence of one of the needle bytes in the given
+haystack. If no such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value is haystack.len() - 1
.
Counts all occurrences of this byte in the given haystack.
+Like find
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
Like rfind
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
Counts all occurrences of this byte in the given haystack represented +by raw pointers.
+This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, 0
will always be returned.
pub struct OneIter<'a, 'h> { /* private fields */ }
An iterator over all occurrences of a single byte in a haystack.
+This iterator implements DoubleEndedIterator
, which means it can also be
+used to find occurrences in reverse order.
This iterator is created by the One::iter
method.
The lifetime parameters are as follows:
+'a
refers to the lifetime of the underlying One
searcher.'h
refers to the lifetime of the haystack being searched.iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct Three { /* private fields */ }
Finds all occurrences of three bytes in a haystack.
+That is, this reports matches of one of three possible bytes. For example,
+searching for a
, b
or o
in afoobar
would report matches at offsets
+0
, 2
, 3
, 4
and 5
.
Create a new searcher that finds occurrences of the needle bytes given.
+This particular searcher is specialized to use AVX2 vector instructions +that typically make it quite fast. (SSE2 is used for haystacks that +are too short to accommodate an AVX2 vector.)
+If either SSE2 or AVX2 is unavailable in the current environment, then
+None
is returned.
sse2
and avx2
only.Create a new finder specific to AVX2 vectors and routines without +checking that either SSE2 or AVX2 is available.
+Callers must guarantee that it is safe to execute both sse2
and
+avx2
instructions in the current environment.
Note that it is a common misconception that if one compiles for an
+x86_64
target, then they therefore automatically have access to SSE2
+instructions. While this is almost always the case, it isn’t true in
+100% of cases.
Returns true when this implementation is available in the current +environment.
+When this is true, it is guaranteed that Three::new
will return
+a Some
value. Similarly, when it is false, it is guaranteed that
+Three::new
will return a None
value.
Note also that for the lifetime of a single program, if this returns +true then it will always return true.
+Return the first occurrence of one of the needle bytes in the given
+haystack. If no such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value is haystack.len() - 1
.
Return the last occurrence of one of the needle bytes in the given
+haystack. If no such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value is haystack.len() - 1
.
Like find
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
Like rfind
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
pub struct ThreeIter<'a, 'h> { /* private fields */ }
An iterator over all occurrences of three possible bytes in a haystack.
+This iterator implements DoubleEndedIterator
, which means it can also be
+used to find occurrences in reverse order.
This iterator is created by the Three::iter
method.
The lifetime parameters are as follows:
+'a
refers to the lifetime of the underlying Three
searcher.'h
refers to the lifetime of the haystack being searched.iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct Two { /* private fields */ }
Finds all occurrences of two bytes in a haystack.
+That is, this reports matches of one of two possible bytes. For example,
+searching for a
or b
in afoobar
would report matches at offsets 0
,
+4
and 5
.
Create a new searcher that finds occurrences of the needle bytes given.
+This particular searcher is specialized to use AVX2 vector instructions +that typically make it quite fast. (SSE2 is used for haystacks that +are too short to accommodate an AVX2 vector.)
+If either SSE2 or AVX2 is unavailable in the current environment, then
+None
is returned.
sse2
and avx2
only.Create a new finder specific to AVX2 vectors and routines without +checking that either SSE2 or AVX2 is available.
+Callers must guarantee that it is safe to execute both sse2
and
+avx2
instructions in the current environment.
Note that it is a common misconception that if one compiles for an
+x86_64
target, then they therefore automatically have access to SSE2
+instructions. While this is almost always the case, it isn’t true in
+100% of cases.
Returns true when this implementation is available in the current +environment.
+When this is true, it is guaranteed that Two::new
will return
+a Some
value. Similarly, when it is false, it is guaranteed that
+Two::new
will return a None
value.
Note also that for the lifetime of a single program, if this returns +true then it will always return true.
+Return the first occurrence of one of the needle bytes in the given
+haystack. If no such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value is haystack.len() - 1
.
Return the last occurrence of one of the needle bytes in the given
+haystack. If no such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value is haystack.len() - 1
.
Like find
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
Like rfind
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
pub struct TwoIter<'a, 'h> { /* private fields */ }
An iterator over all occurrences of two possible bytes in a haystack.
+This iterator implements DoubleEndedIterator
, which means it can also be
+used to find occurrences in reverse order.
This iterator is created by the Two::iter
method.
The lifetime parameters are as follows:
+'a
refers to the lifetime of the underlying Two
searcher.'h
refers to the lifetime of the haystack being searched.iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)A 256-bit vector implementation of the “packed pair” SIMD algorithm.
+The “packed pair” algorithm is based on the generic SIMD algorithm. The main +difference is that it (by default) uses a background distribution of byte +frequencies to heuristically select the pair of bytes to search for.
+pub struct Finder { /* private fields */ }
A “packed pair” finder that uses 256-bit vector operations.
+This finder picks two bytes that it believes have high predictive power
+for indicating an overall match of a needle. Depending on whether
+Finder::find
or Finder::find_prefilter
is used, it reports offsets
+where the needle matches or could match. In the prefilter case, candidates
+are reported whenever the Pair
of bytes given matches.
Create a new pair searcher. The searcher returned can either report
+exact matches of needle
or act as a prefilter and report candidate
+positions of needle
.
If AVX2 is unavailable in the current environment or if a Pair
+could not be constructed from the needle given, then None
is
+returned.
Create a new “packed pair” finder using the pair of bytes given.
+This constructor permits callers to control precisely which pair of +bytes is used as a predicate.
+If AVX2 is unavailable in the current environment, then None
is
+returned.
Returns true when this implementation is available in the current +environment.
+When this is true, it is guaranteed that Finder::with_pair
will
+return a Some
value. Similarly, when it is false, it is guaranteed
+that Finder::with_pair
will return a None
value. Notice that this
+does not guarantee that Finder::new
will return a Finder
. Namely,
+even when Finder::is_available
is true, it is not guaranteed that a
+valid Pair
can be found from the needle given.
Note also that for the lifetime of a single program, if this returns +true then it will always return true.
+Execute a search using AVX2 vectors and routines.
+When haystack.len()
is less than Finder::min_haystack_len
.
Run this finder on the given haystack as a prefilter.
+If a candidate match is found, then an offset where the needle could +begin in the haystack is returned.
+When haystack.len()
is less than Finder::min_haystack_len
.
Returns the pair of offsets (into the needle) used to check as a +predicate before confirming whether a needle exists at a particular +position.
+Returns the minimum haystack length that this Finder
can search.
Using a haystack with length smaller than this in a search will result +in a panic. The reason for this restriction is that this finder is +meant to be a low-level component that is part of a larger substring +strategy. In that sense, it avoids trying to handle all cases and +instead only handles the cases that it can handle very well.
+This module defines 128-bit vector implementations of memchr
and friends.
The main types in this module are One
, Two
and Three
. They are for
+searching for one, two or three distinct bytes, respectively, in a haystack.
+Each type also has corresponding double ended iterators. These searchers are
+typically much faster than scalar routines accomplishing the same task.
The One
searcher also provides a One::count
routine for efficiently
+counting the number of times a single byte occurs in a haystack. This is
+useful, for example, for counting the number of lines in a haystack. This
+routine exists because it is usually faster, especially with a high match
+count, then using One::find
repeatedly. (OneIter
specializes its
+Iterator::count
implementation to use this routine.)
Only one, two and three bytes are supported because three bytes is about
+the point where one sees diminishing returns. Beyond this point and it’s
+probably (but not necessarily) better to just use a simple [bool; 256]
array
+or similar. However, it depends mightily on the specific work-load and the
+expected match frequency.
pub struct One(/* private fields */);
Finds all occurrences of a single byte in a haystack.
+Create a new searcher that finds occurrences of the needle byte given.
+This particular searcher is specialized to use SSE2 vector instructions +that typically make it quite fast.
+If SSE2 is unavailable in the current environment, then None
is
+returned.
sse2
only.Create a new finder specific to SSE2 vectors and routines without +checking that SSE2 is available.
+Callers must guarantee that it is safe to execute sse2
instructions
+in the current environment.
Note that it is a common misconception that if one compiles for an
+x86_64
target, then they therefore automatically have access to SSE2
+instructions. While this is almost always the case, it isn’t true in
+100% of cases.
Returns true when this implementation is available in the current +environment.
+When this is true, it is guaranteed that One::new
will return
+a Some
value. Similarly, when it is false, it is guaranteed that
+One::new
will return a None
value.
Note also that for the lifetime of a single program, if this returns +true then it will always return true.
+Return the first occurrence of one of the needle bytes in the given
+haystack. If no such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value is haystack.len() - 1
.
Return the last occurrence of one of the needle bytes in the given
+haystack. If no such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value is haystack.len() - 1
.
Counts all occurrences of this byte in the given haystack.
+Like find
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
Like rfind
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
Counts all occurrences of this byte in the given haystack represented +by raw pointers.
+This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, 0
will always be returned.
pub struct OneIter<'a, 'h> { /* private fields */ }
An iterator over all occurrences of a single byte in a haystack.
+This iterator implements DoubleEndedIterator
, which means it can also be
+used to find occurrences in reverse order.
This iterator is created by the One::iter
method.
The lifetime parameters are as follows:
+'a
refers to the lifetime of the underlying One
searcher.'h
refers to the lifetime of the haystack being searched.iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct Three(/* private fields */);
Finds all occurrences of three bytes in a haystack.
+That is, this reports matches of one of three possible bytes. For example,
+searching for a
, b
or o
in afoobar
would report matches at offsets
+0
, 2
, 3
, 4
and 5
.
Create a new searcher that finds occurrences of the needle bytes given.
+This particular searcher is specialized to use SSE2 vector instructions +that typically make it quite fast.
+If SSE2 is unavailable in the current environment, then None
is
+returned.
sse2
only.Create a new finder specific to SSE2 vectors and routines without +checking that SSE2 is available.
+Callers must guarantee that it is safe to execute sse2
instructions
+in the current environment.
Note that it is a common misconception that if one compiles for an
+x86_64
target, then they therefore automatically have access to SSE2
+instructions. While this is almost always the case, it isn’t true in
+100% of cases.
Returns true when this implementation is available in the current +environment.
+When this is true, it is guaranteed that Three::new
will return
+a Some
value. Similarly, when it is false, it is guaranteed that
+Three::new
will return a None
value.
Note also that for the lifetime of a single program, if this returns +true then it will always return true.
+Return the first occurrence of one of the needle bytes in the given
+haystack. If no such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value is haystack.len() - 1
.
Return the last occurrence of one of the needle bytes in the given
+haystack. If no such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value is haystack.len() - 1
.
Like find
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
Like rfind
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
pub struct ThreeIter<'a, 'h> { /* private fields */ }
An iterator over all occurrences of three possible bytes in a haystack.
+This iterator implements DoubleEndedIterator
, which means it can also be
+used to find occurrences in reverse order.
This iterator is created by the Three::iter
method.
The lifetime parameters are as follows:
+'a
refers to the lifetime of the underlying Three
searcher.'h
refers to the lifetime of the haystack being searched.iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct Two(/* private fields */);
Finds all occurrences of two bytes in a haystack.
+That is, this reports matches of one of two possible bytes. For example,
+searching for a
or b
in afoobar
would report matches at offsets 0
,
+4
and 5
.
Create a new searcher that finds occurrences of the needle bytes given.
+This particular searcher is specialized to use SSE2 vector instructions +that typically make it quite fast.
+If SSE2 is unavailable in the current environment, then None
is
+returned.
sse2
only.Create a new finder specific to SSE2 vectors and routines without +checking that SSE2 is available.
+Callers must guarantee that it is safe to execute sse2
instructions
+in the current environment.
Note that it is a common misconception that if one compiles for an
+x86_64
target, then they therefore automatically have access to SSE2
+instructions. While this is almost always the case, it isn’t true in
+100% of cases.
Returns true when this implementation is available in the current +environment.
+When this is true, it is guaranteed that Two::new
will return
+a Some
value. Similarly, when it is false, it is guaranteed that
+Two::new
will return a None
value.
Note also that for the lifetime of a single program, if this returns +true then it will always return true.
+Return the first occurrence of one of the needle bytes in the given
+haystack. If no such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value is haystack.len() - 1
.
Return the last occurrence of one of the needle bytes in the given
+haystack. If no such occurrence exists, then None
is returned.
The occurrence is reported as an offset into haystack
. Its maximum
+value is haystack.len() - 1
.
Like find
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
Like rfind
, but accepts and returns raw pointers.
When a match is found, the pointer returned is guaranteed to be
+>= start
and < end
.
This routine is useful if you’re already using raw pointers and would +like to avoid converting back to a slice before executing a search.
+start
and end
must be valid for reads.start
and end
must point to an initialized value.start
and end
must point to the same allocated object and
+must either be in bounds or at most one byte past the end of the
+allocated object.start
and end
must be derived from a pointer to the same
+object.start
and end
must not overflow isize
.Note that callers may pass a pair of pointers such that start >= end
.
+In that case, None
will always be returned.
pub struct TwoIter<'a, 'h> { /* private fields */ }
An iterator over all occurrences of two possible bytes in a haystack.
+This iterator implements DoubleEndedIterator
, which means it can also be
+used to find occurrences in reverse order.
This iterator is created by the Two::iter
method.
The lifetime parameters are as follows:
+'a
refers to the lifetime of the underlying Two
searcher.'h
refers to the lifetime of the haystack being searched.iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)A 128-bit vector implementation of the “packed pair” SIMD algorithm.
+The “packed pair” algorithm is based on the generic SIMD algorithm. The main +difference is that it (by default) uses a background distribution of byte +frequencies to heuristically select the pair of bytes to search for.
+pub struct Finder(/* private fields */);
A “packed pair” finder that uses 128-bit vector operations.
+This finder picks two bytes that it believes have high predictive power
+for indicating an overall match of a needle. Depending on whether
+Finder::find
or Finder::find_prefilter
is used, it reports offsets
+where the needle matches or could match. In the prefilter case, candidates
+are reported whenever the Pair
of bytes given matches.
Create a new pair searcher. The searcher returned can either report
+exact matches of needle
or act as a prefilter and report candidate
+positions of needle
.
If SSE2 is unavailable in the current environment or if a Pair
+could not be constructed from the needle given, then None
is
+returned.
Create a new “packed pair” finder using the pair of bytes given.
+This constructor permits callers to control precisely which pair of +bytes is used as a predicate.
+If SSE2 is unavailable in the current environment, then None
is
+returned.
Returns true when this implementation is available in the current +environment.
+When this is true, it is guaranteed that Finder::with_pair
will
+return a Some
value. Similarly, when it is false, it is guaranteed
+that Finder::with_pair
will return a None
value. Notice that this
+does not guarantee that Finder::new
will return a Finder
. Namely,
+even when Finder::is_available
is true, it is not guaranteed that a
+valid Pair
can be found from the needle given.
Note also that for the lifetime of a single program, if this returns +true then it will always return true.
+Execute a search using SSE2 vectors and routines.
+When haystack.len()
is less than Finder::min_haystack_len
.
Run this finder on the given haystack as a prefilter.
+If a candidate match is found, then an offset where the needle could +begin in the haystack is returned.
+When haystack.len()
is less than Finder::min_haystack_len
.
Returns the pair of offsets (into the needle) used to check as a +predicate before confirming whether a needle exists at a particular +position.
+Returns the minimum haystack length that this Finder
can search.
Using a haystack with length smaller than this in a search will result +in a panic. The reason for this restriction is that this finder is +meant to be a low-level component that is part of a larger substring +strategy. In that sense, it avoids trying to handle all cases and +instead only handles the cases that it can handle very well.
+pub fn memchr(needle: u8, haystack: &[u8]) -> Option<usize>
Search for the first occurrence of a byte in a slice.
+This returns the index corresponding to the first occurrence of needle
in
+haystack
, or None
if one is not found. If an index is returned, it is
+guaranteed to be less than haystack.len()
.
While this is semantically the same as something like
+haystack.iter().position(|&b| b == needle)
, this routine will attempt to
+use highly optimized vector operations that can be an order of magnitude
+faster (or more).
This shows how to find the first position of a byte in a byte string.
+ +use memchr::memchr;
+
+let haystack = b"the quick brown fox";
+assert_eq!(memchr(b'k', haystack), Some(8));
pub fn memchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option<usize>
Search for the first occurrence of two possible bytes in a haystack.
+This returns the index corresponding to the first occurrence of one of the
+needle bytes in haystack
, or None
if one is not found. If an index is
+returned, it is guaranteed to be less than haystack.len()
.
While this is semantically the same as something like
+haystack.iter().position(|&b| b == needle1 || b == needle2)
, this routine
+will attempt to use highly optimized vector operations that can be an order
+of magnitude faster (or more).
This shows how to find the first position of one of two possible bytes in a +haystack.
+ +use memchr::memchr2;
+
+let haystack = b"the quick brown fox";
+assert_eq!(memchr2(b'k', b'q', haystack), Some(4));
pub fn memchr2_iter<'h>(
+ needle1: u8,
+ needle2: u8,
+ haystack: &'h [u8]
+) -> Memchr2<'h> ⓘ
Returns an iterator over all occurrences of the needles in a haystack.
+The iterator returned implements DoubleEndedIterator
. This means it
+can also be used to find occurrences in reverse order.
pub fn memchr3(
+ needle1: u8,
+ needle2: u8,
+ needle3: u8,
+ haystack: &[u8]
+) -> Option<usize>
Search for the first occurrence of three possible bytes in a haystack.
+This returns the index corresponding to the first occurrence of one of the
+needle bytes in haystack
, or None
if one is not found. If an index is
+returned, it is guaranteed to be less than haystack.len()
.
While this is semantically the same as something like
+haystack.iter().position(|&b| b == needle1 || b == needle2 || b == needle3)
,
+this routine will attempt to use highly optimized vector operations that
+can be an order of magnitude faster (or more).
This shows how to find the first position of one of three possible bytes in +a haystack.
+ +use memchr::memchr3;
+
+let haystack = b"the quick brown fox";
+assert_eq!(memchr3(b'k', b'q', b'u', haystack), Some(4));
pub fn memchr3_iter<'h>(
+ needle1: u8,
+ needle2: u8,
+ needle3: u8,
+ haystack: &'h [u8]
+) -> Memchr3<'h> ⓘ
Returns an iterator over all occurrences of the needles in a haystack.
+The iterator returned implements DoubleEndedIterator
. This means it
+can also be used to find occurrences in reverse order.
pub fn memchr_iter<'h>(needle: u8, haystack: &'h [u8]) -> Memchr<'h> ⓘ
Returns an iterator over all occurrences of the needle in a haystack.
+The iterator returned implements DoubleEndedIterator
. This means it
+can also be used to find occurrences in reverse order.
pub fn memrchr(needle: u8, haystack: &[u8]) -> Option<usize>
Search for the last occurrence of a byte in a slice.
+This returns the index corresponding to the last occurrence of needle
in
+haystack
, or None
if one is not found. If an index is returned, it is
+guaranteed to be less than haystack.len()
.
While this is semantically the same as something like
+haystack.iter().rposition(|&b| b == needle)
, this routine will attempt to
+use highly optimized vector operations that can be an order of magnitude
+faster (or more).
This shows how to find the last position of a byte in a byte string.
+ +use memchr::memrchr;
+
+let haystack = b"the quick brown fox";
+assert_eq!(memrchr(b'o', haystack), Some(17));
pub fn memrchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option<usize>
Search for the last occurrence of two possible bytes in a haystack.
+This returns the index corresponding to the last occurrence of one of the
+needle bytes in haystack
, or None
if one is not found. If an index is
+returned, it is guaranteed to be less than haystack.len()
.
While this is semantically the same as something like
+haystack.iter().rposition(|&b| b == needle1 || b == needle2)
, this
+routine will attempt to use highly optimized vector operations that can be
+an order of magnitude faster (or more).
This shows how to find the last position of one of two possible bytes in a +haystack.
+ +use memchr::memrchr2;
+
+let haystack = b"the quick brown fox";
+assert_eq!(memrchr2(b'k', b'o', haystack), Some(17));
pub fn memrchr3(
+ needle1: u8,
+ needle2: u8,
+ needle3: u8,
+ haystack: &[u8]
+) -> Option<usize>
Search for the last occurrence of three possible bytes in a haystack.
+This returns the index corresponding to the last occurrence of one of the
+needle bytes in haystack
, or None
if one is not found. If an index is
+returned, it is guaranteed to be less than haystack.len()
.
While this is semantically the same as something like
+haystack.iter().rposition(|&b| b == needle1 || b == needle2 || b == needle3)
,
+this routine will attempt to use highly optimized vector operations that
+can be an order of magnitude faster (or more).
This shows how to find the last position of one of three possible bytes in +a haystack.
+ +use memchr::memrchr3;
+
+let haystack = b"the quick brown fox";
+assert_eq!(memrchr3(b'k', b'o', b'n', haystack), Some(17));
This library provides heavily optimized routines for string search primitives.
+This section gives a brief high level overview of what this crate offers.
+memmem
sub-module provides forward and reverse substring search
+routines.In all such cases, routines operate on &[u8]
without regard to encoding. This
+is exactly what you want when searching either UTF-8 or arbitrary bytes.
memchr
This example shows how to use memchr
to find the first occurrence of z
in
+a haystack:
use memchr::memchr;
+
+let haystack = b"foo bar baz quuz";
+assert_eq!(Some(10), memchr(b'z', haystack));
This examples shows how to use memrchr3
to find occurrences of a
, b
or
+c
, starting at the end of the haystack.
use memchr::memchr3_iter;
+
+let haystack = b"xyzaxyzbxyzc";
+
+let mut it = memchr3_iter(b'a', b'b', b'c', haystack).rev();
+assert_eq!(Some(11), it.next());
+assert_eq!(Some(7), it.next());
+assert_eq!(Some(3), it.next());
+assert_eq!(None, it.next());
This example shows how to use the memmem
sub-module to find occurrences of
+a substring in a haystack.
use memchr::memmem;
+
+let haystack = b"foo bar foo baz foo";
+
+let mut it = memmem::find_iter(haystack, "foo");
+assert_eq!(Some(0), it.next());
+assert_eq!(Some(8), it.next());
+assert_eq!(Some(16), it.next());
+assert_eq!(None, it.next());
It may be possible for the overhead of constructing a substring searcher to be
+measurable in some workloads. In cases where the same needle is used to search
+many haystacks, it is possible to do construction once and thus to avoid it for
+subsequent searches. This can be done with a memmem::Finder
:
use memchr::memmem;
+
+let finder = memmem::Finder::new("foo");
+
+assert_eq!(Some(4), finder.find(b"baz foo quux"));
+assert_eq!(None, finder.find(b"quux baz bar"));
At first glance, the APIs provided by this crate might seem weird. Why provide
+a dedicated routine like memchr
for something that could be implemented
+clearly and trivially in one line:
fn memchr(needle: u8, haystack: &[u8]) -> Option<usize> {
+ haystack.iter().position(|&b| b == needle)
+}
Or similarly, why does this crate provide substring search routines when Rust’s +core library already provides them?
+ +fn search(haystack: &str, needle: &str) -> Option<usize> {
+ haystack.find(needle)
+}
The primary reason for both of them to exist is performance. When it comes to +performance, at a high level at least, there are two primary ways to look at +it:
+The memchr
routine in this crate has slightly worse latency than the
+solution presented above, however, its throughput can easily be over an
+order of magnitude faster. This is a good general purpose trade off to make.
+You rarely lose, but often gain big.
NOTE: The name memchr
comes from the corresponding routine in libc
. A
+key advantage of using this library is that its performance is not tied to its
+quality of implementation in the libc
you happen to be using, which can vary
+greatly from platform to platform.
But what about substring search? This one is a bit more complicated. The +primary reason for its existence is still indeed performance, but it’s also +useful because Rust’s core library doesn’t actually expose any substring +search routine on arbitrary bytes. The only substring search routine that +exists works exclusively on valid UTF-8.
+So if you have valid UTF-8, is there a reason to use this over the standard +library substring search routine? Yes. This routine is faster on almost every +metric, including latency. The natural question then, is why isn’t this +implementation in the standard library, even if only for searching on UTF-8? +The reason is that the implementation details for using SIMD in the standard +library haven’t quite been worked out yet.
+NOTE: Currently, only x86_64
, wasm32
and aarch64
targets have vector
+accelerated implementations of memchr
(and friends) and memmem
.
x86_64
targets without enabling
+the avx2
feature at compile time, for example. When std
is not enabled,
+this crate will still attempt to use SSE2 accelerated routines on x86_64
. It
+will also use AVX2 accelerated routines when the avx2
feature is enabled at
+compile time. In general, enable this feature if you can.memmem::Finder::into_ownedd
API and the
+arch::all::shiftor
substring search
+implementation. Otherwise, this crate is designed from the ground up to be
+usable in core-only contexts, so the alloc
feature doesn’t add much
+currently. Notably, disabling std
but enabling alloc
will not result
+in the use of AVX2 on x86_64
targets unless the avx2
feature is enabled
+at compile time. (With std
enabled, AVX2 can be used even without the avx2
+feature enabled at compile time by way of runtime CPU feature detection.)log
crate is used
+to emit log messages about what kinds of memchr
and memmem
algorithms
+are used. Namely, both memchr
and memmem
have a number of different
+implementation choices depending on the target and CPU, and the log messages
+can help show what specific implementations are being used. Generally, this is
+useful for debugging performance issues.memchr
function from whatever libc
was linked into the program. This
+feature is now a no-op because this crate’s implementation of memchr
should
+now be sufficiently fast on a number of platforms that libc
should no longer
+be needed. (This feature is somewhat of a holdover from this crate’s origins.
+Originally, this crate was literally just a safe wrapper function around the
+memchr
function from libc
.)Redirecting to ../../memchr/fn.memchr.html...
+ + + \ No newline at end of file diff --git a/memchr/memchr/fn.memchr2.html b/memchr/memchr/fn.memchr2.html new file mode 100644 index 000000000..b43634a74 --- /dev/null +++ b/memchr/memchr/fn.memchr2.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../memchr/fn.memchr2.html...
+ + + \ No newline at end of file diff --git a/memchr/memchr/fn.memchr2_iter.html b/memchr/memchr/fn.memchr2_iter.html new file mode 100644 index 000000000..bed40a2a0 --- /dev/null +++ b/memchr/memchr/fn.memchr2_iter.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../memchr/fn.memchr2_iter.html...
+ + + \ No newline at end of file diff --git a/memchr/memchr/fn.memchr3.html b/memchr/memchr/fn.memchr3.html new file mode 100644 index 000000000..275f4bbef --- /dev/null +++ b/memchr/memchr/fn.memchr3.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../memchr/fn.memchr3.html...
+ + + \ No newline at end of file diff --git a/memchr/memchr/fn.memchr3_iter.html b/memchr/memchr/fn.memchr3_iter.html new file mode 100644 index 000000000..dc6d7fa6e --- /dev/null +++ b/memchr/memchr/fn.memchr3_iter.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../memchr/fn.memchr3_iter.html...
+ + + \ No newline at end of file diff --git a/memchr/memchr/fn.memchr_iter.html b/memchr/memchr/fn.memchr_iter.html new file mode 100644 index 000000000..e6424a721 --- /dev/null +++ b/memchr/memchr/fn.memchr_iter.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../memchr/fn.memchr_iter.html...
+ + + \ No newline at end of file diff --git a/memchr/memchr/fn.memrchr.html b/memchr/memchr/fn.memrchr.html new file mode 100644 index 000000000..57d98444a --- /dev/null +++ b/memchr/memchr/fn.memrchr.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../memchr/fn.memrchr.html...
+ + + \ No newline at end of file diff --git a/memchr/memchr/fn.memrchr2.html b/memchr/memchr/fn.memrchr2.html new file mode 100644 index 000000000..ffc1e5875 --- /dev/null +++ b/memchr/memchr/fn.memrchr2.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../memchr/fn.memrchr2.html...
+ + + \ No newline at end of file diff --git a/memchr/memchr/fn.memrchr2_iter.html b/memchr/memchr/fn.memrchr2_iter.html new file mode 100644 index 000000000..c80c1ebfa --- /dev/null +++ b/memchr/memchr/fn.memrchr2_iter.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../memchr/fn.memrchr2_iter.html...
+ + + \ No newline at end of file diff --git a/memchr/memchr/fn.memrchr3.html b/memchr/memchr/fn.memrchr3.html new file mode 100644 index 000000000..ceb0775d2 --- /dev/null +++ b/memchr/memchr/fn.memrchr3.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../memchr/fn.memrchr3.html...
+ + + \ No newline at end of file diff --git a/memchr/memchr/fn.memrchr3_iter.html b/memchr/memchr/fn.memrchr3_iter.html new file mode 100644 index 000000000..781ba508b --- /dev/null +++ b/memchr/memchr/fn.memrchr3_iter.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../memchr/fn.memrchr3_iter.html...
+ + + \ No newline at end of file diff --git a/memchr/memchr/fn.memrchr_iter.html b/memchr/memchr/fn.memrchr_iter.html new file mode 100644 index 000000000..ad5d91a36 --- /dev/null +++ b/memchr/memchr/fn.memrchr_iter.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../memchr/fn.memrchr_iter.html...
+ + + \ No newline at end of file diff --git a/memchr/memchr/struct.Memchr.html b/memchr/memchr/struct.Memchr.html new file mode 100644 index 000000000..b360bbc06 --- /dev/null +++ b/memchr/memchr/struct.Memchr.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../memchr/struct.Memchr.html...
+ + + \ No newline at end of file diff --git a/memchr/memchr/struct.Memchr2.html b/memchr/memchr/struct.Memchr2.html new file mode 100644 index 000000000..7aa16d909 --- /dev/null +++ b/memchr/memchr/struct.Memchr2.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../memchr/struct.Memchr2.html...
+ + + \ No newline at end of file diff --git a/memchr/memchr/struct.Memchr3.html b/memchr/memchr/struct.Memchr3.html new file mode 100644 index 000000000..0feeb01a8 --- /dev/null +++ b/memchr/memchr/struct.Memchr3.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../memchr/struct.Memchr3.html...
+ + + \ No newline at end of file diff --git a/memchr/memmem/enum.Prefilter.html b/memchr/memmem/enum.Prefilter.html new file mode 100644 index 000000000..5b1d9645e --- /dev/null +++ b/memchr/memmem/enum.Prefilter.html @@ -0,0 +1,40 @@ +#[non_exhaustive]pub enum Prefilter {
+ None,
+ Auto,
+}
Prefilter controls whether heuristics are used to accelerate searching.
+A prefilter refers to the idea of detecting candidate matches very quickly, +and then confirming whether those candidates are full matches. This +idea can be quite effective since it’s often the case that looking for +candidates can be a lot faster than running a complete substring search +over the entire input. Namely, looking for candidates can be done with +extremely fast vectorized code.
+The downside of a prefilter is that it assumes false positives (which are +candidates generated by a prefilter that aren’t matches) are somewhat rare +relative to the frequency of full matches. That is, if a lot of false +positives are generated, then it’s possible for search time to be worse +than if the prefilter wasn’t enabled in the first place.
+Another downside of a prefilter is that it can result in highly variable +performance, where some cases are extraordinarily fast and others aren’t. +Typically, variable performance isn’t a problem, but it may be for your use +case.
+The use of prefilters in this implementation does use a heuristic to detect +when a prefilter might not be carrying its weight, and will dynamically +disable its use. Nevertheless, this configuration option gives callers +the ability to disable prefilters if you have knowledge that they won’t be +useful.
+Never used a prefilter in substring search.
+Automatically detect whether a heuristic prefilter should be used. If +it is used, then heuristics will be used to dynamically disable the +prefilter if it is believed to not be carrying its weight.
+source
. Read morepub fn find(haystack: &[u8], needle: &[u8]) -> Option<usize>
Returns the index of the first occurrence of the given needle.
+Note that if you’re are searching for the same needle in many different
+small haystacks, it may be faster to initialize a Finder
once,
+and reuse it for each search.
This routine is guaranteed to have worst case linear time complexity
+with respect to both the needle and the haystack. That is, this runs
+in O(needle.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use memchr::memmem;
+
+let haystack = b"foo bar baz";
+assert_eq!(Some(0), memmem::find(haystack, b"foo"));
+assert_eq!(Some(4), memmem::find(haystack, b"bar"));
+assert_eq!(None, memmem::find(haystack, b"quux"));
pub fn find_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>(
+ haystack: &'h [u8],
+ needle: &'n N
+) -> FindIter<'h, 'n> ⓘ
Returns an iterator over all non-overlapping occurrences of a substring in +a haystack.
+This routine is guaranteed to have worst case linear time complexity
+with respect to both the needle and the haystack. That is, this runs
+in O(needle.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use memchr::memmem;
+
+let haystack = b"foo bar foo baz foo";
+let mut it = memmem::find_iter(haystack, b"foo");
+assert_eq!(Some(0), it.next());
+assert_eq!(Some(8), it.next());
+assert_eq!(Some(16), it.next());
+assert_eq!(None, it.next());
pub fn rfind(haystack: &[u8], needle: &[u8]) -> Option<usize>
Returns the index of the last occurrence of the given needle.
+Note that if you’re are searching for the same needle in many different
+small haystacks, it may be faster to initialize a FinderRev
once,
+and reuse it for each search.
This routine is guaranteed to have worst case linear time complexity
+with respect to both the needle and the haystack. That is, this runs
+in O(needle.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use memchr::memmem;
+
+let haystack = b"foo bar baz";
+assert_eq!(Some(0), memmem::rfind(haystack, b"foo"));
+assert_eq!(Some(4), memmem::rfind(haystack, b"bar"));
+assert_eq!(Some(8), memmem::rfind(haystack, b"ba"));
+assert_eq!(None, memmem::rfind(haystack, b"quux"));
pub fn rfind_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>(
+ haystack: &'h [u8],
+ needle: &'n N
+) -> FindRevIter<'h, 'n> ⓘ
Returns a reverse iterator over all non-overlapping occurrences of a +substring in a haystack.
+This routine is guaranteed to have worst case linear time complexity
+with respect to both the needle and the haystack. That is, this runs
+in O(needle.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use memchr::memmem;
+
+let haystack = b"foo bar foo baz foo";
+let mut it = memmem::rfind_iter(haystack, b"foo");
+assert_eq!(Some(16), it.next());
+assert_eq!(Some(8), it.next());
+assert_eq!(Some(0), it.next());
+assert_eq!(None, it.next());
This module provides forward and reverse substring search routines.
+Unlike the standard library’s substring search routines, these work on +arbitrary bytes. For all non-empty needles, these routines will report exactly +the same values as the corresponding routines in the standard library. For +the empty needle, the standard library reports matches only at valid UTF-8 +boundaries, where as these routines will report matches at every position.
+Other than being able to work on arbitrary bytes, the primary reason to prefer +these routines over the standard library routines is that these will generally +be faster. In some cases, significantly so.
+This example shows how to use find_iter
to find occurrences of a substring
+in a haystack.
use memchr::memmem;
+
+let haystack = b"foo bar foo baz foo";
+
+let mut it = memmem::find_iter(haystack, "foo");
+assert_eq!(Some(0), it.next());
+assert_eq!(Some(8), it.next());
+assert_eq!(Some(16), it.next());
+assert_eq!(None, it.next());
This example shows how to use rfind_iter
to find occurrences of a substring
+in a haystack starting from the end of the haystack.
NOTE: This module does not implement double ended iterators, so reverse
+searches aren’t done by calling rev
on a forward iterator.
use memchr::memmem;
+
+let haystack = b"foo bar foo baz foo";
+
+let mut it = memmem::rfind_iter(haystack, "foo");
+assert_eq!(Some(16), it.next());
+assert_eq!(Some(8), it.next());
+assert_eq!(Some(0), it.next());
+assert_eq!(None, it.next());
It may be possible for the overhead of constructing a substring searcher to be
+measurable in some workloads. In cases where the same needle is used to search
+many haystacks, it is possible to do construction once and thus to avoid it for
+subsequent searches. This can be done with a Finder
(or a FinderRev
for
+reverse searches).
use memchr::memmem;
+
+let finder = memmem::Finder::new("foo");
+
+assert_eq!(Some(4), finder.find(b"baz foo quux"));
+assert_eq!(None, finder.find(b"quux baz bar"));
Redirecting to ../../../memchr/memmem/enum.Prefilter.html...
+ + + \ No newline at end of file diff --git a/memchr/memmem/sidebar-items.js b/memchr/memmem/sidebar-items.js new file mode 100644 index 000000000..5013bc312 --- /dev/null +++ b/memchr/memmem/sidebar-items.js @@ -0,0 +1 @@ +window.SIDEBAR_ITEMS = {"enum":["Prefilter"],"fn":["find","find_iter","rfind","rfind_iter"],"struct":["FindIter","FindRevIter","Finder","FinderBuilder","FinderRev"]}; \ No newline at end of file diff --git a/memchr/memmem/struct.FindIter.html b/memchr/memmem/struct.FindIter.html new file mode 100644 index 000000000..e5548e9b2 --- /dev/null +++ b/memchr/memmem/struct.FindIter.html @@ -0,0 +1,198 @@ +pub struct FindIter<'h, 'n> { /* private fields */ }
An iterator over non-overlapping substring matches.
+Matches are reported by the byte offset at which they begin.
+'h
is the lifetime of the haystack while 'n
is the lifetime of the
+needle.
Convert this iterator into its owned variant, such that it no longer +borrows the finder and needle.
+If this is already an owned iterator, then this is a no-op. Otherwise, +this copies the needle.
+This is only available when the alloc
feature is enabled.
iter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct FindRevIter<'h, 'n> { /* private fields */ }
An iterator over non-overlapping substring matches in reverse.
+Matches are reported by the byte offset at which they begin.
+'h
is the lifetime of the haystack while 'n
is the lifetime of the
+needle.
Convert this iterator into its owned variant, such that it no longer +borrows the finder and needle.
+If this is already an owned iterator, then this is a no-op. Otherwise, +this copies the needle.
+This is only available when the std
feature is enabled.
iter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct Finder<'n> { /* private fields */ }
A single substring searcher fixed to a particular needle.
+The purpose of this type is to permit callers to construct a substring
+searcher that can be used to search haystacks without the overhead of
+constructing the searcher in the first place. This is a somewhat niche
+concern when it’s necessary to re-use the same needle to search multiple
+different haystacks with as little overhead as possible. In general, using
+find
is good enough, but Finder
is useful when you can meaningfully
+observe searcher construction time in a profile.
When the std
feature is enabled, then this type has an into_owned
+version which permits building a Finder
that is not connected to
+the lifetime of its needle.
Create a new finder for the given needle.
+Returns the index of the first occurrence of this needle in the given +haystack.
+This routine is guaranteed to have worst case linear time complexity
+with respect to both the needle and the haystack. That is, this runs
+in O(needle.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use memchr::memmem::Finder;
+
+let haystack = b"foo bar baz";
+assert_eq!(Some(0), Finder::new("foo").find(haystack));
+assert_eq!(Some(4), Finder::new("bar").find(haystack));
+assert_eq!(None, Finder::new("quux").find(haystack));
Returns an iterator over all occurrences of a substring in a haystack.
+This routine is guaranteed to have worst case linear time complexity
+with respect to both the needle and the haystack. That is, this runs
+in O(needle.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use memchr::memmem::Finder;
+
+let haystack = b"foo bar foo baz foo";
+let finder = Finder::new(b"foo");
+let mut it = finder.find_iter(haystack);
+assert_eq!(Some(0), it.next());
+assert_eq!(Some(8), it.next());
+assert_eq!(Some(16), it.next());
+assert_eq!(None, it.next());
Convert this finder into its owned variant, such that it no longer +borrows the needle.
+If this is already an owned finder, then this is a no-op. Otherwise, +this copies the needle.
+This is only available when the alloc
feature is enabled.
Convert this finder into its borrowed variant.
+This is primarily useful if your finder is owned and you’d like to +store its borrowed variant in some intermediate data structure.
+Note that the lifetime parameter of the returned finder is tied to the
+lifetime of self
, and may be shorter than the 'n
lifetime of the
+needle itself. Namely, a finder’s needle can be either borrowed or
+owned, so the lifetime of the needle returned must necessarily be the
+shorter of the two.
Returns the needle that this finder searches for.
+Note that the lifetime of the needle returned is tied to the lifetime
+of the finder, and may be shorter than the 'n
lifetime. Namely, a
+finder’s needle can be either borrowed or owned, so the lifetime of the
+needle returned must necessarily be the shorter of the two.
pub struct FinderBuilder { /* private fields */ }
A builder for constructing non-default forward or reverse memmem finders.
+A builder is primarily useful for configuring a substring searcher. +Currently, the only configuration exposed is the ability to disable +heuristic prefilters used to speed up certain searches.
+Create a new finder builder with default settings.
+Build a forward finder using the given needle from the current +settings.
+Build a forward finder using the given needle and a custom heuristic for
+determining the frequency of a given byte in the dataset.
+See HeuristicFrequencyRank
for more details.
Build a reverse finder using the given needle from the current +settings.
+Configure the prefilter setting for the finder.
+See the documentation for Prefilter
for more discussion on why
+you might want to configure this.
source
. Read morepub struct FinderRev<'n> { /* private fields */ }
A single substring reverse searcher fixed to a particular needle.
+The purpose of this type is to permit callers to construct a substring
+searcher that can be used to search haystacks without the overhead of
+constructing the searcher in the first place. This is a somewhat niche
+concern when it’s necessary to re-use the same needle to search multiple
+different haystacks with as little overhead as possible. In general,
+using rfind
is good enough, but FinderRev
is useful when you can
+meaningfully observe searcher construction time in a profile.
When the std
feature is enabled, then this type has an into_owned
+version which permits building a FinderRev
that is not connected to
+the lifetime of its needle.
Create a new reverse finder for the given needle.
+Returns the index of the last occurrence of this needle in the given +haystack.
+The haystack may be any type that can be cheaply converted into a
+&[u8]
. This includes, but is not limited to, &str
and &[u8]
.
This routine is guaranteed to have worst case linear time complexity
+with respect to both the needle and the haystack. That is, this runs
+in O(needle.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use memchr::memmem::FinderRev;
+
+let haystack = b"foo bar baz";
+assert_eq!(Some(0), FinderRev::new("foo").rfind(haystack));
+assert_eq!(Some(4), FinderRev::new("bar").rfind(haystack));
+assert_eq!(None, FinderRev::new("quux").rfind(haystack));
Returns a reverse iterator over all occurrences of a substring in a +haystack.
+This routine is guaranteed to have worst case linear time complexity
+with respect to both the needle and the haystack. That is, this runs
+in O(needle.len() + haystack.len())
time.
This routine is also guaranteed to have worst case constant space +complexity.
+Basic usage:
+ +use memchr::memmem::FinderRev;
+
+let haystack = b"foo bar foo baz foo";
+let finder = FinderRev::new(b"foo");
+let mut it = finder.rfind_iter(haystack);
+assert_eq!(Some(16), it.next());
+assert_eq!(Some(8), it.next());
+assert_eq!(Some(0), it.next());
+assert_eq!(None, it.next());
Convert this finder into its owned variant, such that it no longer +borrows the needle.
+If this is already an owned finder, then this is a no-op. Otherwise, +this copies the needle.
+This is only available when the std
feature is enabled.
Convert this finder into its borrowed variant.
+This is primarily useful if your finder is owned and you’d like to +store its borrowed variant in some intermediate data structure.
+Note that the lifetime parameter of the returned finder is tied to the
+lifetime of self
, and may be shorter than the 'n
lifetime of the
+needle itself. Namely, a finder’s needle can be either borrowed or
+owned, so the lifetime of the needle returned must necessarily be the
+shorter of the two.
Returns the needle that this finder searches for.
+Note that the lifetime of the needle returned is tied to the lifetime
+of the finder, and may be shorter than the 'n
lifetime. Namely, a
+finder’s needle can be either borrowed or owned, so the lifetime of the
+needle returned must necessarily be the shorter of the two.
pub struct Memchr<'h> { /* private fields */ }
An iterator over all occurrences of a single byte in a haystack.
+This iterator implements DoubleEndedIterator
, which means it can also be
+used to find occurrences in reverse order.
This iterator is created by the memchr_iter
or [memrchr_iter
]
+functions. It can also be created with the Memchr::new
method.
The lifetime parameter 'h
refers to the lifetime of the haystack being
+searched.
iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct Memchr2<'h> { /* private fields */ }
An iterator over all occurrences of two possible bytes in a haystack.
+This iterator implements DoubleEndedIterator
, which means it can also be
+used to find occurrences in reverse order.
This iterator is created by the memchr2_iter
or [memrchr2_iter
]
+functions. It can also be created with the Memchr2::new
method.
The lifetime parameter 'h
refers to the lifetime of the haystack being
+searched.
iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct Memchr3<'h> { /* private fields */ }
An iterator over all occurrences of three possible bytes in a haystack.
+This iterator implements DoubleEndedIterator
, which means it can also be
+used to find occurrences in reverse order.
This iterator is created by the memchr2_iter
or [memrchr2_iter
]
+functions. It can also be created with the Memchr3::new
method.
The lifetime parameter 'h
refers to the lifetime of the haystack being
+searched.
Returns an iterator over all occurrences of the needle bytes in the +given haystack.
+The iterator returned implements DoubleEndedIterator
. This means it
+can also be used to find occurrences in reverse order.
iter_advance_by
)n
elements. Read moren
th element from the end of the iterator. Read moreIterator::try_fold()
: it takes
+elements starting from the back of the iterator. Read moreiter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_partition_in_place
)true
precede all those that return false
.
+Returns the number of true
elements found. Read moreiter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)Redirecting to ../../../roe/fn.make_ascii_lowercase.html...
+ + + \ No newline at end of file diff --git a/roe/ascii/lowercase/fn.to_ascii_lowercase.html b/roe/ascii/lowercase/fn.to_ascii_lowercase.html new file mode 100644 index 000000000..5b38530d4 --- /dev/null +++ b/roe/ascii/lowercase/fn.to_ascii_lowercase.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../../roe/fn.to_ascii_lowercase.html...
+ + + \ No newline at end of file diff --git a/roe/ascii/titlecase/fn.make_ascii_titlecase.html b/roe/ascii/titlecase/fn.make_ascii_titlecase.html new file mode 100644 index 000000000..b9f98e5c2 --- /dev/null +++ b/roe/ascii/titlecase/fn.make_ascii_titlecase.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../../roe/fn.make_ascii_titlecase.html...
+ + + \ No newline at end of file diff --git a/roe/ascii/titlecase/fn.to_ascii_titlecase.html b/roe/ascii/titlecase/fn.to_ascii_titlecase.html new file mode 100644 index 000000000..97a2850d2 --- /dev/null +++ b/roe/ascii/titlecase/fn.to_ascii_titlecase.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../../roe/fn.to_ascii_titlecase.html...
+ + + \ No newline at end of file diff --git a/roe/ascii/uppercase/fn.make_ascii_uppercase.html b/roe/ascii/uppercase/fn.make_ascii_uppercase.html new file mode 100644 index 000000000..63b87efd7 --- /dev/null +++ b/roe/ascii/uppercase/fn.make_ascii_uppercase.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../../roe/fn.make_ascii_uppercase.html...
+ + + \ No newline at end of file diff --git a/roe/ascii/uppercase/fn.to_ascii_uppercase.html b/roe/ascii/uppercase/fn.to_ascii_uppercase.html new file mode 100644 index 000000000..d244df5b0 --- /dev/null +++ b/roe/ascii/uppercase/fn.to_ascii_uppercase.html @@ -0,0 +1,11 @@ + + + + +Redirecting to ../../../roe/fn.to_ascii_uppercase.html...
+ + + \ No newline at end of file diff --git a/roe/enum.LowercaseMode.html b/roe/enum.LowercaseMode.html new file mode 100644 index 000000000..534acb236 --- /dev/null +++ b/roe/enum.LowercaseMode.html @@ -0,0 +1,51 @@ +pub enum LowercaseMode {
+ Full,
+ Ascii,
+ Turkic,
+ Lithuanian,
+ Fold,
+}
Options to configure the behavior of lowercase
.
Which letters exactly are replaced, and by which other letters, depends on +the given options.
+See individual variants for a description of the available behaviors.
+If you’re not sure which mode to choose, LowercaseMode::Full
is a a good
+default.
Full Unicode case mapping, suitable for most languages.
+See the Turkic and Lithuanian variants for exceptions.
+Context-dependent case mapping as described in Table 3-14 of the Unicode +standard is currently not supported.
+Only the ASCII region, i.e. the characters 'A'..='Z'
and 'a'..='z'
,
+are affected.
This option cannot be combined with any other option.
+Full Unicode case mapping, adapted for Turkic languages (Turkish, +Azerbaijani, …).
+This means that upper case I is mapped to lower case dotless i, and so +on.
+Currently, just full Unicode case mapping.
+In the future, full Unicode case mapping adapted for Lithuanian (keeping +the dot on the lower case i even if there is an accent on top).
+Unicode case folding, which is more far-reaching than Unicode case +mapping.
+This option currently cannot be combined with any other option (i.e. +there is currently no variant for turkic languages).
+source
. Read moreself
and other
values to be equal, and is used
+by ==
.self
and other
) and is used by the <=
+operator. Read morepub enum UppercaseMode {
+ Full,
+ Ascii,
+ Turkic,
+ Lithuanian,
+}
Options to configure the behavior of uppercase
.
Which letters exactly are replaced, and by which other letters, depends on +the given options.
+See individual variants for a description of the available behaviors.
+If you’re not sure which mode to choose, UppercaseMode::Full
is a a good
+default.
Full Unicode case mapping, suitable for most languages.
+See the Turkic and Lithuanian variants for exceptions.
+Context-dependent case mapping as described in Table 3-14 of the Unicode +standard is currently not supported.
+Only the ASCII region, i.e. the characters 'A'..='Z'
and 'a'..='z'
,
+are affected.
This option cannot be combined with any other option.
+Full Unicode case mapping, adapted for Turkic languages (Turkish, +Azerbaijani, …).
+This means that upper case I is mapped to lower case dotless i, and so +on.
+Currently, just full Unicode case mapping.
+In the future, full Unicode case mapping adapted for Lithuanian (keeping +the dot on the lower case i even if there is an accent on top).
+source
. Read moreself
and other
values to be equal, and is used
+by ==
.self
and other
) and is used by the <=
+operator. Read morepub fn lowercase(slice: &[u8], options: LowercaseMode) -> Lowercase<'_> ⓘ
Returns an iterator that yields a copy of the bytes in the given slice with +all uppercase letters replaced with their lowercase counterparts.
+This function treats the given slice as a conventionally UTF-8 string. +UTF-8 byte sequences are converted to their Unicode lowercase equivalents. +Invalid UTF-8 byte sequences are yielded as is.
+The case mapping mode is determined by the given LowercaseMode
. See its
+documentation for details on the available case mapping modes.
Not all LowercaseMode
s are currently implemented. This function will
+panic if the caller supplies Turkic or case folding lowercasing mode.
pub fn make_ascii_lowercase<T: AsMut<[u8]>>(slice: &mut T)
Converts the given slice to its ASCII lower case equivalent in-place.
+ASCII letters ‘A’ to ‘Z’ are mapped to ‘a’ to ‘z’, but non-ASCII letters are +unchanged.
+This function can be used to implement String#downcase!
for ASCII
+strings in Ruby.
To return a new lowercased value without modifying the existing one, use to_ascii_lowercase
.
+This function is analogous <[u8]>::make_ascii_lowercase
.
let mut buf = *b"ABCxyz";
+make_ascii_lowercase(&mut buf);
+assert_eq!(buf, *b"abcxyz");
+
+let mut buf = *b"1234%&*";
+make_ascii_lowercase(&mut buf);
+assert_eq!(buf, *b"1234%&*");
pub fn make_ascii_titlecase<T: AsMut<[u8]>>(slice: &mut T)
Converts the given slice to its ASCII title case equivalent in-place.
+ASCII letters ‘a’ to ‘z’ are mapped to ‘A’ to ‘Z’ in the first byte; +subsequent bytes with ASCII letters ‘A’ to ‘Z’ are mapped to ‘a’ to ‘z’; +non-ASCII letters are unchanged.
+This function can be used to implement String#capitalize!
for ASCII
+strings in Ruby.
To return a new titlecased value without modifying the existing one, use to_ascii_titlecase
.
let mut buf = *b"ABCxyz";
+make_ascii_titlecase(&mut buf);
+assert_eq!(buf, *b"Abcxyz");
+
+let mut buf = *b"1234%&*";
+make_ascii_titlecase(&mut buf);
+assert_eq!(buf, *b"1234%&*");
+
+let mut buf = *b"ABC1234%&*";
+make_ascii_titlecase(&mut buf);
+assert_eq!(buf, *b"Abc1234%&*");
+
+let mut buf = *b"1234%&*abcXYZ";
+make_ascii_titlecase(&mut buf);
+assert_eq!(buf, *b"1234%&*abcxyz");
+
+let mut buf = *b"ABC, XYZ";
+make_ascii_titlecase(&mut buf);
+assert_eq!(buf, *b"Abc, xyz");
pub fn make_ascii_uppercase<T: AsMut<[u8]>>(slice: &mut T)
Converts the given slice to its ASCII upper case equivalent in-place.
+ASCII letters ‘a’ to ‘z’ are mapped to ‘A’ to ‘Z’, but non-ASCII letters are +unchanged.
+This function can be used to implement String#upcase!
for ASCII strings
+in Ruby.
To return a new uppercased value without modifying the existing one, use to_ascii_uppercase
.
+This function is analogous to <[u8]>::make_ascii_uppercase
.
let mut buf = *b"ABCxyz";
+make_ascii_uppercase(&mut buf);
+assert_eq!(buf, *b"ABCXYZ");
+
+let mut buf = *b"1234%&*";
+make_ascii_uppercase(&mut buf);
+assert_eq!(buf, *b"1234%&*");
pub fn to_ascii_lowercase<T: AsRef<[u8]>>(slice: T) -> Vec<u8>
alloc
only.Returns a vector containing a copy of the given slice where each byte is +mapped to its ASCII lower case equivalent.
+ASCII letters ‘A’ to ‘Z’ are mapped to ‘a’ to ‘z’, but non-ASCII letters are +unchanged.
+This function can be used to implement String#downcase
and
+Symbol#downcase
for ASCII strings in Ruby.
To lowercase the value in-place, use make_ascii_lowercase
. This function
+is analogous to <[u8]>::to_ascii_lowercase
.
assert_eq!(to_ascii_lowercase("ABCxyz"), &b"abcxyz"[..]);
+assert_eq!(to_ascii_lowercase("1234%&*"), &b"1234%&*"[..]);
pub fn to_ascii_titlecase<T: AsRef<[u8]>>(slice: T) -> Vec<u8>
alloc
only.Returns a vector containing a copy of the given slice where each byte is +mapped to its ASCII title case equivalent.
+ASCII letters ‘a’ to ‘z’ are mapped to ‘A’ to ‘Z’ in the first byte; +subsequent bytes with ASCII letters ‘A’ to ‘Z’ are mapped to ‘a’ to ‘z’; +non-ASCII letters are unchanged.
+This function can be used to implement String#capitalize
and
+Symbol#capitalize
for ASCII strings in Ruby.
To titlecase the value in-place, use make_ascii_titlecase
.
assert_eq!(to_ascii_titlecase("ABCxyz"), &b"Abcxyz"[..]);
+assert_eq!(to_ascii_titlecase("1234%&*"), &b"1234%&*"[..]);
+assert_eq!(to_ascii_titlecase("ABC1234%&*"), &b"Abc1234%&*"[..]);
+assert_eq!(to_ascii_titlecase("1234%&*abcXYZ"), &b"1234%&*abcxyz"[..]);
+assert_eq!(to_ascii_titlecase("ABC, XYZ"), &b"Abc, xyz"[..]);
pub fn to_ascii_uppercase<T: AsRef<[u8]>>(slice: T) -> Vec<u8>
alloc
only.Returns a vector containing a copy of the given slice where each byte is +mapped to its ASCII upper case equivalent.
+ASCII letters ‘a’ to ‘z’ are mapped to ‘A’ to ‘Z’, but non-ASCII letters are +unchanged.
+This function can be used to implement String#upcase
and
+Symbol#upcase
for ASCII strings in Ruby.
To uppercase the value in-place, use make_ascii_uppercase
. This function
+is analogous to <[u8]>::to_ascii_uppercase
.
assert_eq!(to_ascii_uppercase("ABCxyz"), &b"ABCXYZ"[..]);
+assert_eq!(to_ascii_uppercase("1234%&*"), &b"1234%&*"[..]);
pub fn uppercase(slice: &[u8], options: UppercaseMode) -> Uppercase<'_> ⓘ
Returns an iterator that yields a copy of the bytes in the given slice with +all lowercase letters replaced with their uppercase counterparts.
+This function treats the given slice as a conventionally UTF-8 string. +UTF-8 byte sequences are converted to their Unicode uppercase equivalents. +Invalid UTF-8 byte sequences are yielded as is.
+The case mapping mode is determined by the given UppercaseMode
. See its
+documentation for details on the available case mapping modes.
Not all UppercaseMode
s are currently implemented. This function will
+panic if the caller supplies Turkic uppercasing mode.
This crate provides Unicode case mapping routines and iterators for +conventionally UTF-8 binary strings.
+Unicode case mapping or case conversion can be used to transform the +characters in a string. To quote the Unicode FAQ:
+++Case mapping or case conversion is a process whereby strings are converted +to a particular form—uppercase, lowercase, or titlecase—possibly for +display to the user.
+
This crate is currently a work in progress. When the API is complete, Roe +will support lowercase, uppercase, titlecase, and case folding iterators for +conventionally UTF-8 byte slices.
+Roe will implement support for full, Turkic, ASCII, and case folding +transforms.
+You can convert case like:
+ +assert_eq!(
+ roe::lowercase(b"Artichoke Ruby", LowercaseMode::Ascii).collect::<Vec<_>>(),
+ b"artichoke ruby"
+);
+assert_eq!(
+ roe::uppercase("Αύριο".as_bytes(), UppercaseMode::Full).collect::<Vec<_>>(),
+ "ΑΎΡΙΟ".as_bytes()
+);
Roe provides fast path routines that assume the byte slice is ASCII-only.
+Roe is no_std
compatible with an optional dependency on the alloc
+crate.
Roe has several Cargo features, all of which are enabled by default:
+std
, the Rust Standard Library. This
+feature enables std::error::Error
implementations on error types in
+this crate. Enabling the std feature also enables the alloc
+feature.alloc
, the Rust allocation and
+collections library. This feature enables APIs that allocate String
or
+Vec
.LowercaseMode
or
+UppercaseMode
.alloc
alloc
alloc
Redirecting to ../../roe/struct.Lowercase.html...
+ + + \ No newline at end of file diff --git a/roe/sidebar-items.js b/roe/sidebar-items.js new file mode 100644 index 000000000..fb522411d --- /dev/null +++ b/roe/sidebar-items.js @@ -0,0 +1 @@ +window.SIDEBAR_ITEMS = {"enum":["LowercaseMode","UppercaseMode"],"fn":["lowercase","make_ascii_lowercase","make_ascii_titlecase","make_ascii_uppercase","to_ascii_lowercase","to_ascii_titlecase","to_ascii_uppercase","uppercase"],"struct":["InvalidCaseMappingMode","Lowercase","Uppercase"]}; \ No newline at end of file diff --git a/roe/struct.InvalidCaseMappingMode.html b/roe/struct.InvalidCaseMappingMode.html new file mode 100644 index 000000000..51e2d63f9 --- /dev/null +++ b/roe/struct.InvalidCaseMappingMode.html @@ -0,0 +1,38 @@ +pub struct InvalidCaseMappingMode { /* private fields */ }
Error that indicates a failure to parse a LowercaseMode
or
+UppercaseMode
.
This error corresponds to the Ruby ArgumentError
Exception class.
let err = InvalidCaseMappingMode::new();
+assert_eq!(err.message(), "invalid option");
+
+let mode: Result<LowercaseMode, InvalidCaseMappingMode> = "full".try_into();
source
. Read moreself
and other
values to be equal, and is used
+by ==
.self
and other
) and is used by the <=
+operator. Read morepub struct Lowercase<'a> { /* private fields */ }
Create a new, empty lowercase iterator.
+let mut lowercase = Lowercase::new();
+assert_eq!(lowercase.next(), None);
Create a new lowercase iterator with the given byte slice using full +Unicode case mapping.
+let mut lowercase = Lowercase::with_slice(b"abcXYZ");
+assert_eq!(lowercase.next(), Some(b'a'));
+assert_eq!(lowercase.next(), Some(b'b'));
+assert_eq!(lowercase.next(), Some(b'c'));
+assert_eq!(lowercase.next(), Some(b'x'));
+assert_eq!(lowercase.next(), Some(b'y'));
+assert_eq!(lowercase.next(), Some(b'z'));
+assert_eq!(lowercase.next(), None);
Non-ASCII characters are case mapped:
+ +let lowercase = Lowercase::with_slice("Αύριο".as_bytes());
+assert_eq!(lowercase.collect::<Vec<_>>(), "αύριο".as_bytes());
Invalid UTF-8 bytes are yielded as is without impacting Unicode +characters:
+ +let mut s = "Αύριο".to_string().into_bytes();
+s.extend(b"\xFF\xFE");
+let lowercase = Lowercase::with_slice(s.as_slice());
+
+let mut expected = "αύριο".to_string().into_bytes();
+expected.extend(b"\xFF\xFE");
+assert_eq!(lowercase.collect::<Vec<_>>(), expected);
Create a new lowercase iterator with the given byte slice using ASCII +case mapping.
+let mut lowercase = Lowercase::with_ascii_slice(b"abcXYZ");
+assert_eq!(lowercase.next(), Some(b'a'));
+assert_eq!(lowercase.next(), Some(b'b'));
+assert_eq!(lowercase.next(), Some(b'c'));
+assert_eq!(lowercase.next(), Some(b'x'));
+assert_eq!(lowercase.next(), Some(b'y'));
+assert_eq!(lowercase.next(), Some(b'z'));
+assert_eq!(lowercase.next(), None);
Non-ASCII characters are ignored:
+ +let lowercase = Lowercase::with_ascii_slice("Αύριο".as_bytes());
+assert_eq!(lowercase.collect::<Vec<_>>(), "Αύριο".as_bytes());
Invalid UTF-8 bytes are yielded as is without impacting ASCII bytes:
+ +let lowercase = Lowercase::with_ascii_slice(b"abc\xFF\xFEXYZ");
+assert_eq!(lowercase.collect::<Vec<_>>(), b"abc\xFF\xFExyz");
iter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)pub struct Uppercase<'a> { /* private fields */ }
Create a new, empty uppercase iterator.
+let mut uppercase = Uppercase::new();
+assert_eq!(uppercase.next(), None);
Create a new uppercase iterator with the given byte slice using full +Unicode case mapping.
+let mut uppercase = Uppercase::with_slice(b"abcXYZ");
+assert_eq!(uppercase.next(), Some(b'A'));
+assert_eq!(uppercase.next(), Some(b'B'));
+assert_eq!(uppercase.next(), Some(b'C'));
+assert_eq!(uppercase.next(), Some(b'X'));
+assert_eq!(uppercase.next(), Some(b'Y'));
+assert_eq!(uppercase.next(), Some(b'Z'));
+assert_eq!(uppercase.next(), None);
Non-ASCII characters are case mapped:
+ +let uppercase = Uppercase::with_slice("Αύριο".as_bytes());
+assert_eq!(uppercase.collect::<Vec<_>>(), "ΑΎΡΙΟ".as_bytes());
Invalid UTF-8 bytes are yielded as is without impacting Unicode +characters:
+ +let mut s = "Αύριο".to_string().into_bytes();
+s.extend(b"\xFF\xFE");
+let uppercase = Uppercase::with_slice(s.as_slice());
+
+let mut expected = "ΑΎΡΙΟ".to_string().into_bytes();
+expected.extend(b"\xFF\xFE");
+assert_eq!(uppercase.collect::<Vec<_>>(), expected);
Create a new uppercase iterator with the given byte slice using ASCII +case mapping.
+let mut uppercase = Uppercase::with_ascii_slice(b"abcXYZ");
+assert_eq!(uppercase.next(), Some(b'A'));
+assert_eq!(uppercase.next(), Some(b'B'));
+assert_eq!(uppercase.next(), Some(b'C'));
+assert_eq!(uppercase.next(), Some(b'X'));
+assert_eq!(uppercase.next(), Some(b'Y'));
+assert_eq!(uppercase.next(), Some(b'Z'));
+assert_eq!(uppercase.next(), None);
Non-ASCII characters are ignored:
+ +let uppercase = Uppercase::with_ascii_slice("Αύριο".as_bytes());
+assert_eq!(uppercase.collect::<Vec<_>>(), "Αύριο".as_bytes());
Invalid UTF-8 bytes are yielded as is without impacting ASCII bytes:
+ +let uppercase = Uppercase::with_ascii_slice(b"abc\xFF\xFEXYZ");
+assert_eq!(uppercase.collect::<Vec<_>>(), b"ABC\xFF\xFEXYZ");
iter_next_chunk
)N
values. Read moreiter_advance_by
)n
elements. Read moren
th element of the iterator. Read moreiter_intersperse
)separator
+between adjacent items of the original iterator. Read moren
elements. Read moren
elements, or fewer
+if the underlying iterator ends sooner. Read moreiter_map_windows
)f
for each contiguous window of size N
over
+self
and returns an iterator over the outputs of f
. Like slice::windows()
,
+the windows during mapping overlap as well. Read moreiter_collect_into
)iter_is_partitioned
)true
precede all those that return false
. Read moreiterator_try_reduce
)try_find
)iter_array_chunks
)N
elements of the iterator at a time. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read morePartialOrd
elements of
+this Iterator
with those of another. The comparison works like short-circuit
+evaluation, returning a result without comparing the remaining elements.
+As soon as an order can be determined, the evaluation stops and a result is returned. Read moreiter_order_by
)Iterator
with those
+of another with respect to the specified comparison function. Read moreiter_order_by
)Iterator
are lexicographically
+less than those of another. Read moreIterator
are lexicographically
+less or equal to those of another. Read moreIterator
are lexicographically
+greater than those of another. Read moreIterator
are lexicographically
+greater than or equal to those of another. Read moreis_sorted
)is_sorted
)Redirecting to ../../roe/struct.Uppercase.html...
+ + + \ No newline at end of file diff --git a/search-index.js b/search-index.js new file mode 100644 index 000000000..5a2a50163 --- /dev/null +++ b/search-index.js @@ -0,0 +1,7 @@ +var searchIndex = JSON.parse('{\ +"bstr":{"doc":"A byte string library.","t":"FDIDDDDDDDDDDDDDDDDDDLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLFFLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL","n":["B","BStr","ByteSlice","Bytes","CharIndices","Chars","EscapeBytes","FieldsWith","Find","FindReverse","Finder","FinderReverse","Lines","LinesWithTerminator","Split","SplitN","SplitNReverse","SplitReverse","Utf8Chunk","Utf8Chunks","Utf8Error","as_bstr","as_bstr","as_bstr_mut","as_bstr_mut","as_bytes","as_bytes","as_bytes","as_bytes","as_bytes","as_mut","as_ref","as_ref","borrow","borrow","borrow","borrow","borrow","borrow","borrow","borrow","borrow","borrow","borrow","borrow","borrow","borrow","borrow","borrow","borrow","borrow","borrow","borrow","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","bytes","bytes","char_indices","char_indices","chars","chars","clone","clone","clone","clone","clone","clone","clone","clone","clone","clone","clone_into","clone_into","clone_into","clone_into","clone_into","clone_into","clone_into","clone_into","clone_into","clone_into","cmp","contains_str","contains_str","decode_last_utf8","decode_utf8","default","default","deref","deref_mut","ends_with_str","ends_with_str","eq","eq","eq","eq","eq","eq","error_len","escape_bytes","escape_bytes","fields_with","fields_with","find","find","find","find_byte","find_byte","find_byteset","find_byteset","find_char","find_char","find_iter","find_iter","find_non_ascii_byte","find_non_ascii_byte","find_not_byteset","find_not_byteset","fmt","fmt","fmt","fmt","fmt","fmt","fmt","fmt","fmt","fmt","fmt","fmt","fmt","fmt","fmt","fmt","fmt","fmt","fmt","fmt","fmt","from","from","from","from","from","from","from","from","from","from","from","from","from","from","from","from","from","from","from","from","from","hash","incomplete","index","index","index","index","index","index","index","index_mut","index_mut","index_mut","index_mut","index_mut","index_mut","index_mut","into","into","into","into","into","into","into","into","into","into","into","into","into","into","into","into","into","into","into_iter","into_iter","into_iter","into_iter","into_iter","into_iter","into_iter","into_iter","into_iter","into_iter","into_iter","into_iter","into_iter","into_iter","invalid","is_ascii","is_ascii","is_utf8","is_utf8","last_byte","last_byte","len","lines","lines","lines_with_terminator","lines_with_terminator","make_ascii_lowercase","make_ascii_lowercase","make_ascii_uppercase","make_ascii_uppercase","needle","needle","new","new","new","next","next","next","next","next","next","next","next","next","next","next","next","next","next","next_back","next_back","next_back","next_back","next_back","partial_cmp","partial_cmp","partial_cmp","partial_cmp","partial_cmp","reverse_bytes","reverse_bytes","reverse_chars","reverse_chars","rfind","rfind","rfind","rfind_byte","rfind_byte","rfind_byteset","rfind_byteset","rfind_char","rfind_char","rfind_iter","rfind_iter","rfind_not_byteset","rfind_not_byteset","rsplit_once_str","rsplit_once_str","rsplit_str","rsplit_str","rsplitn_str","rsplitn_str","size_hint","size_hint","split_once_str","split_once_str","split_str","split_str","splitn_str","splitn_str","starts_with_str","starts_with_str","to_owned","to_owned","to_owned","to_owned","to_owned","to_owned","to_owned","to_owned","to_owned","to_owned","to_str","to_str","to_str_unchecked","to_str_unchecked","to_string","to_string","to_string","trim_end_with","trim_end_with","trim_start_with","trim_start_with","trim_with","trim_with","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_into","try_into","try_into","try_into","try_into","try_into","try_into","try_into","try_into","try_into","try_into","try_into","try_into","try_into","try_into","try_into","try_into","try_into","type_id","type_id","type_id","type_id","type_id","type_id","type_id","type_id","type_id","type_id","type_id","type_id","type_id","type_id","type_id","type_id","type_id","type_id","type_id","utf8_chunks","utf8_chunks","valid","valid_up_to"],"q":[[0,"bstr"],[386,"core::marker"],[387,"core::convert"],[388,"core::cmp"],[389,"core::option"],[390,"core::ops::function"],[391,"core::fmt"],[392,"core::fmt"],[393,"core::ops::range"],[394,"core::ops::range"],[395,"alloc::string"],[396,"core::any"]],"d":["A short-hand constructor for building a&[u8]
.","A wrapper for &[u8]
that provides convenient string …","A trait that extends &[u8]
with string oriented methods.","An iterator over the bytes in a byte string.","An iterator over Unicode scalar values in a byte string …","An iterator over Unicode scalar values in a byte string.","An iterator of char
values that represent an escaping of …","An iterator over fields in the byte string, separated by a …","An iterator over non-overlapping substring matches.","An iterator over non-overlapping substring matches in …","A single substring searcher fixed to a particular needle.","A single substring reverse searcher fixed to a particular …","An iterator over all lines in a byte string, without their …","An iterator over all lines in a byte string, including …","An iterator over substrings in a byte string, split by a …","An iterator over at most n
substrings in a byte string, …","An iterator over at most n
substrings in a byte string, …","An iterator over substrings in a byte string, split by a …","A chunk of valid UTF-8, possibly followed by invalid UTF-8 …","An iterator over chunks of valid UTF-8 in a byte slice.","An error that occurs when UTF-8 decoding fails.","Return this byte slice as a &BStr
.","Return this byte slice as a &BStr
.","Return this byte slice as a &mut BStr
.","Return this byte slice as a &mut BStr
.","Views the remaining underlying data as a subslice of the …","Return a copy of the rest of the underlying bytes without …","Return a copy of the rest of the underlying bytes without …","View the underlying data as a subslice of the original …","View the underlying data as a subslice of the original …","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","Returns an iterator over the bytes in this byte string.","Returns an iterator over the bytes in this byte string.","Returns an iterator over the Unicode scalar values in this …","Returns an iterator over the Unicode scalar values in this …","Returns an iterator over the Unicode scalar values in this …","Returns an iterator over the Unicode scalar values in this …","","","","","","","","","","","","","","","","","","","","","","Returns true if and only if this byte string contains the …","Returns true if and only if this byte string contains the …","UTF-8 decode a single Unicode scalar value from the end of …","UTF-8 decode a single Unicode scalar value from the …","","","","","Returns true if and only if this byte string has the given …","Returns true if and only if this byte string has the given …","","","","","","","Returns the total number of invalid UTF-8 bytes …","Escapes this byte string into a sequence of char
values.","Escapes this byte string into a sequence of char
values.","Returns an iterator over the fields in a byte string, …","Returns an iterator over the fields in a byte string, …","Returns the index of the first occurrence of the given …","Returns the index of the first occurrence of the given …","Returns the index of the first occurrence of this needle …","Returns the index of the first occurrence of the given …","Returns the index of the first occurrence of the given …","Returns the index of the first occurrence of any of the …","Returns the index of the first occurrence of any of the …","Returns the index of the first occurrence of the given …","Returns the index of the first occurrence of the given …","Returns an iterator of the non-overlapping occurrences of …","Returns an iterator of the non-overlapping occurrences of …","Returns the index of the first non-ASCII byte in this byte …","Returns the index of the first non-ASCII byte in this byte …","Returns the index of the first occurrence of a byte that …","Returns the index of the first occurrence of a byte that …","","","","","","","","","","","","","","","","","","","","","","Returns the argument unchanged.","","","","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","","Returns whether the invalid sequence might still become …","","","","","","","","","","","","","","","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","","","","","","","","","","","","","","","Returns the (possibly empty) invalid UTF-8 bytes in this …","Returns true if and only if every byte in this byte string …","Returns true if and only if every byte in this byte string …","Returns true if and only if the entire byte string is …","Returns true if and only if the entire byte string is …","Returns the last byte in this byte string, if it’s …","Returns the last byte in this byte string, if it’s …","","An iterator over all lines in a byte string, without their …","An iterator over all lines in a byte string, without their …","An iterator over all lines in a byte string, including …","An iterator over all lines in a byte string, including …","Convert this byte string to its lowercase ASCII equivalent …","Convert this byte string to its lowercase ASCII equivalent …","Convert this byte string to its uppercase ASCII equivalent …","Convert this byte string to its uppercase ASCII equivalent …","Returns the needle that this finder searches for.","Returns the needle that this finder searches for.","Directly creates a BStr
slice from anything that can be …","Create a new finder for the given needle.","Create a new reverse finder for the given needle.","","","","","","","","","","","","","","","","","","","","","","","","","Reverse the bytes in this string, in place.","Reverse the bytes in this string, in place.","Reverse the codepoints in this string, in place.","Reverse the codepoints in this string, in place.","Returns the index of the last occurrence of the given …","Returns the index of the last occurrence of the given …","Returns the index of the last occurrence of this needle in …","Returns the index of the last occurrence of the given …","Returns the index of the last occurrence of the given …","Returns the index of the last occurrence of any of the …","Returns the index of the last occurrence of any of the …","Returns the index of the last occurrence of the given …","Returns the index of the last occurrence of the given …","Returns an iterator of the non-overlapping occurrences of …","Returns an iterator of the non-overlapping occurrences of …","Returns the index of the last occurrence of a byte that is …","Returns the index of the last occurrence of a byte that is …","Split this byte string at the last occurrence of splitter
.","Split this byte string at the last occurrence of splitter
.","Returns an iterator over substrings of this byte string, …","Returns an iterator over substrings of this byte string, …","Returns an iterator of at most limit
substrings of this …","Returns an iterator of at most limit
substrings of this …","","","Split this byte string at the first occurrence of splitter
.","Split this byte string at the first occurrence of splitter
.","Returns an iterator over substrings of this byte string, …","Returns an iterator over substrings of this byte string, …","Returns an iterator of at most limit
substrings of this …","Returns an iterator of at most limit
substrings of this …","Returns true if and only if this byte string has the given …","Returns true if and only if this byte string has the given …","","","","","","","","","","","Safely convert this byte string into a &str
if it’s …","Safely convert this byte string into a &str
if it’s …","Unsafely convert this byte string into a &str
, without …","Unsafely convert this byte string into a &str
, without …","","","","Return a byte string slice with trailing characters …","Return a byte string slice with trailing characters …","Return a byte string slice with leading characters …","Return a byte string slice with leading characters …","Return a byte string slice with leading and trailing …","Return a byte string slice with leading and trailing …","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","Iterate over chunks of valid UTF-8.","Iterate over chunks of valid UTF-8.","Returns the (possibly empty) valid UTF-8 bytes in this …","Returns the byte index of the position immediately …"],"i":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,46,46,46,46,6,7,8,9,10,5,5,5,36,5,5,11,12,13,25,28,6,23,30,31,32,33,7,8,9,10,14,15,36,5,5,11,12,13,25,28,6,23,30,31,32,33,7,8,9,10,14,15,46,46,46,46,46,46,11,12,13,6,7,8,9,10,14,15,11,12,13,6,7,8,9,10,14,15,5,46,46,0,0,5,5,5,5,46,46,5,5,5,5,5,15,15,46,46,46,46,46,46,12,46,46,46,46,46,46,46,46,46,46,46,46,5,5,11,11,12,13,25,28,6,23,30,31,32,33,7,8,9,10,14,15,15,36,5,5,5,11,12,13,25,28,6,23,30,31,32,33,7,8,9,10,14,15,5,36,5,5,5,5,5,5,5,5,5,5,5,5,5,5,36,11,12,13,25,28,6,23,30,31,32,33,7,8,9,10,14,15,11,25,28,6,23,30,31,32,33,7,8,9,10,14,36,46,46,46,46,46,46,6,46,46,46,46,46,46,46,46,12,13,5,12,13,11,25,28,6,23,30,31,32,33,7,8,9,10,14,6,7,8,9,10,5,5,5,5,5,46,46,46,46,46,46,13,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,6,14,46,46,46,46,46,46,46,46,11,12,13,6,7,8,9,10,14,15,46,46,46,46,5,11,15,46,46,46,46,46,46,36,11,12,13,25,28,6,23,30,31,32,33,7,8,9,10,14,15,36,11,12,13,25,28,6,23,30,31,32,33,7,8,9,10,14,15,36,5,11,12,13,25,28,6,23,30,31,32,33,7,8,9,10,14,15,46,46,36,15],"f":[[-1,[[2,[1]]],[3,[4,[[2,[1]]]]]],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[-1,5,[]],[-1,5,[]],[-1,5,[]],[-1,5,[]],[6,[[2,[1]]]],[7,[[2,[1]]]],[8,[[2,[1]]]],[9,[[2,[1]]]],[10,[[2,[1]]]],[5,[[2,[1]]]],[5,[[2,[1]]]],[5,5],[-1,-2,[],[]],[5,[[2,[1]]]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[5,[[2,[1]]]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,6,[]],[-1,6,[]],[-1,10,[]],[-1,10,[]],[-1,9,[]],[-1,9,[]],[11,11],[12,12],[13,13],[6,6],[7,7],[8,8],[9,9],[10,10],[14,14],[15,15],[[-1,-2],16,[],[]],[[-1,-2],16,[],[]],[[-1,-2],16,[],[]],[[-1,-2],16,[],[]],[[-1,-2],16,[],[]],[[-1,-2],16,[],[]],[[-1,-2],16,[],[]],[[-1,-2],16,[],[]],[[-1,-2],16,[],[]],[[-1,-2],16,[],[]],[[5,5],17],[[-1,-2],18,[],[[4,[[2,[1]]]]]],[[-1,-2],18,[],[[4,[[2,[1]]]]]],[-1,[[16,[[20,[19]],21]]],[[4,[[2,[1]]]]]],[-1,[[16,[[20,[19]],21]]],[[4,[[2,[1]]]]]],[[],5],[[],5],[5,[[2,[1]]]],[5,[[2,[1]]]],[[-1,-2],18,[],[[4,[[2,[1]]]]]],[[-1,-2],18,[],[[4,[[2,[1]]]]]],[[5,5],18],[[5,22],18],[[5,22],18],[[5,[2,[1]]],18],[[5,[2,[1]]],18],[[15,15],18],[15,[[20,[21]]]],[-1,11,[]],[-1,11,[]],[[-1,-2],[[23,[-2]]],[],24],[[-1,-2],[[23,[-2]]],[],24],[[-1,-2],[[20,[21]]],[],[[4,[[2,[1]]]]]],[[-1,-2],[[20,[21]]],[],[[4,[[2,[1]]]]]],[[12,-1],[[20,[21]]],[[4,[[2,[1]]]]]],[[-1,1],[[20,[21]]],[]],[[-1,1],[[20,[21]]],[]],[[-1,-2],[[20,[21]]],[],[[4,[[2,[1]]]]]],[[-1,-2],[[20,[21]]],[],[[4,[[2,[1]]]]]],[[-1,19],[[20,[21]]],[]],[[-1,19],[[20,[21]]],[]],[[-1,-2],25,[],[3,[4,[[2,[1]]]]]],[[-1,-2],25,[],[3,[4,[[2,[1]]]]]],[-1,[[20,[21]]],[]],[-1,[[20,[21]]],[]],[[-1,-2],[[20,[21]]],[],[[4,[[2,[1]]]]]],[[-1,-2],[[20,[21]]],[],[[4,[[2,[1]]]]]],[[5,26],27],[[5,26],27],[[11,26],27],[[11,26],27],[[12,26],27],[[13,26],27],[[25,26],27],[[28,26],27],[[6,26],27],[[[23,[-1]],26],27,29],[[30,26],27],[[31,26],27],[[32,26],27],[[33,26],27],[[7,26],27],[[8,26],27],[[9,26],27],[[10,26],27],[[14,26],27],[[15,26],27],[[15,26],27],[-1,-1,[]],[[[2,[1]]],5],[[[34,[1]]],5],[22,5],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[[5,-1],16,35],[36,18],[[5,[37,[21]]],5],[[5,[38,[21]]],5],[[5,39],5],[[5,[40,[21]]],5],[[5,21],1],[[5,[41,[21]]],5],[[5,[42,[21]]],5],[[5,[42,[21]]],5],[[5,[37,[21]]],5],[[5,39],5],[[5,[41,[21]]],5],[[5,21],1],[[5,[38,[21]]],5],[[5,[40,[21]]],5],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[36,[[2,[1]]]],[-1,18,[]],[-1,18,[]],[-1,18,[]],[-1,18,[]],[-1,[[20,[1]]],[]],[-1,[[20,[1]]],[]],[6,21],[-1,7,[]],[-1,7,[]],[-1,8,[]],[-1,8,[]],[-1,16,[]],[-1,16,[]],[-1,16,[]],[-1,16,[]],[12,[[2,[1]]]],[13,[[2,[1]]]],[-1,5,[3,[4,[[2,[1]]]]]],[-1,12,[3,[4,[[2,[1]]]]]],[-1,13,[3,[4,[[2,[1]]]]]],[11,[[20,[19]]]],[25,[[20,[21]]]],[28,[[20,[21]]]],[6,[[20,[1]]]],[[[23,[-1]]],[[20,[[2,[1]]]]],24],[30,[[20,[[2,[1]]]]]],[31,[[20,[[2,[1]]]]]],[32,[[20,[[2,[1]]]]]],[33,[[20,[[2,[1]]]]]],[7,[[20,[[2,[1]]]]]],[8,[[20,[[2,[1]]]]]],[9,[[20,[19]]]],[10,[[20,[[16,[21,21,19]]]]]],[14,[[20,[36]]]],[6,[[20,[1]]]],[7,20],[8,20],[9,[[20,[19]]]],[10,[[20,[[16,[21,21,19]]]]]],[[5,[2,[1]]],[[20,[17]]]],[[5,5],[[20,[17]]]],[[5,22],[[20,[17]]]],[[5,[2,[1]]],[[20,[17]]]],[[5,22],[[20,[17]]]],[-1,16,[]],[-1,16,[]],[-1,16,[]],[-1,16,[]],[[-1,-2],[[20,[21]]],[],[[4,[[2,[1]]]]]],[[-1,-2],[[20,[21]]],[],[[4,[[2,[1]]]]]],[[13,-1],[[20,[21]]],[[4,[[2,[1]]]]]],[[-1,1],[[20,[21]]],[]],[[-1,1],[[20,[21]]],[]],[[-1,-2],[[20,[21]]],[],[[4,[[2,[1]]]]]],[[-1,-2],[[20,[21]]],[],[[4,[[2,[1]]]]]],[[-1,19],[[20,[21]]],[]],[[-1,19],[[20,[21]]],[]],[[-1,-2],28,[],[3,[4,[[2,[1]]]]]],[[-1,-2],28,[],[3,[4,[[2,[1]]]]]],[[-1,-2],[[20,[21]]],[],[[4,[[2,[1]]]]]],[[-1,-2],[[20,[21]]],[],[[4,[[2,[1]]]]]],[[-1,-2],[[20,[[16,[[2,[1]],[2,[1]]]]]]],[],[3,[4,[[2,[1]]]]]],[[-1,-2],[[20,[[16,[[2,[1]],[2,[1]]]]]]],[],[3,[4,[[2,[1]]]]]],[[-1,-2],31,[],[3,[4,[[2,[1]]]]]],[[-1,-2],31,[],[3,[4,[[2,[1]]]]]],[[-1,21,-2],33,[],[3,[4,[[2,[1]]]]]],[[-1,21,-2],33,[],[3,[4,[[2,[1]]]]]],[6,[[16,[21,[20,[21]]]]]],[14,[[16,[21,[20,[21]]]]]],[[-1,-2],[[20,[[16,[[2,[1]],[2,[1]]]]]]],[],[3,[4,[[2,[1]]]]]],[[-1,-2],[[20,[[16,[[2,[1]],[2,[1]]]]]]],[],[3,[4,[[2,[1]]]]]],[[-1,-2],30,[],[3,[4,[[2,[1]]]]]],[[-1,-2],30,[],[3,[4,[[2,[1]]]]]],[[-1,21,-2],32,[],[3,[4,[[2,[1]]]]]],[[-1,21,-2],32,[],[3,[4,[[2,[1]]]]]],[[-1,-2],18,[],[[4,[[2,[1]]]]]],[[-1,-2],18,[],[[4,[[2,[1]]]]]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,[[43,[22,15]]],[]],[-1,[[43,[22,15]]],[]],[-1,22,[]],[-1,22,[]],[-1,44,[]],[-1,44,[]],[-1,44,[]],[[-1,-2],[[2,[1]]],[],24],[[-1,-2],[[2,[1]]],[],24],[[-1,-2],[[2,[1]]],[],24],[[-1,-2],[[2,[1]]],[],24],[[-1,-2],[[2,[1]]],[],24],[[-1,-2],[[2,[1]]],[],24],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,[[43,[-2]]],[],[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,45,[]],[-1,14,[]],[-1,14,[]],[36,22],[15,21]],"c":[],"p":[[15,"u8"],[15,"slice"],[8,"Sized",386],[8,"AsRef",387],[3,"BStr",0],[3,"Bytes",0],[3,"Lines",0],[3,"LinesWithTerminator",0],[3,"Chars",0],[3,"CharIndices",0],[3,"EscapeBytes",0],[3,"Finder",0],[3,"FinderReverse",0],[3,"Utf8Chunks",0],[3,"Utf8Error",0],[15,"tuple"],[4,"Ordering",388],[15,"bool"],[15,"char"],[4,"Option",389],[15,"usize"],[15,"str"],[3,"FieldsWith",0],[8,"FnMut",390],[3,"Find",0],[3,"Formatter",391],[6,"Result",391],[3,"FindReverse",0],[8,"Debug",391],[3,"Split",0],[3,"SplitReverse",0],[3,"SplitN",0],[3,"SplitNReverse",0],[15,"array"],[8,"Hasher",392],[3,"Utf8Chunk",0],[3,"Range",393],[3,"RangeTo",393],[3,"RangeFull",393],[3,"RangeFrom",393],[3,"RangeInclusive",393],[3,"RangeToInclusive",393],[4,"Result",394],[3,"String",395],[3,"TypeId",396],[8,"ByteSlice",0]],"b":[[31,"impl-AsRef%3C%5Bu8%5D%3E-for-BStr"],[32,"impl-AsRef%3CBStr%3E-for-BStr"],[104,"impl-Default-for-%26BStr"],[105,"impl-Default-for-%26mut+BStr"],[110,"impl-PartialEq-for-BStr"],[111,"impl-PartialEq%3Cstr%3E-for-BStr"],[112,"impl-PartialEq%3C%26str%3E-for-BStr"],[113,"impl-PartialEq%3C%26%5Bu8%5D%3E-for-BStr"],[114,"impl-PartialEq%3C%5Bu8%5D%3E-for-BStr"],[136,"impl-Debug-for-BStr"],[137,"impl-Display-for-BStr"],[138,"impl-Debug-for-EscapeBytes%3C\'a%3E"],[139,"impl-Display-for-EscapeBytes%3C\'a%3E"],[155,"impl-Debug-for-Utf8Error"],[156,"impl-Display-for-Utf8Error"],[158,"impl-From%3C%26%5Bu8%5D%3E-for-%26BStr"],[159,"impl-From%3C%26%5Bu8;+N%5D%3E-for-%26BStr"],[160,"impl-From%3C%26str%3E-for-%26BStr"],[180,"impl-Index%3CRange%3Cusize%3E%3E-for-BStr"],[181,"impl-Index%3CRangeTo%3Cusize%3E%3E-for-BStr"],[182,"impl-Index%3CRangeFull%3E-for-BStr"],[183,"impl-Index%3CRangeFrom%3Cusize%3E%3E-for-BStr"],[184,"impl-Index%3Cusize%3E-for-BStr"],[185,"impl-Index%3CRangeInclusive%3Cusize%3E%3E-for-BStr"],[186,"impl-Index%3CRangeToInclusive%3Cusize%3E%3E-for-BStr"],[187,"impl-IndexMut%3CRangeToInclusive%3Cusize%3E%3E-for-BStr"],[188,"impl-IndexMut%3CRange%3Cusize%3E%3E-for-BStr"],[189,"impl-IndexMut%3CRangeFull%3E-for-BStr"],[190,"impl-IndexMut%3CRangeInclusive%3Cusize%3E%3E-for-BStr"],[191,"impl-IndexMut%3Cusize%3E-for-BStr"],[192,"impl-IndexMut%3CRangeTo%3Cusize%3E%3E-for-BStr"],[193,"impl-IndexMut%3CRangeFrom%3Cusize%3E%3E-for-BStr"],[266,"impl-PartialOrd%3C%26%5Bu8%5D%3E-for-BStr"],[267,"impl-PartialOrd-for-BStr"],[268,"impl-PartialOrd%3C%26str%3E-for-BStr"],[269,"impl-PartialOrd%3C%5Bu8%5D%3E-for-BStr"],[270,"impl-PartialOrd%3Cstr%3E-for-BStr"]]},\
+"memchr":{"doc":"This library provides heavily optimized routines for …","t":"DDDALLLLLLLLLLLLLLLLLLLLLLLLLFFFFFFAFFFFFFLLLLLLLLLLLLLLLLLLLLLLLLAAFFFFAAAAADDDDDDLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLDIDLLLLLLLLLLLLLLLLLLLLKLLLLLLLLLLLDDLLLLLLLLLLLLLLLLLLLLLLLLLLLLDLLLLLLLLLLDDLLLLLLLLLLLLLLLLLLLLLLLLLLAAAADDDDDDLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLDLLLLLLLLLLLLLLLLLLAADDDDDDLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLDLLLLLLLLLLLLLLLLLLNDDDDDNELLLLLLLLLLLLLLLLLLLLLLLLLLLFLFLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLFLFLLLLLLLLLLLLLLLLLLLLLLLL","n":["Memchr","Memchr2","Memchr3","arch","borrow","borrow","borrow","borrow_mut","borrow_mut","borrow_mut","clone","clone","clone","clone_into","clone_into","clone_into","count","fmt","fmt","fmt","from","from","from","into","into","into","into_iter","into_iter","into_iter","memchr","memchr2","memchr2_iter","memchr3","memchr3_iter","memchr_iter","memmem","memrchr","memrchr2","memrchr2_iter","memrchr3","memrchr3_iter","memrchr_iter","new","new","new","next","next","next","next_back","next_back","next_back","size_hint","size_hint","size_hint","to_owned","to_owned","to_owned","try_from","try_from","try_from","try_into","try_into","try_into","type_id","type_id","type_id","all","x86_64","is_equal","is_equal_raw","is_prefix","is_suffix","memchr","packedpair","rabinkarp","shiftor","twoway","One","OneIter","Three","ThreeIter","Two","TwoIter","borrow","borrow","borrow","borrow","borrow","borrow","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","clone","clone","clone","clone","clone","clone","clone_into","clone_into","clone_into","clone_into","clone_into","clone_into","count","count","count_raw","find","find","find","find_raw","find_raw","find_raw","fmt","fmt","fmt","fmt","fmt","fmt","from","from","from","from","from","from","into","into","into","into","into","into","into_iter","into_iter","into_iter","iter","iter","iter","new","new","new","next","next","next","next_back","next_back","next_back","rfind","rfind","rfind","rfind_raw","rfind_raw","rfind_raw","size_hint","size_hint","size_hint","to_owned","to_owned","to_owned","to_owned","to_owned","to_owned","try_from","try_from","try_from","try_from","try_from","try_from","try_into","try_into","try_into","try_into","try_into","try_into","type_id","type_id","type_id","type_id","type_id","type_id","Finder","HeuristicFrequencyRank","Pair","borrow","borrow","borrow_mut","borrow_mut","clone","clone","clone_into","clone_into","find_prefilter","fmt","fmt","from","from","index1","index2","into","into","new","new","pair","rank","to_owned","to_owned","try_from","try_from","try_into","try_into","type_id","type_id","with_indices","with_pair","with_ranker","Finder","FinderRev","borrow","borrow","borrow_mut","borrow_mut","clone","clone","clone_into","clone_into","find","find_raw","fmt","fmt","from","from","into","into","new","new","rfind","rfind_raw","to_owned","to_owned","try_from","try_from","try_into","try_into","type_id","type_id","Finder","borrow","borrow_mut","find","fmt","from","into","new","try_from","try_into","type_id","Finder","FinderRev","borrow","borrow","borrow_mut","borrow_mut","clone","clone","clone_into","clone_into","find","fmt","fmt","from","from","into","into","new","new","rfind","to_owned","to_owned","try_from","try_from","try_into","try_into","type_id","type_id","avx2","sse2","memchr","packedpair","One","OneIter","Three","ThreeIter","Two","TwoIter","borrow","borrow","borrow","borrow","borrow","borrow","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","clone","clone","clone","clone","clone","clone","clone_into","clone_into","clone_into","clone_into","clone_into","clone_into","count","count","count_raw","find","find","find","find_raw","find_raw","find_raw","fmt","fmt","fmt","fmt","fmt","fmt","from","from","from","from","from","from","into","into","into","into","into","into","into_iter","into_iter","into_iter","is_available","is_available","is_available","iter","iter","iter","new","new","new","new_unchecked","new_unchecked","new_unchecked","next","next","next","next_back","next_back","next_back","rfind","rfind","rfind","rfind_raw","rfind_raw","rfind_raw","size_hint","size_hint","size_hint","to_owned","to_owned","to_owned","to_owned","to_owned","to_owned","try_from","try_from","try_from","try_from","try_from","try_from","try_into","try_into","try_into","try_into","try_into","try_into","type_id","type_id","type_id","type_id","type_id","type_id","Finder","borrow","borrow_mut","clone","clone_into","find","find_prefilter","fmt","from","into","is_available","min_haystack_len","new","pair","to_owned","try_from","try_into","type_id","with_pair","memchr","packedpair","One","OneIter","Three","ThreeIter","Two","TwoIter","borrow","borrow","borrow","borrow","borrow","borrow","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","clone","clone","clone","clone","clone","clone","clone_into","clone_into","clone_into","clone_into","clone_into","clone_into","count","count","count_raw","find","find","find","find_raw","find_raw","find_raw","fmt","fmt","fmt","fmt","fmt","fmt","from","from","from","from","from","from","into","into","into","into","into","into","into_iter","into_iter","into_iter","is_available","is_available","is_available","iter","iter","iter","new","new","new","new_unchecked","new_unchecked","new_unchecked","next","next","next","next_back","next_back","next_back","rfind","rfind","rfind","rfind_raw","rfind_raw","rfind_raw","size_hint","size_hint","size_hint","to_owned","to_owned","to_owned","to_owned","to_owned","to_owned","try_from","try_from","try_from","try_from","try_from","try_from","try_into","try_into","try_into","try_into","try_into","try_into","type_id","type_id","type_id","type_id","type_id","type_id","Finder","borrow","borrow_mut","clone","clone_into","find","find_prefilter","fmt","from","into","is_available","min_haystack_len","new","pair","to_owned","try_from","try_into","type_id","with_pair","Auto","FindIter","FindRevIter","Finder","FinderBuilder","FinderRev","None","Prefilter","as_ref","as_ref","borrow","borrow","borrow","borrow","borrow","borrow","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","build_forward","build_forward_with_ranker","build_reverse","clone","clone","clone","clone","clone_into","clone_into","clone_into","clone_into","default","default","find","find","find_iter","find_iter","fmt","fmt","fmt","fmt","fmt","fmt","from","from","from","from","from","from","into","into","into","into","into","into","into_iter","into_iter","into_owned","into_owned","into_owned","into_owned","needle","needle","new","new","new","next","next","prefilter","rfind","rfind","rfind_iter","rfind_iter","size_hint","to_owned","to_owned","to_owned","to_owned","try_from","try_from","try_from","try_from","try_from","try_from","try_into","try_into","try_into","try_into","try_into","try_into","type_id","type_id","type_id","type_id","type_id","type_id"],"q":[[0,"memchr"],[66,"memchr::arch"],[68,"memchr::arch::all"],[77,"memchr::arch::all::memchr"],[182,"memchr::arch::all::packedpair"],[217,"memchr::arch::all::rabinkarp"],[247,"memchr::arch::all::shiftor"],[258,"memchr::arch::all::twoway"],[286,"memchr::arch::x86_64"],[288,"memchr::arch::x86_64::avx2"],[290,"memchr::arch::x86_64::avx2::memchr"],[401,"memchr::arch::x86_64::avx2::packedpair"],[420,"memchr::arch::x86_64::sse2"],[422,"memchr::arch::x86_64::sse2::memchr"],[533,"memchr::arch::x86_64::sse2::packedpair"],[552,"memchr::memmem"],[650,"core::fmt"],[651,"core::fmt"],[652,"core::iter::adapters::rev"],[653,"core::result"],[654,"core::any"],[655,"core::marker"],[656,"core::convert"]],"d":["An iterator over all occurrences of a single byte in a …","An iterator over all occurrences of two possible bytes in …","An iterator over all occurrences of three possible bytes …","A module with low-level architecture dependent routines.","","","","","","","","","","","","","","","","","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","","","","Search for the first occurrence of a byte in a slice.","Search for the first occurrence of two possible bytes in a …","Returns an iterator over all occurrences of the needles in …","Search for the first occurrence of three possible bytes in …","Returns an iterator over all occurrences of the needles in …","Returns an iterator over all occurrences of the needle in …","This module provides forward and reverse substring search …","Search for the last occurrence of a byte in a slice.","Search for the last occurrence of two possible bytes in a …","Returns an iterator over all occurrences of the needles in …","Search for the last occurrence of three possible bytes in …","Returns an iterator over all occurrences of the needles in …","Returns an iterator over all occurrences of the needle in …","Returns an iterator over all occurrences of the needle …","Returns an iterator over all occurrences of the needle …","Returns an iterator over all occurrences of the needle …","","","","","","","","","","","","","","","","","","","","","","Contains architecture independent routines.","Vector algorithms for the x86_64
target.","Compare corresponding bytes in x
and y
for equality.","Compare n
bytes at the given pointers for equality.","Returns true if and only if needle
is a prefix of haystack
.","Returns true if and only if needle
is a suffix of haystack
.","Provides architecture independent implementations of memchr
…","Provides an architecture independent implementation of the …","An implementation of the Rabin-Karp substring search …","An implementation of the Shift-Or substring search …","An implementation of the Two-Way substring search algorithm…","Finds all occurrences of a single byte in a haystack.","An iterator over all occurrences of a single byte in a …","Finds all occurrences of three bytes in a haystack.","An iterator over all occurrences of three possible bytes …","Finds all occurrences of two bytes in a haystack.","An iterator over all occurrences of two possible bytes in …","","","","","","","","","","","","","","","","","","","","","","","","","Counts all occurrences of this byte in the given haystack.","","Counts all occurrences of this byte in the given haystack …","Return the first occurrence of the needle in the given …","Return the first occurrence of one of the needle bytes in …","Return the first occurrence of one of the needle bytes in …","Like find
, but accepts and returns raw pointers.","Like find
, but accepts and returns raw pointers.","Like find
, but accepts and returns raw pointers.","","","","","","","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","","","","Returns an iterator over all occurrences of the needle …","Returns an iterator over all occurrences of one of the …","Returns an iterator over all occurrences of one of the …","Create a new searcher that finds occurrences of the byte …","Create a new searcher that finds occurrences of the two …","Create a new searcher that finds occurrences of the three …","","","","","","","Return the last occurrence of the needle in the given …","Return the last occurrence of one of the needle bytes in …","Return the last occurrence of one of the needle bytes in …","Like rfind
, but accepts and returns raw pointers.","Like rfind
, but accepts and returns raw pointers.","Like rfind
, but accepts and returns raw pointers.","","","","","","","","","","","","","","","","","","","","","","","","","","","","An architecture independent “packed pair” finder.","This trait allows the user to customize the heuristic used …","A pair of byte offsets into a needle to use as a predicate.","","","","","","","","","Run this finder on the given haystack as a prefilter.","","","Returns the argument unchanged.","Returns the argument unchanged.","Returns the first offset of the pair.","Returns the second offset of the pair.","Calls U::from(self)
.","Calls U::from(self)
.","Create a new prefilter that reports possible locations …","Create a new pair of offsets from the given needle.","Returns the pair of offsets (into the needle) used to …","Return the heuristic frequency rank of the given byte. A …","","","","","","","","","Create a new pair using the offsets given for the needle …","Create a new prefilter using the pair given.","Create a new pair of offsets from the given needle and …","A forward substring searcher using the Rabin-Karp …","A reverse substring searcher using the Rabin-Karp …","","","","","","","","","Return the first occurrence of the needle
in the haystack
…","Like find
, but accepts and returns raw pointers.","","","Returns the argument unchanged.","Returns the argument unchanged.","Calls U::from(self)
.","Calls U::from(self)
.","Create a new Rabin-Karp forward searcher for the given …","Create a new Rabin-Karp reverse searcher for the given …","Return the last occurrence of the needle
in the haystack
…","Like rfind
, but accepts and returns raw pointers.","","","","","","","","","A forward substring searcher using the Shift-Or algorithm.","","","Return the first occurrence of the needle given to …","","Returns the argument unchanged.","Calls U::from(self)
.","Create a new Shift-Or forward searcher for the given needle
…","","","","A forward substring searcher that uses the Two-Way …","A reverse substring searcher that uses the Two-Way …","","","","","","","","","Returns the first occurrence of needle
in the given …","","","Returns the argument unchanged.","Returns the argument unchanged.","Calls U::from(self)
.","Calls U::from(self)
.","Create a searcher that finds occurrences of the given …","Create a searcher that finds occurrences of the given …","Returns the last occurrence of needle
in the given haystack
…","","","","","","","","","Algorithms for the x86_64
target using 256-bit vectors via …","Algorithms for the x86_64
target using 128-bit vectors via …","This module defines 256-bit vector implementations of …","A 256-bit vector implementation of the “packed pair” …","Finds all occurrences of a single byte in a haystack.","An iterator over all occurrences of a single byte in a …","Finds all occurrences of three bytes in a haystack.","An iterator over all occurrences of three possible bytes …","Finds all occurrences of two bytes in a haystack.","An iterator over all occurrences of two possible bytes in …","","","","","","","","","","","","","","","","","","","","","","","","","Counts all occurrences of this byte in the given haystack.","","Counts all occurrences of this byte in the given haystack …","Return the first occurrence of one of the needle bytes in …","Return the first occurrence of one of the needle bytes in …","Return the first occurrence of one of the needle bytes in …","Like find
, but accepts and returns raw pointers.","Like find
, but accepts and returns raw pointers.","Like find
, but accepts and returns raw pointers.","","","","","","","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","","","","Returns true when this implementation is available in the …","Returns true when this implementation is available in the …","Returns true when this implementation is available in the …","Returns an iterator over all occurrences of the needle …","Returns an iterator over all occurrences of the needle …","Returns an iterator over all occurrences of the needle …","Create a new searcher that finds occurrences of the needle …","Create a new searcher that finds occurrences of the needle …","Create a new searcher that finds occurrences of the needle …","Create a new finder specific to AVX2 vectors and routines …","Create a new finder specific to AVX2 vectors and routines …","Create a new finder specific to AVX2 vectors and routines …","","","","","","","Return the last occurrence of one of the needle bytes in …","Return the last occurrence of one of the needle bytes in …","Return the last occurrence of one of the needle bytes in …","Like rfind
, but accepts and returns raw pointers.","Like rfind
, but accepts and returns raw pointers.","Like rfind
, but accepts and returns raw pointers.","","","","","","","","","","","","","","","","","","","","","","","","","","","","A “packed pair” finder that uses 256-bit vector …","","","","","Execute a search using AVX2 vectors and routines.","Run this finder on the given haystack as a prefilter.","","Returns the argument unchanged.","Calls U::from(self)
.","Returns true when this implementation is available in the …","Returns the minimum haystack length that this Finder
can …","Create a new pair searcher. The searcher returned can …","Returns the pair of offsets (into the needle) used to …","","","","","Create a new “packed pair” finder using the pair of …","This module defines 128-bit vector implementations of …","A 128-bit vector implementation of the “packed pair” …","Finds all occurrences of a single byte in a haystack.","An iterator over all occurrences of a single byte in a …","Finds all occurrences of three bytes in a haystack.","An iterator over all occurrences of three possible bytes …","Finds all occurrences of two bytes in a haystack.","An iterator over all occurrences of two possible bytes in …","","","","","","","","","","","","","","","","","","","","","","","","","Counts all occurrences of this byte in the given haystack.","","Counts all occurrences of this byte in the given haystack …","Return the first occurrence of one of the needle bytes in …","Return the first occurrence of one of the needle bytes in …","Return the first occurrence of one of the needle bytes in …","Like find
, but accepts and returns raw pointers.","Like find
, but accepts and returns raw pointers.","Like find
, but accepts and returns raw pointers.","","","","","","","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","","","","Returns true when this implementation is available in the …","Returns true when this implementation is available in the …","Returns true when this implementation is available in the …","Returns an iterator over all occurrences of the needle …","Returns an iterator over all occurrences of the needle …","Returns an iterator over all occurrences of the needle …","Create a new searcher that finds occurrences of the needle …","Create a new searcher that finds occurrences of the needle …","Create a new searcher that finds occurrences of the needle …","Create a new finder specific to SSE2 vectors and routines …","Create a new finder specific to SSE2 vectors and routines …","Create a new finder specific to SSE2 vectors and routines …","","","","","","","Return the last occurrence of one of the needle bytes in …","Return the last occurrence of one of the needle bytes in …","Return the last occurrence of one of the needle bytes in …","Like rfind
, but accepts and returns raw pointers.","Like rfind
, but accepts and returns raw pointers.","Like rfind
, but accepts and returns raw pointers.","","","","","","","","","","","","","","","","","","","","","","","","","","","","A “packed pair” finder that uses 128-bit vector …","","","","","Execute a search using SSE2 vectors and routines.","Run this finder on the given haystack as a prefilter.","","Returns the argument unchanged.","Calls U::from(self)
.","Returns true when this implementation is available in the …","Returns the minimum haystack length that this Finder
can …","Create a new pair searcher. The searcher returned can …","Returns the pair of offsets (into the needle) used to …","","","","","Create a new “packed pair” finder using the pair of …","Automatically detect whether a heuristic prefilter should …","An iterator over non-overlapping substring matches.","An iterator over non-overlapping substring matches in …","A single substring searcher fixed to a particular needle.","A builder for constructing non-default forward or reverse …","A single substring reverse searcher fixed to a particular …","Never used a prefilter in substring search.","Prefilter controls whether heuristics are used to …","Convert this finder into its borrowed variant.","Convert this finder into its borrowed variant.","","","","","","","","","","","","","Build a forward finder using the given needle from the …","Build a forward finder using the given needle and a custom …","Build a reverse finder using the given needle from the …","","","","","","","","","","","Returns the index of the first occurrence of the given …","Returns the index of the first occurrence of this needle …","Returns an iterator over all non-overlapping occurrences …","Returns an iterator over all occurrences of a substring in …","","","","","","","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","","","Convert this iterator into its owned variant, such that it …","Convert this iterator into its owned variant, such that it …","Convert this finder into its owned variant, such that it …","Convert this finder into its owned variant, such that it …","Returns the needle that this finder searches for.","Returns the needle that this finder searches for.","Create a new finder for the given needle.","Create a new reverse finder for the given needle.","Create a new finder builder with default settings.","","","Configure the prefilter setting for the finder.","Returns the index of the last occurrence of the given …","Returns the index of the last occurrence of this needle in …","Returns a reverse iterator over all non-overlapping …","Returns a reverse iterator over all occurrences of a …","","","","","","","","","","","","","","","","","","","","","","",""],"i":[0,0,0,0,1,2,3,1,2,3,1,2,3,1,2,3,1,1,2,3,1,2,3,1,2,3,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,16,17,18,19,20,15,16,17,18,19,20,15,16,17,18,19,20,15,16,17,18,19,20,15,16,15,15,17,19,15,17,19,15,16,17,18,19,20,15,16,17,18,19,20,15,16,17,18,19,20,16,18,20,15,17,19,15,17,19,16,18,20,16,18,20,15,17,19,15,17,19,16,18,20,15,16,17,18,19,20,15,16,17,18,19,20,15,16,17,18,19,20,15,16,17,18,19,20,0,0,0,21,22,21,22,21,22,21,22,21,21,22,21,22,22,22,21,22,21,22,21,23,21,22,21,22,21,22,21,22,22,21,22,0,0,24,25,24,25,24,25,24,25,24,24,24,25,24,25,24,25,24,25,25,25,24,25,24,25,24,25,24,25,0,26,26,26,26,26,26,26,26,26,26,0,0,27,28,27,28,27,28,27,28,27,27,28,27,28,27,28,27,28,28,27,28,27,28,27,28,27,28,0,0,0,0,0,0,0,0,0,0,29,30,31,32,33,34,29,30,31,32,33,34,29,30,31,32,33,34,29,30,31,32,33,34,29,30,29,29,31,33,29,31,33,29,30,31,32,33,34,29,30,31,32,33,34,29,30,31,32,33,34,30,32,34,29,31,33,29,31,33,29,31,33,29,31,33,30,32,34,30,32,34,29,31,33,29,31,33,30,32,34,29,30,31,32,33,34,29,30,31,32,33,34,29,30,31,32,33,34,29,30,31,32,33,34,0,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,0,0,0,0,0,0,0,0,36,37,38,39,40,41,36,37,38,39,40,41,36,37,38,39,40,41,36,37,38,39,40,41,36,37,36,36,38,40,36,38,40,36,37,38,39,40,41,36,37,38,39,40,41,36,37,38,39,40,41,37,39,41,36,38,40,36,38,40,36,38,40,36,38,40,37,39,41,37,39,41,36,38,40,36,38,40,37,39,41,36,37,38,39,40,41,36,37,38,39,40,41,36,37,38,39,40,41,36,37,38,39,40,41,0,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,48,0,0,0,0,0,48,0,43,44,48,49,50,43,44,45,48,49,50,43,44,45,45,45,45,48,43,44,45,48,43,44,45,48,45,0,43,0,43,48,49,50,43,44,45,48,49,50,43,44,45,48,49,50,43,44,45,49,50,49,50,43,44,43,44,43,44,45,49,50,45,0,44,0,44,49,48,43,44,45,48,49,50,43,44,45,48,49,50,43,44,45,48,49,50,43,44,45],"f":[0,0,0,0,[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[1,1],[2,2],[3,3],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[1,5],[[1,6],7],[[2,6],7],[[3,6],7],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[[8,[9,[8]]],[[10,[5]]]],[[8,8,[9,[8]]],[[10,[5]]]],[[8,8,[9,[8]]],2],[[8,8,8,[9,[8]]],[[10,[5]]]],[[8,8,8,[9,[8]]],3],[[8,[9,[8]]],1],0,[[8,[9,[8]]],[[10,[5]]]],[[8,8,[9,[8]]],[[10,[5]]]],[[8,8,[9,[8]]],[[11,[2]]]],[[8,8,8,[9,[8]]],[[10,[5]]]],[[8,8,8,[9,[8]]],[[11,[3]]]],[[8,[9,[8]]],[[11,[1]]]],[[8,[9,[8]]],1],[[8,8,[9,[8]]],2],[[8,8,8,[9,[8]]],3],[1,[[10,[5]]]],[2,[[10,[5]]]],[3,[[10,[5]]]],[1,[[10,[5]]]],[2,[[10,[5]]]],[3,[[10,[5]]]],[1,[[4,[5,[10,[5]]]]]],[2,[[4,[5,[10,[5]]]]]],[3,[[4,[5,[10,[5]]]]]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,13,[]],[-1,13,[]],[-1,13,[]],0,0,[[[9,[8]],[9,[8]]],14],[[8,8,5],14],[[[9,[8]],[9,[8]]],14],[[[9,[8]],[9,[8]]],14],0,0,0,0,0,0,0,0,0,0,0,[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[15,15],[16,16],[17,17],[18,18],[19,19],[20,20],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[15,[9,[8]]],5],[16,5],[[15,8,8],5],[[15,[9,[8]]],[[10,[5]]]],[[17,[9,[8]]],[[10,[5]]]],[[19,[9,[8]]],[[10,[5]]]],[[15,8,8],[[10,[8]]]],[[17,8,8],[[10,[8]]]],[[19,8,8],[[10,[8]]]],[[15,6],7],[[16,6],7],[[17,6],7],[[18,6],7],[[19,6],7],[[20,6],7],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[[15,[9,[8]]],16],[[17,[9,[8]]],18],[[19,[9,[8]]],20],[8,15],[[8,8],17],[[8,8,8],19],[16,[[10,[5]]]],[18,[[10,[5]]]],[20,[[10,[5]]]],[16,[[10,[5]]]],[18,[[10,[5]]]],[20,[[10,[5]]]],[[15,[9,[8]]],[[10,[5]]]],[[17,[9,[8]]],[[10,[5]]]],[[19,[9,[8]]],[[10,[5]]]],[[15,8,8],[[10,[8]]]],[[17,8,8],[[10,[8]]]],[[19,8,8],[[10,[8]]]],[16,[[4,[5,[10,[5]]]]]],[18,[[4,[5,[10,[5]]]]]],[20,[[4,[5,[10,[5]]]]]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,13,[]],[-1,13,[]],[-1,13,[]],[-1,13,[]],[-1,13,[]],[-1,13,[]],0,0,0,[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[21,21],[22,22],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[21,[9,[8]]],[[10,[5]]]],[[21,6],7],[[22,6],7],[-1,-1,[]],[-1,-1,[]],[22,8],[22,8],[-1,-2,[],[]],[-1,-2,[],[]],[[[9,[8]]],[[10,[21]]]],[[[9,[8]]],[[10,[22]]]],[21,22],[[-1,8],8,[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,13,[]],[-1,13,[]],[[[9,[8]],8,8],[[10,[22]]]],[[[9,[8]],22],[[10,[21]]]],[[[9,[8]],-1],[[10,[22]]],23],0,0,[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[24,24],[25,25],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[24,[9,[8]],[9,[8]]],[[10,[5]]]],[[24,8,8,8,8],[[10,[8]]]],[[24,6],7],[[25,6],7],[-1,-1,[]],[-1,-1,[]],[-1,-2,[],[]],[-1,-2,[],[]],[[[9,[8]]],24],[[[9,[8]]],25],[[25,[9,[8]],[9,[8]]],[[10,[5]]]],[[25,8,8,8,8],[[10,[8]]]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,13,[]],[-1,13,[]],0,[-1,-2,[],[]],[-1,-2,[],[]],[[26,[9,[8]]],[[10,[5]]]],[[26,6],7],[-1,-1,[]],[-1,-2,[],[]],[[[9,[8]]],[[10,[26]]]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,13,[]],0,0,[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[27,27],[28,28],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[27,[9,[8]],[9,[8]]],[[10,[5]]]],[[27,6],7],[[28,6],7],[-1,-1,[]],[-1,-1,[]],[-1,-2,[],[]],[-1,-2,[],[]],[[[9,[8]]],27],[[[9,[8]]],28],[[28,[9,[8]],[9,[8]]],[[10,[5]]]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,13,[]],[-1,13,[]],0,0,0,0,0,0,0,0,0,0,[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[29,29],[30,30],[31,31],[32,32],[33,33],[34,34],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[29,[9,[8]]],5],[30,5],[[29,8,8],5],[[29,[9,[8]]],[[10,[5]]]],[[31,[9,[8]]],[[10,[5]]]],[[33,[9,[8]]],[[10,[5]]]],[[29,8,8],[[10,[8]]]],[[31,8,8],[[10,[8]]]],[[33,8,8],[[10,[8]]]],[[29,6],7],[[30,6],7],[[31,6],7],[[32,6],7],[[33,6],7],[[34,6],7],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[[],14],[[],14],[[],14],[[29,[9,[8]]],30],[[31,[9,[8]]],32],[[33,[9,[8]]],34],[8,[[10,[29]]]],[[8,8],[[10,[31]]]],[[8,8,8],[[10,[33]]]],[8,29],[[8,8],31],[[8,8,8],33],[30,[[10,[5]]]],[32,[[10,[5]]]],[34,[[10,[5]]]],[30,[[10,[5]]]],[32,[[10,[5]]]],[34,[[10,[5]]]],[[29,[9,[8]]],[[10,[5]]]],[[31,[9,[8]]],[[10,[5]]]],[[33,[9,[8]]],[[10,[5]]]],[[29,8,8],[[10,[8]]]],[[31,8,8],[[10,[8]]]],[[33,8,8],[[10,[8]]]],[30,[[4,[5,[10,[5]]]]]],[32,[[4,[5,[10,[5]]]]]],[34,[[4,[5,[10,[5]]]]]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,13,[]],[-1,13,[]],[-1,13,[]],[-1,13,[]],[-1,13,[]],[-1,13,[]],0,[-1,-2,[],[]],[-1,-2,[],[]],[35,35],[[-1,-2],4,[],[]],[[35,[9,[8]],[9,[8]]],[[10,[5]]]],[[35,[9,[8]]],[[10,[5]]]],[[35,6],7],[-1,-1,[]],[-1,-2,[],[]],[[],14],[35,5],[[[9,[8]]],[[10,[35]]]],[35,22],[-1,-2,[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,13,[]],[[[9,[8]],22],[[10,[35]]]],0,0,0,0,0,0,0,0,[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[36,36],[37,37],[38,38],[39,39],[40,40],[41,41],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[36,[9,[8]]],5],[37,5],[[36,8,8],5],[[36,[9,[8]]],[[10,[5]]]],[[38,[9,[8]]],[[10,[5]]]],[[40,[9,[8]]],[[10,[5]]]],[[36,8,8],[[10,[8]]]],[[38,8,8],[[10,[8]]]],[[40,8,8],[[10,[8]]]],[[36,6],7],[[37,6],7],[[38,6],7],[[39,6],7],[[40,6],7],[[41,6],7],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[[],14],[[],14],[[],14],[[36,[9,[8]]],37],[[38,[9,[8]]],39],[[40,[9,[8]]],41],[8,[[10,[36]]]],[[8,8],[[10,[38]]]],[[8,8,8],[[10,[40]]]],[8,36],[[8,8],38],[[8,8,8],40],[37,[[10,[5]]]],[39,[[10,[5]]]],[41,[[10,[5]]]],[37,[[10,[5]]]],[39,[[10,[5]]]],[41,[[10,[5]]]],[[36,[9,[8]]],[[10,[5]]]],[[38,[9,[8]]],[[10,[5]]]],[[40,[9,[8]]],[[10,[5]]]],[[36,8,8],[[10,[8]]]],[[38,8,8],[[10,[8]]]],[[40,8,8],[[10,[8]]]],[37,[[4,[5,[10,[5]]]]]],[39,[[4,[5,[10,[5]]]]]],[41,[[4,[5,[10,[5]]]]]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,13,[]],[-1,13,[]],[-1,13,[]],[-1,13,[]],[-1,13,[]],[-1,13,[]],0,[-1,-2,[],[]],[-1,-2,[],[]],[42,42],[[-1,-2],4,[],[]],[[42,[9,[8]],[9,[8]]],[[10,[5]]]],[[42,[9,[8]]],[[10,[5]]]],[[42,6],7],[-1,-1,[]],[-1,-2,[],[]],[[],14],[42,5],[[[9,[8]]],[[10,[42]]]],[42,22],[-1,-2,[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,13,[]],[[[9,[8]],22],[[10,[42]]]],0,0,0,0,0,0,0,0,[43,43],[44,44],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[[45,-1],43,[46,[47,[[9,[8]]]]]],[[45,-1,-2],43,23,[46,[47,[[9,[8]]]]]],[[45,-1],44,[46,[47,[[9,[8]]]]]],[48,48],[43,43],[44,44],[45,45],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[-1,-2],4,[],[]],[[],48],[[],45],[[[9,[8]],[9,[8]]],[[10,[5]]]],[[43,[9,[8]]],[[10,[5]]]],[[[9,[8]],-1],49,[46,[47,[[9,[8]]]]]],[[43,[9,[8]]],49],[[48,6],7],[[49,6],7],[[50,6],7],[[43,6],7],[[44,6],7],[[45,6],7],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[49,49],[50,50],[43,43],[44,44],[43,[[9,[8]]]],[44,[[9,[8]]]],[-1,43,[46,[47,[[9,[8]]]]]],[-1,44,[46,[47,[[9,[8]]]]]],[[],45],[49,[[10,[5]]]],[50,[[10,[5]]]],[[45,48],45],[[[9,[8]],[9,[8]]],[[10,[5]]]],[[44,-1],[[10,[5]]],[[47,[[9,[8]]]]]],[[[9,[8]],-1],50,[46,[47,[[9,[8]]]]]],[[44,[9,[8]]],50],[49,[[4,[5,[10,[5]]]]]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,[[12,[-2]]],[],[]],[-1,13,[]],[-1,13,[]],[-1,13,[]],[-1,13,[]],[-1,13,[]],[-1,13,[]]],"c":[],"p":[[3,"Memchr",0],[3,"Memchr2",0],[3,"Memchr3",0],[15,"tuple"],[15,"usize"],[3,"Formatter",650],[6,"Result",650],[15,"u8"],[15,"slice"],[4,"Option",651],[3,"Rev",652],[4,"Result",653],[3,"TypeId",654],[15,"bool"],[3,"One",77],[3,"OneIter",77],[3,"Two",77],[3,"TwoIter",77],[3,"Three",77],[3,"ThreeIter",77],[3,"Finder",182],[3,"Pair",182],[8,"HeuristicFrequencyRank",182],[3,"Finder",217],[3,"FinderRev",217],[3,"Finder",247],[3,"Finder",258],[3,"FinderRev",258],[3,"One",290],[3,"OneIter",290],[3,"Two",290],[3,"TwoIter",290],[3,"Three",290],[3,"ThreeIter",290],[3,"Finder",401],[3,"One",422],[3,"OneIter",422],[3,"Two",422],[3,"TwoIter",422],[3,"Three",422],[3,"ThreeIter",422],[3,"Finder",533],[3,"Finder",552],[3,"FinderRev",552],[3,"FinderBuilder",552],[8,"Sized",655],[8,"AsRef",656],[4,"Prefilter",552],[3,"FindIter",552],[3,"FindRevIter",552]],"b":[]},\
+"roe":{"doc":"This crate provides Unicode case mapping routines and …","t":"NNNNNDNNDENNDELLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLFFFFLLLLLLLLLLLFFFLLLLLLLLLLLLLLLLLLLLLLLLLLLLLFLLLL","n":["Ascii","Ascii","Fold","Full","Full","InvalidCaseMappingMode","Lithuanian","Lithuanian","Lowercase","LowercaseMode","Turkic","Turkic","Uppercase","UppercaseMode","borrow","borrow","borrow","borrow","borrow","borrow_mut","borrow_mut","borrow_mut","borrow_mut","borrow_mut","clone","clone","clone","clone","clone","clone_into","clone_into","clone_into","clone_into","clone_into","cmp","cmp","cmp","count","count","default","default","default","eq","eq","eq","fmt","fmt","fmt","fmt","fmt","fmt","from","from","from","from","from","from_str","from_str","hash","hash","hash","into","into","into","into","into","into_iter","into_iter","lowercase","make_ascii_lowercase","make_ascii_titlecase","make_ascii_uppercase","message","new","new","new","next","next","partial_cmp","partial_cmp","partial_cmp","size_hint","size_hint","to_ascii_lowercase","to_ascii_titlecase","to_ascii_uppercase","to_owned","to_owned","to_owned","to_owned","to_owned","to_string","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_from","try_into","try_into","try_into","try_into","try_into","type_id","type_id","type_id","type_id","type_id","uppercase","with_ascii_slice","with_ascii_slice","with_slice","with_slice"],"q":[[0,"roe"],[120,"core::cmp"],[121,"core::fmt"],[122,"core::fmt"],[123,"core::hash"],[124,"core::convert"],[125,"core::option"],[126,"alloc::vec"],[127,"core::convert"],[128,"core::any"]],"d":["Only the ASCII region, i.e. the characters 'A'..='Z'
and …","Only the ASCII region, i.e. the characters 'A'..='Z'
and …","Unicode case folding, which is more far-reaching than …","Full Unicode case mapping, suitable for most languages.","Full Unicode case mapping, suitable for most languages.","Error that indicates a failure to parse a LowercaseMode
or …","Currently, just full Unicode case mapping.","Currently, just full Unicode case mapping.","An iterator that yields the lowercase equivalent of a …","Options to configure the behavior of lowercase
.","Full Unicode case mapping, adapted for Turkic languages …","Full Unicode case mapping, adapted for Turkic languages …","An iterator that yields the uppercase equivalent of a …","Options to configure the behavior of uppercase
.","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","Returns the argument unchanged.","","","","","","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","Calls U::from(self)
.","","","Returns an iterator that yields a copy of the bytes in the …","Converts the given slice to its ASCII lower case …","Converts the given slice to its ASCII title case …","Converts the given slice to its ASCII upper case …","Retrieve the error message associated with this …","Create a new, empty lowercase iterator.","Create a new, empty uppercase iterator.","Construct a new InvalidCaseMappingMode
error.","","","","","","","","Returns a vector containing a copy of the given slice …","Returns a vector containing a copy of the given slice …","Returns a vector containing a copy of the given slice …","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","Returns an iterator that yields a copy of the bytes in the …","Create a new lowercase iterator with the given byte slice …","Create a new uppercase iterator with the given byte slice …","Create a new lowercase iterator with the given byte slice …","Create a new uppercase iterator with the given byte slice …"],"i":[4,5,4,4,5,0,4,5,0,0,4,5,0,0,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,3,4,5,1,2,3,4,5,3,4,5,1,2,3,3,4,5,1,2,3,4,5,4,5,3,4,5,1,2,3,4,5,1,2,0,0,0,0,3,1,2,3,1,2,3,4,5,1,2,0,0,0,1,2,3,4,5,3,1,2,3,4,4,4,4,4,5,5,5,5,5,1,2,3,4,5,1,2,3,4,5,0,1,2,1,2],"f":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[1,1],[2,2],[3,3],[4,4],[5,5],[[-1,-2],6,[],[]],[[-1,-2],6,[],[]],[[-1,-2],6,[],[]],[[-1,-2],6,[],[]],[[-1,-2],6,[],[]],[[3,3],7],[[4,4],7],[[5,5],7],[1,8],[2,8],[[],3],[[],4],[[],5],[[3,3],9],[[4,4],9],[[5,5],9],[[1,10],11],[[2,10],11],[[3,10],11],[[3,10],11],[[4,10],11],[[5,10],11],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[-1,-1,[]],[12,[[13,[4]]]],[12,[[13,[5]]]],[[3,-1],6,14],[[4,-1],6,14],[[5,-1],6,14],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[[[16,[15]],4],1],[-1,6,[[17,[[16,[15]]]]]],[-1,6,[[17,[[16,[15]]]]]],[-1,6,[[17,[[16,[15]]]]]],[3,12],[[],1],[[],2],[[],3],[1,18],[2,18],[[3,3],[[18,[7]]]],[[4,4],[[18,[7]]]],[[5,5],[[18,[7]]]],[1,[[6,[8,[18,[8]]]]]],[2,[[6,[8,[18,[8]]]]]],[-1,[[19,[15]]],[[20,[[16,[15]]]]]],[-1,[[19,[15]]],[[20,[[16,[15]]]]]],[-1,[[19,[15]]],[[20,[[16,[15]]]]]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,-2,[],[]],[-1,21,[]],[-1,[[13,[-2]]],[],[]],[-1,[[13,[-2]]],[],[]],[-1,[[13,[-2]]],[],[]],[[[18,[[16,[15]]]]],[[13,[4]]]],[[[18,[12]]],[[13,[4]]]],[-1,[[13,[-2]]],[],[]],[[[16,[15]]],[[13,[4]]]],[12,[[13,[4]]]],[-1,[[13,[-2]]],[],[]],[[[18,[[16,[15]]]]],[[13,[5]]]],[12,[[13,[5]]]],[[[18,[12]]],[[13,[5]]]],[[[16,[15]]],[[13,[5]]]],[-1,[[13,[-2]]],[],[]],[-1,[[13,[-2]]],[],[]],[-1,[[13,[-2]]],[],[]],[-1,[[13,[-2]]],[],[]],[-1,[[13,[-2]]],[],[]],[-1,22,[]],[-1,22,[]],[-1,22,[]],[-1,22,[]],[-1,22,[]],[[[16,[15]],5],2],[[[16,[15]]],1],[[[16,[15]]],2],[[[16,[15]]],1],[[[16,[15]]],2]],"c":[],"p":[[3,"Lowercase",0],[3,"Uppercase",0],[3,"InvalidCaseMappingMode",0],[4,"LowercaseMode",0],[4,"UppercaseMode",0],[15,"tuple"],[4,"Ordering",120],[15,"usize"],[15,"bool"],[3,"Formatter",121],[6,"Result",121],[15,"str"],[4,"Result",122],[8,"Hasher",123],[15,"u8"],[15,"slice"],[8,"AsMut",124],[4,"Option",125],[3,"Vec",126],[8,"AsRef",124],[3,"String",127],[3,"TypeId",128]],"b":[[47,"impl-Debug-for-InvalidCaseMappingMode"],[48,"impl-Display-for-InvalidCaseMappingMode"],[95,"impl-TryFrom%3COption%3C%26%5Bu8%5D%3E%3E-for-LowercaseMode"],[96,"impl-TryFrom%3COption%3C%26str%3E%3E-for-LowercaseMode"],[98,"impl-TryFrom%3C%26%5Bu8%5D%3E-for-LowercaseMode"],[99,"impl-TryFrom%3C%26str%3E-for-LowercaseMode"],[101,"impl-TryFrom%3COption%3C%26%5Bu8%5D%3E%3E-for-UppercaseMode"],[102,"impl-TryFrom%3C%26str%3E-for-UppercaseMode"],[103,"impl-TryFrom%3COption%3C%26str%3E%3E-for-UppercaseMode"],[104,"impl-TryFrom%3C%26%5Bu8%5D%3E-for-UppercaseMode"]]}\
+}');
+if (typeof window !== 'undefined' && window.initSearch) {window.initSearch(searchIndex)};
+if (typeof exports !== 'undefined') {exports.searchIndex = searchIndex};
diff --git a/settings.html b/settings.html
new file mode 100644
index 000000000..aced12316
--- /dev/null
+++ b/settings.html
@@ -0,0 +1 @@
+1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +
// The following ~400 lines of code exists for exactly one purpose, which is
+// to optimize this code:
+//
+// byte_slice.iter().position(|&b| b > 0x7F).unwrap_or(byte_slice.len())
+//
+// Yes... Overengineered is a word that comes to mind, but this is effectively
+// a very similar problem to memchr, and virtually nobody has been able to
+// resist optimizing the crap out of that (except for perhaps the BSD and MUSL
+// folks). In particular, this routine makes a very common case (ASCII) very
+// fast, which seems worth it. We do stop short of adding AVX variants of the
+// code below in order to retain our sanity and also to avoid needing to deal
+// with runtime target feature detection. RESIST!
+//
+// In order to understand the SIMD version below, it would be good to read this
+// comment describing how my memchr routine works:
+// https://github.com/BurntSushi/rust-memchr/blob/b0a29f267f4a7fad8ffcc8fe8377a06498202883/src/x86/sse2.rs#L19-L106
+//
+// The primary difference with memchr is that for ASCII, we can do a bit less
+// work. In particular, we don't need to detect the presence of a specific
+// byte, but rather, whether any byte has its most significant bit set. That
+// means we can effectively skip the _mm_cmpeq_epi8 step and jump straight to
+// _mm_movemask_epi8.
+
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
+const USIZE_BYTES: usize = core::mem::size_of::<usize>();
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
+const FALLBACK_LOOP_SIZE: usize = 2 * USIZE_BYTES;
+
+// This is a mask where the most significant bit of each byte in the usize
+// is set. We test this bit to determine whether a character is ASCII or not.
+// Namely, a single byte is regarded as an ASCII codepoint if and only if it's
+// most significant bit is not set.
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
+const ASCII_MASK_U64: u64 = 0x8080808080808080;
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
+const ASCII_MASK: usize = ASCII_MASK_U64 as usize;
+
+/// Returns the index of the first non ASCII byte in the given slice.
+///
+/// If slice only contains ASCII bytes, then the length of the slice is
+/// returned.
+pub fn first_non_ascii_byte(slice: &[u8]) -> usize {
+ #[cfg(any(miri, not(target_arch = "x86_64")))]
+ {
+ first_non_ascii_byte_fallback(slice)
+ }
+
+ #[cfg(all(not(miri), target_arch = "x86_64"))]
+ {
+ first_non_ascii_byte_sse2(slice)
+ }
+}
+
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
+fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize {
+ let align = USIZE_BYTES - 1;
+ let start_ptr = slice.as_ptr();
+ let end_ptr = slice[slice.len()..].as_ptr();
+ let mut ptr = start_ptr;
+
+ unsafe {
+ if slice.len() < USIZE_BYTES {
+ return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr);
+ }
+
+ let chunk = read_unaligned_usize(ptr);
+ let mask = chunk & ASCII_MASK;
+ if mask != 0 {
+ return first_non_ascii_byte_mask(mask);
+ }
+
+ ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & align));
+ debug_assert!(ptr > start_ptr);
+ debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr);
+ if slice.len() >= FALLBACK_LOOP_SIZE {
+ while ptr <= ptr_sub(end_ptr, FALLBACK_LOOP_SIZE) {
+ debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
+
+ let a = *(ptr as *const usize);
+ let b = *(ptr_add(ptr, USIZE_BYTES) as *const usize);
+ if (a | b) & ASCII_MASK != 0 {
+ // What a kludge. We wrap the position finding code into
+ // a non-inlineable function, which makes the codegen in
+ // the tight loop above a bit better by avoiding a
+ // couple extra movs. We pay for it by two additional
+ // stores, but only in the case of finding a non-ASCII
+ // byte.
+ #[inline(never)]
+ unsafe fn findpos(
+ start_ptr: *const u8,
+ ptr: *const u8,
+ ) -> usize {
+ let a = *(ptr as *const usize);
+ let b = *(ptr_add(ptr, USIZE_BYTES) as *const usize);
+
+ let mut at = sub(ptr, start_ptr);
+ let maska = a & ASCII_MASK;
+ if maska != 0 {
+ return at + first_non_ascii_byte_mask(maska);
+ }
+
+ at += USIZE_BYTES;
+ let maskb = b & ASCII_MASK;
+ debug_assert!(maskb != 0);
+ return at + first_non_ascii_byte_mask(maskb);
+ }
+ return findpos(start_ptr, ptr);
+ }
+ ptr = ptr_add(ptr, FALLBACK_LOOP_SIZE);
+ }
+ }
+ first_non_ascii_byte_slow(start_ptr, end_ptr, ptr)
+ }
+}
+
+#[cfg(all(not(miri), target_arch = "x86_64"))]
+fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize {
+ use core::arch::x86_64::*;
+
+ const VECTOR_SIZE: usize = core::mem::size_of::<__m128i>();
+ const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
+ const VECTOR_LOOP_SIZE: usize = 4 * VECTOR_SIZE;
+
+ let start_ptr = slice.as_ptr();
+ let end_ptr = slice[slice.len()..].as_ptr();
+ let mut ptr = start_ptr;
+
+ unsafe {
+ if slice.len() < VECTOR_SIZE {
+ return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr);
+ }
+
+ let chunk = _mm_loadu_si128(ptr as *const __m128i);
+ let mask = _mm_movemask_epi8(chunk);
+ if mask != 0 {
+ return mask.trailing_zeros() as usize;
+ }
+
+ ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+ debug_assert!(ptr > start_ptr);
+ debug_assert!(end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+ if slice.len() >= VECTOR_LOOP_SIZE {
+ while ptr <= ptr_sub(end_ptr, VECTOR_LOOP_SIZE) {
+ debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+ let a = _mm_load_si128(ptr as *const __m128i);
+ let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+ let c =
+ _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
+ let d =
+ _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
+
+ let or1 = _mm_or_si128(a, b);
+ let or2 = _mm_or_si128(c, d);
+ let or3 = _mm_or_si128(or1, or2);
+ if _mm_movemask_epi8(or3) != 0 {
+ let mut at = sub(ptr, start_ptr);
+ let mask = _mm_movemask_epi8(a);
+ if mask != 0 {
+ return at + mask.trailing_zeros() as usize;
+ }
+
+ at += VECTOR_SIZE;
+ let mask = _mm_movemask_epi8(b);
+ if mask != 0 {
+ return at + mask.trailing_zeros() as usize;
+ }
+
+ at += VECTOR_SIZE;
+ let mask = _mm_movemask_epi8(c);
+ if mask != 0 {
+ return at + mask.trailing_zeros() as usize;
+ }
+
+ at += VECTOR_SIZE;
+ let mask = _mm_movemask_epi8(d);
+ debug_assert!(mask != 0);
+ return at + mask.trailing_zeros() as usize;
+ }
+ ptr = ptr_add(ptr, VECTOR_LOOP_SIZE);
+ }
+ }
+ while ptr <= end_ptr.sub(VECTOR_SIZE) {
+ debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
+
+ let chunk = _mm_loadu_si128(ptr as *const __m128i);
+ let mask = _mm_movemask_epi8(chunk);
+ if mask != 0 {
+ return sub(ptr, start_ptr) + mask.trailing_zeros() as usize;
+ }
+ ptr = ptr.add(VECTOR_SIZE);
+ }
+ first_non_ascii_byte_slow(start_ptr, end_ptr, ptr)
+ }
+}
+
+#[inline(always)]
+unsafe fn first_non_ascii_byte_slow(
+ start_ptr: *const u8,
+ end_ptr: *const u8,
+ mut ptr: *const u8,
+) -> usize {
+ debug_assert!(start_ptr <= ptr);
+ debug_assert!(ptr <= end_ptr);
+
+ while ptr < end_ptr {
+ if *ptr > 0x7F {
+ return sub(ptr, start_ptr);
+ }
+ ptr = ptr.offset(1);
+ }
+ sub(end_ptr, start_ptr)
+}
+
+/// Compute the position of the first ASCII byte in the given mask.
+///
+/// The mask should be computed by `chunk & ASCII_MASK`, where `chunk` is
+/// 8 contiguous bytes of the slice being checked where *at least* one of those
+/// bytes is not an ASCII byte.
+///
+/// The position returned is always in the inclusive range [0, 7].
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
+fn first_non_ascii_byte_mask(mask: usize) -> usize {
+ #[cfg(target_endian = "little")]
+ {
+ mask.trailing_zeros() as usize / 8
+ }
+ #[cfg(target_endian = "big")]
+ {
+ mask.leading_zeros() as usize / 8
+ }
+}
+
+/// Increment the given pointer by the given amount.
+unsafe fn ptr_add(ptr: *const u8, amt: usize) -> *const u8 {
+ debug_assert!(amt < ::core::isize::MAX as usize);
+ ptr.offset(amt as isize)
+}
+
+/// Decrement the given pointer by the given amount.
+unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 {
+ debug_assert!(amt < ::core::isize::MAX as usize);
+ ptr.offset((amt as isize).wrapping_neg())
+}
+
+#[cfg(any(test, miri, not(target_arch = "x86_64")))]
+unsafe fn read_unaligned_usize(ptr: *const u8) -> usize {
+ use core::ptr;
+
+ let mut n: usize = 0;
+ ptr::copy_nonoverlapping(ptr, &mut n as *mut _ as *mut u8, USIZE_BYTES);
+ n
+}
+
+/// Subtract `b` from `a` and return the difference. `a` should be greater than
+/// or equal to `b`.
+fn sub(a: *const u8, b: *const u8) -> usize {
+ debug_assert!(a >= b);
+ (a as usize) - (b as usize)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ // Our testing approach here is to try and exhaustively test every case.
+ // This includes the position at which a non-ASCII byte occurs in addition
+ // to the alignment of the slice that we're searching.
+
+ #[test]
+ fn positive_fallback_forward() {
+ for i in 0..517 {
+ let s = "a".repeat(i);
+ assert_eq!(
+ i,
+ first_non_ascii_byte_fallback(s.as_bytes()),
+ "i: {:?}, len: {:?}, s: {:?}",
+ i,
+ s.len(),
+ s
+ );
+ }
+ }
+
+ #[test]
+ #[cfg(target_arch = "x86_64")]
+ #[cfg(not(miri))]
+ fn positive_sse2_forward() {
+ for i in 0..517 {
+ let b = "a".repeat(i).into_bytes();
+ assert_eq!(b.len(), first_non_ascii_byte_sse2(&b));
+ }
+ }
+
+ #[test]
+ #[cfg(not(miri))]
+ fn negative_fallback_forward() {
+ for i in 0..517 {
+ for align in 0..65 {
+ let mut s = "a".repeat(i);
+ s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃");
+ let s = s.get(align..).unwrap_or("");
+ assert_eq!(
+ i.saturating_sub(align),
+ first_non_ascii_byte_fallback(s.as_bytes()),
+ "i: {:?}, align: {:?}, len: {:?}, s: {:?}",
+ i,
+ align,
+ s.len(),
+ s
+ );
+ }
+ }
+ }
+
+ #[test]
+ #[cfg(target_arch = "x86_64")]
+ #[cfg(not(miri))]
+ fn negative_sse2_forward() {
+ for i in 0..517 {
+ for align in 0..65 {
+ let mut s = "a".repeat(i);
+ s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃");
+ let s = s.get(align..).unwrap_or("");
+ assert_eq!(
+ i.saturating_sub(align),
+ first_non_ascii_byte_sse2(s.as_bytes()),
+ "i: {:?}, align: {:?}, len: {:?}, s: {:?}",
+ i,
+ align,
+ s.len(),
+ s
+ );
+ }
+ }
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +
use core::mem;
+
+#[cfg(feature = "alloc")]
+use alloc::boxed::Box;
+
+/// A wrapper for `&[u8]` that provides convenient string oriented trait impls.
+///
+/// If you need ownership or a growable byte string buffer, then use
+/// [`BString`](struct.BString.html).
+///
+/// Using a `&BStr` is just like using a `&[u8]`, since `BStr`
+/// implements `Deref` to `[u8]`. So all methods available on `[u8]`
+/// are also available on `BStr`.
+///
+/// # Representation
+///
+/// A `&BStr` has the same representation as a `&str`. That is, a `&BStr` is
+/// a fat pointer which consists of a pointer to some bytes and a length.
+///
+/// # Trait implementations
+///
+/// The `BStr` type has a number of trait implementations, and in particular,
+/// defines equality and ordinal comparisons between `&BStr`, `&str` and
+/// `&[u8]` for convenience.
+///
+/// The `Debug` implementation for `BStr` shows its bytes as a normal string.
+/// For invalid UTF-8, hex escape sequences are used.
+///
+/// The `Display` implementation behaves as if `BStr` were first lossily
+/// converted to a `str`. Invalid UTF-8 bytes are substituted with the Unicode
+/// replacement codepoint, which looks like this: �.
+#[derive(Hash)]
+#[repr(transparent)]
+pub struct BStr {
+ pub(crate) bytes: [u8],
+}
+
+impl BStr {
+ /// Directly creates a `BStr` slice from anything that can be converted
+ /// to a byte slice.
+ ///
+ /// This is very similar to the [`B`](crate::B) function, except this
+ /// returns a `&BStr` instead of a `&[u8]`.
+ ///
+ /// This is a cost-free conversion.
+ ///
+ /// # Example
+ ///
+ /// You can create `BStr`'s from byte arrays, byte slices or even string
+ /// slices:
+ ///
+ /// ```
+ /// use bstr::BStr;
+ ///
+ /// let a = BStr::new(b"abc");
+ /// let b = BStr::new(&b"abc"[..]);
+ /// let c = BStr::new("abc");
+ ///
+ /// assert_eq!(a, b);
+ /// assert_eq!(a, c);
+ /// ```
+ #[inline]
+ pub fn new<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a BStr {
+ BStr::from_bytes(bytes.as_ref())
+ }
+
+ #[inline]
+ pub(crate) fn new_mut<B: ?Sized + AsMut<[u8]>>(
+ bytes: &mut B,
+ ) -> &mut BStr {
+ BStr::from_bytes_mut(bytes.as_mut())
+ }
+
+ #[inline]
+ pub(crate) fn from_bytes(slice: &[u8]) -> &BStr {
+ unsafe { mem::transmute(slice) }
+ }
+
+ #[inline]
+ pub(crate) fn from_bytes_mut(slice: &mut [u8]) -> &mut BStr {
+ unsafe { mem::transmute(slice) }
+ }
+
+ #[inline]
+ #[cfg(feature = "alloc")]
+ pub(crate) fn from_boxed_bytes(slice: Box<[u8]>) -> Box<BStr> {
+ unsafe { Box::from_raw(Box::into_raw(slice) as _) }
+ }
+
+ #[inline]
+ #[cfg(feature = "alloc")]
+ pub(crate) fn into_boxed_bytes(slice: Box<BStr>) -> Box<[u8]> {
+ unsafe { Box::from_raw(Box::into_raw(slice) as _) }
+ }
+
+ #[inline]
+ pub(crate) fn as_bytes(&self) -> &[u8] {
+ &self.bytes
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +
use memchr::{memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3};
+
+mod scalar;
+
+#[inline]
+fn build_table(byteset: &[u8]) -> [u8; 256] {
+ let mut table = [0u8; 256];
+ for &b in byteset {
+ table[b as usize] = 1;
+ }
+ table
+}
+
+#[inline]
+pub(crate) fn find(haystack: &[u8], byteset: &[u8]) -> Option<usize> {
+ match byteset.len() {
+ 0 => return None,
+ 1 => memchr(byteset[0], haystack),
+ 2 => memchr2(byteset[0], byteset[1], haystack),
+ 3 => memchr3(byteset[0], byteset[1], byteset[2], haystack),
+ _ => {
+ let table = build_table(byteset);
+ scalar::forward_search_bytes(haystack, |b| table[b as usize] != 0)
+ }
+ }
+}
+
+#[inline]
+pub(crate) fn rfind(haystack: &[u8], byteset: &[u8]) -> Option<usize> {
+ match byteset.len() {
+ 0 => return None,
+ 1 => memrchr(byteset[0], haystack),
+ 2 => memrchr2(byteset[0], byteset[1], haystack),
+ 3 => memrchr3(byteset[0], byteset[1], byteset[2], haystack),
+ _ => {
+ let table = build_table(byteset);
+ scalar::reverse_search_bytes(haystack, |b| table[b as usize] != 0)
+ }
+ }
+}
+
+#[inline]
+pub(crate) fn find_not(haystack: &[u8], byteset: &[u8]) -> Option<usize> {
+ if haystack.is_empty() {
+ return None;
+ }
+ match byteset.len() {
+ 0 => return Some(0),
+ 1 => scalar::inv_memchr(byteset[0], haystack),
+ 2 => scalar::forward_search_bytes(haystack, |b| {
+ b != byteset[0] && b != byteset[1]
+ }),
+ 3 => scalar::forward_search_bytes(haystack, |b| {
+ b != byteset[0] && b != byteset[1] && b != byteset[2]
+ }),
+ _ => {
+ let table = build_table(byteset);
+ scalar::forward_search_bytes(haystack, |b| table[b as usize] == 0)
+ }
+ }
+}
+#[inline]
+pub(crate) fn rfind_not(haystack: &[u8], byteset: &[u8]) -> Option<usize> {
+ if haystack.is_empty() {
+ return None;
+ }
+ match byteset.len() {
+ 0 => return Some(haystack.len() - 1),
+ 1 => scalar::inv_memrchr(byteset[0], haystack),
+ 2 => scalar::reverse_search_bytes(haystack, |b| {
+ b != byteset[0] && b != byteset[1]
+ }),
+ 3 => scalar::reverse_search_bytes(haystack, |b| {
+ b != byteset[0] && b != byteset[1] && b != byteset[2]
+ }),
+ _ => {
+ let table = build_table(byteset);
+ scalar::reverse_search_bytes(haystack, |b| table[b as usize] == 0)
+ }
+ }
+}
+
+#[cfg(all(test, feature = "std", not(miri)))]
+mod tests {
+ quickcheck::quickcheck! {
+ fn qc_byteset_forward_matches_naive(
+ haystack: Vec<u8>,
+ needles: Vec<u8>
+ ) -> bool {
+ super::find(&haystack, &needles)
+ == haystack.iter().position(|b| needles.contains(b))
+ }
+ fn qc_byteset_backwards_matches_naive(
+ haystack: Vec<u8>,
+ needles: Vec<u8>
+ ) -> bool {
+ super::rfind(&haystack, &needles)
+ == haystack.iter().rposition(|b| needles.contains(b))
+ }
+ fn qc_byteset_forward_not_matches_naive(
+ haystack: Vec<u8>,
+ needles: Vec<u8>
+ ) -> bool {
+ super::find_not(&haystack, &needles)
+ == haystack.iter().position(|b| !needles.contains(b))
+ }
+ fn qc_byteset_backwards_not_matches_naive(
+ haystack: Vec<u8>,
+ needles: Vec<u8>
+ ) -> bool {
+ super::rfind_not(&haystack, &needles)
+ == haystack.iter().rposition(|b| !needles.contains(b))
+ }
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +
// This is adapted from `fallback.rs` from rust-memchr. It's modified to return
+// the 'inverse' query of memchr, e.g. finding the first byte not in the
+// provided set. This is simple for the 1-byte case.
+
+use core::{cmp, usize};
+
+const USIZE_BYTES: usize = core::mem::size_of::<usize>();
+
+// The number of bytes to loop at in one iteration of memchr/memrchr.
+const LOOP_SIZE: usize = 2 * USIZE_BYTES;
+
+/// Repeat the given byte into a word size number. That is, every 8 bits
+/// is equivalent to the given byte. For example, if `b` is `\x4E` or
+/// `01001110` in binary, then the returned value on a 32-bit system would be:
+/// `01001110_01001110_01001110_01001110`.
+#[inline(always)]
+fn repeat_byte(b: u8) -> usize {
+ (b as usize) * (usize::MAX / 255)
+}
+
+pub fn inv_memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+ let vn1 = repeat_byte(n1);
+ let confirm = |byte| byte != n1;
+ let loop_size = cmp::min(LOOP_SIZE, haystack.len());
+ let align = USIZE_BYTES - 1;
+ let start_ptr = haystack.as_ptr();
+
+ unsafe {
+ let end_ptr = haystack.as_ptr().add(haystack.len());
+ let mut ptr = start_ptr;
+
+ if haystack.len() < USIZE_BYTES {
+ return forward_search(start_ptr, end_ptr, ptr, confirm);
+ }
+
+ let chunk = read_unaligned_usize(ptr);
+ if (chunk ^ vn1) != 0 {
+ return forward_search(start_ptr, end_ptr, ptr, confirm);
+ }
+
+ ptr = ptr.add(USIZE_BYTES - (start_ptr as usize & align));
+ debug_assert!(ptr > start_ptr);
+ debug_assert!(end_ptr.sub(USIZE_BYTES) >= start_ptr);
+ while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
+ debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
+
+ let a = *(ptr as *const usize);
+ let b = *(ptr.add(USIZE_BYTES) as *const usize);
+ let eqa = (a ^ vn1) != 0;
+ let eqb = (b ^ vn1) != 0;
+ if eqa || eqb {
+ break;
+ }
+ ptr = ptr.add(LOOP_SIZE);
+ }
+ forward_search(start_ptr, end_ptr, ptr, confirm)
+ }
+}
+
+/// Return the last index not matching the byte `x` in `text`.
+pub fn inv_memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+ let vn1 = repeat_byte(n1);
+ let confirm = |byte| byte != n1;
+ let loop_size = cmp::min(LOOP_SIZE, haystack.len());
+ let align = USIZE_BYTES - 1;
+ let start_ptr = haystack.as_ptr();
+
+ unsafe {
+ let end_ptr = haystack.as_ptr().add(haystack.len());
+ let mut ptr = end_ptr;
+
+ if haystack.len() < USIZE_BYTES {
+ return reverse_search(start_ptr, end_ptr, ptr, confirm);
+ }
+
+ let chunk = read_unaligned_usize(ptr.sub(USIZE_BYTES));
+ if (chunk ^ vn1) != 0 {
+ return reverse_search(start_ptr, end_ptr, ptr, confirm);
+ }
+
+ ptr = ptr.sub(end_ptr as usize & align);
+ debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+ while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
+ debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
+
+ let a = *(ptr.sub(2 * USIZE_BYTES) as *const usize);
+ let b = *(ptr.sub(1 * USIZE_BYTES) as *const usize);
+ let eqa = (a ^ vn1) != 0;
+ let eqb = (b ^ vn1) != 0;
+ if eqa || eqb {
+ break;
+ }
+ ptr = ptr.sub(loop_size);
+ }
+ reverse_search(start_ptr, end_ptr, ptr, confirm)
+ }
+}
+
+#[inline(always)]
+unsafe fn forward_search<F: Fn(u8) -> bool>(
+ start_ptr: *const u8,
+ end_ptr: *const u8,
+ mut ptr: *const u8,
+ confirm: F,
+) -> Option<usize> {
+ debug_assert!(start_ptr <= ptr);
+ debug_assert!(ptr <= end_ptr);
+
+ while ptr < end_ptr {
+ if confirm(*ptr) {
+ return Some(sub(ptr, start_ptr));
+ }
+ ptr = ptr.offset(1);
+ }
+ None
+}
+
+#[inline(always)]
+unsafe fn reverse_search<F: Fn(u8) -> bool>(
+ start_ptr: *const u8,
+ end_ptr: *const u8,
+ mut ptr: *const u8,
+ confirm: F,
+) -> Option<usize> {
+ debug_assert!(start_ptr <= ptr);
+ debug_assert!(ptr <= end_ptr);
+
+ while ptr > start_ptr {
+ ptr = ptr.offset(-1);
+ if confirm(*ptr) {
+ return Some(sub(ptr, start_ptr));
+ }
+ }
+ None
+}
+
+unsafe fn read_unaligned_usize(ptr: *const u8) -> usize {
+ (ptr as *const usize).read_unaligned()
+}
+
+/// Subtract `b` from `a` and return the difference. `a` should be greater than
+/// or equal to `b`.
+fn sub(a: *const u8, b: *const u8) -> usize {
+ debug_assert!(a >= b);
+ (a as usize) - (b as usize)
+}
+
+/// Safe wrapper around `forward_search`
+#[inline]
+pub(crate) fn forward_search_bytes<F: Fn(u8) -> bool>(
+ s: &[u8],
+ confirm: F,
+) -> Option<usize> {
+ unsafe {
+ let start = s.as_ptr();
+ let end = start.add(s.len());
+ forward_search(start, end, start, confirm)
+ }
+}
+
+/// Safe wrapper around `reverse_search`
+#[inline]
+pub(crate) fn reverse_search_bytes<F: Fn(u8) -> bool>(
+ s: &[u8],
+ confirm: F,
+) -> Option<usize> {
+ unsafe {
+ let start = s.as_ptr();
+ let end = start.add(s.len());
+ reverse_search(start, end, end, confirm)
+ }
+}
+
+#[cfg(all(test, feature = "std"))]
+mod tests {
+ use super::{inv_memchr, inv_memrchr};
+
+ // search string, search byte, inv_memchr result, inv_memrchr result.
+ // these are expanded into a much larger set of tests in build_tests
+ const TESTS: &[(&[u8], u8, usize, usize)] = &[
+ (b"z", b'a', 0, 0),
+ (b"zz", b'a', 0, 1),
+ (b"aza", b'a', 1, 1),
+ (b"zaz", b'a', 0, 2),
+ (b"zza", b'a', 0, 1),
+ (b"zaa", b'a', 0, 0),
+ (b"zzz", b'a', 0, 2),
+ ];
+
+ type TestCase = (Vec<u8>, u8, Option<(usize, usize)>);
+
+ fn build_tests() -> Vec<TestCase> {
+ #[cfg(not(miri))]
+ const MAX_PER: usize = 515;
+ #[cfg(miri)]
+ const MAX_PER: usize = 10;
+
+ let mut result = vec![];
+ for &(search, byte, fwd_pos, rev_pos) in TESTS {
+ result.push((search.to_vec(), byte, Some((fwd_pos, rev_pos))));
+ for i in 1..MAX_PER {
+ // add a bunch of copies of the search byte to the end.
+ let mut suffixed: Vec<u8> = search.into();
+ suffixed.extend(std::iter::repeat(byte).take(i));
+ result.push((suffixed, byte, Some((fwd_pos, rev_pos))));
+
+ // add a bunch of copies of the search byte to the start.
+ let mut prefixed: Vec<u8> =
+ std::iter::repeat(byte).take(i).collect();
+ prefixed.extend(search);
+ result.push((
+ prefixed,
+ byte,
+ Some((fwd_pos + i, rev_pos + i)),
+ ));
+
+ // add a bunch of copies of the search byte to both ends.
+ let mut surrounded: Vec<u8> =
+ std::iter::repeat(byte).take(i).collect();
+ surrounded.extend(search);
+ surrounded.extend(std::iter::repeat(byte).take(i));
+ result.push((
+ surrounded,
+ byte,
+ Some((fwd_pos + i, rev_pos + i)),
+ ));
+ }
+ }
+
+ // build non-matching tests for several sizes
+ for i in 0..MAX_PER {
+ result.push((
+ std::iter::repeat(b'\0').take(i).collect(),
+ b'\0',
+ None,
+ ));
+ }
+
+ result
+ }
+
+ #[test]
+ fn test_inv_memchr() {
+ use crate::{ByteSlice, B};
+
+ #[cfg(not(miri))]
+ const MAX_OFFSET: usize = 130;
+ #[cfg(miri)]
+ const MAX_OFFSET: usize = 13;
+
+ for (search, byte, matching) in build_tests() {
+ assert_eq!(
+ inv_memchr(byte, &search),
+ matching.map(|m| m.0),
+ "inv_memchr when searching for {:?} in {:?}",
+ byte as char,
+ // better printing
+ B(&search).as_bstr(),
+ );
+ assert_eq!(
+ inv_memrchr(byte, &search),
+ matching.map(|m| m.1),
+ "inv_memrchr when searching for {:?} in {:?}",
+ byte as char,
+ // better printing
+ B(&search).as_bstr(),
+ );
+ // Test a rather large number off offsets for potential alignment
+ // issues.
+ for offset in 1..MAX_OFFSET {
+ if offset >= search.len() {
+ break;
+ }
+ // If this would cause us to shift the results off the end,
+ // skip it so that we don't have to recompute them.
+ if let Some((f, r)) = matching {
+ if offset > f || offset > r {
+ break;
+ }
+ }
+ let realigned = &search[offset..];
+
+ let forward_pos = matching.map(|m| m.0 - offset);
+ let reverse_pos = matching.map(|m| m.1 - offset);
+
+ assert_eq!(
+ inv_memchr(byte, &realigned),
+ forward_pos,
+ "inv_memchr when searching (realigned by {}) for {:?} in {:?}",
+ offset,
+ byte as char,
+ realigned.as_bstr(),
+ );
+ assert_eq!(
+ inv_memrchr(byte, &realigned),
+ reverse_pos,
+ "inv_memrchr when searching (realigned by {}) for {:?} in {:?}",
+ offset,
+ byte as char,
+ realigned.as_bstr(),
+ );
+ }
+ }
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +
/// An iterator of `char` values that represent an escaping of arbitrary bytes.
+///
+/// The lifetime parameter `'a` refers to the lifetime of the bytes being
+/// escaped.
+///
+/// This iterator is created by the
+/// [`ByteSlice::escape_bytes`](crate::ByteSlice::escape_bytes) method.
+#[derive(Clone, Debug)]
+pub struct EscapeBytes<'a> {
+ remaining: &'a [u8],
+ state: EscapeState,
+}
+
+impl<'a> EscapeBytes<'a> {
+ pub(crate) fn new(bytes: &'a [u8]) -> EscapeBytes {
+ EscapeBytes { remaining: bytes, state: EscapeState::Start }
+ }
+}
+
+impl<'a> Iterator for EscapeBytes<'a> {
+ type Item = char;
+
+ #[inline]
+ fn next(&mut self) -> Option<char> {
+ use self::EscapeState::*;
+
+ match self.state {
+ Start => {
+ let byte = match crate::decode_utf8(self.remaining) {
+ (None, 0) => return None,
+ // If we see invalid UTF-8 or ASCII, then we always just
+ // peel one byte off. If it's printable ASCII, we'll pass
+ // it through as-is below. Otherwise, below, it will get
+ // escaped in some way.
+ (None, _) | (Some(_), 1) => {
+ let byte = self.remaining[0];
+ self.remaining = &self.remaining[1..];
+ byte
+ }
+ // For any valid UTF-8 that is not ASCII, we pass it
+ // through as-is. We don't do any Unicode escaping.
+ (Some(ch), size) => {
+ self.remaining = &self.remaining[size..];
+ return Some(ch);
+ }
+ };
+ self.state = match byte {
+ 0x21..=0x5B | 0x5D..=0x7E => {
+ return Some(char::from(byte))
+ }
+ b'\0' => SpecialEscape('0'),
+ b'\n' => SpecialEscape('n'),
+ b'\r' => SpecialEscape('r'),
+ b'\t' => SpecialEscape('t'),
+ b'\\' => SpecialEscape('\\'),
+ _ => HexEscapeX(byte),
+ };
+ Some('\\')
+ }
+ SpecialEscape(ch) => {
+ self.state = Start;
+ Some(ch)
+ }
+ HexEscapeX(byte) => {
+ self.state = HexEscapeHighNybble(byte);
+ Some('x')
+ }
+ HexEscapeHighNybble(byte) => {
+ self.state = HexEscapeLowNybble(byte);
+ let nybble = byte >> 4;
+ Some(hexdigit_to_char(nybble))
+ }
+ HexEscapeLowNybble(byte) => {
+ self.state = Start;
+ let nybble = byte & 0xF;
+ Some(hexdigit_to_char(nybble))
+ }
+ }
+ }
+}
+
+impl<'a> core::fmt::Display for EscapeBytes<'a> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ use core::fmt::Write;
+ for ch in self.clone() {
+ f.write_char(ch)?;
+ }
+ Ok(())
+ }
+}
+
+/// The state used by the FSM in the escaping iterator.
+#[derive(Clone, Debug)]
+enum EscapeState {
+ /// Read and remove the next byte from 'remaining'. If 'remaining' is
+ /// empty, then return None. Otherwise, escape the byte according to the
+ /// following rules or emit it as-is.
+ ///
+ /// If it's \n, \r, \t, \\ or \0, then emit a '\' and set the current
+ /// state to 'SpecialEscape(n | r | t | \ | 0)'. Otherwise, if the 'byte'
+ /// is not in [\x21-\x5B\x5D-\x7E], then emit a '\' and set the state to
+ /// to 'HexEscapeX(byte)'.
+ Start,
+ /// Emit the given codepoint as is. This assumes '\' has just been emitted.
+ /// Then set the state to 'Start'.
+ SpecialEscape(char),
+ /// Emit the 'x' part of a hex escape. This assumes '\' has just been
+ /// emitted. Then set the state to 'HexEscapeHighNybble(byte)'.
+ HexEscapeX(u8),
+ /// Emit the high nybble of the byte as a hexadecimal digit. This
+ /// assumes '\x' has just been emitted. Then set the state to
+ /// 'HexEscapeLowNybble(byte)'.
+ HexEscapeHighNybble(u8),
+ /// Emit the low nybble of the byte as a hexadecimal digit. This assume
+ /// '\xZ' has just been emitted, where 'Z' is the high nybble of this byte.
+ /// Then set the state to 'Start'.
+ HexEscapeLowNybble(u8),
+}
+
+/// An iterator of `u8` values that represent an unescaping of a sequence of
+/// codepoints.
+///
+/// The type parameter `I` refers to the iterator of codepoints that is
+/// unescaped.
+///
+/// Currently this iterator is not exposed in the crate API, and instead all
+/// we expose is a `ByteVec::unescape` method. Which of course requires an
+/// alloc. That's the most convenient form of this, but in theory, we could
+/// expose this for core-only use cases too. I'm just not quite sure what the
+/// API should be.
+#[derive(Clone, Debug)]
+#[cfg(feature = "alloc")]
+pub(crate) struct UnescapeBytes<I> {
+ it: I,
+ state: UnescapeState,
+}
+
+#[cfg(feature = "alloc")]
+impl<I: Iterator<Item = char>> UnescapeBytes<I> {
+ pub(crate) fn new<T: IntoIterator<IntoIter = I>>(
+ t: T,
+ ) -> UnescapeBytes<I> {
+ UnescapeBytes { it: t.into_iter(), state: UnescapeState::Start }
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl<I: Iterator<Item = char>> Iterator for UnescapeBytes<I> {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<u8> {
+ use self::UnescapeState::*;
+
+ loop {
+ match self.state {
+ Start => {
+ let ch = self.it.next()?;
+ match ch {
+ '\\' => {
+ self.state = Escape;
+ }
+ ch => {
+ self.state = UnescapeState::bytes(&[], ch);
+ }
+ }
+ }
+ Bytes { buf, mut cur, len } => {
+ let byte = buf[cur];
+ cur += 1;
+ if cur >= len {
+ self.state = Start;
+ } else {
+ self.state = Bytes { buf, cur, len };
+ }
+ return Some(byte);
+ }
+ Escape => {
+ let ch = match self.it.next() {
+ Some(ch) => ch,
+ None => {
+ self.state = Start;
+ // Incomplete escape sequences unescape as
+ // themselves.
+ return Some(b'\\');
+ }
+ };
+ match ch {
+ '0' => {
+ self.state = Start;
+ return Some(b'\x00');
+ }
+ '\\' => {
+ self.state = Start;
+ return Some(b'\\');
+ }
+ 'r' => {
+ self.state = Start;
+ return Some(b'\r');
+ }
+ 'n' => {
+ self.state = Start;
+ return Some(b'\n');
+ }
+ 't' => {
+ self.state = Start;
+ return Some(b'\t');
+ }
+ 'x' => {
+ self.state = HexFirst;
+ }
+ ch => {
+ // An invalid escape sequence unescapes as itself.
+ self.state = UnescapeState::bytes(&[b'\\'], ch);
+ }
+ }
+ }
+ HexFirst => {
+ let ch = match self.it.next() {
+ Some(ch) => ch,
+ None => {
+ // An incomplete escape sequence unescapes as
+ // itself.
+ self.state = UnescapeState::bytes_raw(&[b'x']);
+ return Some(b'\\');
+ }
+ };
+ match ch {
+ '0'..='9' | 'A'..='F' | 'a'..='f' => {
+ self.state = HexSecond(ch);
+ }
+ ch => {
+ // An invalid escape sequence unescapes as itself.
+ self.state = UnescapeState::bytes(&[b'x'], ch);
+ return Some(b'\\');
+ }
+ }
+ }
+ HexSecond(first) => {
+ let second = match self.it.next() {
+ Some(ch) => ch,
+ None => {
+ // An incomplete escape sequence unescapes as
+ // itself.
+ self.state = UnescapeState::bytes(&[b'x'], first);
+ return Some(b'\\');
+ }
+ };
+ match second {
+ '0'..='9' | 'A'..='F' | 'a'..='f' => {
+ self.state = Start;
+ let hinybble = char_to_hexdigit(first);
+ let lonybble = char_to_hexdigit(second);
+ let byte = hinybble << 4 | lonybble;
+ return Some(byte);
+ }
+ ch => {
+ // An invalid escape sequence unescapes as itself.
+ self.state =
+ UnescapeState::bytes2(&[b'x'], first, ch);
+ return Some(b'\\');
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+/// The state used by the FSM in the unescaping iterator.
+#[derive(Clone, Debug)]
+#[cfg(feature = "alloc")]
+enum UnescapeState {
+ /// The start state. Look for an escape sequence, otherwise emit the next
+ /// codepoint as-is.
+ Start,
+ /// Emit the byte at `buf[cur]`.
+ ///
+ /// This state should never be created when `cur >= len`. That is, when
+ /// this state is visited, it is assumed that `cur < len`.
+ Bytes { buf: [u8; 11], cur: usize, len: usize },
+ /// This state is entered after a `\` is seen.
+ Escape,
+ /// This state is entered after a `\x` is seen.
+ HexFirst,
+ /// This state is entered after a `\xN` is seen, where `N` is in
+ /// `[0-9A-Fa-f]`. The given codepoint corresponds to `N`.
+ HexSecond(char),
+}
+
+#[cfg(feature = "alloc")]
+impl UnescapeState {
+ /// Create a new `Bytes` variant with the given slice.
+ ///
+ /// # Panics
+ ///
+ /// Panics if `bytes.len() > 11`.
+ fn bytes_raw(bytes: &[u8]) -> UnescapeState {
+ // This can be increased, you just need to make sure 'buf' in the
+ // 'Bytes' state has enough room.
+ assert!(bytes.len() <= 11, "no more than 11 bytes allowed");
+ let mut buf = [0; 11];
+ buf[..bytes.len()].copy_from_slice(bytes);
+ UnescapeState::Bytes { buf, cur: 0, len: bytes.len() }
+ }
+
+ /// Create a new `Bytes` variant with the prefix byte slice, followed by
+ /// the UTF-8 encoding of the given char.
+ ///
+ /// # Panics
+ ///
+ /// Panics if `prefix.len() > 3`.
+ fn bytes(prefix: &[u8], ch: char) -> UnescapeState {
+ // This can be increased, you just need to make sure 'buf' in the
+ // 'Bytes' state has enough room.
+ assert!(prefix.len() <= 3, "no more than 3 bytes allowed");
+ let mut buf = [0; 11];
+ buf[..prefix.len()].copy_from_slice(prefix);
+ let chlen = ch.encode_utf8(&mut buf[prefix.len()..]).len();
+ UnescapeState::Bytes { buf, cur: 0, len: prefix.len() + chlen }
+ }
+
+ /// Create a new `Bytes` variant with the prefix byte slice, followed by
+ /// the UTF-8 encoding of `ch1` and then `ch2`.
+ ///
+ /// # Panics
+ ///
+ /// Panics if `prefix.len() > 3`.
+ fn bytes2(prefix: &[u8], ch1: char, ch2: char) -> UnescapeState {
+ // This can be increased, you just need to make sure 'buf' in the
+ // 'Bytes' state has enough room.
+ assert!(prefix.len() <= 3, "no more than 3 bytes allowed");
+ let mut buf = [0; 11];
+ buf[..prefix.len()].copy_from_slice(prefix);
+ let len1 = ch1.encode_utf8(&mut buf[prefix.len()..]).len();
+ let len2 = ch2.encode_utf8(&mut buf[prefix.len() + len1..]).len();
+ UnescapeState::Bytes { buf, cur: 0, len: prefix.len() + len1 + len2 }
+ }
+}
+
+/// Convert the given codepoint to its corresponding hexadecimal digit.
+///
+/// # Panics
+///
+/// This panics if `ch` is not in `[0-9A-Fa-f]`.
+#[cfg(feature = "alloc")]
+fn char_to_hexdigit(ch: char) -> u8 {
+ u8::try_from(ch.to_digit(16).unwrap()).unwrap()
+}
+
+/// Convert the given hexadecimal digit to its corresponding codepoint.
+///
+/// # Panics
+///
+/// This panics when `digit > 15`.
+fn hexdigit_to_char(digit: u8) -> char {
+ char::from_digit(u32::from(digit), 16).unwrap().to_ascii_uppercase()
+}
+
+#[cfg(all(test, feature = "std"))]
+mod tests {
+ use crate::BString;
+
+ use super::*;
+
+ #[allow(non_snake_case)]
+ fn B<B: AsRef<[u8]>>(bytes: B) -> BString {
+ BString::from(bytes.as_ref())
+ }
+
+ fn e<B: AsRef<[u8]>>(bytes: B) -> String {
+ EscapeBytes::new(bytes.as_ref()).to_string()
+ }
+
+ fn u(string: &str) -> BString {
+ UnescapeBytes::new(string.chars()).collect()
+ }
+
+ #[test]
+ fn escape() {
+ assert_eq!(r"a", e(br"a"));
+ assert_eq!(r"\\x61", e(br"\x61"));
+ assert_eq!(r"a", e(b"\x61"));
+ assert_eq!(r"~", e(b"\x7E"));
+ assert_eq!(r"\x7F", e(b"\x7F"));
+
+ assert_eq!(r"\n", e(b"\n"));
+ assert_eq!(r"\r", e(b"\r"));
+ assert_eq!(r"\t", e(b"\t"));
+ assert_eq!(r"\\", e(b"\\"));
+ assert_eq!(r"\0", e(b"\0"));
+ assert_eq!(r"\0", e(b"\x00"));
+
+ assert_eq!(r"\x88", e(b"\x88"));
+ assert_eq!(r"\x8F", e(b"\x8F"));
+ assert_eq!(r"\xF8", e(b"\xF8"));
+ assert_eq!(r"\xFF", e(b"\xFF"));
+
+ assert_eq!(r"\xE2", e(b"\xE2"));
+ assert_eq!(r"\xE2\x98", e(b"\xE2\x98"));
+ assert_eq!(r"☃", e(b"\xE2\x98\x83"));
+
+ assert_eq!(r"\xF0", e(b"\xF0"));
+ assert_eq!(r"\xF0\x9F", e(b"\xF0\x9F"));
+ assert_eq!(r"\xF0\x9F\x92", e(b"\xF0\x9F\x92"));
+ assert_eq!(r"💩", e(b"\xF0\x9F\x92\xA9"));
+ }
+
+ #[test]
+ fn unescape() {
+ assert_eq!(B(r"a"), u(r"a"));
+ assert_eq!(B(r"\x61"), u(r"\\x61"));
+ assert_eq!(B(r"a"), u(r"\x61"));
+ assert_eq!(B(r"~"), u(r"\x7E"));
+ assert_eq!(B(b"\x7F"), u(r"\x7F"));
+
+ assert_eq!(B(b"\n"), u(r"\n"));
+ assert_eq!(B(b"\r"), u(r"\r"));
+ assert_eq!(B(b"\t"), u(r"\t"));
+ assert_eq!(B(b"\\"), u(r"\\"));
+ assert_eq!(B(b"\0"), u(r"\0"));
+ assert_eq!(B(b"\0"), u(r"\x00"));
+
+ assert_eq!(B(b"\x88"), u(r"\x88"));
+ assert_eq!(B(b"\x8F"), u(r"\x8F"));
+ assert_eq!(B(b"\xF8"), u(r"\xF8"));
+ assert_eq!(B(b"\xFF"), u(r"\xFF"));
+
+ assert_eq!(B(b"\xE2"), u(r"\xE2"));
+ assert_eq!(B(b"\xE2\x98"), u(r"\xE2\x98"));
+ assert_eq!(B("☃"), u(r"\xE2\x98\x83"));
+
+ assert_eq!(B(b"\xF0"), u(r"\xf0"));
+ assert_eq!(B(b"\xF0\x9F"), u(r"\xf0\x9f"));
+ assert_eq!(B(b"\xF0\x9F\x92"), u(r"\xf0\x9f\x92"));
+ assert_eq!(B("💩"), u(r"\xf0\x9f\x92\xa9"));
+ }
+
+ #[test]
+ fn unescape_weird() {
+ assert_eq!(B(b"\\"), u(r"\"));
+ assert_eq!(B(b"\\"), u(r"\\"));
+ assert_eq!(B(b"\\x"), u(r"\x"));
+ assert_eq!(B(b"\\xA"), u(r"\xA"));
+
+ assert_eq!(B(b"\\xZ"), u(r"\xZ"));
+ assert_eq!(B(b"\\xZZ"), u(r"\xZZ"));
+ assert_eq!(B(b"\\i"), u(r"\i"));
+ assert_eq!(B(b"\\u"), u(r"\u"));
+ assert_eq!(B(b"\\u{2603}"), u(r"\u{2603}"));
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 +763 +764 +765 +766 +767 +768 +769 +770 +771 +772 +773 +774 +775 +776 +777 +778 +779 +780 +781 +782 +783 +784 +785 +786 +787 +788 +789 +790 +791 +792 +793 +794 +795 +796 +797 +798 +799 +800 +801 +802 +803 +804 +805 +806 +807 +808 +809 +810 +811 +812 +813 +814 +815 +816 +817 +818 +819 +820 +821 +822 +823 +824 +825 +826 +827 +828 +829 +830 +831 +832 +833 +834 +835 +836 +837 +838 +839 +840 +841 +842 +843 +844 +845 +846 +847 +848 +849 +850 +851 +852 +853 +854 +855 +856 +857 +858 +859 +860 +861 +862 +863 +864 +865 +866 +867 +868 +869 +870 +871 +872 +873 +874 +875 +876 +877 +878 +879 +880 +881 +882 +883 +884 +885 +886 +887 +888 +889 +890 +891 +892 +893 +894 +895 +896 +897 +898 +899 +900 +901 +902 +903 +904 +905 +906 +907 +908 +909 +910 +911 +912 +913 +914 +915 +916 +917 +918 +919 +920 +921 +922 +923 +924 +925 +926 +927 +928 +929 +930 +931 +932 +933 +934 +935 +936 +937 +938 +939 +940 +941 +942 +943 +944 +945 +946 +947 +948 +949 +950 +951 +952 +953 +954 +955 +956 +957 +958 +959 +960 +961 +962 +963 +964 +965 +966 +967 +968 +969 +970 +971 +972 +973 +974 +975 +976 +977 +978 +979 +980 +981 +982 +983 +984 +985 +986 +987 +988 +989 +990 +991 +992 +993 +994 +995 +996 +997 +998 +999 +1000 +1001 +1002 +1003 +1004 +1005 +1006 +1007 +1008 +1009 +1010 +1011 +1012 +1013 +1014 +1015 +1016 +1017 +1018 +1019 +1020 +1021 +1022 +1023 +1024 +1025 +1026 +1027 +1028 +1029 +1030 +1031 +1032 +1033 +1034 +1035 +1036 +1037 +1038 +1039 +1040 +1041 +1042 +1043 +1044 +1045 +1046 +1047 +1048 +1049 +1050 +1051 +1052 +1053 +1054 +1055 +1056 +1057 +1058 +1059 +1060 +1061 +1062 +1063 +1064 +1065 +1066 +1067 +1068 +1069 +1070 +1071 +1072 +1073 +1074 +1075 +1076 +1077 +1078 +1079 +1080 +1081 +1082 +1083 +1084 +1085 +1086 +1087 +1088 +1089 +1090 +1091 +1092 +1093 +1094 +1095 +1096 +1097 +1098 +1099 +1100 +1101 +1102 +1103 +1104 +1105 +1106 +1107 +1108 +1109 +1110 +1111 +1112 +1113 +1114 +1115 +1116 +1117 +1118 +1119 +1120 +1121 +1122 +1123 +1124 +1125 +1126 +1127 +1128 +1129 +1130 +1131 +1132 +1133 +1134 +1135 +1136 +1137 +1138 +1139 +1140 +1141 +1142 +1143 +1144 +1145 +1146 +1147 +1148 +1149 +1150 +1151 +1152 +1153 +1154 +1155 +1156 +1157 +1158 +1159 +1160 +1161 +1162 +1163 +1164 +1165 +1166 +1167 +1168 +1169 +1170 +1171 +1172 +1173 +1174 +1175 +1176 +1177 +1178 +1179 +1180 +1181 +1182 +1183 +1184 +1185 +1186 +1187 +1188 +1189 +1190 +1191 +1192 +1193 +1194 +1195 +1196 +1197 +1198 +1199 +1200 +1201 +1202 +1203 +1204 +1205 +1206 +1207 +1208 +1209 +1210 +1211 +1212 +1213 +1214 +1215 +1216 +1217 +1218 +1219 +1220 +1221 +1222 +1223 +1224 +1225 +1226 +1227 +1228 +1229 +1230 +1231 +1232 +1233 +1234 +1235 +1236 +1237 +1238 +1239 +1240 +1241 +1242 +1243 +1244 +1245 +1246 +1247 +1248 +1249 +1250 +1251 +1252 +1253 +1254 +1255 +1256 +1257 +1258 +1259 +1260 +1261 +1262 +1263 +1264 +1265 +1266 +1267 +1268 +1269 +1270 +1271 +1272 +1273 +1274 +1275 +1276 +1277 +1278 +1279 +1280 +1281 +1282 +1283 +1284 +1285 +1286 +1287 +1288 +1289 +1290 +1291 +1292 +1293 +1294 +1295 +1296 +1297 +1298 +1299 +1300 +1301 +1302 +1303 +1304 +1305 +1306 +1307 +1308 +1309 +1310 +1311 +1312 +1313 +1314 +1315 +1316 +1317 +1318 +1319 +1320 +1321 +1322 +1323 +1324 +1325 +1326 +1327 +1328 +1329 +1330 +1331 +1332 +1333 +1334 +1335 +1336 +1337 +1338 +1339 +1340 +1341 +1342 +1343 +1344 +1345 +1346 +1347 +1348 +1349 +1350 +1351 +1352 +1353 +1354 +1355 +1356 +1357 +1358 +1359 +1360 +1361 +1362 +1363 +1364 +1365 +1366 +1367 +1368 +1369 +1370 +1371 +1372 +1373 +1374 +1375 +1376 +1377 +1378 +1379 +1380 +1381 +1382 +1383 +1384 +1385 +1386 +1387 +1388 +1389 +1390 +1391 +1392 +1393 +1394 +1395 +1396 +1397 +1398 +1399 +1400 +1401 +1402 +1403 +1404 +1405 +1406 +1407 +1408 +1409 +1410 +1411 +1412 +1413 +1414 +1415 +1416 +1417 +1418 +1419 +1420 +1421 +1422 +1423 +1424 +1425 +1426 +1427 +1428 +1429 +1430 +1431 +1432 +1433 +1434 +1435 +1436 +1437 +1438 +1439 +1440 +1441 +1442 +1443 +1444 +1445 +1446 +1447 +1448 +1449 +1450 +1451 +1452 +1453 +1454 +1455 +1456 +1457 +1458 +1459 +1460 +1461 +1462 +1463 +1464 +1465 +1466 +1467 +1468 +1469 +1470 +1471 +1472 +1473 +1474 +1475 +1476 +1477 +1478 +1479 +1480 +1481 +1482 +1483 +1484 +1485 +1486 +1487 +1488 +1489 +1490 +1491 +1492 +1493 +1494 +1495 +1496 +1497 +1498 +1499 +1500 +1501 +1502 +1503 +1504 +1505 +1506 +1507 +1508 +1509 +1510 +1511 +1512 +1513 +1514 +1515 +1516 +1517 +1518 +1519 +1520 +1521 +1522 +1523 +1524 +1525 +1526 +1527 +1528 +1529 +1530 +1531 +1532 +1533 +1534 +1535 +1536 +1537 +1538 +1539 +1540 +1541 +1542 +1543 +1544 +1545 +1546 +1547 +1548 +1549 +1550 +1551 +1552 +1553 +1554 +1555 +1556 +1557 +1558 +1559 +1560 +1561 +1562 +1563 +1564 +1565 +1566 +1567 +1568 +1569 +1570 +1571 +1572 +1573 +1574 +1575 +1576 +1577 +1578 +1579 +1580 +1581 +1582 +1583 +1584 +1585 +1586 +1587 +1588 +1589 +1590 +1591 +1592 +1593 +1594 +1595 +1596 +1597 +1598 +1599 +1600 +1601 +1602 +1603 +1604 +1605 +1606 +1607 +1608 +1609 +1610 +1611 +1612 +1613 +1614 +1615 +1616 +1617 +1618 +1619 +1620 +1621 +1622 +1623 +1624 +1625 +1626 +1627 +1628 +1629 +1630 +1631 +1632 +1633 +1634 +1635 +1636 +1637 +1638 +1639 +1640 +1641 +1642 +1643 +1644 +1645 +1646 +1647 +1648 +1649 +1650 +1651 +1652 +1653 +1654 +1655 +1656 +1657 +1658 +1659 +1660 +1661 +1662 +1663 +1664 +1665 +1666 +1667 +1668 +1669 +1670 +1671 +1672 +1673 +1674 +1675 +1676 +1677 +1678 +1679 +1680 +1681 +1682 +1683 +1684 +1685 +1686 +1687 +1688 +1689 +1690 +1691 +1692 +1693 +1694 +1695 +1696 +1697 +1698 +1699 +1700 +1701 +1702 +1703 +1704 +1705 +1706 +1707 +1708 +1709 +1710 +1711 +1712 +1713 +1714 +1715 +1716 +1717 +1718 +1719 +1720 +1721 +1722 +1723 +1724 +1725 +1726 +1727 +1728 +1729 +1730 +1731 +1732 +1733 +1734 +1735 +1736 +1737 +1738 +1739 +1740 +1741 +1742 +1743 +1744 +1745 +1746 +1747 +1748 +1749 +1750 +1751 +1752 +1753 +1754 +1755 +1756 +1757 +1758 +1759 +1760 +1761 +1762 +1763 +1764 +1765 +1766 +1767 +1768 +1769 +1770 +1771 +1772 +1773 +1774 +1775 +1776 +1777 +1778 +1779 +1780 +1781 +1782 +1783 +1784 +1785 +1786 +1787 +1788 +1789 +1790 +1791 +1792 +1793 +1794 +1795 +1796 +1797 +1798 +1799 +1800 +1801 +1802 +1803 +1804 +1805 +1806 +1807 +1808 +1809 +1810 +1811 +1812 +1813 +1814 +1815 +1816 +1817 +1818 +1819 +1820 +1821 +1822 +1823 +1824 +1825 +1826 +1827 +1828 +1829 +1830 +1831 +1832 +1833 +1834 +1835 +1836 +1837 +1838 +1839 +1840 +1841 +1842 +1843 +1844 +1845 +1846 +1847 +1848 +1849 +1850 +1851 +1852 +1853 +1854 +1855 +1856 +1857 +1858 +1859 +1860 +1861 +1862 +1863 +1864 +1865 +1866 +1867 +1868 +1869 +1870 +1871 +1872 +1873 +1874 +1875 +1876 +1877 +1878 +1879 +1880 +1881 +1882 +1883 +1884 +1885 +1886 +1887 +1888 +1889 +1890 +1891 +1892 +1893 +1894 +1895 +1896 +1897 +1898 +1899 +1900 +1901 +1902 +1903 +1904 +1905 +1906 +1907 +1908 +1909 +1910 +1911 +1912 +1913 +1914 +1915 +1916 +1917 +1918 +1919 +1920 +1921 +1922 +1923 +1924 +1925 +1926 +1927 +1928 +1929 +1930 +1931 +1932 +1933 +1934 +1935 +1936 +1937 +1938 +1939 +1940 +1941 +1942 +1943 +1944 +1945 +1946 +1947 +1948 +1949 +1950 +1951 +1952 +1953 +1954 +1955 +1956 +1957 +1958 +1959 +1960 +1961 +1962 +1963 +1964 +1965 +1966 +1967 +1968 +1969 +1970 +1971 +1972 +1973 +1974 +1975 +1976 +1977 +1978 +1979 +1980 +1981 +1982 +1983 +1984 +1985 +1986 +1987 +1988 +1989 +1990 +1991 +1992 +1993 +1994 +1995 +1996 +1997 +1998 +1999 +2000 +2001 +2002 +2003 +2004 +2005 +2006 +2007 +2008 +2009 +2010 +2011 +2012 +2013 +2014 +2015 +2016 +2017 +2018 +2019 +2020 +2021 +2022 +2023 +2024 +2025 +2026 +2027 +2028 +2029 +2030 +2031 +2032 +2033 +2034 +2035 +2036 +2037 +2038 +2039 +2040 +2041 +2042 +2043 +2044 +2045 +2046 +2047 +2048 +2049 +2050 +2051 +2052 +2053 +2054 +2055 +2056 +2057 +2058 +2059 +2060 +2061 +2062 +2063 +2064 +2065 +2066 +2067 +2068 +2069 +2070 +2071 +2072 +2073 +2074 +2075 +2076 +2077 +2078 +2079 +2080 +2081 +2082 +2083 +2084 +2085 +2086 +2087 +2088 +2089 +2090 +2091 +2092 +2093 +2094 +2095 +2096 +2097 +2098 +2099 +2100 +2101 +2102 +2103 +2104 +2105 +2106 +2107 +2108 +2109 +2110 +2111 +2112 +2113 +2114 +2115 +2116 +2117 +2118 +2119 +2120 +2121 +2122 +2123 +2124 +2125 +2126 +2127 +2128 +2129 +2130 +2131 +2132 +2133 +2134 +2135 +2136 +2137 +2138 +2139 +2140 +2141 +2142 +2143 +2144 +2145 +2146 +2147 +2148 +2149 +2150 +2151 +2152 +2153 +2154 +2155 +2156 +2157 +2158 +2159 +2160 +2161 +2162 +2163 +2164 +2165 +2166 +2167 +2168 +2169 +2170 +2171 +2172 +2173 +2174 +2175 +2176 +2177 +2178 +2179 +2180 +2181 +2182 +2183 +2184 +2185 +2186 +2187 +2188 +2189 +2190 +2191 +2192 +2193 +2194 +2195 +2196 +2197 +2198 +2199 +2200 +2201 +2202 +2203 +2204 +2205 +2206 +2207 +2208 +2209 +2210 +2211 +2212 +2213 +2214 +2215 +2216 +2217 +2218 +2219 +2220 +2221 +2222 +2223 +2224 +2225 +2226 +2227 +2228 +2229 +2230 +2231 +2232 +2233 +2234 +2235 +2236 +2237 +2238 +2239 +2240 +2241 +2242 +2243 +2244 +2245 +2246 +2247 +2248 +2249 +2250 +2251 +2252 +2253 +2254 +2255 +2256 +2257 +2258 +2259 +2260 +2261 +2262 +2263 +2264 +2265 +2266 +2267 +2268 +2269 +2270 +2271 +2272 +2273 +2274 +2275 +2276 +2277 +2278 +2279 +2280 +2281 +2282 +2283 +2284 +2285 +2286 +2287 +2288 +2289 +2290 +2291 +2292 +2293 +2294 +2295 +2296 +2297 +2298 +2299 +2300 +2301 +2302 +2303 +2304 +2305 +2306 +2307 +2308 +2309 +2310 +2311 +2312 +2313 +2314 +2315 +2316 +2317 +2318 +2319 +2320 +2321 +2322 +2323 +2324 +2325 +2326 +2327 +2328 +2329 +2330 +2331 +2332 +2333 +2334 +2335 +2336 +2337 +2338 +2339 +2340 +2341 +2342 +2343 +2344 +2345 +2346 +2347 +2348 +2349 +2350 +2351 +2352 +2353 +2354 +2355 +2356 +2357 +2358 +2359 +2360 +2361 +2362 +2363 +2364 +2365 +2366 +2367 +2368 +2369 +2370 +2371 +2372 +2373 +2374 +2375 +2376 +2377 +2378 +2379 +2380 +2381 +2382 +2383 +2384 +2385 +2386 +2387 +2388 +2389 +2390 +2391 +2392 +2393 +2394 +2395 +2396 +2397 +2398 +2399 +2400 +2401 +2402 +2403 +2404 +2405 +2406 +2407 +2408 +2409 +2410 +2411 +2412 +2413 +2414 +2415 +2416 +2417 +2418 +2419 +2420 +2421 +2422 +2423 +2424 +2425 +2426 +2427 +2428 +2429 +2430 +2431 +2432 +2433 +2434 +2435 +2436 +2437 +2438 +2439 +2440 +2441 +2442 +2443 +2444 +2445 +2446 +2447 +2448 +2449 +2450 +2451 +2452 +2453 +2454 +2455 +2456 +2457 +2458 +2459 +2460 +2461 +2462 +2463 +2464 +2465 +2466 +2467 +2468 +2469 +2470 +2471 +2472 +2473 +2474 +2475 +2476 +2477 +2478 +2479 +2480 +2481 +2482 +2483 +2484 +2485 +2486 +2487 +2488 +2489 +2490 +2491 +2492 +2493 +2494 +2495 +2496 +2497 +2498 +2499 +2500 +2501 +2502 +2503 +2504 +2505 +2506 +2507 +2508 +2509 +2510 +2511 +2512 +2513 +2514 +2515 +2516 +2517 +2518 +2519 +2520 +2521 +2522 +2523 +2524 +2525 +2526 +2527 +2528 +2529 +2530 +2531 +2532 +2533 +2534 +2535 +2536 +2537 +2538 +2539 +2540 +2541 +2542 +2543 +2544 +2545 +2546 +2547 +2548 +2549 +2550 +2551 +2552 +2553 +2554 +2555 +2556 +2557 +2558 +2559 +2560 +2561 +2562 +2563 +2564 +2565 +2566 +2567 +2568 +2569 +2570 +2571 +2572 +2573 +2574 +2575 +2576 +2577 +2578 +2579 +2580 +2581 +2582 +2583 +2584 +2585 +2586 +2587 +2588 +2589 +2590 +2591 +2592 +2593 +2594 +2595 +2596 +2597 +2598 +2599 +2600 +2601 +2602 +2603 +2604 +2605 +2606 +2607 +2608 +2609 +2610 +2611 +2612 +2613 +2614 +2615 +2616 +2617 +2618 +2619 +2620 +2621 +2622 +2623 +2624 +2625 +2626 +2627 +2628 +2629 +2630 +2631 +2632 +2633 +2634 +2635 +2636 +2637 +2638 +2639 +2640 +2641 +2642 +2643 +2644 +2645 +2646 +2647 +2648 +2649 +2650 +2651 +2652 +2653 +2654 +2655 +2656 +2657 +2658 +2659 +2660 +2661 +2662 +2663 +2664 +2665 +2666 +2667 +2668 +2669 +2670 +2671 +2672 +2673 +2674 +2675 +2676 +2677 +2678 +2679 +2680 +2681 +2682 +2683 +2684 +2685 +2686 +2687 +2688 +2689 +2690 +2691 +2692 +2693 +2694 +2695 +2696 +2697 +2698 +2699 +2700 +2701 +2702 +2703 +2704 +2705 +2706 +2707 +2708 +2709 +2710 +2711 +2712 +2713 +2714 +2715 +2716 +2717 +2718 +2719 +2720 +2721 +2722 +2723 +2724 +2725 +2726 +2727 +2728 +2729 +2730 +2731 +2732 +2733 +2734 +2735 +2736 +2737 +2738 +2739 +2740 +2741 +2742 +2743 +2744 +2745 +2746 +2747 +2748 +2749 +2750 +2751 +2752 +2753 +2754 +2755 +2756 +2757 +2758 +2759 +2760 +2761 +2762 +2763 +2764 +2765 +2766 +2767 +2768 +2769 +2770 +2771 +2772 +2773 +2774 +2775 +2776 +2777 +2778 +2779 +2780 +2781 +2782 +2783 +2784 +2785 +2786 +2787 +2788 +2789 +2790 +2791 +2792 +2793 +2794 +2795 +2796 +2797 +2798 +2799 +2800 +2801 +2802 +2803 +2804 +2805 +2806 +2807 +2808 +2809 +2810 +2811 +2812 +2813 +2814 +2815 +2816 +2817 +2818 +2819 +2820 +2821 +2822 +2823 +2824 +2825 +2826 +2827 +2828 +2829 +2830 +2831 +2832 +2833 +2834 +2835 +2836 +2837 +2838 +2839 +2840 +2841 +2842 +2843 +2844 +2845 +2846 +2847 +2848 +2849 +2850 +2851 +2852 +2853 +2854 +2855 +2856 +2857 +2858 +2859 +2860 +2861 +2862 +2863 +2864 +2865 +2866 +2867 +2868 +2869 +2870 +2871 +2872 +2873 +2874 +2875 +2876 +2877 +2878 +2879 +2880 +2881 +2882 +2883 +2884 +2885 +2886 +2887 +2888 +2889 +2890 +2891 +2892 +2893 +2894 +2895 +2896 +2897 +2898 +2899 +2900 +2901 +2902 +2903 +2904 +2905 +2906 +2907 +2908 +2909 +2910 +2911 +2912 +2913 +2914 +2915 +2916 +2917 +2918 +2919 +2920 +2921 +2922 +2923 +2924 +2925 +2926 +2927 +2928 +2929 +2930 +2931 +2932 +2933 +2934 +2935 +2936 +2937 +2938 +2939 +2940 +2941 +2942 +2943 +2944 +2945 +2946 +2947 +2948 +2949 +2950 +2951 +2952 +2953 +2954 +2955 +2956 +2957 +2958 +2959 +2960 +2961 +2962 +2963 +2964 +2965 +2966 +2967 +2968 +2969 +2970 +2971 +2972 +2973 +2974 +2975 +2976 +2977 +2978 +2979 +2980 +2981 +2982 +2983 +2984 +2985 +2986 +2987 +2988 +2989 +2990 +2991 +2992 +2993 +2994 +2995 +2996 +2997 +2998 +2999 +3000 +3001 +3002 +3003 +3004 +3005 +3006 +3007 +3008 +3009 +3010 +3011 +3012 +3013 +3014 +3015 +3016 +3017 +3018 +3019 +3020 +3021 +3022 +3023 +3024 +3025 +3026 +3027 +3028 +3029 +3030 +3031 +3032 +3033 +3034 +3035 +3036 +3037 +3038 +3039 +3040 +3041 +3042 +3043 +3044 +3045 +3046 +3047 +3048 +3049 +3050 +3051 +3052 +3053 +3054 +3055 +3056 +3057 +3058 +3059 +3060 +3061 +3062 +3063 +3064 +3065 +3066 +3067 +3068 +3069 +3070 +3071 +3072 +3073 +3074 +3075 +3076 +3077 +3078 +3079 +3080 +3081 +3082 +3083 +3084 +3085 +3086 +3087 +3088 +3089 +3090 +3091 +3092 +3093 +3094 +3095 +3096 +3097 +3098 +3099 +3100 +3101 +3102 +3103 +3104 +3105 +3106 +3107 +3108 +3109 +3110 +3111 +3112 +3113 +3114 +3115 +3116 +3117 +3118 +3119 +3120 +3121 +3122 +3123 +3124 +3125 +3126 +3127 +3128 +3129 +3130 +3131 +3132 +3133 +3134 +3135 +3136 +3137 +3138 +3139 +3140 +3141 +3142 +3143 +3144 +3145 +3146 +3147 +3148 +3149 +3150 +3151 +3152 +3153 +3154 +3155 +3156 +3157 +3158 +3159 +3160 +3161 +3162 +3163 +3164 +3165 +3166 +3167 +3168 +3169 +3170 +3171 +3172 +3173 +3174 +3175 +3176 +3177 +3178 +3179 +3180 +3181 +3182 +3183 +3184 +3185 +3186 +3187 +3188 +3189 +3190 +3191 +3192 +3193 +3194 +3195 +3196 +3197 +3198 +3199 +3200 +3201 +3202 +3203 +3204 +3205 +3206 +3207 +3208 +3209 +3210 +3211 +3212 +3213 +3214 +3215 +3216 +3217 +3218 +3219 +3220 +3221 +3222 +3223 +3224 +3225 +3226 +3227 +3228 +3229 +3230 +3231 +3232 +3233 +3234 +3235 +3236 +3237 +3238 +3239 +3240 +3241 +3242 +3243 +3244 +3245 +3246 +3247 +3248 +3249 +3250 +3251 +3252 +3253 +3254 +3255 +3256 +3257 +3258 +3259 +3260 +3261 +3262 +3263 +3264 +3265 +3266 +3267 +3268 +3269 +3270 +3271 +3272 +3273 +3274 +3275 +3276 +3277 +3278 +3279 +3280 +3281 +3282 +3283 +3284 +3285 +3286 +3287 +3288 +3289 +3290 +3291 +3292 +3293 +3294 +3295 +3296 +3297 +3298 +3299 +3300 +3301 +3302 +3303 +3304 +3305 +3306 +3307 +3308 +3309 +3310 +3311 +3312 +3313 +3314 +3315 +3316 +3317 +3318 +3319 +3320 +3321 +3322 +3323 +3324 +3325 +3326 +3327 +3328 +3329 +3330 +3331 +3332 +3333 +3334 +3335 +3336 +3337 +3338 +3339 +3340 +3341 +3342 +3343 +3344 +3345 +3346 +3347 +3348 +3349 +3350 +3351 +3352 +3353 +3354 +3355 +3356 +3357 +3358 +3359 +3360 +3361 +3362 +3363 +3364 +3365 +3366 +3367 +3368 +3369 +3370 +3371 +3372 +3373 +3374 +3375 +3376 +3377 +3378 +3379 +3380 +3381 +3382 +3383 +3384 +3385 +3386 +3387 +3388 +3389 +3390 +3391 +3392 +3393 +3394 +3395 +3396 +3397 +3398 +3399 +3400 +3401 +3402 +3403 +3404 +3405 +3406 +3407 +3408 +3409 +3410 +3411 +3412 +3413 +3414 +3415 +3416 +3417 +3418 +3419 +3420 +3421 +3422 +3423 +3424 +3425 +3426 +3427 +3428 +3429 +3430 +3431 +3432 +3433 +3434 +3435 +3436 +3437 +3438 +3439 +3440 +3441 +3442 +3443 +3444 +3445 +3446 +3447 +3448 +3449 +3450 +3451 +3452 +3453 +3454 +3455 +3456 +3457 +3458 +3459 +3460 +3461 +3462 +3463 +3464 +3465 +3466 +3467 +3468 +3469 +3470 +3471 +3472 +3473 +3474 +3475 +3476 +3477 +3478 +3479 +3480 +3481 +3482 +3483 +3484 +3485 +3486 +3487 +3488 +3489 +3490 +3491 +3492 +3493 +3494 +3495 +3496 +3497 +3498 +3499 +3500 +3501 +3502 +3503 +3504 +3505 +3506 +3507 +3508 +3509 +3510 +3511 +3512 +3513 +3514 +3515 +3516 +3517 +3518 +3519 +3520 +3521 +3522 +3523 +3524 +3525 +3526 +3527 +3528 +3529 +3530 +3531 +3532 +3533 +3534 +3535 +3536 +3537 +3538 +3539 +3540 +3541 +3542 +3543 +3544 +3545 +3546 +3547 +3548 +3549 +3550 +3551 +3552 +3553 +3554 +3555 +3556 +3557 +3558 +3559 +3560 +3561 +3562 +3563 +3564 +3565 +3566 +3567 +3568 +3569 +3570 +3571 +3572 +3573 +3574 +3575 +3576 +3577 +3578 +3579 +3580 +3581 +3582 +3583 +3584 +3585 +3586 +3587 +3588 +3589 +3590 +3591 +3592 +3593 +3594 +3595 +3596 +3597 +3598 +3599 +3600 +3601 +3602 +3603 +3604 +3605 +3606 +3607 +3608 +3609 +3610 +3611 +3612 +3613 +3614 +3615 +3616 +3617 +3618 +3619 +3620 +3621 +3622 +3623 +3624 +3625 +3626 +3627 +3628 +3629 +3630 +3631 +3632 +3633 +3634 +3635 +3636 +3637 +3638 +3639 +3640 +3641 +3642 +3643 +3644 +3645 +3646 +3647 +3648 +3649 +3650 +3651 +3652 +3653 +3654 +3655 +3656 +3657 +3658 +3659 +3660 +3661 +3662 +3663 +3664 +3665 +3666 +3667 +3668 +3669 +3670 +3671 +3672 +3673 +3674 +3675 +3676 +3677 +3678 +3679 +3680 +3681 +3682 +3683 +3684 +3685 +3686 +3687 +3688 +3689 +3690 +3691 +3692 +3693 +3694 +3695 +3696 +3697 +3698 +3699 +3700 +3701 +3702 +3703 +3704 +3705 +3706 +3707 +3708 +3709 +3710 +3711 +3712 +3713 +3714 +3715 +3716 +3717 +3718 +3719 +3720 +3721 +3722 +3723 +3724 +3725 +3726 +3727 +3728 +3729 +3730 +3731 +3732 +3733 +3734 +3735 +3736 +3737 +3738 +3739 +3740 +3741 +3742 +3743 +3744 +3745 +3746 +3747 +3748 +3749 +3750 +3751 +3752 +3753 +3754 +3755 +3756 +3757 +3758 +3759 +3760 +3761 +3762 +3763 +3764 +3765 +3766 +3767 +3768 +3769 +3770 +3771 +3772 +3773 +3774 +3775 +3776 +3777 +3778 +3779 +3780 +3781 +3782 +3783 +3784 +3785 +3786 +3787 +3788 +3789 +3790 +3791 +3792 +3793 +3794 +3795 +3796 +3797 +3798 +3799 +3800 +3801 +3802 +3803 +3804 +3805 +3806 +3807 +3808 +3809 +3810 +3811 +3812 +3813 +3814 +3815 +3816 +3817 +3818 +3819 +3820 +3821 +3822 +3823 +3824 +3825 +3826 +3827 +3828 +3829 +3830 +3831 +3832 +3833 +3834 +3835 +3836 +3837 +3838 +3839 +3840 +3841 +3842 +3843 +3844 +3845 +3846 +3847 +3848 +3849 +3850 +3851 +3852 +3853 +3854 +3855 +3856 +3857 +3858 +3859 +3860 +3861 +3862 +3863 +3864 +3865 +3866 +3867 +3868 +3869 +3870 +
use core::{iter, slice, str};
+
+#[cfg(all(feature = "alloc", feature = "unicode"))]
+use alloc::vec;
+#[cfg(feature = "alloc")]
+use alloc::{borrow::Cow, string::String, vec::Vec};
+
+#[cfg(feature = "std")]
+use std::{ffi::OsStr, path::Path};
+
+use memchr::{memchr, memmem, memrchr};
+
+use crate::escape_bytes::EscapeBytes;
+#[cfg(feature = "alloc")]
+use crate::ext_vec::ByteVec;
+#[cfg(feature = "unicode")]
+use crate::unicode::{
+ whitespace_len_fwd, whitespace_len_rev, GraphemeIndices, Graphemes,
+ SentenceIndices, Sentences, WordIndices, Words, WordsWithBreakIndices,
+ WordsWithBreaks,
+};
+use crate::{
+ ascii,
+ bstr::BStr,
+ byteset,
+ utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error},
+};
+
+/// A short-hand constructor for building a `&[u8]`.
+///
+/// This idiosyncratic constructor is useful for concisely building byte string
+/// slices. Its primary utility is in conveniently writing byte string literals
+/// in a uniform way. For example, consider this code that does not compile:
+///
+/// ```ignore
+/// let strs = vec![b"a", b"xy"];
+/// ```
+///
+/// The above code doesn't compile because the type of the byte string literal
+/// `b"a"` is `&'static [u8; 1]`, and the type of `b"xy"` is
+/// `&'static [u8; 2]`. Since their types aren't the same, they can't be stored
+/// in the same `Vec`. (This is dissimilar from normal Unicode string slices,
+/// where both `"a"` and `"xy"` have the same type of `&'static str`.)
+///
+/// One way of getting the above code to compile is to convert byte strings to
+/// slices. You might try this:
+///
+/// ```ignore
+/// let strs = vec![&b"a", &b"xy"];
+/// ```
+///
+/// But this just creates values with type `& &'static [u8; 1]` and
+/// `& &'static [u8; 2]`. Instead, you need to force the issue like so:
+///
+/// ```
+/// let strs = vec![&b"a"[..], &b"xy"[..]];
+/// // or
+/// let strs = vec![b"a".as_ref(), b"xy".as_ref()];
+/// ```
+///
+/// But neither of these are particularly convenient to type, especially when
+/// it's something as common as a string literal. Thus, this constructor
+/// permits writing the following instead:
+///
+/// ```
+/// use bstr::B;
+///
+/// let strs = vec![B("a"), B(b"xy")];
+/// ```
+///
+/// Notice that this also lets you mix and match both string literals and byte
+/// string literals. This can be quite convenient!
+#[allow(non_snake_case)]
+#[inline]
+pub fn B<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a [u8] {
+ bytes.as_ref()
+}
+
+impl ByteSlice for [u8] {
+ #[inline]
+ fn as_bytes(&self) -> &[u8] {
+ self
+ }
+
+ #[inline]
+ fn as_bytes_mut(&mut self) -> &mut [u8] {
+ self
+ }
+}
+
+impl<const N: usize> ByteSlice for [u8; N] {
+ #[inline]
+ fn as_bytes(&self) -> &[u8] {
+ self
+ }
+
+ #[inline]
+ fn as_bytes_mut(&mut self) -> &mut [u8] {
+ self
+ }
+}
+
+/// Ensure that callers cannot implement `ByteSlice` by making an
+/// umplementable trait its super trait.
+mod private {
+ pub trait Sealed {}
+}
+impl private::Sealed for [u8] {}
+impl<const N: usize> private::Sealed for [u8; N] {}
+
+/// A trait that extends `&[u8]` with string oriented methods.
+///
+/// This trait is sealed and cannot be implemented outside of `bstr`.
+pub trait ByteSlice: private::Sealed {
+ /// A method for accessing the raw bytes of this type. This is always a
+ /// no-op and callers shouldn't care about it. This only exists for making
+ /// the extension trait work.
+ #[doc(hidden)]
+ fn as_bytes(&self) -> &[u8];
+
+ /// A method for accessing the raw bytes of this type, mutably. This is
+ /// always a no-op and callers shouldn't care about it. This only exists
+ /// for making the extension trait work.
+ #[doc(hidden)]
+ fn as_bytes_mut(&mut self) -> &mut [u8];
+
+ /// Return this byte slice as a `&BStr`.
+ ///
+ /// Use `&BStr` is useful because of its `fmt::Debug` representation
+ /// and various other trait implementations (such as `PartialEq` and
+ /// `PartialOrd`). In particular, the `Debug` implementation for `BStr`
+ /// shows its bytes as a normal string. For invalid UTF-8, hex escape
+ /// sequences are used.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// println!("{:?}", b"foo\xFFbar".as_bstr());
+ /// ```
+ #[inline]
+ fn as_bstr(&self) -> &BStr {
+ BStr::new(self.as_bytes())
+ }
+
+ /// Return this byte slice as a `&mut BStr`.
+ ///
+ /// Use `&mut BStr` is useful because of its `fmt::Debug` representation
+ /// and various other trait implementations (such as `PartialEq` and
+ /// `PartialOrd`). In particular, the `Debug` implementation for `BStr`
+ /// shows its bytes as a normal string. For invalid UTF-8, hex escape
+ /// sequences are used.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let mut bytes = *b"foo\xFFbar";
+ /// println!("{:?}", &mut bytes.as_bstr_mut());
+ /// ```
+ #[inline]
+ fn as_bstr_mut(&mut self) -> &mut BStr {
+ BStr::new_mut(self.as_bytes_mut())
+ }
+
+ /// Create an immutable byte string from an OS string slice.
+ ///
+ /// When the underlying bytes of OS strings are accessible, then this
+ /// always succeeds and is zero cost. Otherwise, this returns `None` if the
+ /// given OS string is not valid UTF-8. (For example, when the underlying
+ /// bytes are inaccessible on Windows, file paths are allowed to be a
+ /// sequence of arbitrary 16-bit integers. Not all such sequences can be
+ /// transcoded to valid UTF-8.)
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use std::ffi::OsStr;
+ ///
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let os_str = OsStr::new("foo");
+ /// let bs = <[u8]>::from_os_str(os_str).expect("should be valid UTF-8");
+ /// assert_eq!(bs, B("foo"));
+ /// ```
+ #[cfg(feature = "std")]
+ #[inline]
+ fn from_os_str(os_str: &OsStr) -> Option<&[u8]> {
+ #[cfg(unix)]
+ #[inline]
+ fn imp(os_str: &OsStr) -> Option<&[u8]> {
+ use std::os::unix::ffi::OsStrExt;
+
+ Some(os_str.as_bytes())
+ }
+
+ #[cfg(not(unix))]
+ #[inline]
+ fn imp(os_str: &OsStr) -> Option<&[u8]> {
+ os_str.to_str().map(|s| s.as_bytes())
+ }
+
+ imp(os_str)
+ }
+
+ /// Create an immutable byte string from a file path.
+ ///
+ /// When the underlying bytes of paths are accessible, then this always
+ /// succeeds and is zero cost. Otherwise, this returns `None` if the given
+ /// path is not valid UTF-8. (For example, when the underlying bytes are
+ /// inaccessible on Windows, file paths are allowed to be a sequence of
+ /// arbitrary 16-bit integers. Not all such sequences can be transcoded to
+ /// valid UTF-8.)
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use std::path::Path;
+ ///
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let path = Path::new("foo");
+ /// let bs = <[u8]>::from_path(path).expect("should be valid UTF-8");
+ /// assert_eq!(bs, B("foo"));
+ /// ```
+ #[cfg(feature = "std")]
+ #[inline]
+ fn from_path(path: &Path) -> Option<&[u8]> {
+ Self::from_os_str(path.as_os_str())
+ }
+
+ /// Safely convert this byte string into a `&str` if it's valid UTF-8.
+ ///
+ /// If this byte string is not valid UTF-8, then an error is returned. The
+ /// error returned indicates the first invalid byte found and the length
+ /// of the error.
+ ///
+ /// In cases where a lossy conversion to `&str` is acceptable, then use one
+ /// of the [`to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) or
+ /// [`to_str_lossy_into`](trait.ByteSlice.html#method.to_str_lossy_into)
+ /// methods.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// # #[cfg(feature = "alloc")] {
+ /// use bstr::{B, ByteSlice, ByteVec};
+ ///
+ /// # fn example() -> Result<(), bstr::Utf8Error> {
+ /// let s = B("☃βツ").to_str()?;
+ /// assert_eq!("☃βツ", s);
+ ///
+ /// let mut bstring = <Vec<u8>>::from("☃βツ");
+ /// bstring.push(b'\xFF');
+ /// let err = bstring.to_str().unwrap_err();
+ /// assert_eq!(8, err.valid_up_to());
+ /// # Ok(()) }; example().unwrap()
+ /// # }
+ /// ```
+ #[inline]
+ fn to_str(&self) -> Result<&str, Utf8Error> {
+ utf8::validate(self.as_bytes()).map(|_| {
+ // SAFETY: This is safe because of the guarantees provided by
+ // utf8::validate.
+ unsafe { str::from_utf8_unchecked(self.as_bytes()) }
+ })
+ }
+
+ /// Unsafely convert this byte string into a `&str`, without checking for
+ /// valid UTF-8.
+ ///
+ /// # Safety
+ ///
+ /// Callers *must* ensure that this byte string is valid UTF-8 before
+ /// calling this method. Converting a byte string into a `&str` that is
+ /// not valid UTF-8 is considered undefined behavior.
+ ///
+ /// This routine is useful in performance sensitive contexts where the
+ /// UTF-8 validity of the byte string is already known and it is
+ /// undesirable to pay the cost of an additional UTF-8 validation check
+ /// that [`to_str`](trait.ByteSlice.html#method.to_str) performs.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// // SAFETY: This is safe because string literals are guaranteed to be
+ /// // valid UTF-8 by the Rust compiler.
+ /// let s = unsafe { B("☃βツ").to_str_unchecked() };
+ /// assert_eq!("☃βツ", s);
+ /// ```
+ #[inline]
+ unsafe fn to_str_unchecked(&self) -> &str {
+ str::from_utf8_unchecked(self.as_bytes())
+ }
+
+ /// Convert this byte string to a valid UTF-8 string by replacing invalid
+ /// UTF-8 bytes with the Unicode replacement codepoint (`U+FFFD`).
+ ///
+ /// If the byte string is already valid UTF-8, then no copying or
+ /// allocation is performed and a borrrowed string slice is returned. If
+ /// the byte string is not valid UTF-8, then an owned string buffer is
+ /// returned with invalid bytes replaced by the replacement codepoint.
+ ///
+ /// This method uses the "substitution of maximal subparts" (Unicode
+ /// Standard, Chapter 3, Section 9) strategy for inserting the replacement
+ /// codepoint. Specifically, a replacement codepoint is inserted whenever a
+ /// byte is found that cannot possibly lead to a valid code unit sequence.
+ /// If there were previous bytes that represented a prefix of a well-formed
+ /// code unit sequence, then all of those bytes are substituted with a
+ /// single replacement codepoint. The "substitution of maximal subparts"
+ /// strategy is the same strategy used by
+ /// [W3C's Encoding standard](https://www.w3.org/TR/encoding/).
+ /// For a more precise description of the maximal subpart strategy, see
+ /// the Unicode Standard, Chapter 3, Section 9. See also
+ /// [Public Review Issue #121](https://www.unicode.org/review/pr-121.html).
+ ///
+ /// N.B. Rust's standard library also appears to use the same strategy,
+ /// but it does not appear to be an API guarantee.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use std::borrow::Cow;
+ ///
+ /// use bstr::ByteSlice;
+ ///
+ /// let mut bstring = <Vec<u8>>::from("☃βツ");
+ /// assert_eq!(Cow::Borrowed("☃βツ"), bstring.to_str_lossy());
+ ///
+ /// // Add a byte that makes the sequence invalid.
+ /// bstring.push(b'\xFF');
+ /// assert_eq!(Cow::Borrowed("☃βツ\u{FFFD}"), bstring.to_str_lossy());
+ /// ```
+ ///
+ /// This demonstrates the "maximal subpart" substitution logic.
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// // \x61 is the ASCII codepoint for 'a'.
+ /// // \xF1\x80\x80 is a valid 3-byte code unit prefix.
+ /// // \xE1\x80 is a valid 2-byte code unit prefix.
+ /// // \xC2 is a valid 1-byte code unit prefix.
+ /// // \x62 is the ASCII codepoint for 'b'.
+ /// //
+ /// // In sum, each of the prefixes is replaced by a single replacement
+ /// // codepoint since none of the prefixes are properly completed. This
+ /// // is in contrast to other strategies that might insert a replacement
+ /// // codepoint for every single byte.
+ /// let bs = B(b"\x61\xF1\x80\x80\xE1\x80\xC2\x62");
+ /// assert_eq!("a\u{FFFD}\u{FFFD}\u{FFFD}b", bs.to_str_lossy());
+ /// ```
+ #[cfg(feature = "alloc")]
+ #[inline]
+ fn to_str_lossy(&self) -> Cow<'_, str> {
+ match utf8::validate(self.as_bytes()) {
+ Ok(()) => {
+ // SAFETY: This is safe because of the guarantees provided by
+ // utf8::validate.
+ unsafe {
+ Cow::Borrowed(str::from_utf8_unchecked(self.as_bytes()))
+ }
+ }
+ Err(err) => {
+ let mut lossy = String::with_capacity(self.as_bytes().len());
+ let (valid, after) =
+ self.as_bytes().split_at(err.valid_up_to());
+ // SAFETY: This is safe because utf8::validate guarantees
+ // that all of `valid` is valid UTF-8.
+ lossy.push_str(unsafe { str::from_utf8_unchecked(valid) });
+ lossy.push_str("\u{FFFD}");
+ if let Some(len) = err.error_len() {
+ after[len..].to_str_lossy_into(&mut lossy);
+ }
+ Cow::Owned(lossy)
+ }
+ }
+ }
+
+ /// Copy the contents of this byte string into the given owned string
+ /// buffer, while replacing invalid UTF-8 code unit sequences with the
+ /// Unicode replacement codepoint (`U+FFFD`).
+ ///
+ /// This method uses the same "substitution of maximal subparts" strategy
+ /// for inserting the replacement codepoint as the
+ /// [`to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) method.
+ ///
+ /// This routine is useful for amortizing allocation. However, unlike
+ /// `to_str_lossy`, this routine will _always_ copy the contents of this
+ /// byte string into the destination buffer, even if this byte string is
+ /// valid UTF-8.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use std::borrow::Cow;
+ ///
+ /// use bstr::ByteSlice;
+ ///
+ /// let mut bstring = <Vec<u8>>::from("☃βツ");
+ /// // Add a byte that makes the sequence invalid.
+ /// bstring.push(b'\xFF');
+ ///
+ /// let mut dest = String::new();
+ /// bstring.to_str_lossy_into(&mut dest);
+ /// assert_eq!("☃βツ\u{FFFD}", dest);
+ /// ```
+ #[cfg(feature = "alloc")]
+ #[inline]
+ fn to_str_lossy_into(&self, dest: &mut String) {
+ let mut bytes = self.as_bytes();
+ dest.reserve(bytes.len());
+ loop {
+ match utf8::validate(bytes) {
+ Ok(()) => {
+ // SAFETY: This is safe because utf8::validate guarantees
+ // that all of `bytes` is valid UTF-8.
+ dest.push_str(unsafe { str::from_utf8_unchecked(bytes) });
+ break;
+ }
+ Err(err) => {
+ let (valid, after) = bytes.split_at(err.valid_up_to());
+ // SAFETY: This is safe because utf8::validate guarantees
+ // that all of `valid` is valid UTF-8.
+ dest.push_str(unsafe { str::from_utf8_unchecked(valid) });
+ dest.push_str("\u{FFFD}");
+ match err.error_len() {
+ None => break,
+ Some(len) => bytes = &after[len..],
+ }
+ }
+ }
+ }
+ }
+
+ /// Create an OS string slice from this byte string.
+ ///
+ /// When OS strings can be constructed from arbitrary byte sequences, this
+ /// always succeeds and is zero cost. Otherwise, this returns a UTF-8
+ /// decoding error if this byte string is not valid UTF-8. (For example,
+ /// assuming the representation of `OsStr` is opaque on Windows, file paths
+ /// are allowed to be a sequence of arbitrary 16-bit integers. There is
+ /// no obvious mapping from an arbitrary sequence of 8-bit integers to an
+ /// arbitrary sequence of 16-bit integers. If the representation of `OsStr`
+ /// is even opened up, then this will convert any sequence of bytes to an
+ /// `OsStr` without cost.)
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let os_str = b"foo".to_os_str().expect("should be valid UTF-8");
+ /// assert_eq!(os_str, "foo");
+ /// ```
+ #[cfg(feature = "std")]
+ #[inline]
+ fn to_os_str(&self) -> Result<&OsStr, Utf8Error> {
+ #[cfg(unix)]
+ #[inline]
+ fn imp(bytes: &[u8]) -> Result<&OsStr, Utf8Error> {
+ use std::os::unix::ffi::OsStrExt;
+
+ Ok(OsStr::from_bytes(bytes))
+ }
+
+ #[cfg(not(unix))]
+ #[inline]
+ fn imp(bytes: &[u8]) -> Result<&OsStr, Utf8Error> {
+ bytes.to_str().map(OsStr::new)
+ }
+
+ imp(self.as_bytes())
+ }
+
+ /// Lossily create an OS string slice from this byte string.
+ ///
+ /// When OS strings can be constructed from arbitrary byte sequences, this
+ /// is zero cost and always returns a slice. Otherwise, this will perform a
+ /// UTF-8 check and lossily convert this byte string into valid UTF-8 using
+ /// the Unicode replacement codepoint.
+ ///
+ /// Note that this can prevent the correct roundtripping of file paths when
+ /// the representation of `OsStr` is opaque.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let os_str = b"foo\xFFbar".to_os_str_lossy();
+ /// assert_eq!(os_str.to_string_lossy(), "foo\u{FFFD}bar");
+ /// ```
+ #[cfg(feature = "std")]
+ #[inline]
+ fn to_os_str_lossy(&self) -> Cow<'_, OsStr> {
+ #[cfg(unix)]
+ #[inline]
+ fn imp(bytes: &[u8]) -> Cow<'_, OsStr> {
+ use std::os::unix::ffi::OsStrExt;
+
+ Cow::Borrowed(OsStr::from_bytes(bytes))
+ }
+
+ #[cfg(not(unix))]
+ #[inline]
+ fn imp(bytes: &[u8]) -> Cow<OsStr> {
+ use std::ffi::OsString;
+
+ match bytes.to_str_lossy() {
+ Cow::Borrowed(x) => Cow::Borrowed(OsStr::new(x)),
+ Cow::Owned(x) => Cow::Owned(OsString::from(x)),
+ }
+ }
+
+ imp(self.as_bytes())
+ }
+
+ /// Create a path slice from this byte string.
+ ///
+ /// When paths can be constructed from arbitrary byte sequences, this
+ /// always succeeds and is zero cost. Otherwise, this returns a UTF-8
+ /// decoding error if this byte string is not valid UTF-8. (For example,
+ /// assuming the representation of `Path` is opaque on Windows, file paths
+ /// are allowed to be a sequence of arbitrary 16-bit integers. There is
+ /// no obvious mapping from an arbitrary sequence of 8-bit integers to an
+ /// arbitrary sequence of 16-bit integers. If the representation of `Path`
+ /// is even opened up, then this will convert any sequence of bytes to an
+ /// `Path` without cost.)
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let path = b"foo".to_path().expect("should be valid UTF-8");
+ /// assert_eq!(path.as_os_str(), "foo");
+ /// ```
+ #[cfg(feature = "std")]
+ #[inline]
+ fn to_path(&self) -> Result<&Path, Utf8Error> {
+ self.to_os_str().map(Path::new)
+ }
+
+ /// Lossily create a path slice from this byte string.
+ ///
+ /// When paths can be constructed from arbitrary byte sequences, this is
+ /// zero cost and always returns a slice. Otherwise, this will perform a
+ /// UTF-8 check and lossily convert this byte string into valid UTF-8 using
+ /// the Unicode replacement codepoint.
+ ///
+ /// Note that this can prevent the correct roundtripping of file paths when
+ /// the representation of `Path` is opaque.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let bs = b"foo\xFFbar";
+ /// let path = bs.to_path_lossy();
+ /// assert_eq!(path.to_string_lossy(), "foo\u{FFFD}bar");
+ /// ```
+ #[cfg(feature = "std")]
+ #[inline]
+ fn to_path_lossy(&self) -> Cow<'_, Path> {
+ use std::path::PathBuf;
+
+ match self.to_os_str_lossy() {
+ Cow::Borrowed(x) => Cow::Borrowed(Path::new(x)),
+ Cow::Owned(x) => Cow::Owned(PathBuf::from(x)),
+ }
+ }
+
+ /// Create a new byte string by repeating this byte string `n` times.
+ ///
+ /// # Panics
+ ///
+ /// This function panics if the capacity of the new byte string would
+ /// overflow.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// assert_eq!(b"foo".repeatn(4), B("foofoofoofoo"));
+ /// assert_eq!(b"foo".repeatn(0), B(""));
+ /// ```
+ #[cfg(feature = "alloc")]
+ #[inline]
+ fn repeatn(&self, n: usize) -> Vec<u8> {
+ self.as_bytes().repeat(n)
+ }
+
+ /// Returns true if and only if this byte string contains the given needle.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// assert!(b"foo bar".contains_str("foo"));
+ /// assert!(b"foo bar".contains_str("bar"));
+ /// assert!(!b"foo".contains_str("foobar"));
+ /// ```
+ #[inline]
+ fn contains_str<B: AsRef<[u8]>>(&self, needle: B) -> bool {
+ self.find(needle).is_some()
+ }
+
+ /// Returns true if and only if this byte string has the given prefix.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// assert!(b"foo bar".starts_with_str("foo"));
+ /// assert!(!b"foo bar".starts_with_str("bar"));
+ /// assert!(!b"foo".starts_with_str("foobar"));
+ /// ```
+ #[inline]
+ fn starts_with_str<B: AsRef<[u8]>>(&self, prefix: B) -> bool {
+ self.as_bytes().starts_with(prefix.as_ref())
+ }
+
+ /// Returns true if and only if this byte string has the given suffix.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// assert!(b"foo bar".ends_with_str("bar"));
+ /// assert!(!b"foo bar".ends_with_str("foo"));
+ /// assert!(!b"bar".ends_with_str("foobar"));
+ /// ```
+ #[inline]
+ fn ends_with_str<B: AsRef<[u8]>>(&self, suffix: B) -> bool {
+ self.as_bytes().ends_with(suffix.as_ref())
+ }
+
+ /// Returns the index of the first occurrence of the given needle.
+ ///
+ /// The needle may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// Note that if you're are searching for the same needle in many
+ /// different small haystacks, it may be faster to initialize a
+ /// [`Finder`](struct.Finder.html) once, and reuse it for each search.
+ ///
+ /// # Complexity
+ ///
+ /// This routine is guaranteed to have worst case linear time complexity
+ /// with respect to both the needle and the haystack. That is, this runs
+ /// in `O(needle.len() + haystack.len())` time.
+ ///
+ /// This routine is also guaranteed to have worst case constant space
+ /// complexity.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"foo bar baz";
+ /// assert_eq!(Some(0), s.find("foo"));
+ /// assert_eq!(Some(4), s.find("bar"));
+ /// assert_eq!(None, s.find("quux"));
+ /// ```
+ #[inline]
+ fn find<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize> {
+ Finder::new(needle.as_ref()).find(self.as_bytes())
+ }
+
+ /// Returns the index of the last occurrence of the given needle.
+ ///
+ /// The needle may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// Note that if you're are searching for the same needle in many
+ /// different small haystacks, it may be faster to initialize a
+ /// [`FinderReverse`](struct.FinderReverse.html) once, and reuse it for
+ /// each search.
+ ///
+ /// # Complexity
+ ///
+ /// This routine is guaranteed to have worst case linear time complexity
+ /// with respect to both the needle and the haystack. That is, this runs
+ /// in `O(needle.len() + haystack.len())` time.
+ ///
+ /// This routine is also guaranteed to have worst case constant space
+ /// complexity.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"foo bar baz";
+ /// assert_eq!(Some(0), s.rfind("foo"));
+ /// assert_eq!(Some(4), s.rfind("bar"));
+ /// assert_eq!(Some(8), s.rfind("ba"));
+ /// assert_eq!(None, s.rfind("quux"));
+ /// ```
+ #[inline]
+ fn rfind<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize> {
+ FinderReverse::new(needle.as_ref()).rfind(self.as_bytes())
+ }
+
+ /// Returns an iterator of the non-overlapping occurrences of the given
+ /// needle. The iterator yields byte offset positions indicating the start
+ /// of each match.
+ ///
+ /// # Complexity
+ ///
+ /// This routine is guaranteed to have worst case linear time complexity
+ /// with respect to both the needle and the haystack. That is, this runs
+ /// in `O(needle.len() + haystack.len())` time.
+ ///
+ /// This routine is also guaranteed to have worst case constant space
+ /// complexity.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"foo bar foo foo quux foo";
+ /// let matches: Vec<usize> = s.find_iter("foo").collect();
+ /// assert_eq!(matches, vec![0, 8, 12, 21]);
+ /// ```
+ ///
+ /// An empty string matches at every position, including the position
+ /// immediately following the last byte:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let matches: Vec<usize> = b"foo".find_iter("").collect();
+ /// assert_eq!(matches, vec![0, 1, 2, 3]);
+ ///
+ /// let matches: Vec<usize> = b"".find_iter("").collect();
+ /// assert_eq!(matches, vec![0]);
+ /// ```
+ #[inline]
+ fn find_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ needle: &'n B,
+ ) -> Find<'h, 'n> {
+ Find::new(self.as_bytes(), needle.as_ref())
+ }
+
+ /// Returns an iterator of the non-overlapping occurrences of the given
+ /// needle in reverse. The iterator yields byte offset positions indicating
+ /// the start of each match.
+ ///
+ /// # Complexity
+ ///
+ /// This routine is guaranteed to have worst case linear time complexity
+ /// with respect to both the needle and the haystack. That is, this runs
+ /// in `O(needle.len() + haystack.len())` time.
+ ///
+ /// This routine is also guaranteed to have worst case constant space
+ /// complexity.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"foo bar foo foo quux foo";
+ /// let matches: Vec<usize> = s.rfind_iter("foo").collect();
+ /// assert_eq!(matches, vec![21, 12, 8, 0]);
+ /// ```
+ ///
+ /// An empty string matches at every position, including the position
+ /// immediately following the last byte:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let matches: Vec<usize> = b"foo".rfind_iter("").collect();
+ /// assert_eq!(matches, vec![3, 2, 1, 0]);
+ ///
+ /// let matches: Vec<usize> = b"".rfind_iter("").collect();
+ /// assert_eq!(matches, vec![0]);
+ /// ```
+ #[inline]
+ fn rfind_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ needle: &'n B,
+ ) -> FindReverse<'h, 'n> {
+ FindReverse::new(self.as_bytes(), needle.as_ref())
+ }
+
+ /// Returns the index of the first occurrence of the given byte. If the
+ /// byte does not occur in this byte string, then `None` is returned.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// assert_eq!(Some(10), b"foo bar baz".find_byte(b'z'));
+ /// assert_eq!(None, b"foo bar baz".find_byte(b'y'));
+ /// ```
+ #[inline]
+ fn find_byte(&self, byte: u8) -> Option<usize> {
+ memchr(byte, self.as_bytes())
+ }
+
+ /// Returns the index of the last occurrence of the given byte. If the
+ /// byte does not occur in this byte string, then `None` is returned.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// assert_eq!(Some(10), b"foo bar baz".rfind_byte(b'z'));
+ /// assert_eq!(None, b"foo bar baz".rfind_byte(b'y'));
+ /// ```
+ #[inline]
+ fn rfind_byte(&self, byte: u8) -> Option<usize> {
+ memrchr(byte, self.as_bytes())
+ }
+
+ /// Returns the index of the first occurrence of the given codepoint.
+ /// If the codepoint does not occur in this byte string, then `None` is
+ /// returned.
+ ///
+ /// Note that if one searches for the replacement codepoint, `\u{FFFD}`,
+ /// then only explicit occurrences of that encoding will be found. Invalid
+ /// UTF-8 sequences will not be matched.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// assert_eq!(Some(10), b"foo bar baz".find_char('z'));
+ /// assert_eq!(Some(4), B("αβγγδ").find_char('γ'));
+ /// assert_eq!(None, b"foo bar baz".find_char('y'));
+ /// ```
+ #[inline]
+ fn find_char(&self, ch: char) -> Option<usize> {
+ self.find(ch.encode_utf8(&mut [0; 4]))
+ }
+
+ /// Returns the index of the last occurrence of the given codepoint.
+ /// If the codepoint does not occur in this byte string, then `None` is
+ /// returned.
+ ///
+ /// Note that if one searches for the replacement codepoint, `\u{FFFD}`,
+ /// then only explicit occurrences of that encoding will be found. Invalid
+ /// UTF-8 sequences will not be matched.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// assert_eq!(Some(10), b"foo bar baz".rfind_char('z'));
+ /// assert_eq!(Some(6), B("αβγγδ").rfind_char('γ'));
+ /// assert_eq!(None, b"foo bar baz".rfind_char('y'));
+ /// ```
+ #[inline]
+ fn rfind_char(&self, ch: char) -> Option<usize> {
+ self.rfind(ch.encode_utf8(&mut [0; 4]))
+ }
+
+ /// Returns the index of the first occurrence of any of the bytes in the
+ /// provided set.
+ ///
+ /// The `byteset` may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
+ /// note that passing a `&str` which contains multibyte characters may not
+ /// behave as you expect: each byte in the `&str` is treated as an
+ /// individual member of the byte set.
+ ///
+ /// Note that order is irrelevant for the `byteset` parameter, and
+ /// duplicate bytes present in its body are ignored.
+ ///
+ /// # Complexity
+ ///
+ /// This routine is guaranteed to have worst case linear time complexity
+ /// with respect to both the set of bytes and the haystack. That is, this
+ /// runs in `O(byteset.len() + haystack.len())` time.
+ ///
+ /// This routine is also guaranteed to have worst case constant space
+ /// complexity.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// assert_eq!(b"foo bar baz".find_byteset(b"zr"), Some(6));
+ /// assert_eq!(b"foo baz bar".find_byteset(b"bzr"), Some(4));
+ /// assert_eq!(None, b"foo baz bar".find_byteset(b"\t\n"));
+ /// // The empty byteset never matches.
+ /// assert_eq!(None, b"abc".find_byteset(b""));
+ /// assert_eq!(None, b"".find_byteset(b""));
+ /// ```
+ #[inline]
+ fn find_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
+ byteset::find(self.as_bytes(), byteset.as_ref())
+ }
+
+ /// Returns the index of the first occurrence of a byte that is not a
+ /// member of the provided set.
+ ///
+ /// The `byteset` may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
+ /// note that passing a `&str` which contains multibyte characters may not
+ /// behave as you expect: each byte in the `&str` is treated as an
+ /// individual member of the byte set.
+ ///
+ /// Note that order is irrelevant for the `byteset` parameter, and
+ /// duplicate bytes present in its body are ignored.
+ ///
+ /// # Complexity
+ ///
+ /// This routine is guaranteed to have worst case linear time complexity
+ /// with respect to both the set of bytes and the haystack. That is, this
+ /// runs in `O(byteset.len() + haystack.len())` time.
+ ///
+ /// This routine is also guaranteed to have worst case constant space
+ /// complexity.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// assert_eq!(b"foo bar baz".find_not_byteset(b"fo "), Some(4));
+ /// assert_eq!(b"\t\tbaz bar".find_not_byteset(b" \t\r\n"), Some(2));
+ /// assert_eq!(b"foo\nbaz\tbar".find_not_byteset(b"\t\n"), Some(0));
+ /// // The negation of the empty byteset matches everything.
+ /// assert_eq!(Some(0), b"abc".find_not_byteset(b""));
+ /// // But an empty string never contains anything.
+ /// assert_eq!(None, b"".find_not_byteset(b""));
+ /// ```
+ #[inline]
+ fn find_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
+ byteset::find_not(self.as_bytes(), byteset.as_ref())
+ }
+
+ /// Returns the index of the last occurrence of any of the bytes in the
+ /// provided set.
+ ///
+ /// The `byteset` may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
+ /// note that passing a `&str` which contains multibyte characters may not
+ /// behave as you expect: each byte in the `&str` is treated as an
+ /// individual member of the byte set.
+ ///
+ /// Note that order is irrelevant for the `byteset` parameter, and duplicate
+ /// bytes present in its body are ignored.
+ ///
+ /// # Complexity
+ ///
+ /// This routine is guaranteed to have worst case linear time complexity
+ /// with respect to both the set of bytes and the haystack. That is, this
+ /// runs in `O(byteset.len() + haystack.len())` time.
+ ///
+ /// This routine is also guaranteed to have worst case constant space
+ /// complexity.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// assert_eq!(b"foo bar baz".rfind_byteset(b"agb"), Some(9));
+ /// assert_eq!(b"foo baz bar".rfind_byteset(b"rabz "), Some(10));
+ /// assert_eq!(b"foo baz bar".rfind_byteset(b"\n123"), None);
+ /// ```
+ #[inline]
+ fn rfind_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
+ byteset::rfind(self.as_bytes(), byteset.as_ref())
+ }
+
+ /// Returns the index of the last occurrence of a byte that is not a member
+ /// of the provided set.
+ ///
+ /// The `byteset` may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
+ /// note that passing a `&str` which contains multibyte characters may not
+ /// behave as you expect: each byte in the `&str` is treated as an
+ /// individual member of the byte set.
+ ///
+ /// Note that order is irrelevant for the `byteset` parameter, and
+ /// duplicate bytes present in its body are ignored.
+ ///
+ /// # Complexity
+ ///
+ /// This routine is guaranteed to have worst case linear time complexity
+ /// with respect to both the set of bytes and the haystack. That is, this
+ /// runs in `O(byteset.len() + haystack.len())` time.
+ ///
+ /// This routine is also guaranteed to have worst case constant space
+ /// complexity.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// assert_eq!(b"foo bar baz,\t".rfind_not_byteset(b",\t"), Some(10));
+ /// assert_eq!(b"foo baz bar".rfind_not_byteset(b"rabz "), Some(2));
+ /// assert_eq!(None, b"foo baz bar".rfind_not_byteset(b"barfoz "));
+ /// ```
+ #[inline]
+ fn rfind_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
+ byteset::rfind_not(self.as_bytes(), byteset.as_ref())
+ }
+
+ /// Returns an iterator over the fields in a byte string, separated
+ /// by contiguous whitespace (according to the Unicode property
+ /// `White_Space`).
+ ///
+ /// # Example
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B(" foo\tbar\t\u{2003}\nquux \n");
+ /// let fields: Vec<&[u8]> = s.fields().collect();
+ /// assert_eq!(fields, vec![B("foo"), B("bar"), B("quux")]);
+ /// ```
+ ///
+ /// A byte string consisting of just whitespace yields no elements:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// assert_eq!(0, B(" \n\t\u{2003}\n \t").fields().count());
+ /// ```
+ #[cfg(feature = "unicode")]
+ #[inline]
+ fn fields(&self) -> Fields<'_> {
+ Fields::new(self.as_bytes())
+ }
+
+ /// Returns an iterator over the fields in a byte string, separated by
+ /// contiguous codepoints satisfying the given predicate.
+ ///
+ /// If this byte string is not valid UTF-8, then the given closure will
+ /// be called with a Unicode replacement codepoint when invalid UTF-8
+ /// bytes are seen.
+ ///
+ /// # Example
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = b"123foo999999bar1quux123456";
+ /// let fields: Vec<&[u8]> = s.fields_with(|c| c.is_numeric()).collect();
+ /// assert_eq!(fields, vec![B("foo"), B("bar"), B("quux")]);
+ /// ```
+ ///
+ /// A byte string consisting of all codepoints satisfying the predicate
+ /// yields no elements:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// assert_eq!(0, b"1911354563".fields_with(|c| c.is_numeric()).count());
+ /// ```
+ #[inline]
+ fn fields_with<F: FnMut(char) -> bool>(&self, f: F) -> FieldsWith<'_, F> {
+ FieldsWith::new(self.as_bytes(), f)
+ }
+
+ /// Returns an iterator over substrings of this byte string, separated
+ /// by the given byte string. Each element yielded is guaranteed not to
+ /// include the splitter substring.
+ ///
+ /// The splitter may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let x: Vec<&[u8]> = b"Mary had a little lamb".split_str(" ").collect();
+ /// assert_eq!(x, vec![
+ /// B("Mary"), B("had"), B("a"), B("little"), B("lamb"),
+ /// ]);
+ ///
+ /// let x: Vec<&[u8]> = b"".split_str("X").collect();
+ /// assert_eq!(x, vec![b""]);
+ ///
+ /// let x: Vec<&[u8]> = b"lionXXtigerXleopard".split_str("X").collect();
+ /// assert_eq!(x, vec![B("lion"), B(""), B("tiger"), B("leopard")]);
+ ///
+ /// let x: Vec<&[u8]> = b"lion::tiger::leopard".split_str("::").collect();
+ /// assert_eq!(x, vec![B("lion"), B("tiger"), B("leopard")]);
+ /// ```
+ ///
+ /// If a string contains multiple contiguous separators, you will end up
+ /// with empty strings yielded by the iterator:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let x: Vec<&[u8]> = b"||||a||b|c".split_str("|").collect();
+ /// assert_eq!(x, vec![
+ /// B(""), B(""), B(""), B(""), B("a"), B(""), B("b"), B("c"),
+ /// ]);
+ ///
+ /// let x: Vec<&[u8]> = b"(///)".split_str("/").collect();
+ /// assert_eq!(x, vec![B("("), B(""), B(""), B(")")]);
+ /// ```
+ ///
+ /// Separators at the start or end of a string are neighbored by empty
+ /// strings.
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let x: Vec<&[u8]> = b"010".split_str("0").collect();
+ /// assert_eq!(x, vec![B(""), B("1"), B("")]);
+ /// ```
+ ///
+ /// When the empty string is used as a separator, it splits every **byte**
+ /// in the byte string, along with the beginning and end of the byte
+ /// string.
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let x: Vec<&[u8]> = b"rust".split_str("").collect();
+ /// assert_eq!(x, vec![
+ /// B(""), B("r"), B("u"), B("s"), B("t"), B(""),
+ /// ]);
+ ///
+ /// // Splitting by an empty string is not UTF-8 aware. Elements yielded
+ /// // may not be valid UTF-8!
+ /// let x: Vec<&[u8]> = B("☃").split_str("").collect();
+ /// assert_eq!(x, vec![
+ /// B(""), B(b"\xE2"), B(b"\x98"), B(b"\x83"), B(""),
+ /// ]);
+ /// ```
+ ///
+ /// Contiguous separators, especially whitespace, can lead to possibly
+ /// surprising behavior. For example, this code is correct:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let x: Vec<&[u8]> = b" a b c".split_str(" ").collect();
+ /// assert_eq!(x, vec![
+ /// B(""), B(""), B(""), B(""), B("a"), B(""), B("b"), B("c"),
+ /// ]);
+ /// ```
+ ///
+ /// It does *not* give you `["a", "b", "c"]`. For that behavior, use
+ /// [`fields`](#method.fields) instead.
+ #[inline]
+ fn split_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ splitter: &'s B,
+ ) -> Split<'h, 's> {
+ Split::new(self.as_bytes(), splitter.as_ref())
+ }
+
+ /// Returns an iterator over substrings of this byte string, separated by
+ /// the given byte string, in reverse. Each element yielded is guaranteed
+ /// not to include the splitter substring.
+ ///
+ /// The splitter may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let x: Vec<&[u8]> =
+ /// b"Mary had a little lamb".rsplit_str(" ").collect();
+ /// assert_eq!(x, vec![
+ /// B("lamb"), B("little"), B("a"), B("had"), B("Mary"),
+ /// ]);
+ ///
+ /// let x: Vec<&[u8]> = b"".rsplit_str("X").collect();
+ /// assert_eq!(x, vec![b""]);
+ ///
+ /// let x: Vec<&[u8]> = b"lionXXtigerXleopard".rsplit_str("X").collect();
+ /// assert_eq!(x, vec![B("leopard"), B("tiger"), B(""), B("lion")]);
+ ///
+ /// let x: Vec<&[u8]> = b"lion::tiger::leopard".rsplit_str("::").collect();
+ /// assert_eq!(x, vec![B("leopard"), B("tiger"), B("lion")]);
+ /// ```
+ ///
+ /// If a string contains multiple contiguous separators, you will end up
+ /// with empty strings yielded by the iterator:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let x: Vec<&[u8]> = b"||||a||b|c".rsplit_str("|").collect();
+ /// assert_eq!(x, vec![
+ /// B("c"), B("b"), B(""), B("a"), B(""), B(""), B(""), B(""),
+ /// ]);
+ ///
+ /// let x: Vec<&[u8]> = b"(///)".rsplit_str("/").collect();
+ /// assert_eq!(x, vec![B(")"), B(""), B(""), B("(")]);
+ /// ```
+ ///
+ /// Separators at the start or end of a string are neighbored by empty
+ /// strings.
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let x: Vec<&[u8]> = b"010".rsplit_str("0").collect();
+ /// assert_eq!(x, vec![B(""), B("1"), B("")]);
+ /// ```
+ ///
+ /// When the empty string is used as a separator, it splits every **byte**
+ /// in the byte string, along with the beginning and end of the byte
+ /// string.
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let x: Vec<&[u8]> = b"rust".rsplit_str("").collect();
+ /// assert_eq!(x, vec![
+ /// B(""), B("t"), B("s"), B("u"), B("r"), B(""),
+ /// ]);
+ ///
+ /// // Splitting by an empty string is not UTF-8 aware. Elements yielded
+ /// // may not be valid UTF-8!
+ /// let x: Vec<&[u8]> = B("☃").rsplit_str("").collect();
+ /// assert_eq!(x, vec![B(""), B(b"\x83"), B(b"\x98"), B(b"\xE2"), B("")]);
+ /// ```
+ ///
+ /// Contiguous separators, especially whitespace, can lead to possibly
+ /// surprising behavior. For example, this code is correct:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let x: Vec<&[u8]> = b" a b c".rsplit_str(" ").collect();
+ /// assert_eq!(x, vec![
+ /// B("c"), B("b"), B(""), B("a"), B(""), B(""), B(""), B(""),
+ /// ]);
+ /// ```
+ ///
+ /// It does *not* give you `["a", "b", "c"]`.
+ #[inline]
+ fn rsplit_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ splitter: &'s B,
+ ) -> SplitReverse<'h, 's> {
+ SplitReverse::new(self.as_bytes(), splitter.as_ref())
+ }
+
+ /// Split this byte string at the first occurrence of `splitter`.
+ ///
+ /// If the `splitter` is found in the byte string, returns a tuple
+ /// containing the parts of the string before and after the first occurrence
+ /// of `splitter` respectively. Otherwise, if there are no occurrences of
+ /// `splitter` in the byte string, returns `None`.
+ ///
+ /// The splitter may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// If you need to split on the *last* instance of a delimiter instead, see
+ /// the [`ByteSlice::rsplit_once_str`](#method.rsplit_once_str) method .
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// assert_eq!(
+ /// B("foo,bar").split_once_str(","),
+ /// Some((B("foo"), B("bar"))),
+ /// );
+ /// assert_eq!(
+ /// B("foo,bar,baz").split_once_str(","),
+ /// Some((B("foo"), B("bar,baz"))),
+ /// );
+ /// assert_eq!(B("foo").split_once_str(","), None);
+ /// assert_eq!(B("foo,").split_once_str(b","), Some((B("foo"), B(""))));
+ /// assert_eq!(B(",foo").split_once_str(b","), Some((B(""), B("foo"))));
+ /// ```
+ #[inline]
+ fn split_once_str<'a, B: ?Sized + AsRef<[u8]>>(
+ &'a self,
+ splitter: &B,
+ ) -> Option<(&'a [u8], &'a [u8])> {
+ let bytes = self.as_bytes();
+ let splitter = splitter.as_ref();
+ let start = Finder::new(splitter).find(bytes)?;
+ let end = start + splitter.len();
+ Some((&bytes[..start], &bytes[end..]))
+ }
+
+ /// Split this byte string at the last occurrence of `splitter`.
+ ///
+ /// If the `splitter` is found in the byte string, returns a tuple
+ /// containing the parts of the string before and after the last occurrence
+ /// of `splitter`, respectively. Otherwise, if there are no occurrences of
+ /// `splitter` in the byte string, returns `None`.
+ ///
+ /// The splitter may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// If you need to split on the *first* instance of a delimiter instead, see
+ /// the [`ByteSlice::split_once_str`](#method.split_once_str) method.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// assert_eq!(
+ /// B("foo,bar").rsplit_once_str(","),
+ /// Some((B("foo"), B("bar"))),
+ /// );
+ /// assert_eq!(
+ /// B("foo,bar,baz").rsplit_once_str(","),
+ /// Some((B("foo,bar"), B("baz"))),
+ /// );
+ /// assert_eq!(B("foo").rsplit_once_str(","), None);
+ /// assert_eq!(B("foo,").rsplit_once_str(b","), Some((B("foo"), B(""))));
+ /// assert_eq!(B(",foo").rsplit_once_str(b","), Some((B(""), B("foo"))));
+ /// ```
+ #[inline]
+ fn rsplit_once_str<'a, B: ?Sized + AsRef<[u8]>>(
+ &'a self,
+ splitter: &B,
+ ) -> Option<(&'a [u8], &'a [u8])> {
+ let bytes = self.as_bytes();
+ let splitter = splitter.as_ref();
+ let start = FinderReverse::new(splitter).rfind(bytes)?;
+ let end = start + splitter.len();
+ Some((&bytes[..start], &bytes[end..]))
+ }
+
+ /// Returns an iterator of at most `limit` substrings of this byte string,
+ /// separated by the given byte string. If `limit` substrings are yielded,
+ /// then the last substring will contain the remainder of this byte string.
+ ///
+ /// The needle may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let x: Vec<_> = b"Mary had a little lamb".splitn_str(3, " ").collect();
+ /// assert_eq!(x, vec![B("Mary"), B("had"), B("a little lamb")]);
+ ///
+ /// let x: Vec<_> = b"".splitn_str(3, "X").collect();
+ /// assert_eq!(x, vec![b""]);
+ ///
+ /// let x: Vec<_> = b"lionXXtigerXleopard".splitn_str(3, "X").collect();
+ /// assert_eq!(x, vec![B("lion"), B(""), B("tigerXleopard")]);
+ ///
+ /// let x: Vec<_> = b"lion::tiger::leopard".splitn_str(2, "::").collect();
+ /// assert_eq!(x, vec![B("lion"), B("tiger::leopard")]);
+ ///
+ /// let x: Vec<_> = b"abcXdef".splitn_str(1, "X").collect();
+ /// assert_eq!(x, vec![B("abcXdef")]);
+ ///
+ /// let x: Vec<_> = b"abcdef".splitn_str(2, "X").collect();
+ /// assert_eq!(x, vec![B("abcdef")]);
+ ///
+ /// let x: Vec<_> = b"abcXdef".splitn_str(0, "X").collect();
+ /// assert!(x.is_empty());
+ /// ```
+ #[inline]
+ fn splitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ limit: usize,
+ splitter: &'s B,
+ ) -> SplitN<'h, 's> {
+ SplitN::new(self.as_bytes(), splitter.as_ref(), limit)
+ }
+
+ /// Returns an iterator of at most `limit` substrings of this byte string,
+ /// separated by the given byte string, in reverse. If `limit` substrings
+ /// are yielded, then the last substring will contain the remainder of this
+ /// byte string.
+ ///
+ /// The needle may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let x: Vec<_> =
+ /// b"Mary had a little lamb".rsplitn_str(3, " ").collect();
+ /// assert_eq!(x, vec![B("lamb"), B("little"), B("Mary had a")]);
+ ///
+ /// let x: Vec<_> = b"".rsplitn_str(3, "X").collect();
+ /// assert_eq!(x, vec![b""]);
+ ///
+ /// let x: Vec<_> = b"lionXXtigerXleopard".rsplitn_str(3, "X").collect();
+ /// assert_eq!(x, vec![B("leopard"), B("tiger"), B("lionX")]);
+ ///
+ /// let x: Vec<_> = b"lion::tiger::leopard".rsplitn_str(2, "::").collect();
+ /// assert_eq!(x, vec![B("leopard"), B("lion::tiger")]);
+ ///
+ /// let x: Vec<_> = b"abcXdef".rsplitn_str(1, "X").collect();
+ /// assert_eq!(x, vec![B("abcXdef")]);
+ ///
+ /// let x: Vec<_> = b"abcdef".rsplitn_str(2, "X").collect();
+ /// assert_eq!(x, vec![B("abcdef")]);
+ ///
+ /// let x: Vec<_> = b"abcXdef".rsplitn_str(0, "X").collect();
+ /// assert!(x.is_empty());
+ /// ```
+ #[inline]
+ fn rsplitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
+ &'h self,
+ limit: usize,
+ splitter: &'s B,
+ ) -> SplitNReverse<'h, 's> {
+ SplitNReverse::new(self.as_bytes(), splitter.as_ref(), limit)
+ }
+
+ /// Replace all matches of the given needle with the given replacement, and
+ /// the result as a new `Vec<u8>`.
+ ///
+ /// This routine is useful as a convenience. If you need to reuse an
+ /// allocation, use [`replace_into`](#method.replace_into) instead.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"this is old".replace("old", "new");
+ /// assert_eq!(s, "this is new".as_bytes());
+ /// ```
+ ///
+ /// When the pattern doesn't match:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"this is old".replace("nada nada", "limonada");
+ /// assert_eq!(s, "this is old".as_bytes());
+ /// ```
+ ///
+ /// When the needle is an empty string:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"foo".replace("", "Z");
+ /// assert_eq!(s, "ZfZoZoZ".as_bytes());
+ /// ```
+ #[cfg(feature = "alloc")]
+ #[inline]
+ fn replace<N: AsRef<[u8]>, R: AsRef<[u8]>>(
+ &self,
+ needle: N,
+ replacement: R,
+ ) -> Vec<u8> {
+ let mut dest = Vec::with_capacity(self.as_bytes().len());
+ self.replace_into(needle, replacement, &mut dest);
+ dest
+ }
+
+ /// Replace up to `limit` matches of the given needle with the given
+ /// replacement, and the result as a new `Vec<u8>`.
+ ///
+ /// This routine is useful as a convenience. If you need to reuse an
+ /// allocation, use [`replacen_into`](#method.replacen_into) instead.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"foofoo".replacen("o", "z", 2);
+ /// assert_eq!(s, "fzzfoo".as_bytes());
+ /// ```
+ ///
+ /// When the pattern doesn't match:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"foofoo".replacen("a", "z", 2);
+ /// assert_eq!(s, "foofoo".as_bytes());
+ /// ```
+ ///
+ /// When the needle is an empty string:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"foo".replacen("", "Z", 2);
+ /// assert_eq!(s, "ZfZoo".as_bytes());
+ /// ```
+ #[cfg(feature = "alloc")]
+ #[inline]
+ fn replacen<N: AsRef<[u8]>, R: AsRef<[u8]>>(
+ &self,
+ needle: N,
+ replacement: R,
+ limit: usize,
+ ) -> Vec<u8> {
+ let mut dest = Vec::with_capacity(self.as_bytes().len());
+ self.replacen_into(needle, replacement, limit, &mut dest);
+ dest
+ }
+
+ /// Replace all matches of the given needle with the given replacement,
+ /// and write the result into the provided `Vec<u8>`.
+ ///
+ /// This does **not** clear `dest` before writing to it.
+ ///
+ /// This routine is useful for reusing allocation. For a more convenient
+ /// API, use [`replace`](#method.replace) instead.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"this is old";
+ ///
+ /// let mut dest = vec![];
+ /// s.replace_into("old", "new", &mut dest);
+ /// assert_eq!(dest, "this is new".as_bytes());
+ /// ```
+ ///
+ /// When the pattern doesn't match:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"this is old";
+ ///
+ /// let mut dest = vec![];
+ /// s.replace_into("nada nada", "limonada", &mut dest);
+ /// assert_eq!(dest, "this is old".as_bytes());
+ /// ```
+ ///
+ /// When the needle is an empty string:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"foo";
+ ///
+ /// let mut dest = vec![];
+ /// s.replace_into("", "Z", &mut dest);
+ /// assert_eq!(dest, "ZfZoZoZ".as_bytes());
+ /// ```
+ #[cfg(feature = "alloc")]
+ #[inline]
+ fn replace_into<N: AsRef<[u8]>, R: AsRef<[u8]>>(
+ &self,
+ needle: N,
+ replacement: R,
+ dest: &mut Vec<u8>,
+ ) {
+ let (needle, replacement) = (needle.as_ref(), replacement.as_ref());
+
+ let mut last = 0;
+ for start in self.find_iter(needle) {
+ dest.push_str(&self.as_bytes()[last..start]);
+ dest.push_str(replacement);
+ last = start + needle.len();
+ }
+ dest.push_str(&self.as_bytes()[last..]);
+ }
+
+ /// Replace up to `limit` matches of the given needle with the given
+ /// replacement, and write the result into the provided `Vec<u8>`.
+ ///
+ /// This does **not** clear `dest` before writing to it.
+ ///
+ /// This routine is useful for reusing allocation. For a more convenient
+ /// API, use [`replacen`](#method.replacen) instead.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"foofoo";
+ ///
+ /// let mut dest = vec![];
+ /// s.replacen_into("o", "z", 2, &mut dest);
+ /// assert_eq!(dest, "fzzfoo".as_bytes());
+ /// ```
+ ///
+ /// When the pattern doesn't match:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"foofoo";
+ ///
+ /// let mut dest = vec![];
+ /// s.replacen_into("a", "z", 2, &mut dest);
+ /// assert_eq!(dest, "foofoo".as_bytes());
+ /// ```
+ ///
+ /// When the needle is an empty string:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"foo";
+ ///
+ /// let mut dest = vec![];
+ /// s.replacen_into("", "Z", 2, &mut dest);
+ /// assert_eq!(dest, "ZfZoo".as_bytes());
+ /// ```
+ #[cfg(feature = "alloc")]
+ #[inline]
+ fn replacen_into<N: AsRef<[u8]>, R: AsRef<[u8]>>(
+ &self,
+ needle: N,
+ replacement: R,
+ limit: usize,
+ dest: &mut Vec<u8>,
+ ) {
+ let (needle, replacement) = (needle.as_ref(), replacement.as_ref());
+
+ let mut last = 0;
+ for start in self.find_iter(needle).take(limit) {
+ dest.push_str(&self.as_bytes()[last..start]);
+ dest.push_str(replacement);
+ last = start + needle.len();
+ }
+ dest.push_str(&self.as_bytes()[last..]);
+ }
+
+ /// Returns an iterator over the bytes in this byte string.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let bs = b"foobar";
+ /// let bytes: Vec<u8> = bs.bytes().collect();
+ /// assert_eq!(bytes, bs);
+ /// ```
+ #[inline]
+ fn bytes(&self) -> Bytes<'_> {
+ Bytes { it: self.as_bytes().iter() }
+ }
+
+ /// Returns an iterator over the Unicode scalar values in this byte string.
+ /// If invalid UTF-8 is encountered, then the Unicode replacement codepoint
+ /// is yielded instead.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
+ /// let chars: Vec<char> = bs.chars().collect();
+ /// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
+ /// ```
+ ///
+ /// Codepoints can also be iterated over in reverse:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
+ /// let chars: Vec<char> = bs.chars().rev().collect();
+ /// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
+ /// ```
+ #[inline]
+ fn chars(&self) -> Chars<'_> {
+ Chars::new(self.as_bytes())
+ }
+
+ /// Returns an iterator over the Unicode scalar values in this byte string
+ /// along with their starting and ending byte index positions. If invalid
+ /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded
+ /// instead.
+ ///
+ /// Note that this is slightly different from the `CharIndices` iterator
+ /// provided by the standard library. Aside from working on possibly
+ /// invalid UTF-8, this iterator provides both the corresponding starting
+ /// and ending byte indices of each codepoint yielded. The ending position
+ /// is necessary to slice the original byte string when invalid UTF-8 bytes
+ /// are converted into a Unicode replacement codepoint, since a single
+ /// replacement codepoint can substitute anywhere from 1 to 3 invalid bytes
+ /// (inclusive).
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
+ /// let chars: Vec<(usize, usize, char)> = bs.char_indices().collect();
+ /// assert_eq!(chars, vec![
+ /// (0, 3, '☃'),
+ /// (3, 4, '\u{FFFD}'),
+ /// (4, 8, '𝞃'),
+ /// (8, 10, '\u{FFFD}'),
+ /// (10, 11, 'a'),
+ /// ]);
+ /// ```
+ ///
+ /// Codepoints can also be iterated over in reverse:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
+ /// let chars: Vec<(usize, usize, char)> = bs
+ /// .char_indices()
+ /// .rev()
+ /// .collect();
+ /// assert_eq!(chars, vec![
+ /// (10, 11, 'a'),
+ /// (8, 10, '\u{FFFD}'),
+ /// (4, 8, '𝞃'),
+ /// (3, 4, '\u{FFFD}'),
+ /// (0, 3, '☃'),
+ /// ]);
+ /// ```
+ #[inline]
+ fn char_indices(&self) -> CharIndices<'_> {
+ CharIndices::new(self.as_bytes())
+ }
+
+ /// Iterate over chunks of valid UTF-8.
+ ///
+ /// The iterator returned yields chunks of valid UTF-8 separated by invalid
+ /// UTF-8 bytes, if they exist. Invalid UTF-8 bytes are always 1-3 bytes,
+ /// which are determined via the "substitution of maximal subparts"
+ /// strategy described in the docs for the
+ /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy)
+ /// method.
+ ///
+ /// # Examples
+ ///
+ /// This example shows how to gather all valid and invalid chunks from a
+ /// byte slice:
+ ///
+ /// ```
+ /// use bstr::{ByteSlice, Utf8Chunk};
+ ///
+ /// let bytes = b"foo\xFD\xFEbar\xFF";
+ ///
+ /// let (mut valid_chunks, mut invalid_chunks) = (vec![], vec![]);
+ /// for chunk in bytes.utf8_chunks() {
+ /// if !chunk.valid().is_empty() {
+ /// valid_chunks.push(chunk.valid());
+ /// }
+ /// if !chunk.invalid().is_empty() {
+ /// invalid_chunks.push(chunk.invalid());
+ /// }
+ /// }
+ ///
+ /// assert_eq!(valid_chunks, vec!["foo", "bar"]);
+ /// assert_eq!(invalid_chunks, vec![b"\xFD", b"\xFE", b"\xFF"]);
+ /// ```
+ #[inline]
+ fn utf8_chunks(&self) -> Utf8Chunks<'_> {
+ Utf8Chunks { bytes: self.as_bytes() }
+ }
+
+ /// Returns an iterator over the grapheme clusters in this byte string.
+ /// If invalid UTF-8 is encountered, then the Unicode replacement codepoint
+ /// is yielded instead.
+ ///
+ /// # Examples
+ ///
+ /// This example shows how multiple codepoints can combine to form a
+ /// single grapheme cluster:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes();
+ /// let graphemes: Vec<&str> = bs.graphemes().collect();
+ /// assert_eq!(vec!["à̖", "🇺🇸"], graphemes);
+ /// ```
+ ///
+ /// This shows that graphemes can be iterated over in reverse:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes();
+ /// let graphemes: Vec<&str> = bs.graphemes().rev().collect();
+ /// assert_eq!(vec!["🇺🇸", "à̖"], graphemes);
+ /// ```
+ #[cfg(feature = "unicode")]
+ #[inline]
+ fn graphemes(&self) -> Graphemes<'_> {
+ Graphemes::new(self.as_bytes())
+ }
+
+ /// Returns an iterator over the grapheme clusters in this byte string
+ /// along with their starting and ending byte index positions. If invalid
+ /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded
+ /// instead.
+ ///
+ /// # Examples
+ ///
+ /// This example shows how to get the byte offsets of each individual
+ /// grapheme cluster:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes();
+ /// let graphemes: Vec<(usize, usize, &str)> =
+ /// bs.grapheme_indices().collect();
+ /// assert_eq!(vec![(0, 5, "à̖"), (5, 13, "🇺🇸")], graphemes);
+ /// ```
+ ///
+ /// This example shows what happens when invalid UTF-8 is encountered. Note
+ /// that the offsets are valid indices into the original string, and do
+ /// not necessarily correspond to the length of the `&str` returned!
+ ///
+ /// ```
+ /// # #[cfg(all(feature = "alloc"))] {
+ /// use bstr::{ByteSlice, ByteVec};
+ ///
+ /// let mut bytes = vec![];
+ /// bytes.push_str("a\u{0300}\u{0316}");
+ /// bytes.push(b'\xFF');
+ /// bytes.push_str("\u{1F1FA}\u{1F1F8}");
+ ///
+ /// let graphemes: Vec<(usize, usize, &str)> =
+ /// bytes.grapheme_indices().collect();
+ /// assert_eq!(
+ /// graphemes,
+ /// vec![(0, 5, "à̖"), (5, 6, "\u{FFFD}"), (6, 14, "🇺🇸")]
+ /// );
+ /// # }
+ /// ```
+ #[cfg(feature = "unicode")]
+ #[inline]
+ fn grapheme_indices(&self) -> GraphemeIndices<'_> {
+ GraphemeIndices::new(self.as_bytes())
+ }
+
+ /// Returns an iterator over the words in this byte string. If invalid
+ /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded
+ /// instead.
+ ///
+ /// This is similar to
+ /// [`words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks),
+ /// except it only returns elements that contain a "word" character. A word
+ /// character is defined by UTS #18 (Annex C) to be the combination of the
+ /// `Alphabetic` and `Join_Control` properties, along with the
+ /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general
+ /// categories.
+ ///
+ /// Since words are made up of one or more codepoints, this iterator
+ /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
+ /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let bs = br#"The quick ("brown") fox can't jump 32.3 feet, right?"#;
+ /// let words: Vec<&str> = bs.words().collect();
+ /// assert_eq!(words, vec![
+ /// "The", "quick", "brown", "fox", "can't",
+ /// "jump", "32.3", "feet", "right",
+ /// ]);
+ /// ```
+ #[cfg(feature = "unicode")]
+ #[inline]
+ fn words(&self) -> Words<'_> {
+ Words::new(self.as_bytes())
+ }
+
+ /// Returns an iterator over the words in this byte string along with
+ /// their starting and ending byte index positions.
+ ///
+ /// This is similar to
+ /// [`words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices),
+ /// except it only returns elements that contain a "word" character. A word
+ /// character is defined by UTS #18 (Annex C) to be the combination of the
+ /// `Alphabetic` and `Join_Control` properties, along with the
+ /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general
+ /// categories.
+ ///
+ /// Since words are made up of one or more codepoints, this iterator
+ /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
+ /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
+ ///
+ /// # Examples
+ ///
+ /// This example shows how to get the byte offsets of each individual
+ /// word:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let bs = b"can't jump 32.3 feet";
+ /// let words: Vec<(usize, usize, &str)> = bs.word_indices().collect();
+ /// assert_eq!(words, vec![
+ /// (0, 5, "can't"),
+ /// (6, 10, "jump"),
+ /// (11, 15, "32.3"),
+ /// (16, 20, "feet"),
+ /// ]);
+ /// ```
+ #[cfg(feature = "unicode")]
+ #[inline]
+ fn word_indices(&self) -> WordIndices<'_> {
+ WordIndices::new(self.as_bytes())
+ }
+
+ /// Returns an iterator over the words in this byte string, along with
+ /// all breaks between the words. Concatenating all elements yielded by
+ /// the iterator results in the original string (modulo Unicode replacement
+ /// codepoint substitutions if invalid UTF-8 is encountered).
+ ///
+ /// Since words are made up of one or more codepoints, this iterator
+ /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
+ /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let bs = br#"The quick ("brown") fox can't jump 32.3 feet, right?"#;
+ /// let words: Vec<&str> = bs.words_with_breaks().collect();
+ /// assert_eq!(words, vec![
+ /// "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")",
+ /// " ", "fox", " ", "can't", " ", "jump", " ", "32.3", " ", "feet",
+ /// ",", " ", "right", "?",
+ /// ]);
+ /// ```
+ #[cfg(feature = "unicode")]
+ #[inline]
+ fn words_with_breaks(&self) -> WordsWithBreaks<'_> {
+ WordsWithBreaks::new(self.as_bytes())
+ }
+
+ /// Returns an iterator over the words and their byte offsets in this
+ /// byte string, along with all breaks between the words. Concatenating
+ /// all elements yielded by the iterator results in the original string
+ /// (modulo Unicode replacement codepoint substitutions if invalid UTF-8 is
+ /// encountered).
+ ///
+ /// Since words are made up of one or more codepoints, this iterator
+ /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
+ /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
+ ///
+ /// # Examples
+ ///
+ /// This example shows how to get the byte offsets of each individual
+ /// word:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let bs = b"can't jump 32.3 feet";
+ /// let words: Vec<(usize, usize, &str)> =
+ /// bs.words_with_break_indices().collect();
+ /// assert_eq!(words, vec![
+ /// (0, 5, "can't"),
+ /// (5, 6, " "),
+ /// (6, 10, "jump"),
+ /// (10, 11, " "),
+ /// (11, 15, "32.3"),
+ /// (15, 16, " "),
+ /// (16, 20, "feet"),
+ /// ]);
+ /// ```
+ #[cfg(feature = "unicode")]
+ #[inline]
+ fn words_with_break_indices(&self) -> WordsWithBreakIndices<'_> {
+ WordsWithBreakIndices::new(self.as_bytes())
+ }
+
+ /// Returns an iterator over the sentences in this byte string.
+ ///
+ /// Typically, a sentence will include its trailing punctuation and
+ /// whitespace. Concatenating all elements yielded by the iterator
+ /// results in the original string (modulo Unicode replacement codepoint
+ /// substitutions if invalid UTF-8 is encountered).
+ ///
+ /// Since sentences are made up of one or more codepoints, this iterator
+ /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
+ /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let bs = b"I want this. Not that. Right now.";
+ /// let sentences: Vec<&str> = bs.sentences().collect();
+ /// assert_eq!(sentences, vec![
+ /// "I want this. ",
+ /// "Not that. ",
+ /// "Right now.",
+ /// ]);
+ /// ```
+ #[cfg(feature = "unicode")]
+ #[inline]
+ fn sentences(&self) -> Sentences<'_> {
+ Sentences::new(self.as_bytes())
+ }
+
+ /// Returns an iterator over the sentences in this byte string along with
+ /// their starting and ending byte index positions.
+ ///
+ /// Typically, a sentence will include its trailing punctuation and
+ /// whitespace. Concatenating all elements yielded by the iterator
+ /// results in the original string (modulo Unicode replacement codepoint
+ /// substitutions if invalid UTF-8 is encountered).
+ ///
+ /// Since sentences are made up of one or more codepoints, this iterator
+ /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
+ /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let bs = b"I want this. Not that. Right now.";
+ /// let sentences: Vec<(usize, usize, &str)> =
+ /// bs.sentence_indices().collect();
+ /// assert_eq!(sentences, vec![
+ /// (0, 13, "I want this. "),
+ /// (13, 23, "Not that. "),
+ /// (23, 33, "Right now."),
+ /// ]);
+ /// ```
+ #[cfg(feature = "unicode")]
+ #[inline]
+ fn sentence_indices(&self) -> SentenceIndices<'_> {
+ SentenceIndices::new(self.as_bytes())
+ }
+
+ /// An iterator over all lines in a byte string, without their
+ /// terminators.
+ ///
+ /// For this iterator, the only line terminators recognized are `\r\n` and
+ /// `\n`.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = b"\
+ /// foo
+ ///
+ /// bar\r
+ /// baz
+ ///
+ ///
+ /// quux";
+ /// let lines: Vec<&[u8]> = s.lines().collect();
+ /// assert_eq!(lines, vec![
+ /// B("foo"), B(""), B("bar"), B("baz"), B(""), B(""), B("quux"),
+ /// ]);
+ /// ```
+ #[inline]
+ fn lines(&self) -> Lines<'_> {
+ Lines::new(self.as_bytes())
+ }
+
+ /// An iterator over all lines in a byte string, including their
+ /// terminators.
+ ///
+ /// For this iterator, the only line terminator recognized is `\n`. (Since
+ /// line terminators are included, this also handles `\r\n` line endings.)
+ ///
+ /// Line terminators are only included if they are present in the original
+ /// byte string. For example, the last line in a byte string may not end
+ /// with a line terminator.
+ ///
+ /// Concatenating all elements yielded by this iterator is guaranteed to
+ /// yield the original byte string.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = b"\
+ /// foo
+ ///
+ /// bar\r
+ /// baz
+ ///
+ ///
+ /// quux";
+ /// let lines: Vec<&[u8]> = s.lines_with_terminator().collect();
+ /// assert_eq!(lines, vec![
+ /// B("foo\n"),
+ /// B("\n"),
+ /// B("bar\r\n"),
+ /// B("baz\n"),
+ /// B("\n"),
+ /// B("\n"),
+ /// B("quux"),
+ /// ]);
+ /// ```
+ #[inline]
+ fn lines_with_terminator(&self) -> LinesWithTerminator<'_> {
+ LinesWithTerminator::new(self.as_bytes())
+ }
+
+ /// Return a byte string slice with leading and trailing whitespace
+ /// removed.
+ ///
+ /// Whitespace is defined according to the terms of the `White_Space`
+ /// Unicode property.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B(" foo\tbar\t\u{2003}\n");
+ /// assert_eq!(s.trim(), B("foo\tbar"));
+ /// ```
+ #[cfg(feature = "unicode")]
+ #[inline]
+ fn trim(&self) -> &[u8] {
+ self.trim_start().trim_end()
+ }
+
+ /// Return a byte string slice with leading whitespace removed.
+ ///
+ /// Whitespace is defined according to the terms of the `White_Space`
+ /// Unicode property.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B(" foo\tbar\t\u{2003}\n");
+ /// assert_eq!(s.trim_start(), B("foo\tbar\t\u{2003}\n"));
+ /// ```
+ #[cfg(feature = "unicode")]
+ #[inline]
+ fn trim_start(&self) -> &[u8] {
+ let start = whitespace_len_fwd(self.as_bytes());
+ &self.as_bytes()[start..]
+ }
+
+ /// Return a byte string slice with trailing whitespace removed.
+ ///
+ /// Whitespace is defined according to the terms of the `White_Space`
+ /// Unicode property.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B(" foo\tbar\t\u{2003}\n");
+ /// assert_eq!(s.trim_end(), B(" foo\tbar"));
+ /// ```
+ #[cfg(feature = "unicode")]
+ #[inline]
+ fn trim_end(&self) -> &[u8] {
+ let end = whitespace_len_rev(self.as_bytes());
+ &self.as_bytes()[..end]
+ }
+
+ /// Return a byte string slice with leading and trailing characters
+ /// satisfying the given predicate removed.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = b"123foo5bar789";
+ /// assert_eq!(s.trim_with(|c| c.is_numeric()), B("foo5bar"));
+ /// ```
+ #[inline]
+ fn trim_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] {
+ self.trim_start_with(&mut trim).trim_end_with(&mut trim)
+ }
+
+ /// Return a byte string slice with leading characters satisfying the given
+ /// predicate removed.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = b"123foo5bar789";
+ /// assert_eq!(s.trim_start_with(|c| c.is_numeric()), B("foo5bar789"));
+ /// ```
+ #[inline]
+ fn trim_start_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] {
+ for (s, _, ch) in self.char_indices() {
+ if !trim(ch) {
+ return &self.as_bytes()[s..];
+ }
+ }
+ b""
+ }
+
+ /// Return a byte string slice with trailing characters satisfying the
+ /// given predicate removed.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = b"123foo5bar789";
+ /// assert_eq!(s.trim_end_with(|c| c.is_numeric()), B("123foo5bar"));
+ /// ```
+ #[inline]
+ fn trim_end_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] {
+ for (_, e, ch) in self.char_indices().rev() {
+ if !trim(ch) {
+ return &self.as_bytes()[..e];
+ }
+ }
+ b""
+ }
+
+ /// Returns a new `Vec<u8>` containing the lowercase equivalent of this
+ /// byte string.
+ ///
+ /// In this case, lowercase is defined according to the `Lowercase` Unicode
+ /// property.
+ ///
+ /// If invalid UTF-8 is seen, or if a character has no lowercase variant,
+ /// then it is written to the given buffer unchanged.
+ ///
+ /// Note that some characters in this byte string may expand into multiple
+ /// characters when changing the case, so the number of bytes written to
+ /// the given byte string may not be equivalent to the number of bytes in
+ /// this byte string.
+ ///
+ /// If you'd like to reuse an allocation for performance reasons, then use
+ /// [`to_lowercase_into`](#method.to_lowercase_into) instead.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B("HELLO Β");
+ /// assert_eq!("hello β".as_bytes(), s.to_lowercase().as_bytes());
+ /// ```
+ ///
+ /// Scripts without case are not changed:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B("农历新年");
+ /// assert_eq!("农历新年".as_bytes(), s.to_lowercase().as_bytes());
+ /// ```
+ ///
+ /// Invalid UTF-8 remains as is:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
+ /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), s.to_lowercase().as_bytes());
+ /// ```
+ #[cfg(all(feature = "alloc", feature = "unicode"))]
+ #[inline]
+ fn to_lowercase(&self) -> Vec<u8> {
+ let mut buf = vec![];
+ self.to_lowercase_into(&mut buf);
+ buf
+ }
+
+ /// Writes the lowercase equivalent of this byte string into the given
+ /// buffer. The buffer is not cleared before written to.
+ ///
+ /// In this case, lowercase is defined according to the `Lowercase`
+ /// Unicode property.
+ ///
+ /// If invalid UTF-8 is seen, or if a character has no lowercase variant,
+ /// then it is written to the given buffer unchanged.
+ ///
+ /// Note that some characters in this byte string may expand into multiple
+ /// characters when changing the case, so the number of bytes written to
+ /// the given byte string may not be equivalent to the number of bytes in
+ /// this byte string.
+ ///
+ /// If you don't need to amortize allocation and instead prefer
+ /// convenience, then use [`to_lowercase`](#method.to_lowercase) instead.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B("HELLO Β");
+ ///
+ /// let mut buf = vec![];
+ /// s.to_lowercase_into(&mut buf);
+ /// assert_eq!("hello β".as_bytes(), buf.as_bytes());
+ /// ```
+ ///
+ /// Scripts without case are not changed:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B("农历新年");
+ ///
+ /// let mut buf = vec![];
+ /// s.to_lowercase_into(&mut buf);
+ /// assert_eq!("农历新年".as_bytes(), buf.as_bytes());
+ /// ```
+ ///
+ /// Invalid UTF-8 remains as is:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
+ ///
+ /// let mut buf = vec![];
+ /// s.to_lowercase_into(&mut buf);
+ /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), buf.as_bytes());
+ /// ```
+ #[cfg(all(feature = "alloc", feature = "unicode"))]
+ #[inline]
+ fn to_lowercase_into(&self, buf: &mut Vec<u8>) {
+ // TODO: This is the best we can do given what std exposes I think.
+ // If we roll our own case handling, then we might be able to do this
+ // a bit faster. We shouldn't roll our own case handling unless we
+ // need to, e.g., for doing caseless matching or case folding.
+
+ // TODO(BUG): This doesn't handle any special casing rules.
+
+ buf.reserve(self.as_bytes().len());
+ for (s, e, ch) in self.char_indices() {
+ if ch == '\u{FFFD}' {
+ buf.push_str(&self.as_bytes()[s..e]);
+ } else if ch.is_ascii() {
+ buf.push_char(ch.to_ascii_lowercase());
+ } else {
+ for upper in ch.to_lowercase() {
+ buf.push_char(upper);
+ }
+ }
+ }
+ }
+
+ /// Returns a new `Vec<u8>` containing the ASCII lowercase equivalent of
+ /// this byte string.
+ ///
+ /// In this case, lowercase is only defined in ASCII letters. Namely, the
+ /// letters `A-Z` are converted to `a-z`. All other bytes remain unchanged.
+ /// In particular, the length of the byte string returned is always
+ /// equivalent to the length of this byte string.
+ ///
+ /// If you'd like to reuse an allocation for performance reasons, then use
+ /// [`make_ascii_lowercase`](#method.make_ascii_lowercase) to perform
+ /// the conversion in place.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B("HELLO Β");
+ /// assert_eq!("hello Β".as_bytes(), s.to_ascii_lowercase().as_bytes());
+ /// ```
+ ///
+ /// Invalid UTF-8 remains as is:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
+ /// assert_eq!(s.to_ascii_lowercase(), B(b"foo\xFFbar\xE2\x98baz"));
+ /// ```
+ #[cfg(feature = "alloc")]
+ #[inline]
+ fn to_ascii_lowercase(&self) -> Vec<u8> {
+ self.as_bytes().to_ascii_lowercase()
+ }
+
+ /// Convert this byte string to its lowercase ASCII equivalent in place.
+ ///
+ /// In this case, lowercase is only defined in ASCII letters. Namely, the
+ /// letters `A-Z` are converted to `a-z`. All other bytes remain unchanged.
+ ///
+ /// If you don't need to do the conversion in
+ /// place and instead prefer convenience, then use
+ /// [`to_ascii_lowercase`](#method.to_ascii_lowercase) instead.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let mut s = <Vec<u8>>::from("HELLO Β");
+ /// s.make_ascii_lowercase();
+ /// assert_eq!(s, "hello Β".as_bytes());
+ /// ```
+ ///
+ /// Invalid UTF-8 remains as is:
+ ///
+ /// ```
+ /// # #[cfg(feature = "alloc")] {
+ /// use bstr::{B, ByteSlice, ByteVec};
+ ///
+ /// let mut s = <Vec<u8>>::from_slice(b"FOO\xFFBAR\xE2\x98BAZ");
+ /// s.make_ascii_lowercase();
+ /// assert_eq!(s, B(b"foo\xFFbar\xE2\x98baz"));
+ /// # }
+ /// ```
+ #[inline]
+ fn make_ascii_lowercase(&mut self) {
+ self.as_bytes_mut().make_ascii_lowercase();
+ }
+
+ /// Returns a new `Vec<u8>` containing the uppercase equivalent of this
+ /// byte string.
+ ///
+ /// In this case, uppercase is defined according to the `Uppercase`
+ /// Unicode property.
+ ///
+ /// If invalid UTF-8 is seen, or if a character has no uppercase variant,
+ /// then it is written to the given buffer unchanged.
+ ///
+ /// Note that some characters in this byte string may expand into multiple
+ /// characters when changing the case, so the number of bytes written to
+ /// the given byte string may not be equivalent to the number of bytes in
+ /// this byte string.
+ ///
+ /// If you'd like to reuse an allocation for performance reasons, then use
+ /// [`to_uppercase_into`](#method.to_uppercase_into) instead.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B("hello β");
+ /// assert_eq!(s.to_uppercase(), B("HELLO Β"));
+ /// ```
+ ///
+ /// Scripts without case are not changed:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B("农历新年");
+ /// assert_eq!(s.to_uppercase(), B("农历新年"));
+ /// ```
+ ///
+ /// Invalid UTF-8 remains as is:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B(b"foo\xFFbar\xE2\x98baz");
+ /// assert_eq!(s.to_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ"));
+ /// ```
+ #[cfg(all(feature = "alloc", feature = "unicode"))]
+ #[inline]
+ fn to_uppercase(&self) -> Vec<u8> {
+ let mut buf = vec![];
+ self.to_uppercase_into(&mut buf);
+ buf
+ }
+
+ /// Writes the uppercase equivalent of this byte string into the given
+ /// buffer. The buffer is not cleared before written to.
+ ///
+ /// In this case, uppercase is defined according to the `Uppercase`
+ /// Unicode property.
+ ///
+ /// If invalid UTF-8 is seen, or if a character has no uppercase variant,
+ /// then it is written to the given buffer unchanged.
+ ///
+ /// Note that some characters in this byte string may expand into multiple
+ /// characters when changing the case, so the number of bytes written to
+ /// the given byte string may not be equivalent to the number of bytes in
+ /// this byte string.
+ ///
+ /// If you don't need to amortize allocation and instead prefer
+ /// convenience, then use [`to_uppercase`](#method.to_uppercase) instead.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B("hello β");
+ ///
+ /// let mut buf = vec![];
+ /// s.to_uppercase_into(&mut buf);
+ /// assert_eq!(buf, B("HELLO Β"));
+ /// ```
+ ///
+ /// Scripts without case are not changed:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B("农历新年");
+ ///
+ /// let mut buf = vec![];
+ /// s.to_uppercase_into(&mut buf);
+ /// assert_eq!(buf, B("农历新年"));
+ /// ```
+ ///
+ /// Invalid UTF-8 remains as is:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B(b"foo\xFFbar\xE2\x98baz");
+ ///
+ /// let mut buf = vec![];
+ /// s.to_uppercase_into(&mut buf);
+ /// assert_eq!(buf, B(b"FOO\xFFBAR\xE2\x98BAZ"));
+ /// ```
+ #[cfg(all(feature = "alloc", feature = "unicode"))]
+ #[inline]
+ fn to_uppercase_into(&self, buf: &mut Vec<u8>) {
+ // TODO: This is the best we can do given what std exposes I think.
+ // If we roll our own case handling, then we might be able to do this
+ // a bit faster. We shouldn't roll our own case handling unless we
+ // need to, e.g., for doing caseless matching or case folding.
+ buf.reserve(self.as_bytes().len());
+ for (s, e, ch) in self.char_indices() {
+ if ch == '\u{FFFD}' {
+ buf.push_str(&self.as_bytes()[s..e]);
+ } else if ch.is_ascii() {
+ buf.push_char(ch.to_ascii_uppercase());
+ } else {
+ for upper in ch.to_uppercase() {
+ buf.push_char(upper);
+ }
+ }
+ }
+ }
+
+ /// Returns a new `Vec<u8>` containing the ASCII uppercase equivalent of
+ /// this byte string.
+ ///
+ /// In this case, uppercase is only defined in ASCII letters. Namely, the
+ /// letters `a-z` are converted to `A-Z`. All other bytes remain unchanged.
+ /// In particular, the length of the byte string returned is always
+ /// equivalent to the length of this byte string.
+ ///
+ /// If you'd like to reuse an allocation for performance reasons, then use
+ /// [`make_ascii_uppercase`](#method.make_ascii_uppercase) to perform
+ /// the conversion in place.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B("hello β");
+ /// assert_eq!(s.to_ascii_uppercase(), B("HELLO β"));
+ /// ```
+ ///
+ /// Invalid UTF-8 remains as is:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = B(b"foo\xFFbar\xE2\x98baz");
+ /// assert_eq!(s.to_ascii_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ"));
+ /// ```
+ #[cfg(feature = "alloc")]
+ #[inline]
+ fn to_ascii_uppercase(&self) -> Vec<u8> {
+ self.as_bytes().to_ascii_uppercase()
+ }
+
+ /// Convert this byte string to its uppercase ASCII equivalent in place.
+ ///
+ /// In this case, uppercase is only defined in ASCII letters. Namely, the
+ /// letters `a-z` are converted to `A-Z`. All other bytes remain unchanged.
+ ///
+ /// If you don't need to do the conversion in
+ /// place and instead prefer convenience, then use
+ /// [`to_ascii_uppercase`](#method.to_ascii_uppercase) instead.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let mut s = <Vec<u8>>::from("hello β");
+ /// s.make_ascii_uppercase();
+ /// assert_eq!(s, B("HELLO β"));
+ /// ```
+ ///
+ /// Invalid UTF-8 remains as is:
+ ///
+ /// ```
+ /// # #[cfg(feature = "alloc")] {
+ /// use bstr::{B, ByteSlice, ByteVec};
+ ///
+ /// let mut s = <Vec<u8>>::from_slice(b"foo\xFFbar\xE2\x98baz");
+ /// s.make_ascii_uppercase();
+ /// assert_eq!(s, B(b"FOO\xFFBAR\xE2\x98BAZ"));
+ /// # }
+ /// ```
+ #[inline]
+ fn make_ascii_uppercase(&mut self) {
+ self.as_bytes_mut().make_ascii_uppercase();
+ }
+
+ /// Escapes this byte string into a sequence of `char` values.
+ ///
+ /// When the sequence of `char` values is concatenated into a string, the
+ /// result is always valid UTF-8. Any unprintable or invalid UTF-8 in this
+ /// byte string are escaped using using `\xNN` notation. Moreover, the
+ /// characters `\0`, `\r`, `\n`, `\t` and `\` are escaped as well.
+ ///
+ /// This is useful when one wants to get a human readable view of the raw
+ /// bytes that is also valid UTF-8.
+ ///
+ /// The iterator returned implements the `Display` trait. So one can do
+ /// `b"foo\xFFbar".escape_bytes().to_string()` to get a `String` with its
+ /// bytes escaped.
+ ///
+ /// The dual of this function is [`ByteVec::unescape_bytes`].
+ ///
+ /// Note that this is similar to, but not equivalent to the `Debug`
+ /// implementation on [`BStr`] and [`BString`]. The `Debug` implementations
+ /// also use the debug representation for all Unicode codepoints. However,
+ /// this escaping routine only escapes individual bytes. All Unicode
+ /// codepoints above `U+007F` are passed through unchanged without any
+ /// escaping.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # #[cfg(feature = "alloc")] {
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// assert_eq!(r"foo\xFFbar", b"foo\xFFbar".escape_bytes().to_string());
+ /// assert_eq!(r"foo\nbar", b"foo\nbar".escape_bytes().to_string());
+ /// assert_eq!(r"foo\tbar", b"foo\tbar".escape_bytes().to_string());
+ /// assert_eq!(r"foo\\bar", b"foo\\bar".escape_bytes().to_string());
+ /// assert_eq!(r"foo☃bar", B("foo☃bar").escape_bytes().to_string());
+ /// # }
+ /// ```
+ #[inline]
+ fn escape_bytes(&self) -> EscapeBytes<'_> {
+ EscapeBytes::new(self.as_bytes())
+ }
+
+ /// Reverse the bytes in this string, in place.
+ ///
+ /// This is not necessarily a well formed operation! For example, if this
+ /// byte string contains valid UTF-8 that isn't ASCII, then reversing the
+ /// string will likely result in invalid UTF-8 and otherwise non-sensical
+ /// content.
+ ///
+ /// Note that this is equivalent to the generic `[u8]::reverse` method.
+ /// This method is provided to permit callers to explicitly differentiate
+ /// between reversing bytes, codepoints and graphemes.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let mut s = <Vec<u8>>::from("hello");
+ /// s.reverse_bytes();
+ /// assert_eq!(s, "olleh".as_bytes());
+ /// ```
+ #[inline]
+ fn reverse_bytes(&mut self) {
+ self.as_bytes_mut().reverse();
+ }
+
+ /// Reverse the codepoints in this string, in place.
+ ///
+ /// If this byte string is valid UTF-8, then its reversal by codepoint
+ /// is also guaranteed to be valid UTF-8.
+ ///
+ /// This operation is equivalent to the following, but without allocating:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let mut s = <Vec<u8>>::from("foo☃bar");
+ ///
+ /// let mut chars: Vec<char> = s.chars().collect();
+ /// chars.reverse();
+ ///
+ /// let reversed: String = chars.into_iter().collect();
+ /// assert_eq!(reversed, "rab☃oof");
+ /// ```
+ ///
+ /// Note that this is not necessarily a well formed operation. For example,
+ /// if this byte string contains grapheme clusters with more than one
+ /// codepoint, then those grapheme clusters will not necessarily be
+ /// preserved. If you'd like to preserve grapheme clusters, then use
+ /// [`reverse_graphemes`](#method.reverse_graphemes) instead.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let mut s = <Vec<u8>>::from("foo☃bar");
+ /// s.reverse_chars();
+ /// assert_eq!(s, "rab☃oof".as_bytes());
+ /// ```
+ ///
+ /// This example shows that not all reversals lead to a well formed string.
+ /// For example, in this case, combining marks are used to put accents over
+ /// some letters, and those accent marks must appear after the codepoints
+ /// they modify.
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let mut s = <Vec<u8>>::from("résumé");
+ /// s.reverse_chars();
+ /// assert_eq!(s, B(b"\xCC\x81emus\xCC\x81er"));
+ /// ```
+ ///
+ /// A word of warning: the above example relies on the fact that
+ /// `résumé` is in decomposed normal form, which means there are separate
+ /// codepoints for the accents above `e`. If it is instead in composed
+ /// normal form, then the example works:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let mut s = <Vec<u8>>::from("résumé");
+ /// s.reverse_chars();
+ /// assert_eq!(s, B("émusér"));
+ /// ```
+ ///
+ /// The point here is to be cautious and not assume that just because
+ /// `reverse_chars` works in one case, that it therefore works in all
+ /// cases.
+ #[inline]
+ fn reverse_chars(&mut self) {
+ let mut i = 0;
+ loop {
+ let (_, size) = utf8::decode(&self.as_bytes()[i..]);
+ if size == 0 {
+ break;
+ }
+ if size > 1 {
+ self.as_bytes_mut()[i..i + size].reverse_bytes();
+ }
+ i += size;
+ }
+ self.reverse_bytes();
+ }
+
+ /// Reverse the graphemes in this string, in place.
+ ///
+ /// If this byte string is valid UTF-8, then its reversal by grapheme
+ /// is also guaranteed to be valid UTF-8.
+ ///
+ /// This operation is equivalent to the following, but without allocating:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let mut s = <Vec<u8>>::from("foo☃bar");
+ ///
+ /// let mut graphemes: Vec<&str> = s.graphemes().collect();
+ /// graphemes.reverse();
+ ///
+ /// let reversed = graphemes.concat();
+ /// assert_eq!(reversed, "rab☃oof");
+ /// ```
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let mut s = <Vec<u8>>::from("foo☃bar");
+ /// s.reverse_graphemes();
+ /// assert_eq!(s, "rab☃oof".as_bytes());
+ /// ```
+ ///
+ /// This example shows how this correctly handles grapheme clusters,
+ /// unlike `reverse_chars`.
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let mut s = <Vec<u8>>::from("résumé");
+ /// s.reverse_graphemes();
+ /// assert_eq!(s, "émusér".as_bytes());
+ /// ```
+ #[cfg(feature = "unicode")]
+ #[inline]
+ fn reverse_graphemes(&mut self) {
+ use crate::unicode::decode_grapheme;
+
+ let mut i = 0;
+ loop {
+ let (_, size) = decode_grapheme(&self.as_bytes()[i..]);
+ if size == 0 {
+ break;
+ }
+ if size > 1 {
+ self.as_bytes_mut()[i..i + size].reverse_bytes();
+ }
+ i += size;
+ }
+ self.reverse_bytes();
+ }
+
+ /// Returns true if and only if every byte in this byte string is ASCII.
+ ///
+ /// ASCII is an encoding that defines 128 codepoints. A byte corresponds to
+ /// an ASCII codepoint if and only if it is in the inclusive range
+ /// `[0, 127]`.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// assert!(B("abc").is_ascii());
+ /// assert!(!B("☃βツ").is_ascii());
+ /// assert!(!B(b"\xFF").is_ascii());
+ /// ```
+ #[inline]
+ fn is_ascii(&self) -> bool {
+ ascii::first_non_ascii_byte(self.as_bytes()) == self.as_bytes().len()
+ }
+
+ /// Returns true if and only if the entire byte string is valid UTF-8.
+ ///
+ /// If you need location information about where a byte string's first
+ /// invalid UTF-8 byte is, then use the [`to_str`](#method.to_str) method.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// assert!(B("abc").is_utf8());
+ /// assert!(B("☃βツ").is_utf8());
+ /// // invalid bytes
+ /// assert!(!B(b"abc\xFF").is_utf8());
+ /// // surrogate encoding
+ /// assert!(!B(b"\xED\xA0\x80").is_utf8());
+ /// // incomplete sequence
+ /// assert!(!B(b"\xF0\x9D\x9Ca").is_utf8());
+ /// // overlong sequence
+ /// assert!(!B(b"\xF0\x82\x82\xAC").is_utf8());
+ /// ```
+ #[inline]
+ fn is_utf8(&self) -> bool {
+ utf8::validate(self.as_bytes()).is_ok()
+ }
+
+ /// Returns the last byte in this byte string, if it's non-empty. If this
+ /// byte string is empty, this returns `None`.
+ ///
+ /// Note that this is like the generic `[u8]::last`, except this returns
+ /// the byte by value instead of a reference to the byte.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// assert_eq!(Some(b'z'), b"baz".last_byte());
+ /// assert_eq!(None, b"".last_byte());
+ /// ```
+ #[inline]
+ fn last_byte(&self) -> Option<u8> {
+ let bytes = self.as_bytes();
+ bytes.get(bytes.len().saturating_sub(1)).map(|&b| b)
+ }
+
+ /// Returns the index of the first non-ASCII byte in this byte string (if
+ /// any such indices exist). Specifically, it returns the index of the
+ /// first byte with a value greater than or equal to `0x80`.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{ByteSlice, B};
+ ///
+ /// assert_eq!(Some(3), b"abc\xff".find_non_ascii_byte());
+ /// assert_eq!(None, b"abcde".find_non_ascii_byte());
+ /// assert_eq!(Some(0), B("😀").find_non_ascii_byte());
+ /// ```
+ #[inline]
+ fn find_non_ascii_byte(&self) -> Option<usize> {
+ let index = ascii::first_non_ascii_byte(self.as_bytes());
+ if index == self.as_bytes().len() {
+ None
+ } else {
+ Some(index)
+ }
+ }
+}
+
+/// A single substring searcher fixed to a particular needle.
+///
+/// The purpose of this type is to permit callers to construct a substring
+/// searcher that can be used to search haystacks without the overhead of
+/// constructing the searcher in the first place. This is a somewhat niche
+/// concern when it's necessary to re-use the same needle to search multiple
+/// different haystacks with as little overhead as possible. In general, using
+/// [`ByteSlice::find`](trait.ByteSlice.html#method.find)
+/// or
+/// [`ByteSlice::find_iter`](trait.ByteSlice.html#method.find_iter)
+/// is good enough, but `Finder` is useful when you can meaningfully observe
+/// searcher construction time in a profile.
+///
+/// When the `std` feature is enabled, then this type has an `into_owned`
+/// version which permits building a `Finder` that is not connected to the
+/// lifetime of its needle.
+#[derive(Clone, Debug)]
+pub struct Finder<'a>(memmem::Finder<'a>);
+
+impl<'a> Finder<'a> {
+ /// Create a new finder for the given needle.
+ #[inline]
+ pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> Finder<'a> {
+ Finder(memmem::Finder::new(needle.as_ref()))
+ }
+
+ /// Convert this finder into its owned variant, such that it no longer
+ /// borrows the needle.
+ ///
+ /// If this is already an owned finder, then this is a no-op. Otherwise,
+ /// this copies the needle.
+ ///
+ /// This is only available when the `alloc` feature is enabled.
+ #[cfg(feature = "alloc")]
+ #[inline]
+ pub fn into_owned(self) -> Finder<'static> {
+ Finder(self.0.into_owned())
+ }
+
+ /// Returns the needle that this finder searches for.
+ ///
+ /// Note that the lifetime of the needle returned is tied to the lifetime
+ /// of the finder, and may be shorter than the `'a` lifetime. Namely, a
+ /// finder's needle can be either borrowed or owned, so the lifetime of the
+ /// needle returned must necessarily be the shorter of the two.
+ #[inline]
+ pub fn needle(&self) -> &[u8] {
+ self.0.needle()
+ }
+
+ /// Returns the index of the first occurrence of this needle in the given
+ /// haystack.
+ ///
+ /// The haystack may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// # Complexity
+ ///
+ /// This routine is guaranteed to have worst case linear time complexity
+ /// with respect to both the needle and the haystack. That is, this runs
+ /// in `O(needle.len() + haystack.len())` time.
+ ///
+ /// This routine is also guaranteed to have worst case constant space
+ /// complexity.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::Finder;
+ ///
+ /// let haystack = "foo bar baz";
+ /// assert_eq!(Some(0), Finder::new("foo").find(haystack));
+ /// assert_eq!(Some(4), Finder::new("bar").find(haystack));
+ /// assert_eq!(None, Finder::new("quux").find(haystack));
+ /// ```
+ #[inline]
+ pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> {
+ self.0.find(haystack.as_ref())
+ }
+}
+
+/// A single substring reverse searcher fixed to a particular needle.
+///
+/// The purpose of this type is to permit callers to construct a substring
+/// searcher that can be used to search haystacks without the overhead of
+/// constructing the searcher in the first place. This is a somewhat niche
+/// concern when it's necessary to re-use the same needle to search multiple
+/// different haystacks with as little overhead as possible. In general, using
+/// [`ByteSlice::rfind`](trait.ByteSlice.html#method.rfind)
+/// or
+/// [`ByteSlice::rfind_iter`](trait.ByteSlice.html#method.rfind_iter)
+/// is good enough, but `FinderReverse` is useful when you can meaningfully
+/// observe searcher construction time in a profile.
+///
+/// When the `std` feature is enabled, then this type has an `into_owned`
+/// version which permits building a `FinderReverse` that is not connected to
+/// the lifetime of its needle.
+#[derive(Clone, Debug)]
+pub struct FinderReverse<'a>(memmem::FinderRev<'a>);
+
+impl<'a> FinderReverse<'a> {
+ /// Create a new reverse finder for the given needle.
+ #[inline]
+ pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> FinderReverse<'a> {
+ FinderReverse(memmem::FinderRev::new(needle.as_ref()))
+ }
+
+ /// Convert this finder into its owned variant, such that it no longer
+ /// borrows the needle.
+ ///
+ /// If this is already an owned finder, then this is a no-op. Otherwise,
+ /// this copies the needle.
+ ///
+ /// This is only available when the `alloc` feature is enabled.
+ #[cfg(feature = "alloc")]
+ #[inline]
+ pub fn into_owned(self) -> FinderReverse<'static> {
+ FinderReverse(self.0.into_owned())
+ }
+
+ /// Returns the needle that this finder searches for.
+ ///
+ /// Note that the lifetime of the needle returned is tied to the lifetime
+ /// of this finder, and may be shorter than the `'a` lifetime. Namely,
+ /// a finder's needle can be either borrowed or owned, so the lifetime of
+ /// the needle returned must necessarily be the shorter of the two.
+ #[inline]
+ pub fn needle(&self) -> &[u8] {
+ self.0.needle()
+ }
+
+ /// Returns the index of the last occurrence of this needle in the given
+ /// haystack.
+ ///
+ /// The haystack may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// # Complexity
+ ///
+ /// This routine is guaranteed to have worst case linear time complexity
+ /// with respect to both the needle and the haystack. That is, this runs
+ /// in `O(needle.len() + haystack.len())` time.
+ ///
+ /// This routine is also guaranteed to have worst case constant space
+ /// complexity.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::FinderReverse;
+ ///
+ /// let haystack = "foo bar baz";
+ /// assert_eq!(Some(0), FinderReverse::new("foo").rfind(haystack));
+ /// assert_eq!(Some(4), FinderReverse::new("bar").rfind(haystack));
+ /// assert_eq!(None, FinderReverse::new("quux").rfind(haystack));
+ /// ```
+ #[inline]
+ pub fn rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> {
+ self.0.rfind(haystack.as_ref())
+ }
+}
+
+/// An iterator over non-overlapping substring matches.
+///
+/// Matches are reported by the byte offset at which they begin.
+///
+/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the
+/// needle.
+#[derive(Debug)]
+pub struct Find<'h, 'n> {
+ it: memmem::FindIter<'h, 'n>,
+ haystack: &'h [u8],
+ needle: &'n [u8],
+}
+
+impl<'h, 'n> Find<'h, 'n> {
+ fn new(haystack: &'h [u8], needle: &'n [u8]) -> Find<'h, 'n> {
+ Find { it: memmem::find_iter(haystack, needle), haystack, needle }
+ }
+}
+
+impl<'h, 'n> Iterator for Find<'h, 'n> {
+ type Item = usize;
+
+ #[inline]
+ fn next(&mut self) -> Option<usize> {
+ self.it.next()
+ }
+}
+
+/// An iterator over non-overlapping substring matches in reverse.
+///
+/// Matches are reported by the byte offset at which they begin.
+///
+/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the
+/// needle.
+#[derive(Debug)]
+pub struct FindReverse<'h, 'n> {
+ it: memmem::FindRevIter<'h, 'n>,
+ haystack: &'h [u8],
+ needle: &'n [u8],
+}
+
+impl<'h, 'n> FindReverse<'h, 'n> {
+ fn new(haystack: &'h [u8], needle: &'n [u8]) -> FindReverse<'h, 'n> {
+ FindReverse {
+ it: memmem::rfind_iter(haystack, needle),
+ haystack,
+ needle,
+ }
+ }
+
+ fn haystack(&self) -> &'h [u8] {
+ self.haystack
+ }
+
+ fn needle(&self) -> &'n [u8] {
+ self.needle
+ }
+}
+
+impl<'h, 'n> Iterator for FindReverse<'h, 'n> {
+ type Item = usize;
+
+ #[inline]
+ fn next(&mut self) -> Option<usize> {
+ self.it.next()
+ }
+}
+
+/// An iterator over the bytes in a byte string.
+///
+/// `'a` is the lifetime of the byte string being traversed.
+#[derive(Clone, Debug)]
+pub struct Bytes<'a> {
+ it: slice::Iter<'a, u8>,
+}
+
+impl<'a> Bytes<'a> {
+ /// Views the remaining underlying data as a subslice of the original data.
+ /// This has the same lifetime as the original slice,
+ /// and so the iterator can continue to be used while this exists.
+ #[inline]
+ pub fn as_bytes(&self) -> &'a [u8] {
+ self.it.as_slice()
+ }
+}
+
+impl<'a> Iterator for Bytes<'a> {
+ type Item = u8;
+
+ #[inline]
+ fn next(&mut self) -> Option<u8> {
+ self.it.next().map(|&b| b)
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl<'a> DoubleEndedIterator for Bytes<'a> {
+ #[inline]
+ fn next_back(&mut self) -> Option<u8> {
+ self.it.next_back().map(|&b| b)
+ }
+}
+
+impl<'a> ExactSizeIterator for Bytes<'a> {
+ #[inline]
+ fn len(&self) -> usize {
+ self.it.len()
+ }
+}
+
+impl<'a> iter::FusedIterator for Bytes<'a> {}
+
+/// An iterator over the fields in a byte string, separated by whitespace.
+///
+/// Whitespace for this iterator is defined by the Unicode property
+/// `White_Space`.
+///
+/// This iterator splits on contiguous runs of whitespace, such that the fields
+/// in `foo\t\t\n \nbar` are `foo` and `bar`.
+///
+/// `'a` is the lifetime of the byte string being split.
+#[cfg(feature = "unicode")]
+#[derive(Debug)]
+pub struct Fields<'a> {
+ it: FieldsWith<'a, fn(char) -> bool>,
+}
+
+#[cfg(feature = "unicode")]
+impl<'a> Fields<'a> {
+ fn new(bytes: &'a [u8]) -> Fields<'a> {
+ Fields { it: bytes.fields_with(|ch| ch.is_whitespace()) }
+ }
+}
+
+#[cfg(feature = "unicode")]
+impl<'a> Iterator for Fields<'a> {
+ type Item = &'a [u8];
+
+ #[inline]
+ fn next(&mut self) -> Option<&'a [u8]> {
+ self.it.next()
+ }
+}
+
+/// An iterator over fields in the byte string, separated by a predicate over
+/// codepoints.
+///
+/// This iterator splits a byte string based on its predicate function such
+/// that the elements returned are separated by contiguous runs of codepoints
+/// for which the predicate returns true.
+///
+/// `'a` is the lifetime of the byte string being split, while `F` is the type
+/// of the predicate, i.e., `FnMut(char) -> bool`.
+#[derive(Debug)]
+pub struct FieldsWith<'a, F> {
+ f: F,
+ bytes: &'a [u8],
+ chars: CharIndices<'a>,
+}
+
+impl<'a, F: FnMut(char) -> bool> FieldsWith<'a, F> {
+ fn new(bytes: &'a [u8], f: F) -> FieldsWith<'a, F> {
+ FieldsWith { f, bytes, chars: bytes.char_indices() }
+ }
+}
+
+impl<'a, F: FnMut(char) -> bool> Iterator for FieldsWith<'a, F> {
+ type Item = &'a [u8];
+
+ #[inline]
+ fn next(&mut self) -> Option<&'a [u8]> {
+ let (start, mut end);
+ loop {
+ match self.chars.next() {
+ None => return None,
+ Some((s, e, ch)) => {
+ if !(self.f)(ch) {
+ start = s;
+ end = e;
+ break;
+ }
+ }
+ }
+ }
+ while let Some((_, e, ch)) = self.chars.next() {
+ if (self.f)(ch) {
+ break;
+ }
+ end = e;
+ }
+ Some(&self.bytes[start..end])
+ }
+}
+
+/// An iterator over substrings in a byte string, split by a separator.
+///
+/// `'h` is the lifetime of the byte string being split (the haystack), while
+/// `'s` is the lifetime of the byte string doing the splitting.
+#[derive(Debug)]
+pub struct Split<'h, 's> {
+ finder: Find<'h, 's>,
+ /// The end position of the previous match of our splitter. The element
+ /// we yield corresponds to the substring starting at `last` up to the
+ /// beginning of the next match of the splitter.
+ last: usize,
+ /// Only set when iteration is complete. A corner case here is when a
+ /// splitter is matched at the end of the haystack. At that point, we still
+ /// need to yield an empty string following it.
+ done: bool,
+}
+
+impl<'h, 's> Split<'h, 's> {
+ fn new(haystack: &'h [u8], splitter: &'s [u8]) -> Split<'h, 's> {
+ let finder = haystack.find_iter(splitter);
+ Split { finder, last: 0, done: false }
+ }
+}
+
+impl<'h, 's> Iterator for Split<'h, 's> {
+ type Item = &'h [u8];
+
+ #[inline]
+ fn next(&mut self) -> Option<&'h [u8]> {
+ let haystack = self.finder.haystack;
+ match self.finder.next() {
+ Some(start) => {
+ let next = &haystack[self.last..start];
+ self.last = start + self.finder.needle.len();
+ Some(next)
+ }
+ None => {
+ if self.last >= haystack.len() {
+ if !self.done {
+ self.done = true;
+ Some(b"")
+ } else {
+ None
+ }
+ } else {
+ let s = &haystack[self.last..];
+ self.last = haystack.len();
+ self.done = true;
+ Some(s)
+ }
+ }
+ }
+ }
+}
+
+/// An iterator over substrings in a byte string, split by a separator, in
+/// reverse.
+///
+/// `'h` is the lifetime of the byte string being split (the haystack), while
+/// `'s` is the lifetime of the byte string doing the splitting.
+#[derive(Debug)]
+pub struct SplitReverse<'h, 's> {
+ finder: FindReverse<'h, 's>,
+ /// The end position of the previous match of our splitter. The element
+ /// we yield corresponds to the substring starting at `last` up to the
+ /// beginning of the next match of the splitter.
+ last: usize,
+ /// Only set when iteration is complete. A corner case here is when a
+ /// splitter is matched at the end of the haystack. At that point, we still
+ /// need to yield an empty string following it.
+ done: bool,
+}
+
+impl<'h, 's> SplitReverse<'h, 's> {
+ fn new(haystack: &'h [u8], splitter: &'s [u8]) -> SplitReverse<'h, 's> {
+ let finder = haystack.rfind_iter(splitter);
+ SplitReverse { finder, last: haystack.len(), done: false }
+ }
+}
+
+impl<'h, 's> Iterator for SplitReverse<'h, 's> {
+ type Item = &'h [u8];
+
+ #[inline]
+ fn next(&mut self) -> Option<&'h [u8]> {
+ let haystack = self.finder.haystack();
+ match self.finder.next() {
+ Some(start) => {
+ let nlen = self.finder.needle().len();
+ let next = &haystack[start + nlen..self.last];
+ self.last = start;
+ Some(next)
+ }
+ None => {
+ if self.last == 0 {
+ if !self.done {
+ self.done = true;
+ Some(b"")
+ } else {
+ None
+ }
+ } else {
+ let s = &haystack[..self.last];
+ self.last = 0;
+ self.done = true;
+ Some(s)
+ }
+ }
+ }
+ }
+}
+
+/// An iterator over at most `n` substrings in a byte string, split by a
+/// separator.
+///
+/// `'h` is the lifetime of the byte string being split (the haystack), while
+/// `'s` is the lifetime of the byte string doing the splitting.
+#[derive(Debug)]
+pub struct SplitN<'h, 's> {
+ split: Split<'h, 's>,
+ limit: usize,
+ count: usize,
+}
+
+impl<'h, 's> SplitN<'h, 's> {
+ fn new(
+ haystack: &'h [u8],
+ splitter: &'s [u8],
+ limit: usize,
+ ) -> SplitN<'h, 's> {
+ let split = haystack.split_str(splitter);
+ SplitN { split, limit, count: 0 }
+ }
+}
+
+impl<'h, 's> Iterator for SplitN<'h, 's> {
+ type Item = &'h [u8];
+
+ #[inline]
+ fn next(&mut self) -> Option<&'h [u8]> {
+ self.count += 1;
+ if self.count > self.limit || self.split.done {
+ None
+ } else if self.count == self.limit {
+ Some(&self.split.finder.haystack[self.split.last..])
+ } else {
+ self.split.next()
+ }
+ }
+}
+
+/// An iterator over at most `n` substrings in a byte string, split by a
+/// separator, in reverse.
+///
+/// `'h` is the lifetime of the byte string being split (the haystack), while
+/// `'s` is the lifetime of the byte string doing the splitting.
+#[derive(Debug)]
+pub struct SplitNReverse<'h, 's> {
+ split: SplitReverse<'h, 's>,
+ limit: usize,
+ count: usize,
+}
+
+impl<'h, 's> SplitNReverse<'h, 's> {
+ fn new(
+ haystack: &'h [u8],
+ splitter: &'s [u8],
+ limit: usize,
+ ) -> SplitNReverse<'h, 's> {
+ let split = haystack.rsplit_str(splitter);
+ SplitNReverse { split, limit, count: 0 }
+ }
+}
+
+impl<'h, 's> Iterator for SplitNReverse<'h, 's> {
+ type Item = &'h [u8];
+
+ #[inline]
+ fn next(&mut self) -> Option<&'h [u8]> {
+ self.count += 1;
+ if self.count > self.limit || self.split.done {
+ None
+ } else if self.count == self.limit {
+ Some(&self.split.finder.haystack()[..self.split.last])
+ } else {
+ self.split.next()
+ }
+ }
+}
+
+/// An iterator over all lines in a byte string, without their terminators.
+///
+/// For this iterator, the only line terminators recognized are `\r\n` and
+/// `\n`.
+///
+/// `'a` is the lifetime of the byte string being iterated over.
+#[derive(Clone, Debug)]
+pub struct Lines<'a> {
+ it: LinesWithTerminator<'a>,
+}
+
+impl<'a> Lines<'a> {
+ fn new(bytes: &'a [u8]) -> Lines<'a> {
+ Lines { it: LinesWithTerminator::new(bytes) }
+ }
+
+ /// Return a copy of the rest of the underlying bytes without affecting the
+ /// iterator itself.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = b"\
+ /// foo
+ /// bar\r
+ /// baz";
+ /// let mut lines = s.lines();
+ /// assert_eq!(lines.next(), Some(B("foo")));
+ /// assert_eq!(lines.as_bytes(), B("bar\r\nbaz"));
+ /// ```
+ pub fn as_bytes(&self) -> &'a [u8] {
+ self.it.bytes
+ }
+}
+
+impl<'a> Iterator for Lines<'a> {
+ type Item = &'a [u8];
+
+ #[inline]
+ fn next(&mut self) -> Option<&'a [u8]> {
+ Some(trim_last_terminator(self.it.next()?))
+ }
+}
+
+impl<'a> DoubleEndedIterator for Lines<'a> {
+ #[inline]
+ fn next_back(&mut self) -> Option<Self::Item> {
+ Some(trim_last_terminator(self.it.next_back()?))
+ }
+}
+
+impl<'a> iter::FusedIterator for Lines<'a> {}
+
+/// An iterator over all lines in a byte string, including their terminators.
+///
+/// For this iterator, the only line terminator recognized is `\n`. (Since
+/// line terminators are included, this also handles `\r\n` line endings.)
+///
+/// Line terminators are only included if they are present in the original
+/// byte string. For example, the last line in a byte string may not end with
+/// a line terminator.
+///
+/// Concatenating all elements yielded by this iterator is guaranteed to yield
+/// the original byte string.
+///
+/// `'a` is the lifetime of the byte string being iterated over.
+#[derive(Clone, Debug)]
+pub struct LinesWithTerminator<'a> {
+ bytes: &'a [u8],
+}
+
+impl<'a> LinesWithTerminator<'a> {
+ fn new(bytes: &'a [u8]) -> LinesWithTerminator<'a> {
+ LinesWithTerminator { bytes }
+ }
+
+ /// Return a copy of the rest of the underlying bytes without affecting the
+ /// iterator itself.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use bstr::{B, ByteSlice};
+ ///
+ /// let s = b"\
+ /// foo
+ /// bar\r
+ /// baz";
+ /// let mut lines = s.lines_with_terminator();
+ /// assert_eq!(lines.next(), Some(B("foo\n")));
+ /// assert_eq!(lines.as_bytes(), B("bar\r\nbaz"));
+ /// ```
+ pub fn as_bytes(&self) -> &'a [u8] {
+ self.bytes
+ }
+}
+
+impl<'a> Iterator for LinesWithTerminator<'a> {
+ type Item = &'a [u8];
+
+ #[inline]
+ fn next(&mut self) -> Option<&'a [u8]> {
+ match self.bytes.find_byte(b'\n') {
+ None if self.bytes.is_empty() => None,
+ None => {
+ let line = self.bytes;
+ self.bytes = b"";
+ Some(line)
+ }
+ Some(end) => {
+ let line = &self.bytes[..end + 1];
+ self.bytes = &self.bytes[end + 1..];
+ Some(line)
+ }
+ }
+ }
+}
+
+impl<'a> DoubleEndedIterator for LinesWithTerminator<'a> {
+ #[inline]
+ fn next_back(&mut self) -> Option<Self::Item> {
+ let end = self.bytes.len().checked_sub(1)?;
+ match self.bytes[..end].rfind_byte(b'\n') {
+ None => {
+ let line = self.bytes;
+ self.bytes = b"";
+ Some(line)
+ }
+ Some(end) => {
+ let line = &self.bytes[end + 1..];
+ self.bytes = &self.bytes[..end + 1];
+ Some(line)
+ }
+ }
+ }
+}
+
+impl<'a> iter::FusedIterator for LinesWithTerminator<'a> {}
+
+fn trim_last_terminator(mut s: &[u8]) -> &[u8] {
+ if s.last_byte() == Some(b'\n') {
+ s = &s[..s.len() - 1];
+ if s.last_byte() == Some(b'\r') {
+ s = &s[..s.len() - 1];
+ }
+ }
+ s
+}
+
+#[cfg(all(test, feature = "std"))]
+mod tests {
+ use crate::{
+ ext_slice::{ByteSlice, Lines, LinesWithTerminator, B},
+ tests::LOSSY_TESTS,
+ };
+
+ #[test]
+ fn to_str_lossy() {
+ for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() {
+ let got = B(input).to_str_lossy();
+ assert_eq!(
+ expected.as_bytes(),
+ got.as_bytes(),
+ "to_str_lossy(ith: {:?}, given: {:?})",
+ i,
+ input,
+ );
+
+ let mut got = String::new();
+ B(input).to_str_lossy_into(&mut got);
+ assert_eq!(
+ expected.as_bytes(),
+ got.as_bytes(),
+ "to_str_lossy_into",
+ );
+
+ let got = String::from_utf8_lossy(input);
+ assert_eq!(expected.as_bytes(), got.as_bytes(), "std");
+ }
+ }
+
+ #[test]
+ fn lines_iteration() {
+ macro_rules! t {
+ ($it:expr, $forward:expr) => {
+ let mut res: Vec<&[u8]> = Vec::from($forward);
+ assert_eq!($it.collect::<Vec<_>>(), res);
+ res.reverse();
+ assert_eq!($it.rev().collect::<Vec<_>>(), res);
+ };
+ }
+
+ t!(Lines::new(b""), []);
+ t!(LinesWithTerminator::new(b""), []);
+
+ t!(Lines::new(b"\n"), [B("")]);
+ t!(Lines::new(b"\r\n"), [B("")]);
+ t!(LinesWithTerminator::new(b"\n"), [B("\n")]);
+
+ t!(Lines::new(b"a"), [B("a")]);
+ t!(LinesWithTerminator::new(b"a"), [B("a")]);
+
+ t!(Lines::new(b"abc"), [B("abc")]);
+ t!(LinesWithTerminator::new(b"abc"), [B("abc")]);
+
+ t!(Lines::new(b"abc\n"), [B("abc")]);
+ t!(Lines::new(b"abc\r\n"), [B("abc")]);
+ t!(LinesWithTerminator::new(b"abc\n"), [B("abc\n")]);
+
+ t!(Lines::new(b"abc\n\n"), [B("abc"), B("")]);
+ t!(LinesWithTerminator::new(b"abc\n\n"), [B("abc\n"), B("\n")]);
+
+ t!(Lines::new(b"abc\n\ndef"), [B("abc"), B(""), B("def")]);
+ t!(
+ LinesWithTerminator::new(b"abc\n\ndef"),
+ [B("abc\n"), B("\n"), B("def")]
+ );
+
+ t!(Lines::new(b"abc\n\ndef\n"), [B("abc"), B(""), B("def")]);
+ t!(
+ LinesWithTerminator::new(b"abc\n\ndef\n"),
+ [B("abc\n"), B("\n"), B("def\n")]
+ );
+
+ t!(Lines::new(b"\na\nb\n"), [B(""), B("a"), B("b")]);
+ t!(
+ LinesWithTerminator::new(b"\na\nb\n"),
+ [B("\n"), B("a\n"), B("b\n")]
+ );
+
+ t!(Lines::new(b"\n\n\n"), [B(""), B(""), B("")]);
+ t!(LinesWithTerminator::new(b"\n\n\n"), [B("\n"), B("\n"), B("\n")]);
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 +763 +764 +765 +766 +767 +768 +769 +770 +771 +772 +773 +774 +775 +776 +777 +778 +779 +780 +781 +782 +783 +784 +785 +786 +787 +788 +789 +790 +791 +792 +793 +794 +795 +796 +797 +798 +799 +800 +801 +802 +803 +804 +805 +806 +807 +808 +809 +810 +811 +812 +813 +814 +815 +816 +817 +818 +819 +820 +821 +822 +823 +824 +825 +826 +827 +828 +829 +830 +831 +832 +833 +834 +835 +836 +837 +838 +839 +840 +841 +842 +843 +844 +845 +846 +847 +848 +849 +850 +851 +852 +853 +854 +855 +856 +857 +858 +859 +860 +861 +862 +863 +864 +865 +866 +867 +868 +869 +870 +871 +872 +873 +874 +875 +876 +877 +878 +879 +880 +881 +882 +883 +884 +885 +886 +887 +888 +889 +890 +891 +892 +893 +894 +895 +896 +897 +898 +899 +900 +901 +902 +903 +904 +905 +906 +907 +908 +909 +910 +911 +912 +913 +914 +915 +916 +917 +918 +919 +920 +921 +922 +923 +924 +925 +926 +927 +928 +929 +930 +931 +932 +933 +934 +935 +936 +937 +938 +939 +940 +941 +942 +943 +944 +945 +946 +947 +948 +949 +950 +951 +952 +953 +954 +955 +956 +957 +958 +959 +960 +961 +962 +963 +964 +965 +966 +967 +968 +969 +970 +971 +972 +973 +974 +975 +976 +977 +978 +979 +980 +981 +982 +983 +984 +985 +986 +987 +988 +989 +990 +991 +992 +993 +994 +995 +996 +997 +998 +999 +1000 +1001 +1002 +1003 +1004 +1005 +1006 +1007 +1008 +1009 +1010 +1011 +1012 +1013 +1014 +1015 +1016 +1017 +1018 +1019 +1020 +1021 +1022 +1023 +1024 +1025 +1026 +1027 +1028 +1029 +1030 +1031 +1032 +1033 +1034 +1035 +1036 +1037 +1038 +1039 +1040 +1041 +1042 +1043 +1044 +1045 +1046 +1047 +1048 +1049 +1050 +1051 +1052 +1053 +1054 +1055 +1056 +1057 +1058 +1059 +1060 +1061 +1062 +1063 +1064 +1065 +1066 +1067 +1068 +1069 +1070 +1071 +1072 +1073 +1074 +1075 +1076 +1077 +1078 +1079 +1080 +1081 +1082 +1083 +1084 +1085 +1086 +1087 +1088 +1089 +1090 +1091 +1092 +1093 +1094 +1095 +1096 +1097 +1098 +1099 +1100 +1101 +1102 +1103 +1104 +1105 +1106 +1107 +1108 +1109 +1110 +1111 +1112 +1113 +1114 +1115 +1116 +1117 +1118 +1119 +1120 +1121 +1122 +1123 +1124 +1125 +1126 +1127 +1128 +1129 +1130 +1131 +1132 +1133 +1134 +1135 +1136 +1137 +1138 +1139 +1140 +1141 +1142 +1143 +1144 +1145 +1146 +1147 +1148 +1149 +1150 +1151 +1152 +1153 +1154 +1155 +1156 +1157 +1158 +1159 +1160 +1161 +1162 +1163 +1164 +1165 +1166 +1167 +1168 +1169 +1170 +1171 +1172 +1173 +1174 +1175 +1176 +1177 +1178 +1179 +1180 +1181 +1182 +1183 +1184 +1185 +1186 +1187 +1188 +1189 +1190 +1191 +1192 +1193 +1194 +1195 +1196 +1197 +1198 +1199 +1200 +1201 +1202 +1203 +1204 +1205 +1206 +1207 +1208 +1209 +1210 +1211 +1212 +1213 +1214 +1215 +1216 +1217 +1218 +1219 +1220 +1221 +1222 +1223 +1224 +1225 +1226 +1227 +1228 +
macro_rules! impl_partial_eq {
+ ($lhs:ty, $rhs:ty) => {
+ impl<'a, 'b> PartialEq<$rhs> for $lhs {
+ #[inline]
+ fn eq(&self, other: &$rhs) -> bool {
+ let other: &[u8] = other.as_ref();
+ PartialEq::eq(self.as_bytes(), other)
+ }
+ }
+
+ impl<'a, 'b> PartialEq<$lhs> for $rhs {
+ #[inline]
+ fn eq(&self, other: &$lhs) -> bool {
+ let this: &[u8] = self.as_ref();
+ PartialEq::eq(this, other.as_bytes())
+ }
+ }
+ };
+}
+
+#[cfg(feature = "alloc")]
+macro_rules! impl_partial_eq_cow {
+ ($lhs:ty, $rhs:ty) => {
+ impl<'a, 'b> PartialEq<$rhs> for $lhs {
+ #[inline]
+ fn eq(&self, other: &$rhs) -> bool {
+ let other: &[u8] = (&**other).as_ref();
+ PartialEq::eq(self.as_bytes(), other)
+ }
+ }
+
+ impl<'a, 'b> PartialEq<$lhs> for $rhs {
+ #[inline]
+ fn eq(&self, other: &$lhs) -> bool {
+ let this: &[u8] = (&**other).as_ref();
+ PartialEq::eq(this, self.as_bytes())
+ }
+ }
+ };
+}
+
+macro_rules! impl_partial_ord {
+ ($lhs:ty, $rhs:ty) => {
+ impl<'a, 'b> PartialOrd<$rhs> for $lhs {
+ #[inline]
+ fn partial_cmp(&self, other: &$rhs) -> Option<Ordering> {
+ let other: &[u8] = other.as_ref();
+ PartialOrd::partial_cmp(self.as_bytes(), other)
+ }
+ }
+
+ impl<'a, 'b> PartialOrd<$lhs> for $rhs {
+ #[inline]
+ fn partial_cmp(&self, other: &$lhs) -> Option<Ordering> {
+ let this: &[u8] = self.as_ref();
+ PartialOrd::partial_cmp(this, other.as_bytes())
+ }
+ }
+ };
+}
+
+#[cfg(feature = "alloc")]
+mod bstring {
+ use core::{
+ cmp::Ordering, convert::TryFrom, fmt, iter::FromIterator, ops,
+ };
+
+ use alloc::{
+ borrow::{Borrow, BorrowMut, Cow, ToOwned},
+ string::String,
+ vec,
+ vec::Vec,
+ };
+
+ use crate::{
+ bstr::BStr, bstring::BString, ext_slice::ByteSlice, ext_vec::ByteVec,
+ };
+
+ impl fmt::Display for BString {
+ #[inline]
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ fmt::Display::fmt(self.as_bstr(), f)
+ }
+ }
+
+ impl fmt::Debug for BString {
+ #[inline]
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ fmt::Debug::fmt(self.as_bstr(), f)
+ }
+ }
+
+ impl ops::Deref for BString {
+ type Target = Vec<u8>;
+
+ #[inline]
+ fn deref(&self) -> &Vec<u8> {
+ self.as_vec()
+ }
+ }
+
+ impl ops::DerefMut for BString {
+ #[inline]
+ fn deref_mut(&mut self) -> &mut Vec<u8> {
+ self.as_vec_mut()
+ }
+ }
+
+ impl AsRef<[u8]> for BString {
+ #[inline]
+ fn as_ref(&self) -> &[u8] {
+ self.as_bytes()
+ }
+ }
+
+ impl AsRef<BStr> for BString {
+ #[inline]
+ fn as_ref(&self) -> &BStr {
+ self.as_bstr()
+ }
+ }
+
+ impl AsMut<[u8]> for BString {
+ #[inline]
+ fn as_mut(&mut self) -> &mut [u8] {
+ self.as_bytes_mut()
+ }
+ }
+
+ impl AsMut<BStr> for BString {
+ #[inline]
+ fn as_mut(&mut self) -> &mut BStr {
+ self.as_mut_bstr()
+ }
+ }
+
+ impl Borrow<[u8]> for BString {
+ #[inline]
+ fn borrow(&self) -> &[u8] {
+ self.as_bytes()
+ }
+ }
+
+ impl Borrow<BStr> for BString {
+ #[inline]
+ fn borrow(&self) -> &BStr {
+ self.as_bstr()
+ }
+ }
+
+ impl Borrow<BStr> for Vec<u8> {
+ #[inline]
+ fn borrow(&self) -> &BStr {
+ self.as_slice().as_bstr()
+ }
+ }
+
+ impl Borrow<BStr> for String {
+ #[inline]
+ fn borrow(&self) -> &BStr {
+ self.as_bytes().as_bstr()
+ }
+ }
+
+ impl BorrowMut<[u8]> for BString {
+ #[inline]
+ fn borrow_mut(&mut self) -> &mut [u8] {
+ self.as_bytes_mut()
+ }
+ }
+
+ impl BorrowMut<BStr> for BString {
+ #[inline]
+ fn borrow_mut(&mut self) -> &mut BStr {
+ self.as_mut_bstr()
+ }
+ }
+
+ impl BorrowMut<BStr> for Vec<u8> {
+ #[inline]
+ fn borrow_mut(&mut self) -> &mut BStr {
+ BStr::new_mut(self.as_mut_slice())
+ }
+ }
+
+ impl ToOwned for BStr {
+ type Owned = BString;
+
+ #[inline]
+ fn to_owned(&self) -> BString {
+ BString::from(self)
+ }
+ }
+
+ impl Default for BString {
+ fn default() -> BString {
+ BString::from(vec![])
+ }
+ }
+
+ impl<'a, const N: usize> From<&'a [u8; N]> for BString {
+ #[inline]
+ fn from(s: &'a [u8; N]) -> BString {
+ BString::from(&s[..])
+ }
+ }
+
+ impl<const N: usize> From<[u8; N]> for BString {
+ #[inline]
+ fn from(s: [u8; N]) -> BString {
+ BString::from(&s[..])
+ }
+ }
+
+ impl<'a> From<&'a [u8]> for BString {
+ #[inline]
+ fn from(s: &'a [u8]) -> BString {
+ BString::from(s.to_vec())
+ }
+ }
+
+ impl From<Vec<u8>> for BString {
+ #[inline]
+ fn from(s: Vec<u8>) -> BString {
+ BString::new(s)
+ }
+ }
+
+ impl From<BString> for Vec<u8> {
+ #[inline]
+ fn from(s: BString) -> Vec<u8> {
+ s.into_vec()
+ }
+ }
+
+ impl<'a> From<&'a str> for BString {
+ #[inline]
+ fn from(s: &'a str) -> BString {
+ BString::from(s.as_bytes().to_vec())
+ }
+ }
+
+ impl From<String> for BString {
+ #[inline]
+ fn from(s: String) -> BString {
+ BString::from(s.into_bytes())
+ }
+ }
+
+ impl<'a> From<&'a BStr> for BString {
+ #[inline]
+ fn from(s: &'a BStr) -> BString {
+ BString::from(s.bytes.to_vec())
+ }
+ }
+
+ impl<'a> From<BString> for Cow<'a, BStr> {
+ #[inline]
+ fn from(s: BString) -> Cow<'a, BStr> {
+ Cow::Owned(s)
+ }
+ }
+
+ impl TryFrom<BString> for String {
+ type Error = crate::FromUtf8Error;
+
+ #[inline]
+ fn try_from(s: BString) -> Result<String, crate::FromUtf8Error> {
+ s.into_vec().into_string()
+ }
+ }
+
+ impl<'a> TryFrom<&'a BString> for &'a str {
+ type Error = crate::Utf8Error;
+
+ #[inline]
+ fn try_from(s: &'a BString) -> Result<&'a str, crate::Utf8Error> {
+ s.as_bytes().to_str()
+ }
+ }
+
+ impl FromIterator<char> for BString {
+ #[inline]
+ fn from_iter<T: IntoIterator<Item = char>>(iter: T) -> BString {
+ BString::from(iter.into_iter().collect::<String>())
+ }
+ }
+
+ impl FromIterator<u8> for BString {
+ #[inline]
+ fn from_iter<T: IntoIterator<Item = u8>>(iter: T) -> BString {
+ BString::from(iter.into_iter().collect::<Vec<u8>>())
+ }
+ }
+
+ impl<'a> FromIterator<&'a str> for BString {
+ #[inline]
+ fn from_iter<T: IntoIterator<Item = &'a str>>(iter: T) -> BString {
+ let mut buf = vec![];
+ for b in iter {
+ buf.push_str(b);
+ }
+ BString::from(buf)
+ }
+ }
+
+ impl<'a> FromIterator<&'a [u8]> for BString {
+ #[inline]
+ fn from_iter<T: IntoIterator<Item = &'a [u8]>>(iter: T) -> BString {
+ let mut buf = vec![];
+ for b in iter {
+ buf.push_str(b);
+ }
+ BString::from(buf)
+ }
+ }
+
+ impl<'a> FromIterator<&'a BStr> for BString {
+ #[inline]
+ fn from_iter<T: IntoIterator<Item = &'a BStr>>(iter: T) -> BString {
+ let mut buf = vec![];
+ for b in iter {
+ buf.push_str(b);
+ }
+ BString::from(buf)
+ }
+ }
+
+ impl FromIterator<BString> for BString {
+ #[inline]
+ fn from_iter<T: IntoIterator<Item = BString>>(iter: T) -> BString {
+ let mut buf = vec![];
+ for b in iter {
+ buf.push_str(b);
+ }
+ BString::from(buf)
+ }
+ }
+
+ impl Eq for BString {}
+
+ impl PartialEq for BString {
+ #[inline]
+ fn eq(&self, other: &BString) -> bool {
+ &self[..] == &other[..]
+ }
+ }
+
+ impl_partial_eq!(BString, Vec<u8>);
+ impl_partial_eq!(BString, [u8]);
+ impl_partial_eq!(BString, &'a [u8]);
+ impl_partial_eq!(BString, String);
+ impl_partial_eq!(BString, str);
+ impl_partial_eq!(BString, &'a str);
+ impl_partial_eq!(BString, BStr);
+ impl_partial_eq!(BString, &'a BStr);
+
+ impl PartialOrd for BString {
+ #[inline]
+ fn partial_cmp(&self, other: &BString) -> Option<Ordering> {
+ PartialOrd::partial_cmp(self.as_bytes(), other.as_bytes())
+ }
+ }
+
+ impl Ord for BString {
+ #[inline]
+ fn cmp(&self, other: &BString) -> Ordering {
+ self.partial_cmp(other).unwrap()
+ }
+ }
+
+ impl_partial_ord!(BString, Vec<u8>);
+ impl_partial_ord!(BString, [u8]);
+ impl_partial_ord!(BString, &'a [u8]);
+ impl_partial_ord!(BString, String);
+ impl_partial_ord!(BString, str);
+ impl_partial_ord!(BString, &'a str);
+ impl_partial_ord!(BString, BStr);
+ impl_partial_ord!(BString, &'a BStr);
+}
+
+mod bstr {
+ use core::{
+ borrow::{Borrow, BorrowMut},
+ cmp::Ordering,
+ convert::TryFrom,
+ fmt, ops,
+ };
+
+ #[cfg(feature = "alloc")]
+ use alloc::{borrow::Cow, boxed::Box, string::String, vec::Vec};
+
+ use crate::{bstr::BStr, ext_slice::ByteSlice};
+
+ impl fmt::Display for BStr {
+ #[inline]
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ /// Write the given bstr (lossily) to the given formatter.
+ fn write_bstr(
+ f: &mut fmt::Formatter<'_>,
+ bstr: &BStr,
+ ) -> Result<(), fmt::Error> {
+ for chunk in bstr.utf8_chunks() {
+ f.write_str(chunk.valid())?;
+ if !chunk.invalid().is_empty() {
+ f.write_str("\u{FFFD}")?;
+ }
+ }
+ Ok(())
+ }
+
+ /// Write 'num' fill characters to the given formatter.
+ fn write_pads(
+ f: &mut fmt::Formatter<'_>,
+ num: usize,
+ ) -> fmt::Result {
+ let fill = f.fill();
+ for _ in 0..num {
+ f.write_fmt(format_args!("{}", fill))?;
+ }
+ Ok(())
+ }
+
+ if let Some(align) = f.align() {
+ let width = f.width().unwrap_or(0);
+ let nchars = self.chars().count();
+ let remaining_pads = width.saturating_sub(nchars);
+ match align {
+ fmt::Alignment::Left => {
+ write_bstr(f, self)?;
+ write_pads(f, remaining_pads)?;
+ }
+ fmt::Alignment::Right => {
+ write_pads(f, remaining_pads)?;
+ write_bstr(f, self)?;
+ }
+ fmt::Alignment::Center => {
+ let half = remaining_pads / 2;
+ let second_half = if remaining_pads % 2 == 0 {
+ half
+ } else {
+ half + 1
+ };
+ write_pads(f, half)?;
+ write_bstr(f, self)?;
+ write_pads(f, second_half)?;
+ }
+ }
+ Ok(())
+ } else {
+ write_bstr(f, self)?;
+ Ok(())
+ }
+ }
+ }
+
+ impl fmt::Debug for BStr {
+ #[inline]
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "\"")?;
+ for (s, e, ch) in self.char_indices() {
+ match ch {
+ '\0' => write!(f, "\\0")?,
+ '\u{FFFD}' => {
+ let bytes = self[s..e].as_bytes();
+ if bytes == b"\xEF\xBF\xBD" {
+ write!(f, "{}", ch.escape_debug())?;
+ } else {
+ for &b in self[s..e].as_bytes() {
+ write!(f, r"\x{:02X}", b)?;
+ }
+ }
+ }
+ // ASCII control characters except \0, \n, \r, \t
+ '\x01'..='\x08'
+ | '\x0b'
+ | '\x0c'
+ | '\x0e'..='\x19'
+ | '\x7f' => {
+ write!(f, "\\x{:02x}", ch as u32)?;
+ }
+ '\n' | '\r' | '\t' | _ => {
+ write!(f, "{}", ch.escape_debug())?;
+ }
+ }
+ }
+ write!(f, "\"")?;
+ Ok(())
+ }
+ }
+
+ impl ops::Deref for BStr {
+ type Target = [u8];
+
+ #[inline]
+ fn deref(&self) -> &[u8] {
+ &self.bytes
+ }
+ }
+
+ impl ops::DerefMut for BStr {
+ #[inline]
+ fn deref_mut(&mut self) -> &mut [u8] {
+ &mut self.bytes
+ }
+ }
+
+ impl ops::Index<usize> for BStr {
+ type Output = u8;
+
+ #[inline]
+ fn index(&self, idx: usize) -> &u8 {
+ &self.as_bytes()[idx]
+ }
+ }
+
+ impl ops::Index<ops::RangeFull> for BStr {
+ type Output = BStr;
+
+ #[inline]
+ fn index(&self, _: ops::RangeFull) -> &BStr {
+ self
+ }
+ }
+
+ impl ops::Index<ops::Range<usize>> for BStr {
+ type Output = BStr;
+
+ #[inline]
+ fn index(&self, r: ops::Range<usize>) -> &BStr {
+ BStr::new(&self.as_bytes()[r.start..r.end])
+ }
+ }
+
+ impl ops::Index<ops::RangeInclusive<usize>> for BStr {
+ type Output = BStr;
+
+ #[inline]
+ fn index(&self, r: ops::RangeInclusive<usize>) -> &BStr {
+ BStr::new(&self.as_bytes()[*r.start()..=*r.end()])
+ }
+ }
+
+ impl ops::Index<ops::RangeFrom<usize>> for BStr {
+ type Output = BStr;
+
+ #[inline]
+ fn index(&self, r: ops::RangeFrom<usize>) -> &BStr {
+ BStr::new(&self.as_bytes()[r.start..])
+ }
+ }
+
+ impl ops::Index<ops::RangeTo<usize>> for BStr {
+ type Output = BStr;
+
+ #[inline]
+ fn index(&self, r: ops::RangeTo<usize>) -> &BStr {
+ BStr::new(&self.as_bytes()[..r.end])
+ }
+ }
+
+ impl ops::Index<ops::RangeToInclusive<usize>> for BStr {
+ type Output = BStr;
+
+ #[inline]
+ fn index(&self, r: ops::RangeToInclusive<usize>) -> &BStr {
+ BStr::new(&self.as_bytes()[..=r.end])
+ }
+ }
+
+ impl ops::IndexMut<usize> for BStr {
+ #[inline]
+ fn index_mut(&mut self, idx: usize) -> &mut u8 {
+ &mut self.bytes[idx]
+ }
+ }
+
+ impl ops::IndexMut<ops::RangeFull> for BStr {
+ #[inline]
+ fn index_mut(&mut self, _: ops::RangeFull) -> &mut BStr {
+ self
+ }
+ }
+
+ impl ops::IndexMut<ops::Range<usize>> for BStr {
+ #[inline]
+ fn index_mut(&mut self, r: ops::Range<usize>) -> &mut BStr {
+ BStr::from_bytes_mut(&mut self.bytes[r.start..r.end])
+ }
+ }
+
+ impl ops::IndexMut<ops::RangeInclusive<usize>> for BStr {
+ #[inline]
+ fn index_mut(&mut self, r: ops::RangeInclusive<usize>) -> &mut BStr {
+ BStr::from_bytes_mut(&mut self.bytes[*r.start()..=*r.end()])
+ }
+ }
+
+ impl ops::IndexMut<ops::RangeFrom<usize>> for BStr {
+ #[inline]
+ fn index_mut(&mut self, r: ops::RangeFrom<usize>) -> &mut BStr {
+ BStr::from_bytes_mut(&mut self.bytes[r.start..])
+ }
+ }
+
+ impl ops::IndexMut<ops::RangeTo<usize>> for BStr {
+ #[inline]
+ fn index_mut(&mut self, r: ops::RangeTo<usize>) -> &mut BStr {
+ BStr::from_bytes_mut(&mut self.bytes[..r.end])
+ }
+ }
+
+ impl ops::IndexMut<ops::RangeToInclusive<usize>> for BStr {
+ #[inline]
+ fn index_mut(&mut self, r: ops::RangeToInclusive<usize>) -> &mut BStr {
+ BStr::from_bytes_mut(&mut self.bytes[..=r.end])
+ }
+ }
+
+ impl AsRef<[u8]> for BStr {
+ #[inline]
+ fn as_ref(&self) -> &[u8] {
+ self.as_bytes()
+ }
+ }
+
+ impl AsRef<BStr> for BStr {
+ #[inline]
+ fn as_ref(&self) -> &BStr {
+ self
+ }
+ }
+
+ impl AsRef<BStr> for [u8] {
+ #[inline]
+ fn as_ref(&self) -> &BStr {
+ BStr::new(self)
+ }
+ }
+
+ impl AsRef<BStr> for str {
+ #[inline]
+ fn as_ref(&self) -> &BStr {
+ BStr::new(self)
+ }
+ }
+
+ impl AsMut<[u8]> for BStr {
+ #[inline]
+ fn as_mut(&mut self) -> &mut [u8] {
+ &mut self.bytes
+ }
+ }
+
+ impl AsMut<BStr> for [u8] {
+ #[inline]
+ fn as_mut(&mut self) -> &mut BStr {
+ BStr::new_mut(self)
+ }
+ }
+
+ impl Borrow<BStr> for [u8] {
+ #[inline]
+ fn borrow(&self) -> &BStr {
+ self.as_bstr()
+ }
+ }
+
+ impl Borrow<BStr> for str {
+ #[inline]
+ fn borrow(&self) -> &BStr {
+ self.as_bytes().as_bstr()
+ }
+ }
+
+ impl Borrow<[u8]> for BStr {
+ #[inline]
+ fn borrow(&self) -> &[u8] {
+ self.as_bytes()
+ }
+ }
+
+ impl BorrowMut<BStr> for [u8] {
+ #[inline]
+ fn borrow_mut(&mut self) -> &mut BStr {
+ BStr::new_mut(self)
+ }
+ }
+
+ impl BorrowMut<[u8]> for BStr {
+ #[inline]
+ fn borrow_mut(&mut self) -> &mut [u8] {
+ self.as_bytes_mut()
+ }
+ }
+
+ impl<'a> Default for &'a BStr {
+ fn default() -> &'a BStr {
+ BStr::from_bytes(b"")
+ }
+ }
+
+ impl<'a> Default for &'a mut BStr {
+ fn default() -> &'a mut BStr {
+ BStr::from_bytes_mut(&mut [])
+ }
+ }
+
+ impl<'a, const N: usize> From<&'a [u8; N]> for &'a BStr {
+ #[inline]
+ fn from(s: &'a [u8; N]) -> &'a BStr {
+ BStr::from_bytes(s)
+ }
+ }
+
+ impl<'a> From<&'a [u8]> for &'a BStr {
+ #[inline]
+ fn from(s: &'a [u8]) -> &'a BStr {
+ BStr::from_bytes(s)
+ }
+ }
+
+ impl<'a> From<&'a BStr> for &'a [u8] {
+ #[inline]
+ fn from(s: &'a BStr) -> &'a [u8] {
+ BStr::as_bytes(s)
+ }
+ }
+
+ impl<'a> From<&'a str> for &'a BStr {
+ #[inline]
+ fn from(s: &'a str) -> &'a BStr {
+ BStr::from_bytes(s.as_bytes())
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ impl<'a> From<&'a BStr> for Cow<'a, BStr> {
+ #[inline]
+ fn from(s: &'a BStr) -> Cow<'a, BStr> {
+ Cow::Borrowed(s)
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ impl From<Box<[u8]>> for Box<BStr> {
+ #[inline]
+ fn from(s: Box<[u8]>) -> Box<BStr> {
+ BStr::from_boxed_bytes(s)
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ impl From<Box<BStr>> for Box<[u8]> {
+ #[inline]
+ fn from(s: Box<BStr>) -> Box<[u8]> {
+ BStr::into_boxed_bytes(s)
+ }
+ }
+
+ impl<'a> TryFrom<&'a BStr> for &'a str {
+ type Error = crate::Utf8Error;
+
+ #[inline]
+ fn try_from(s: &'a BStr) -> Result<&'a str, crate::Utf8Error> {
+ s.as_bytes().to_str()
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ impl<'a> TryFrom<&'a BStr> for String {
+ type Error = crate::Utf8Error;
+
+ #[inline]
+ fn try_from(s: &'a BStr) -> Result<String, crate::Utf8Error> {
+ Ok(s.as_bytes().to_str()?.into())
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ impl Clone for Box<BStr> {
+ #[inline]
+ fn clone(&self) -> Self {
+ BStr::from_boxed_bytes(self.as_bytes().into())
+ }
+ }
+
+ impl Eq for BStr {}
+
+ impl PartialEq<BStr> for BStr {
+ #[inline]
+ fn eq(&self, other: &BStr) -> bool {
+ self.as_bytes() == other.as_bytes()
+ }
+ }
+
+ impl_partial_eq!(BStr, [u8]);
+ impl_partial_eq!(BStr, &'a [u8]);
+ impl_partial_eq!(BStr, str);
+ impl_partial_eq!(BStr, &'a str);
+
+ #[cfg(feature = "alloc")]
+ impl_partial_eq!(BStr, Vec<u8>);
+ #[cfg(feature = "alloc")]
+ impl_partial_eq!(&'a BStr, Vec<u8>);
+ #[cfg(feature = "alloc")]
+ impl_partial_eq!(BStr, String);
+ #[cfg(feature = "alloc")]
+ impl_partial_eq!(&'a BStr, String);
+ #[cfg(feature = "alloc")]
+ impl_partial_eq_cow!(&'a BStr, Cow<'a, BStr>);
+ #[cfg(feature = "alloc")]
+ impl_partial_eq_cow!(&'a BStr, Cow<'a, str>);
+ #[cfg(feature = "alloc")]
+ impl_partial_eq_cow!(&'a BStr, Cow<'a, [u8]>);
+
+ impl PartialOrd for BStr {
+ #[inline]
+ fn partial_cmp(&self, other: &BStr) -> Option<Ordering> {
+ PartialOrd::partial_cmp(self.as_bytes(), other.as_bytes())
+ }
+ }
+
+ impl Ord for BStr {
+ #[inline]
+ fn cmp(&self, other: &BStr) -> Ordering {
+ self.partial_cmp(other).unwrap()
+ }
+ }
+
+ impl_partial_ord!(BStr, [u8]);
+ impl_partial_ord!(BStr, &'a [u8]);
+ impl_partial_ord!(BStr, str);
+ impl_partial_ord!(BStr, &'a str);
+
+ #[cfg(feature = "alloc")]
+ impl_partial_ord!(BStr, Vec<u8>);
+ #[cfg(feature = "alloc")]
+ impl_partial_ord!(&'a BStr, Vec<u8>);
+ #[cfg(feature = "alloc")]
+ impl_partial_ord!(BStr, String);
+ #[cfg(feature = "alloc")]
+ impl_partial_ord!(&'a BStr, String);
+}
+
+#[cfg(feature = "serde")]
+mod bstr_serde {
+ use core::fmt;
+
+ use serde::{
+ de::Error, de::Visitor, Deserialize, Deserializer, Serialize,
+ Serializer,
+ };
+
+ use crate::bstr::BStr;
+
+ impl Serialize for BStr {
+ #[inline]
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: Serializer,
+ {
+ serializer.serialize_bytes(self.as_bytes())
+ }
+ }
+
+ impl<'a, 'de: 'a> Deserialize<'de> for &'a BStr {
+ #[inline]
+ fn deserialize<D>(deserializer: D) -> Result<&'a BStr, D::Error>
+ where
+ D: Deserializer<'de>,
+ {
+ struct BStrVisitor;
+
+ impl<'de> Visitor<'de> for BStrVisitor {
+ type Value = &'de BStr;
+
+ fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ f.write_str("a borrowed byte string")
+ }
+
+ #[inline]
+ fn visit_borrowed_bytes<E: Error>(
+ self,
+ value: &'de [u8],
+ ) -> Result<&'de BStr, E> {
+ Ok(BStr::new(value))
+ }
+
+ #[inline]
+ fn visit_borrowed_str<E: Error>(
+ self,
+ value: &'de str,
+ ) -> Result<&'de BStr, E> {
+ Ok(BStr::new(value))
+ }
+ }
+
+ deserializer.deserialize_bytes(BStrVisitor)
+ }
+ }
+}
+
+#[cfg(all(feature = "serde", feature = "alloc"))]
+mod bstring_serde {
+ use core::{cmp, fmt};
+
+ use alloc::{boxed::Box, string::String, vec::Vec};
+
+ use serde::{
+ de::Error, de::SeqAccess, de::Visitor, Deserialize, Deserializer,
+ Serialize, Serializer,
+ };
+
+ use crate::{bstr::BStr, bstring::BString};
+
+ impl Serialize for BString {
+ #[inline]
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: Serializer,
+ {
+ serializer.serialize_bytes(self.as_bytes())
+ }
+ }
+
+ impl<'de> Deserialize<'de> for BString {
+ #[inline]
+ fn deserialize<D>(deserializer: D) -> Result<BString, D::Error>
+ where
+ D: Deserializer<'de>,
+ {
+ struct BStringVisitor;
+
+ impl<'de> Visitor<'de> for BStringVisitor {
+ type Value = BString;
+
+ fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ f.write_str("a byte string")
+ }
+
+ #[inline]
+ fn visit_seq<V: SeqAccess<'de>>(
+ self,
+ mut visitor: V,
+ ) -> Result<BString, V::Error> {
+ let len = cmp::min(visitor.size_hint().unwrap_or(0), 256);
+ let mut bytes = Vec::with_capacity(len);
+ while let Some(v) = visitor.next_element()? {
+ bytes.push(v);
+ }
+ Ok(BString::from(bytes))
+ }
+
+ #[inline]
+ fn visit_bytes<E: Error>(
+ self,
+ value: &[u8],
+ ) -> Result<BString, E> {
+ Ok(BString::from(value))
+ }
+
+ #[inline]
+ fn visit_byte_buf<E: Error>(
+ self,
+ value: Vec<u8>,
+ ) -> Result<BString, E> {
+ Ok(BString::from(value))
+ }
+
+ #[inline]
+ fn visit_str<E: Error>(
+ self,
+ value: &str,
+ ) -> Result<BString, E> {
+ Ok(BString::from(value))
+ }
+
+ #[inline]
+ fn visit_string<E: Error>(
+ self,
+ value: String,
+ ) -> Result<BString, E> {
+ Ok(BString::from(value))
+ }
+ }
+
+ deserializer.deserialize_byte_buf(BStringVisitor)
+ }
+ }
+
+ impl<'de> Deserialize<'de> for Box<BStr> {
+ #[inline]
+ fn deserialize<D>(deserializer: D) -> Result<Box<BStr>, D::Error>
+ where
+ D: Deserializer<'de>,
+ {
+ struct BoxedBStrVisitor;
+
+ impl<'de> Visitor<'de> for BoxedBStrVisitor {
+ type Value = Box<BStr>;
+
+ fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ f.write_str("a boxed byte string")
+ }
+
+ #[inline]
+ fn visit_seq<V: SeqAccess<'de>>(
+ self,
+ mut visitor: V,
+ ) -> Result<Box<BStr>, V::Error> {
+ let len = cmp::min(visitor.size_hint().unwrap_or(0), 256);
+ let mut bytes = Vec::with_capacity(len);
+ while let Some(v) = visitor.next_element()? {
+ bytes.push(v);
+ }
+ Ok(BStr::from_boxed_bytes(bytes.into_boxed_slice()))
+ }
+
+ #[inline]
+ fn visit_bytes<E: Error>(
+ self,
+ value: &[u8],
+ ) -> Result<Box<BStr>, E> {
+ Ok(BStr::from_boxed_bytes(
+ value.to_vec().into_boxed_slice(),
+ ))
+ }
+
+ #[inline]
+ fn visit_byte_buf<E: Error>(
+ self,
+ value: Vec<u8>,
+ ) -> Result<Box<BStr>, E> {
+ Ok(BStr::from_boxed_bytes(value.into_boxed_slice()))
+ }
+
+ #[inline]
+ fn visit_str<E: Error>(
+ self,
+ value: &str,
+ ) -> Result<Box<BStr>, E> {
+ Ok(BStr::from_boxed_bytes(
+ value.as_bytes().to_vec().into_boxed_slice(),
+ ))
+ }
+
+ #[inline]
+ fn visit_string<E: Error>(
+ self,
+ value: String,
+ ) -> Result<Box<BStr>, E> {
+ Ok(BStr::from_boxed_bytes(
+ value.into_bytes().into_boxed_slice(),
+ ))
+ }
+ }
+
+ deserializer.deserialize_byte_buf(BoxedBStrVisitor)
+ }
+ }
+}
+
+#[cfg(all(test, feature = "std"))]
+mod display {
+ #[cfg(not(miri))]
+ use crate::bstring::BString;
+ use crate::ByteSlice;
+
+ #[test]
+ fn clean() {
+ assert_eq!(&format!("{}", &b"abc".as_bstr()), "abc");
+ assert_eq!(&format!("{}", &b"\xf0\x28\x8c\xbc".as_bstr()), "�(��");
+ }
+
+ #[test]
+ fn width_bigger_than_bstr() {
+ assert_eq!(&format!("{:<7}!", &b"abc".as_bstr()), "abc !");
+ assert_eq!(&format!("{:>7}!", &b"abc".as_bstr()), " abc!");
+ assert_eq!(&format!("{:^7}!", &b"abc".as_bstr()), " abc !");
+ assert_eq!(&format!("{:^6}!", &b"abc".as_bstr()), " abc !");
+ assert_eq!(&format!("{:-<7}!", &b"abc".as_bstr()), "abc----!");
+ assert_eq!(&format!("{:->7}!", &b"abc".as_bstr()), "----abc!");
+ assert_eq!(&format!("{:-^7}!", &b"abc".as_bstr()), "--abc--!");
+ assert_eq!(&format!("{:-^6}!", &b"abc".as_bstr()), "-abc--!");
+
+ assert_eq!(
+ &format!("{:<7}!", &b"\xf0\x28\x8c\xbc".as_bstr()),
+ "�(�� !"
+ );
+ assert_eq!(
+ &format!("{:>7}!", &b"\xf0\x28\x8c\xbc".as_bstr()),
+ " �(��!"
+ );
+ assert_eq!(
+ &format!("{:^7}!", &b"\xf0\x28\x8c\xbc".as_bstr()),
+ " �(�� !"
+ );
+ assert_eq!(
+ &format!("{:^6}!", &b"\xf0\x28\x8c\xbc".as_bstr()),
+ " �(�� !"
+ );
+
+ assert_eq!(
+ &format!("{:-<7}!", &b"\xf0\x28\x8c\xbc".as_bstr()),
+ "�(��---!"
+ );
+ assert_eq!(
+ &format!("{:->7}!", &b"\xf0\x28\x8c\xbc".as_bstr()),
+ "---�(��!"
+ );
+ assert_eq!(
+ &format!("{:-^7}!", &b"\xf0\x28\x8c\xbc".as_bstr()),
+ "-�(��--!"
+ );
+ assert_eq!(
+ &format!("{:-^6}!", &b"\xf0\x28\x8c\xbc".as_bstr()),
+ "-�(��-!"
+ );
+ }
+
+ #[test]
+ fn width_lesser_than_bstr() {
+ assert_eq!(&format!("{:<2}!", &b"abc".as_bstr()), "abc!");
+ assert_eq!(&format!("{:>2}!", &b"abc".as_bstr()), "abc!");
+ assert_eq!(&format!("{:^2}!", &b"abc".as_bstr()), "abc!");
+ assert_eq!(&format!("{:-<2}!", &b"abc".as_bstr()), "abc!");
+ assert_eq!(&format!("{:->2}!", &b"abc".as_bstr()), "abc!");
+ assert_eq!(&format!("{:-^2}!", &b"abc".as_bstr()), "abc!");
+
+ assert_eq!(
+ &format!("{:<3}!", &b"\xf0\x28\x8c\xbc".as_bstr()),
+ "�(��!"
+ );
+ assert_eq!(
+ &format!("{:>3}!", &b"\xf0\x28\x8c\xbc".as_bstr()),
+ "�(��!"
+ );
+ assert_eq!(
+ &format!("{:^3}!", &b"\xf0\x28\x8c\xbc".as_bstr()),
+ "�(��!"
+ );
+ assert_eq!(
+ &format!("{:^2}!", &b"\xf0\x28\x8c\xbc".as_bstr()),
+ "�(��!"
+ );
+
+ assert_eq!(
+ &format!("{:-<3}!", &b"\xf0\x28\x8c\xbc".as_bstr()),
+ "�(��!"
+ );
+ assert_eq!(
+ &format!("{:->3}!", &b"\xf0\x28\x8c\xbc".as_bstr()),
+ "�(��!"
+ );
+ assert_eq!(
+ &format!("{:-^3}!", &b"\xf0\x28\x8c\xbc".as_bstr()),
+ "�(��!"
+ );
+ assert_eq!(
+ &format!("{:-^2}!", &b"\xf0\x28\x8c\xbc".as_bstr()),
+ "�(��!"
+ );
+ }
+
+ #[cfg(not(miri))]
+ quickcheck::quickcheck! {
+ fn total_length(bstr: BString) -> bool {
+ let size = bstr.chars().count();
+ format!("{:<1$}", bstr.as_bstr(), size).chars().count() >= size
+ }
+ }
+}
+
+#[cfg(all(test, feature = "alloc"))]
+mod bstring_arbitrary {
+ use crate::bstring::BString;
+
+ use quickcheck::{Arbitrary, Gen};
+
+ impl Arbitrary for BString {
+ fn arbitrary(g: &mut Gen) -> BString {
+ BString::from(Vec::<u8>::arbitrary(g))
+ }
+
+ fn shrink(&self) -> Box<dyn Iterator<Item = BString>> {
+ Box::new(self.as_vec().shrink().map(BString::from))
+ }
+ }
+}
+
+#[test]
+#[cfg(feature = "std")]
+fn test_debug() {
+ use crate::{ByteSlice, B};
+
+ assert_eq!(
+ r#""\0\0\0 ftypisom\0\0\x02\0isomiso2avc1mp""#,
+ format!("{:?}", b"\0\0\0 ftypisom\0\0\x02\0isomiso2avc1mp".as_bstr()),
+ );
+
+ // Tests that if the underlying bytes contain the UTF-8 encoding of the
+ // replacement codepoint, then we emit the codepoint just like other
+ // non-printable Unicode characters.
+ assert_eq!(
+ b"\"\\xFF\xEF\xBF\xBD\\xFF\"".as_bstr(),
+ // Before fixing #72, the output here would be:
+ // \\xFF\\xEF\\xBF\\xBD\\xFF
+ B(&format!("{:?}", b"\xFF\xEF\xBF\xBD\xFF".as_bstr())).as_bstr(),
+ );
+}
+
+// See: https://github.com/BurntSushi/bstr/issues/82
+#[test]
+#[cfg(feature = "std")]
+fn test_cows_regression() {
+ use std::borrow::Cow;
+
+ use crate::ByteSlice;
+
+ let c1 = Cow::from(b"hello bstr".as_bstr());
+ let c2 = b"goodbye bstr".as_bstr();
+ assert_ne!(c1, c2);
+
+ let c3 = Cow::from("hello str");
+ let c4 = "goodbye str";
+ assert_ne!(c3, c4);
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +
/*!
+A byte string library.
+
+Byte strings are just like standard Unicode strings with one very important
+difference: byte strings are only *conventionally* UTF-8 while Rust's standard
+Unicode strings are *guaranteed* to be valid UTF-8. The primary motivation for
+byte strings is for handling arbitrary bytes that are mostly UTF-8.
+
+# Overview
+
+This crate provides two important traits that provide string oriented methods
+on `&[u8]` and `Vec<u8>` types:
+
+* [`ByteSlice`](trait.ByteSlice.html) extends the `[u8]` type with additional
+ string oriented methods.
+* [`ByteVec`](trait.ByteVec.html) extends the `Vec<u8>` type with additional
+ string oriented methods.
+
+Additionally, this crate provides two concrete byte string types that deref to
+`[u8]` and `Vec<u8>`. These are useful for storing byte string types, and come
+with convenient `std::fmt::Debug` implementations:
+
+* [`BStr`](struct.BStr.html) is a byte string slice, analogous to `str`.
+* [`BString`](struct.BString.html) is an owned growable byte string buffer,
+ analogous to `String`.
+
+Additionally, the free function [`B`](fn.B.html) serves as a convenient short
+hand for writing byte string literals.
+
+# Quick examples
+
+Byte strings build on the existing APIs for `Vec<u8>` and `&[u8]`, with
+additional string oriented methods. Operations such as iterating over
+graphemes, searching for substrings, replacing substrings, trimming and case
+conversion are examples of things not provided on the standard library `&[u8]`
+APIs but are provided by this crate. For example, this code iterates over all
+of occurrences of a substring:
+
+```
+use bstr::ByteSlice;
+
+let s = b"foo bar foo foo quux foo";
+
+let mut matches = vec![];
+for start in s.find_iter("foo") {
+ matches.push(start);
+}
+assert_eq!(matches, [0, 8, 12, 21]);
+```
+
+Here's another example showing how to do a search and replace (and also showing
+use of the `B` function):
+
+```
+# #[cfg(feature = "alloc")] {
+use bstr::{B, ByteSlice};
+
+let old = B("foo ☃☃☃ foo foo quux foo");
+let new = old.replace("foo", "hello");
+assert_eq!(new, B("hello ☃☃☃ hello hello quux hello"));
+# }
+```
+
+And here's an example that shows case conversion, even in the presence of
+invalid UTF-8:
+
+```
+# #[cfg(all(feature = "alloc", feature = "unicode"))] {
+use bstr::{ByteSlice, ByteVec};
+
+let mut lower = Vec::from("hello β");
+lower[0] = b'\xFF';
+// lowercase β is uppercased to Β
+assert_eq!(lower.to_uppercase(), b"\xFFELLO \xCE\x92");
+# }
+```
+
+# Convenient debug representation
+
+When working with byte strings, it is often useful to be able to print them
+as if they were byte strings and not sequences of integers. While this crate
+cannot affect the `std::fmt::Debug` implementations for `[u8]` and `Vec<u8>`,
+this crate does provide the `BStr` and `BString` types which have convenient
+`std::fmt::Debug` implementations.
+
+For example, this
+
+```
+use bstr::ByteSlice;
+
+let mut bytes = Vec::from("hello β");
+bytes[0] = b'\xFF';
+
+println!("{:?}", bytes.as_bstr());
+```
+
+will output `"\xFFello β"`.
+
+This example works because the
+[`ByteSlice::as_bstr`](trait.ByteSlice.html#method.as_bstr)
+method converts any `&[u8]` to a `&BStr`.
+
+# When should I use byte strings?
+
+This library reflects my belief that UTF-8 by convention is a better trade
+off in some circumstances than guaranteed UTF-8.
+
+The first time this idea hit me was in the implementation of Rust's regex
+engine. In particular, very little of the internal implementation cares at all
+about searching valid UTF-8 encoded strings. Indeed, internally, the
+implementation converts `&str` from the API to `&[u8]` fairly quickly and
+just deals with raw bytes. UTF-8 match boundaries are then guaranteed by the
+finite state machine itself rather than any specific string type. This makes it
+possible to not only run regexes on `&str` values, but also on `&[u8]` values.
+
+Why would you ever want to run a regex on a `&[u8]` though? Well, `&[u8]` is
+the fundamental way at which one reads data from all sorts of streams, via the
+standard library's [`Read`](https://doc.rust-lang.org/std/io/trait.Read.html)
+trait. In particular, there is no platform independent way to determine whether
+what you're reading from is some binary file or a human readable text file.
+Therefore, if you're writing a program to search files, you probably need to
+deal with `&[u8]` directly unless you're okay with first converting it to a
+`&str` and dropping any bytes that aren't valid UTF-8. (Or otherwise determine
+the encoding---which is often impractical---and perform a transcoding step.)
+Often, the simplest and most robust way to approach this is to simply treat the
+contents of a file as if it were mostly valid UTF-8 and pass through invalid
+UTF-8 untouched. This may not be the most correct approach though!
+
+One case in particular exacerbates these issues, and that's memory mapping
+a file. When you memory map a file, that file may be gigabytes big, but all
+you get is a `&[u8]`. Converting that to a `&str` all in one go is generally
+not a good idea because of the costs associated with doing so, and also
+because it generally causes one to do two passes over the data instead of
+one, which is quite undesirable. It is of course usually possible to do it an
+incremental way by only parsing chunks at a time, but this is often complex to
+do or impractical. For example, many regex engines only accept one contiguous
+sequence of bytes at a time with no way to perform incremental matching.
+
+# `bstr` in public APIs
+
+This library is past version `1` and is expected to remain at version `1` for
+the foreseeable future. Therefore, it is encouraged to put types from `bstr`
+(like `BStr` and `BString`) in your public API if that makes sense for your
+crate.
+
+With that said, in general, it should be possible to avoid putting anything
+in this crate into your public APIs. Namely, you should never need to use the
+`ByteSlice` or `ByteVec` traits as bounds on public APIs, since their only
+purpose is to extend the methods on the concrete types `[u8]` and `Vec<u8>`,
+respectively. Similarly, it should not be necessary to put either the `BStr` or
+`BString` types into public APIs. If you want to use them internally, then they
+can be converted to/from `[u8]`/`Vec<u8>` as needed. The conversions are free.
+
+So while it shouldn't ever be 100% necessary to make `bstr` a public
+dependency, there may be cases where it is convenient to do so. This is an
+explicitly supported use case of `bstr`, and as such, major version releases
+should be exceptionally rare.
+
+
+# Differences with standard strings
+
+The primary difference between `[u8]` and `str` is that the former is
+conventionally UTF-8 while the latter is guaranteed to be UTF-8. The phrase
+"conventionally UTF-8" means that a `[u8]` may contain bytes that do not form
+a valid UTF-8 sequence, but operations defined on the type in this crate are
+generally most useful on valid UTF-8 sequences. For example, iterating over
+Unicode codepoints or grapheme clusters is an operation that is only defined
+on valid UTF-8. Therefore, when invalid UTF-8 is encountered, the Unicode
+replacement codepoint is substituted. Thus, a byte string that is not UTF-8 at
+all is of limited utility when using these crate.
+
+However, not all operations on byte strings are specifically Unicode aware. For
+example, substring search has no specific Unicode semantics ascribed to it. It
+works just as well for byte strings that are completely valid UTF-8 as for byte
+strings that contain no valid UTF-8 at all. Similarly for replacements and
+various other operations that do not need any Unicode specific tailoring.
+
+Aside from the difference in how UTF-8 is handled, the APIs between `[u8]` and
+`str` (and `Vec<u8>` and `String`) are intentionally very similar, including
+maintaining the same behavior for corner cases in things like substring
+splitting. There are, however, some differences:
+
+* Substring search is not done with `matches`, but instead, `find_iter`.
+ In general, this crate does not define any generic
+ [`Pattern`](https://doc.rust-lang.org/std/str/pattern/trait.Pattern.html)
+ infrastructure, and instead prefers adding new methods for different
+ argument types. For example, `matches` can search by a `char` or a `&str`,
+ where as `find_iter` can only search by a byte string. `find_char` can be
+ used for searching by a `char`.
+* Since `SliceConcatExt` in the standard library is unstable, it is not
+ possible to reuse that to implement `join` and `concat` methods. Instead,
+ [`join`](fn.join.html) and [`concat`](fn.concat.html) are provided as free
+ functions that perform a similar task.
+* This library bundles in a few more Unicode operations, such as grapheme,
+ word and sentence iterators. More operations, such as normalization and
+ case folding, may be provided in the future.
+* Some `String`/`str` APIs will panic if a particular index was not on a valid
+ UTF-8 code unit sequence boundary. Conversely, no such checking is performed
+ in this crate, as is consistent with treating byte strings as a sequence of
+ bytes. This means callers are responsible for maintaining a UTF-8 invariant
+ if that's important.
+* Some routines provided by this crate, such as `starts_with_str`, have a
+ `_str` suffix to differentiate them from similar routines already defined
+ on the `[u8]` type. The difference is that `starts_with` requires its
+ parameter to be a `&[u8]`, where as `starts_with_str` permits its parameter
+ to by anything that implements `AsRef<[u8]>`, which is more flexible. This
+ means you can write `bytes.starts_with_str("☃")` instead of
+ `bytes.starts_with("☃".as_bytes())`.
+
+Otherwise, you should find most of the APIs between this crate and the standard
+library string APIs to be very similar, if not identical.
+
+# Handling of invalid UTF-8
+
+Since byte strings are only *conventionally* UTF-8, there is no guarantee
+that byte strings contain valid UTF-8. Indeed, it is perfectly legal for a
+byte string to contain arbitrary bytes. However, since this library defines
+a *string* type, it provides many operations specified by Unicode. These
+operations are typically only defined over codepoints, and thus have no real
+meaning on bytes that are invalid UTF-8 because they do not map to a particular
+codepoint.
+
+For this reason, whenever operations defined only on codepoints are used, this
+library will automatically convert invalid UTF-8 to the Unicode replacement
+codepoint, `U+FFFD`, which looks like this: `�`. For example, an
+[iterator over codepoints](struct.Chars.html) will yield a Unicode
+replacement codepoint whenever it comes across bytes that are not valid UTF-8:
+
+```
+use bstr::ByteSlice;
+
+let bs = b"a\xFF\xFFz";
+let chars: Vec<char> = bs.chars().collect();
+assert_eq!(vec!['a', '\u{FFFD}', '\u{FFFD}', 'z'], chars);
+```
+
+There are a few ways in which invalid bytes can be substituted with a Unicode
+replacement codepoint. One way, not used by this crate, is to replace every
+individual invalid byte with a single replacement codepoint. In contrast, the
+approach this crate uses is called the "substitution of maximal subparts," as
+specified by the Unicode Standard (Chapter 3, Section 9). (This approach is
+also used by [W3C's Encoding Standard](https://www.w3.org/TR/encoding/).) In
+this strategy, a replacement codepoint is inserted whenever a byte is found
+that cannot possibly lead to a valid UTF-8 code unit sequence. If there were
+previous bytes that represented a *prefix* of a well-formed UTF-8 code unit
+sequence, then all of those bytes (up to 3) are substituted with a single
+replacement codepoint. For example:
+
+```
+use bstr::ByteSlice;
+
+let bs = b"a\xF0\x9F\x87z";
+let chars: Vec<char> = bs.chars().collect();
+// The bytes \xF0\x9F\x87 could lead to a valid UTF-8 sequence, but 3 of them
+// on their own are invalid. Only one replacement codepoint is substituted,
+// which demonstrates the "substitution of maximal subparts" strategy.
+assert_eq!(vec!['a', '\u{FFFD}', 'z'], chars);
+```
+
+If you do need to access the raw bytes for some reason in an iterator like
+`Chars`, then you should use the iterator's "indices" variant, which gives
+the byte offsets containing the invalid UTF-8 bytes that were substituted with
+the replacement codepoint. For example:
+
+```
+use bstr::{B, ByteSlice};
+
+let bs = b"a\xE2\x98z";
+let chars: Vec<(usize, usize, char)> = bs.char_indices().collect();
+// Even though the replacement codepoint is encoded as 3 bytes itself, the
+// byte range given here is only two bytes, corresponding to the original
+// raw bytes.
+assert_eq!(vec![(0, 1, 'a'), (1, 3, '\u{FFFD}'), (3, 4, 'z')], chars);
+
+// Thus, getting the original raw bytes is as simple as slicing the original
+// byte string:
+let chars: Vec<&[u8]> = bs.char_indices().map(|(s, e, _)| &bs[s..e]).collect();
+assert_eq!(vec![B("a"), B(b"\xE2\x98"), B("z")], chars);
+```
+
+# File paths and OS strings
+
+One of the premiere features of Rust's standard library is how it handles file
+paths. In particular, it makes it very hard to write incorrect code while
+simultaneously providing a correct cross platform abstraction for manipulating
+file paths. The key challenge that one faces with file paths across platforms
+is derived from the following observations:
+
+* On most Unix-like systems, file paths are an arbitrary sequence of bytes.
+* On Windows, file paths are an arbitrary sequence of 16-bit integers.
+
+(In both cases, certain sequences aren't allowed. For example a `NUL` byte is
+not allowed in either case. But we can ignore this for the purposes of this
+section.)
+
+Byte strings, like the ones provided in this crate, line up really well with
+file paths on Unix like systems, which are themselves just arbitrary sequences
+of bytes. It turns out that if you treat them as "mostly UTF-8," then things
+work out pretty well. On the contrary, byte strings _don't_ really work
+that well on Windows because it's not possible to correctly roundtrip file
+paths between 16-bit integers and something that looks like UTF-8 _without_
+explicitly defining an encoding to do this for you, which is anathema to byte
+strings, which are just bytes.
+
+Rust's standard library elegantly solves this problem by specifying an
+internal encoding for file paths that's only used on Windows called
+[WTF-8](https://simonsapin.github.io/wtf-8/). Its key properties are that they
+permit losslessly roundtripping file paths on Windows by extending UTF-8 to
+support an encoding of surrogate codepoints, while simultaneously supporting
+zero-cost conversion from Rust's Unicode strings to file paths. (Since UTF-8 is
+a proper subset of WTF-8.)
+
+The fundamental point at which the above strategy fails is when you want to
+treat file paths as things that look like strings in a zero cost way. In most
+cases, this is actually the wrong thing to do, but some cases call for it,
+for example, glob or regex matching on file paths. This is because WTF-8 is
+treated as an internal implementation detail, and there is no way to access
+those bytes via a public API. Therefore, such consumers are limited in what
+they can do:
+
+1. One could re-implement WTF-8 and re-encode file paths on Windows to WTF-8
+ by accessing their underlying 16-bit integer representation. Unfortunately,
+ this isn't zero cost (it introduces a second WTF-8 decoding step) and it's
+ not clear this is a good thing to do, since WTF-8 should ideally remain an
+ internal implementation detail. This is roughly the approach taken by the
+ [`os_str_bytes`](https://crates.io/crates/os_str_bytes) crate.
+2. One could instead declare that they will not handle paths on Windows that
+ are not valid UTF-16, and return an error when one is encountered.
+3. Like (2), but instead of returning an error, lossily decode the file path
+ on Windows that isn't valid UTF-16 into UTF-16 by replacing invalid bytes
+ with the Unicode replacement codepoint.
+
+While this library may provide facilities for (1) in the future, currently,
+this library only provides facilities for (2) and (3). In particular, a suite
+of conversion functions are provided that permit converting between byte
+strings, OS strings and file paths. For owned byte strings, they are:
+
+* [`ByteVec::from_os_string`](trait.ByteVec.html#method.from_os_string)
+* [`ByteVec::from_os_str_lossy`](trait.ByteVec.html#method.from_os_str_lossy)
+* [`ByteVec::from_path_buf`](trait.ByteVec.html#method.from_path_buf)
+* [`ByteVec::from_path_lossy`](trait.ByteVec.html#method.from_path_lossy)
+* [`ByteVec::into_os_string`](trait.ByteVec.html#method.into_os_string)
+* [`ByteVec::into_os_string_lossy`](trait.ByteVec.html#method.into_os_string_lossy)
+* [`ByteVec::into_path_buf`](trait.ByteVec.html#method.into_path_buf)
+* [`ByteVec::into_path_buf_lossy`](trait.ByteVec.html#method.into_path_buf_lossy)
+
+For byte string slices, they are:
+
+* [`ByteSlice::from_os_str`](trait.ByteSlice.html#method.from_os_str)
+* [`ByteSlice::from_path`](trait.ByteSlice.html#method.from_path)
+* [`ByteSlice::to_os_str`](trait.ByteSlice.html#method.to_os_str)
+* [`ByteSlice::to_os_str_lossy`](trait.ByteSlice.html#method.to_os_str_lossy)
+* [`ByteSlice::to_path`](trait.ByteSlice.html#method.to_path)
+* [`ByteSlice::to_path_lossy`](trait.ByteSlice.html#method.to_path_lossy)
+
+On Unix, all of these conversions are rigorously zero cost, which gives one
+a way to ergonomically deal with raw file paths exactly as they are using
+normal string-related functions. On Windows, these conversion routines perform
+a UTF-8 check and either return an error or lossily decode the file path
+into valid UTF-8, depending on which function you use. This means that you
+cannot roundtrip all file paths on Windows correctly using these conversion
+routines. However, this may be an acceptable downside since such file paths
+are exceptionally rare. Moreover, roundtripping isn't always necessary, for
+example, if all you're doing is filtering based on file paths.
+
+The reason why using byte strings for this is potentially superior than the
+standard library's approach is that a lot of Rust code is already lossily
+converting file paths to Rust's Unicode strings, which are required to be valid
+UTF-8, and thus contain latent bugs on Unix where paths with invalid UTF-8 are
+not terribly uncommon. If you instead use byte strings, then you're guaranteed
+to write correct code for Unix, at the cost of getting a corner case wrong on
+Windows.
+
+# Cargo features
+
+This crates comes with a few features that control standard library, serde
+and Unicode support.
+
+* `std` - **Enabled** by default. This provides APIs that require the standard
+ library, such as `Vec<u8>` and `PathBuf`. Enabling this feature also enables
+ the `alloc` feature and any other relevant `std` features for dependencies.
+* `alloc` - **Enabled** by default. This provides APIs that require allocations
+ via the `alloc` crate, such as `Vec<u8>`.
+* `unicode` - **Enabled** by default. This provides APIs that require sizable
+ Unicode data compiled into the binary. This includes, but is not limited to,
+ grapheme/word/sentence segmenters. When this is disabled, basic support such
+ as UTF-8 decoding is still included. Note that currently, enabling this
+ feature also requires enabling the `std` feature. It is expected that this
+ limitation will be lifted at some point.
+* `serde` - Enables implementations of serde traits for `BStr`, and also
+ `BString` when `alloc` is enabled.
+*/
+
+#![cfg_attr(not(any(feature = "std", test)), no_std)]
+#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+
+#[cfg(feature = "alloc")]
+extern crate alloc;
+
+pub use crate::bstr::BStr;
+#[cfg(feature = "alloc")]
+pub use crate::bstring::BString;
+pub use crate::escape_bytes::EscapeBytes;
+#[cfg(feature = "unicode")]
+pub use crate::ext_slice::Fields;
+pub use crate::ext_slice::{
+ ByteSlice, Bytes, FieldsWith, Find, FindReverse, Finder, FinderReverse,
+ Lines, LinesWithTerminator, Split, SplitN, SplitNReverse, SplitReverse, B,
+};
+#[cfg(feature = "alloc")]
+pub use crate::ext_vec::{concat, join, ByteVec, DrainBytes, FromUtf8Error};
+#[cfg(feature = "unicode")]
+pub use crate::unicode::{
+ GraphemeIndices, Graphemes, SentenceIndices, Sentences, WordIndices,
+ Words, WordsWithBreakIndices, WordsWithBreaks,
+};
+pub use crate::utf8::{
+ decode as decode_utf8, decode_last as decode_last_utf8, CharIndices,
+ Chars, Utf8Chunk, Utf8Chunks, Utf8Error,
+};
+
+mod ascii;
+mod bstr;
+#[cfg(feature = "alloc")]
+mod bstring;
+mod byteset;
+mod escape_bytes;
+mod ext_slice;
+#[cfg(feature = "alloc")]
+mod ext_vec;
+mod impls;
+#[cfg(feature = "std")]
+pub mod io;
+#[cfg(all(test, feature = "std"))]
+mod tests;
+#[cfg(feature = "unicode")]
+mod unicode;
+mod utf8;
+
+#[cfg(all(test, feature = "std"))]
+mod apitests {
+ use crate::{
+ bstr::BStr,
+ bstring::BString,
+ ext_slice::{Finder, FinderReverse},
+ };
+
+ #[test]
+ fn oibits() {
+ use std::panic::{RefUnwindSafe, UnwindSafe};
+
+ fn assert_send<T: Send>() {}
+ fn assert_sync<T: Sync>() {}
+ fn assert_unwind_safe<T: RefUnwindSafe + UnwindSafe>() {}
+
+ assert_send::<&BStr>();
+ assert_sync::<&BStr>();
+ assert_unwind_safe::<&BStr>();
+ assert_send::<BString>();
+ assert_sync::<BString>();
+ assert_unwind_safe::<BString>();
+
+ assert_send::<Finder<'_>>();
+ assert_sync::<Finder<'_>>();
+ assert_unwind_safe::<Finder<'_>>();
+ assert_send::<FinderReverse<'_>>();
+ assert_sync::<FinderReverse<'_>>();
+ assert_unwind_safe::<FinderReverse<'_>>();
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 +763 +764 +765 +766 +767 +768 +769 +770 +771 +772 +773 +774 +775 +776 +777 +778 +779 +780 +781 +782 +783 +784 +785 +786 +787 +788 +789 +790 +791 +792 +793 +794 +795 +796 +797 +798 +799 +800 +801 +802 +803 +804 +805 +806 +807 +808 +809 +810 +811 +812 +813 +814 +815 +816 +817 +818 +819 +820 +821 +822 +823 +824 +825 +826 +827 +828 +829 +830 +831 +832 +833 +834 +835 +836 +837 +838 +839 +840 +841 +842 +843 +844 +845 +846 +847 +848 +849 +850 +851 +852 +853 +854 +855 +856 +857 +858 +859 +860 +861 +862 +863 +864 +865 +866 +867 +868 +869 +870 +871 +872 +873 +874 +875 +876 +877 +878 +879 +880 +881 +882 +883 +884 +885 +886 +887 +888 +889 +890 +891 +892 +893 +894 +895 +896 +897 +898 +899 +900 +901 +902 +903 +904 +905 +906 +907 +908 +909 +910 +911 +912 +913 +914 +915 +916 +917 +918 +919 +920 +921 +922 +923 +924 +925 +926 +927 +928 +929 +930 +931 +932 +933 +934 +935 +936 +937 +938 +939 +940 +941 +942 +943 +944 +945 +946 +947 +948 +949 +950 +951 +952 +953 +954 +955 +956 +957 +958 +959 +960 +961 +962 +963 +964 +965 +966 +967 +968 +969 +970 +971 +972 +973 +974 +975 +976 +977 +978 +979 +980 +981 +982 +983 +984 +985 +986 +987 +988 +989 +990 +991 +992 +993 +994 +995 +996 +997 +998 +999 +1000 +1001 +1002 +1003 +1004 +1005 +1006 +1007 +1008 +1009 +1010 +1011 +1012 +1013 +1014 +1015 +1016 +1017 +1018 +1019 +1020 +1021 +1022 +1023 +1024 +1025 +1026 +1027 +1028 +1029 +1030 +1031 +1032 +1033 +1034 +1035 +1036 +1037 +1038 +1039 +1040 +1041 +1042 +1043 +1044 +1045 +1046 +1047 +1048 +1049 +1050 +1051 +1052 +1053 +1054 +1055 +1056 +1057 +1058 +1059 +1060 +1061 +1062 +1063 +1064 +1065 +1066 +1067 +1068 +1069 +1070 +1071 +1072 +1073 +1074 +1075 +1076 +1077 +1078 +1079 +1080 +1081 +1082 +1083 +1084 +1085 +1086 +1087 +1088 +1089 +1090 +1091 +1092 +1093 +1094 +1095 +1096 +1097 +1098 +1099 +1100 +1101 +1102 +1103 +1104 +1105 +1106 +1107 +1108 +1109 +1110 +1111 +1112 +1113 +1114 +1115 +1116 +1117 +1118 +1119 +1120 +1121 +1122 +1123 +1124 +1125 +1126 +1127 +1128 +1129 +1130 +1131 +1132 +1133 +1134 +1135 +1136 +1137 +1138 +1139 +1140 +1141 +1142 +1143 +1144 +1145 +1146 +1147 +1148 +1149 +1150 +1151 +1152 +1153 +1154 +1155 +1156 +1157 +1158 +1159 +1160 +1161 +1162 +1163 +1164 +1165 +1166 +1167 +1168 +1169 +1170 +1171 +1172 +1173 +1174 +1175 +1176 +1177 +1178 +1179 +1180 +1181 +1182 +1183 +1184 +1185 +1186 +1187 +1188 +1189 +1190 +1191 +1192 +1193 +1194 +1195 +1196 +1197 +1198 +1199 +1200 +1201 +1202 +1203 +1204 +1205 +1206 +1207 +1208 +1209 +1210 +1211 +1212 +1213 +1214 +1215 +1216 +1217 +1218 +1219 +1220 +1221 +1222 +1223 +1224 +1225 +1226 +1227 +1228 +1229 +1230 +1231 +1232 +1233 +1234 +1235 +1236 +1237 +1238 +1239 +1240 +1241 +1242 +1243 +1244 +1245 +1246 +1247 +1248 +1249 +1250 +1251 +1252 +1253 +1254 +1255 +1256 +1257 +1258 +1259 +1260 +1261 +1262 +1263 +1264 +1265 +1266 +1267 +1268 +1269 +1270 +1271 +1272 +1273 +1274 +1275 +1276 +1277 +1278 +1279 +1280 +1281 +1282 +1283 +1284 +1285 +1286 +1287 +1288 +1289 +1290 +1291 +1292 +1293 +1294 +1295 +1296 +1297 +1298 +1299 +1300 +1301 +1302 +1303 +1304 +1305 +1306 +1307 +1308 +1309 +1310 +1311 +1312 +1313 +1314 +1315 +1316 +1317 +1318 +1319 +1320 +1321 +1322 +1323 +1324 +1325 +1326 +1327 +1328 +1329 +1330 +1331 +1332 +1333 +1334 +1335 +1336 +1337 +1338 +1339 +1340 +1341 +1342 +1343 +1344 +1345 +1346 +1347 +1348 +1349 +1350 +1351 +1352 +1353 +1354 +1355 +1356 +1357 +1358 +1359 +1360 +1361 +1362 +1363 +1364 +1365 +1366 +1367 +1368 +1369 +
use core::{char, cmp, fmt, str};
+
+#[cfg(feature = "std")]
+use std::error;
+
+use crate::{ascii, bstr::BStr, ext_slice::ByteSlice};
+
+// The UTF-8 decoder provided here is based on the one presented here:
+// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+//
+// We *could* have done UTF-8 decoding by using a DFA generated by `\p{any}`
+// using regex-automata that is roughly the same size. The real benefit of
+// Hoehrmann's formulation is that the byte class mapping below is manually
+// tailored such that each byte's class doubles as a shift to mask out the
+// bits necessary for constructing the leading bits of each codepoint value
+// from the initial byte.
+//
+// There are some minor differences between this implementation and Hoehrmann's
+// formulation.
+//
+// Firstly, we make REJECT have state ID 0, since it makes the state table
+// itself a little easier to read and is consistent with the notion that 0
+// means "false" or "bad."
+//
+// Secondly, when doing bulk decoding, we add a SIMD accelerated ASCII fast
+// path.
+//
+// Thirdly, we pre-multiply the state IDs to avoid a multiplication instruction
+// in the core decoding loop. (Which is what regex-automata would do by
+// default.)
+//
+// Fourthly, we split the byte class mapping and transition table into two
+// arrays because it's clearer.
+//
+// It is unlikely that this is the fastest way to do UTF-8 decoding, however,
+// it is fairly simple.
+
+const ACCEPT: usize = 12;
+const REJECT: usize = 0;
+
+/// SAFETY: The decode below function relies on the correctness of these
+/// equivalence classes.
+#[cfg_attr(rustfmt, rustfmt::skip)]
+const CLASSES: [u8; 256] = [
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+];
+
+/// SAFETY: The decode below function relies on the correctness of this state
+/// machine.
+#[cfg_attr(rustfmt, rustfmt::skip)]
+const STATES_FORWARD: &'static [u8] = &[
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72,
+ 0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0,
+ 0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
+ 0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
+ 0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
+ 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+];
+
+/// An iterator over Unicode scalar values in a byte string.
+///
+/// When invalid UTF-8 byte sequences are found, they are substituted with the
+/// Unicode replacement codepoint (`U+FFFD`) using the
+/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
+///
+/// This iterator is created by the
+/// [`chars`](trait.ByteSlice.html#method.chars) method provided by the
+/// [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
+#[derive(Clone, Debug)]
+pub struct Chars<'a> {
+ bs: &'a [u8],
+}
+
+impl<'a> Chars<'a> {
+ pub(crate) fn new(bs: &'a [u8]) -> Chars<'a> {
+ Chars { bs }
+ }
+
+ /// View the underlying data as a subslice of the original data.
+ ///
+ /// The slice returned has the same lifetime as the original slice, and so
+ /// the iterator can continue to be used while this exists.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let mut chars = b"abc".chars();
+ ///
+ /// assert_eq!(b"abc", chars.as_bytes());
+ /// chars.next();
+ /// assert_eq!(b"bc", chars.as_bytes());
+ /// chars.next();
+ /// chars.next();
+ /// assert_eq!(b"", chars.as_bytes());
+ /// ```
+ #[inline]
+ pub fn as_bytes(&self) -> &'a [u8] {
+ self.bs
+ }
+}
+
+impl<'a> Iterator for Chars<'a> {
+ type Item = char;
+
+ #[inline]
+ fn next(&mut self) -> Option<char> {
+ let (ch, size) = decode_lossy(self.bs);
+ if size == 0 {
+ return None;
+ }
+ self.bs = &self.bs[size..];
+ Some(ch)
+ }
+}
+
+impl<'a> DoubleEndedIterator for Chars<'a> {
+ #[inline]
+ fn next_back(&mut self) -> Option<char> {
+ let (ch, size) = decode_last_lossy(self.bs);
+ if size == 0 {
+ return None;
+ }
+ self.bs = &self.bs[..self.bs.len() - size];
+ Some(ch)
+ }
+}
+
+/// An iterator over Unicode scalar values in a byte string and their
+/// byte index positions.
+///
+/// When invalid UTF-8 byte sequences are found, they are substituted with the
+/// Unicode replacement codepoint (`U+FFFD`) using the
+/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
+///
+/// Note that this is slightly different from the `CharIndices` iterator
+/// provided by the standard library. Aside from working on possibly invalid
+/// UTF-8, this iterator provides both the corresponding starting and ending
+/// byte indices of each codepoint yielded. The ending position is necessary to
+/// slice the original byte string when invalid UTF-8 bytes are converted into
+/// a Unicode replacement codepoint, since a single replacement codepoint can
+/// substitute anywhere from 1 to 3 invalid bytes (inclusive).
+///
+/// This iterator is created by the
+/// [`char_indices`](trait.ByteSlice.html#method.char_indices) method provided
+/// by the [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
+#[derive(Clone, Debug)]
+pub struct CharIndices<'a> {
+ bs: &'a [u8],
+ forward_index: usize,
+ reverse_index: usize,
+}
+
+impl<'a> CharIndices<'a> {
+ pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> {
+ CharIndices { bs, forward_index: 0, reverse_index: bs.len() }
+ }
+
+ /// View the underlying data as a subslice of the original data.
+ ///
+ /// The slice returned has the same lifetime as the original slice, and so
+ /// the iterator can continue to be used while this exists.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let mut it = b"abc".char_indices();
+ ///
+ /// assert_eq!(b"abc", it.as_bytes());
+ /// it.next();
+ /// assert_eq!(b"bc", it.as_bytes());
+ /// it.next();
+ /// it.next();
+ /// assert_eq!(b"", it.as_bytes());
+ /// ```
+ #[inline]
+ pub fn as_bytes(&self) -> &'a [u8] {
+ self.bs
+ }
+}
+
+impl<'a> Iterator for CharIndices<'a> {
+ type Item = (usize, usize, char);
+
+ #[inline]
+ fn next(&mut self) -> Option<(usize, usize, char)> {
+ let index = self.forward_index;
+ let (ch, size) = decode_lossy(self.bs);
+ if size == 0 {
+ return None;
+ }
+ self.bs = &self.bs[size..];
+ self.forward_index += size;
+ Some((index, index + size, ch))
+ }
+}
+
+impl<'a> DoubleEndedIterator for CharIndices<'a> {
+ #[inline]
+ fn next_back(&mut self) -> Option<(usize, usize, char)> {
+ let (ch, size) = decode_last_lossy(self.bs);
+ if size == 0 {
+ return None;
+ }
+ self.bs = &self.bs[..self.bs.len() - size];
+ self.reverse_index -= size;
+ Some((self.reverse_index, self.reverse_index + size, ch))
+ }
+}
+
+impl<'a> ::core::iter::FusedIterator for CharIndices<'a> {}
+
+/// An iterator over chunks of valid UTF-8 in a byte slice.
+///
+/// See [`utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks).
+#[derive(Clone, Debug)]
+pub struct Utf8Chunks<'a> {
+ pub(super) bytes: &'a [u8],
+}
+
+/// A chunk of valid UTF-8, possibly followed by invalid UTF-8 bytes.
+///
+/// This is yielded by the
+/// [`Utf8Chunks`](struct.Utf8Chunks.html)
+/// iterator, which can be created via the
+/// [`ByteSlice::utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks)
+/// method.
+///
+/// The `'a` lifetime parameter corresponds to the lifetime of the bytes that
+/// are being iterated over.
+#[cfg_attr(test, derive(Debug, PartialEq))]
+pub struct Utf8Chunk<'a> {
+ /// A valid UTF-8 piece, at the start, end, or between invalid UTF-8 bytes.
+ ///
+ /// This is empty between adjacent invalid UTF-8 byte sequences.
+ valid: &'a str,
+ /// A sequence of invalid UTF-8 bytes.
+ ///
+ /// Can only be empty in the last chunk.
+ ///
+ /// Should be replaced by a single unicode replacement character, if not
+ /// empty.
+ invalid: &'a BStr,
+ /// Indicates whether the invalid sequence could've been valid if there
+ /// were more bytes.
+ ///
+ /// Can only be true in the last chunk.
+ incomplete: bool,
+}
+
+impl<'a> Utf8Chunk<'a> {
+ /// Returns the (possibly empty) valid UTF-8 bytes in this chunk.
+ ///
+ /// This may be empty if there are consecutive sequences of invalid UTF-8
+ /// bytes.
+ #[inline]
+ pub fn valid(&self) -> &'a str {
+ self.valid
+ }
+
+ /// Returns the (possibly empty) invalid UTF-8 bytes in this chunk that
+ /// immediately follow the valid UTF-8 bytes in this chunk.
+ ///
+ /// This is only empty when this chunk corresponds to the last chunk in
+ /// the original bytes.
+ ///
+ /// The maximum length of this slice is 3. That is, invalid UTF-8 byte
+ /// sequences greater than 1 always correspond to a valid _prefix_ of
+ /// a valid UTF-8 encoded codepoint. This corresponds to the "substitution
+ /// of maximal subparts" strategy that is described in more detail in the
+ /// docs for the
+ /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy)
+ /// method.
+ #[inline]
+ pub fn invalid(&self) -> &'a [u8] {
+ self.invalid.as_bytes()
+ }
+
+ /// Returns whether the invalid sequence might still become valid if more
+ /// bytes are added.
+ ///
+ /// Returns true if the end of the input was reached unexpectedly,
+ /// without encountering an unexpected byte.
+ ///
+ /// This can only be the case for the last chunk.
+ #[inline]
+ pub fn incomplete(&self) -> bool {
+ self.incomplete
+ }
+}
+
+impl<'a> Iterator for Utf8Chunks<'a> {
+ type Item = Utf8Chunk<'a>;
+
+ #[inline]
+ fn next(&mut self) -> Option<Utf8Chunk<'a>> {
+ if self.bytes.is_empty() {
+ return None;
+ }
+ match validate(self.bytes) {
+ Ok(()) => {
+ let valid = self.bytes;
+ self.bytes = &[];
+ Some(Utf8Chunk {
+ // SAFETY: This is safe because of the guarantees provided
+ // by utf8::validate.
+ valid: unsafe { str::from_utf8_unchecked(valid) },
+ invalid: [].as_bstr(),
+ incomplete: false,
+ })
+ }
+ Err(e) => {
+ let (valid, rest) = self.bytes.split_at(e.valid_up_to());
+ // SAFETY: This is safe because of the guarantees provided by
+ // utf8::validate.
+ let valid = unsafe { str::from_utf8_unchecked(valid) };
+ let (invalid_len, incomplete) = match e.error_len() {
+ Some(n) => (n, false),
+ None => (rest.len(), true),
+ };
+ let (invalid, rest) = rest.split_at(invalid_len);
+ self.bytes = rest;
+ Some(Utf8Chunk {
+ valid,
+ invalid: invalid.as_bstr(),
+ incomplete,
+ })
+ }
+ }
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ if self.bytes.is_empty() {
+ (0, Some(0))
+ } else {
+ (1, Some(self.bytes.len()))
+ }
+ }
+}
+
+impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {}
+
+/// An error that occurs when UTF-8 decoding fails.
+///
+/// This error occurs when attempting to convert a non-UTF-8 byte
+/// string to a Rust string that must be valid UTF-8. For example,
+/// [`to_str`](trait.ByteSlice.html#method.to_str) is one such method.
+///
+/// # Example
+///
+/// This example shows what happens when a given byte sequence is invalid,
+/// but ends with a sequence that is a possible prefix of valid UTF-8.
+///
+/// ```
+/// use bstr::{B, ByteSlice};
+///
+/// let s = B(b"foobar\xF1\x80\x80");
+/// let err = s.to_str().unwrap_err();
+/// assert_eq!(err.valid_up_to(), 6);
+/// assert_eq!(err.error_len(), None);
+/// ```
+///
+/// This example shows what happens when a given byte sequence contains
+/// invalid UTF-8.
+///
+/// ```
+/// use bstr::ByteSlice;
+///
+/// let s = b"foobar\xF1\x80\x80quux";
+/// let err = s.to_str().unwrap_err();
+/// assert_eq!(err.valid_up_to(), 6);
+/// // The error length reports the maximum number of bytes that correspond to
+/// // a valid prefix of a UTF-8 encoded codepoint.
+/// assert_eq!(err.error_len(), Some(3));
+///
+/// // In contrast to the above which contains a single invalid prefix,
+/// // consider the case of multiple individual bytes that are never valid
+/// // prefixes. Note how the value of error_len changes!
+/// let s = b"foobar\xFF\xFFquux";
+/// let err = s.to_str().unwrap_err();
+/// assert_eq!(err.valid_up_to(), 6);
+/// assert_eq!(err.error_len(), Some(1));
+///
+/// // The fact that it's an invalid prefix does not change error_len even
+/// // when it immediately precedes the end of the string.
+/// let s = b"foobar\xFF";
+/// let err = s.to_str().unwrap_err();
+/// assert_eq!(err.valid_up_to(), 6);
+/// assert_eq!(err.error_len(), Some(1));
+/// ```
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Utf8Error {
+ valid_up_to: usize,
+ error_len: Option<usize>,
+}
+
+impl Utf8Error {
+ /// Returns the byte index of the position immediately following the last
+ /// valid UTF-8 byte.
+ ///
+ /// # Example
+ ///
+ /// This examples shows how `valid_up_to` can be used to retrieve a
+ /// possibly empty prefix that is guaranteed to be valid UTF-8:
+ ///
+ /// ```
+ /// use bstr::ByteSlice;
+ ///
+ /// let s = b"foobar\xF1\x80\x80quux";
+ /// let err = s.to_str().unwrap_err();
+ ///
+ /// // This is guaranteed to never panic.
+ /// let string = s[..err.valid_up_to()].to_str().unwrap();
+ /// assert_eq!(string, "foobar");
+ /// ```
+ #[inline]
+ pub fn valid_up_to(&self) -> usize {
+ self.valid_up_to
+ }
+
+ /// Returns the total number of invalid UTF-8 bytes immediately following
+ /// the position returned by `valid_up_to`. This value is always at least
+ /// `1`, but can be up to `3` if bytes form a valid prefix of some UTF-8
+ /// encoded codepoint.
+ ///
+ /// If the end of the original input was found before a valid UTF-8 encoded
+ /// codepoint could be completed, then this returns `None`. This is useful
+ /// when processing streams, where a `None` value signals that more input
+ /// might be needed.
+ #[inline]
+ pub fn error_len(&self) -> Option<usize> {
+ self.error_len
+ }
+}
+
+#[cfg(feature = "std")]
+impl error::Error for Utf8Error {
+ fn description(&self) -> &str {
+ "invalid UTF-8"
+ }
+}
+
+impl fmt::Display for Utf8Error {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "invalid UTF-8 found at byte offset {}", self.valid_up_to)
+ }
+}
+
+/// Returns OK if and only if the given slice is completely valid UTF-8.
+///
+/// If the slice isn't valid UTF-8, then an error is returned that explains
+/// the first location at which invalid UTF-8 was detected.
+pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> {
+ // The fast path for validating UTF-8. It steps through a UTF-8 automaton
+ // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is
+ // detected, it backs up and runs the slower version of the UTF-8 automaton
+ // to determine correct error information.
+ fn fast(slice: &[u8]) -> Result<(), Utf8Error> {
+ let mut state = ACCEPT;
+ let mut i = 0;
+
+ while i < slice.len() {
+ let b = slice[i];
+
+ // ASCII fast path. If we see two consecutive ASCII bytes, then try
+ // to validate as much ASCII as possible very quickly.
+ if state == ACCEPT
+ && b <= 0x7F
+ && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
+ {
+ i += ascii::first_non_ascii_byte(&slice[i..]);
+ continue;
+ }
+
+ state = step(state, b);
+ if state == REJECT {
+ return Err(find_valid_up_to(slice, i));
+ }
+ i += 1;
+ }
+ if state != ACCEPT {
+ Err(find_valid_up_to(slice, slice.len()))
+ } else {
+ Ok(())
+ }
+ }
+
+ // Given the first position at which a UTF-8 sequence was determined to be
+ // invalid, return an error that correctly reports the position at which
+ // the last complete UTF-8 sequence ends.
+ #[inline(never)]
+ fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error {
+ // In order to find the last valid byte, we need to back up an amount
+ // that guarantees every preceding byte is part of a valid UTF-8
+ // code unit sequence. To do this, we simply locate the last leading
+ // byte that occurs before rejected_at.
+ let mut backup = rejected_at.saturating_sub(1);
+ while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) {
+ backup -= 1;
+ }
+ let upto = cmp::min(slice.len(), rejected_at.saturating_add(1));
+ let mut err = slow(&slice[backup..upto]).unwrap_err();
+ err.valid_up_to += backup;
+ err
+ }
+
+ // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error
+ // when an invalid sequence is found. This is split out from validate so
+ // that the fast path doesn't need to keep track of the position of the
+ // last valid UTF-8 byte. In particular, tracking this requires checking
+ // for an ACCEPT state on each byte, which degrades throughput pretty
+ // badly.
+ fn slow(slice: &[u8]) -> Result<(), Utf8Error> {
+ let mut state = ACCEPT;
+ let mut valid_up_to = 0;
+ for (i, &b) in slice.iter().enumerate() {
+ state = step(state, b);
+ if state == ACCEPT {
+ valid_up_to = i + 1;
+ } else if state == REJECT {
+ // Our error length must always be at least 1.
+ let error_len = Some(cmp::max(1, i - valid_up_to));
+ return Err(Utf8Error { valid_up_to, error_len });
+ }
+ }
+ if state != ACCEPT {
+ Err(Utf8Error { valid_up_to, error_len: None })
+ } else {
+ Ok(())
+ }
+ }
+
+ // Advance to the next state given the current state and current byte.
+ fn step(state: usize, b: u8) -> usize {
+ let class = CLASSES[b as usize];
+ // SAFETY: This is safe because 'class' is always <=11 and 'state' is
+ // always <=96. Therefore, the maximal index is 96+11 = 107, where
+ // STATES_FORWARD.len() = 108 such that every index is guaranteed to be
+ // valid by construction of the state machine and the byte equivalence
+ // classes.
+ unsafe {
+ *STATES_FORWARD.get_unchecked(state + class as usize) as usize
+ }
+ }
+
+ fast(slice)
+}
+
+/// UTF-8 decode a single Unicode scalar value from the beginning of a slice.
+///
+/// When successful, the corresponding Unicode scalar value is returned along
+/// with the number of bytes it was encoded with. The number of bytes consumed
+/// for a successful decode is always between 1 and 4, inclusive.
+///
+/// When unsuccessful, `None` is returned along with the number of bytes that
+/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
+/// the number of bytes consumed is always between 0 and 3, inclusive, where
+/// 0 is only returned when `slice` is empty.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```
+/// use bstr::decode_utf8;
+///
+/// // Decoding a valid codepoint.
+/// let (ch, size) = decode_utf8(b"\xE2\x98\x83");
+/// assert_eq!(Some('☃'), ch);
+/// assert_eq!(3, size);
+///
+/// // Decoding an incomplete codepoint.
+/// let (ch, size) = decode_utf8(b"\xE2\x98");
+/// assert_eq!(None, ch);
+/// assert_eq!(2, size);
+/// ```
+///
+/// This example shows how to iterate over all codepoints in UTF-8 encoded
+/// bytes, while replacing invalid UTF-8 sequences with the replacement
+/// codepoint:
+///
+/// ```
+/// use bstr::{B, decode_utf8};
+///
+/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
+/// let mut chars = vec![];
+/// while !bytes.is_empty() {
+/// let (ch, size) = decode_utf8(bytes);
+/// bytes = &bytes[size..];
+/// chars.push(ch.unwrap_or('\u{FFFD}'));
+/// }
+/// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
+/// ```
+#[inline]
+pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
+ let slice = slice.as_ref();
+ match slice.get(0) {
+ None => return (None, 0),
+ Some(&b) if b <= 0x7F => return (Some(b as char), 1),
+ _ => {}
+ }
+
+ let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
+ while i < slice.len() {
+ decode_step(&mut state, &mut cp, slice[i]);
+ i += 1;
+
+ if state == ACCEPT {
+ // SAFETY: This is safe because `decode_step` guarantees that
+ // `cp` is a valid Unicode scalar value in an ACCEPT state.
+ let ch = unsafe { char::from_u32_unchecked(cp) };
+ return (Some(ch), i);
+ } else if state == REJECT {
+ // At this point, we always want to advance at least one byte.
+ return (None, cmp::max(1, i.saturating_sub(1)));
+ }
+ }
+ (None, i)
+}
+
+/// Lossily UTF-8 decode a single Unicode scalar value from the beginning of a
+/// slice.
+///
+/// When successful, the corresponding Unicode scalar value is returned along
+/// with the number of bytes it was encoded with. The number of bytes consumed
+/// for a successful decode is always between 1 and 4, inclusive.
+///
+/// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
+/// along with the number of bytes that make up a maximal prefix of a valid
+/// UTF-8 code unit sequence. In this case, the number of bytes consumed is
+/// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
+/// empty.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```ignore
+/// use bstr::decode_utf8_lossy;
+///
+/// // Decoding a valid codepoint.
+/// let (ch, size) = decode_utf8_lossy(b"\xE2\x98\x83");
+/// assert_eq!('☃', ch);
+/// assert_eq!(3, size);
+///
+/// // Decoding an incomplete codepoint.
+/// let (ch, size) = decode_utf8_lossy(b"\xE2\x98");
+/// assert_eq!('\u{FFFD}', ch);
+/// assert_eq!(2, size);
+/// ```
+///
+/// This example shows how to iterate over all codepoints in UTF-8 encoded
+/// bytes, while replacing invalid UTF-8 sequences with the replacement
+/// codepoint:
+///
+/// ```ignore
+/// use bstr::{B, decode_utf8_lossy};
+///
+/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
+/// let mut chars = vec![];
+/// while !bytes.is_empty() {
+/// let (ch, size) = decode_utf8_lossy(bytes);
+/// bytes = &bytes[size..];
+/// chars.push(ch);
+/// }
+/// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
+/// ```
+#[inline]
+pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
+ match decode(slice) {
+ (Some(ch), size) => (ch, size),
+ (None, size) => ('\u{FFFD}', size),
+ }
+}
+
+/// UTF-8 decode a single Unicode scalar value from the end of a slice.
+///
+/// When successful, the corresponding Unicode scalar value is returned along
+/// with the number of bytes it was encoded with. The number of bytes consumed
+/// for a successful decode is always between 1 and 4, inclusive.
+///
+/// When unsuccessful, `None` is returned along with the number of bytes that
+/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
+/// the number of bytes consumed is always between 0 and 3, inclusive, where
+/// 0 is only returned when `slice` is empty.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```
+/// use bstr::decode_last_utf8;
+///
+/// // Decoding a valid codepoint.
+/// let (ch, size) = decode_last_utf8(b"\xE2\x98\x83");
+/// assert_eq!(Some('☃'), ch);
+/// assert_eq!(3, size);
+///
+/// // Decoding an incomplete codepoint.
+/// let (ch, size) = decode_last_utf8(b"\xE2\x98");
+/// assert_eq!(None, ch);
+/// assert_eq!(2, size);
+/// ```
+///
+/// This example shows how to iterate over all codepoints in UTF-8 encoded
+/// bytes in reverse, while replacing invalid UTF-8 sequences with the
+/// replacement codepoint:
+///
+/// ```
+/// use bstr::{B, decode_last_utf8};
+///
+/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
+/// let mut chars = vec![];
+/// while !bytes.is_empty() {
+/// let (ch, size) = decode_last_utf8(bytes);
+/// bytes = &bytes[..bytes.len()-size];
+/// chars.push(ch.unwrap_or('\u{FFFD}'));
+/// }
+/// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
+/// ```
+#[inline]
+pub fn decode_last<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
+ // TODO: We could implement this by reversing the UTF-8 automaton, but for
+ // now, we do it the slow way by using the forward automaton.
+
+ let slice = slice.as_ref();
+ if slice.is_empty() {
+ return (None, 0);
+ }
+ let mut start = slice.len() - 1;
+ let limit = slice.len().saturating_sub(4);
+ while start > limit && !is_leading_or_invalid_utf8_byte(slice[start]) {
+ start -= 1;
+ }
+ let (ch, size) = decode(&slice[start..]);
+ // If we didn't consume all of the bytes, then that means there's at least
+ // one stray byte that never occurs in a valid code unit prefix, so we can
+ // advance by one byte.
+ if start + size != slice.len() {
+ (None, 1)
+ } else {
+ (ch, size)
+ }
+}
+
+/// Lossily UTF-8 decode a single Unicode scalar value from the end of a slice.
+///
+/// When successful, the corresponding Unicode scalar value is returned along
+/// with the number of bytes it was encoded with. The number of bytes consumed
+/// for a successful decode is always between 1 and 4, inclusive.
+///
+/// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
+/// along with the number of bytes that make up a maximal prefix of a valid
+/// UTF-8 code unit sequence. In this case, the number of bytes consumed is
+/// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
+/// empty.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```ignore
+/// use bstr::decode_last_utf8_lossy;
+///
+/// // Decoding a valid codepoint.
+/// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98\x83");
+/// assert_eq!('☃', ch);
+/// assert_eq!(3, size);
+///
+/// // Decoding an incomplete codepoint.
+/// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98");
+/// assert_eq!('\u{FFFD}', ch);
+/// assert_eq!(2, size);
+/// ```
+///
+/// This example shows how to iterate over all codepoints in UTF-8 encoded
+/// bytes in reverse, while replacing invalid UTF-8 sequences with the
+/// replacement codepoint:
+///
+/// ```ignore
+/// use bstr::decode_last_utf8_lossy;
+///
+/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
+/// let mut chars = vec![];
+/// while !bytes.is_empty() {
+/// let (ch, size) = decode_last_utf8_lossy(bytes);
+/// bytes = &bytes[..bytes.len()-size];
+/// chars.push(ch);
+/// }
+/// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
+/// ```
+#[inline]
+pub fn decode_last_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
+ match decode_last(slice) {
+ (Some(ch), size) => (ch, size),
+ (None, size) => ('\u{FFFD}', size),
+ }
+}
+
+/// SAFETY: The decode function relies on state being equal to ACCEPT only if
+/// cp is a valid Unicode scalar value.
+#[inline]
+pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
+ let class = CLASSES[b as usize];
+ if *state == ACCEPT {
+ *cp = (0xFF >> class) & (b as u32);
+ } else {
+ *cp = (b as u32 & 0b111111) | (*cp << 6);
+ }
+ *state = STATES_FORWARD[*state + class as usize] as usize;
+}
+
+/// Returns true if and only if the given byte is either a valid leading UTF-8
+/// byte, or is otherwise an invalid byte that can never appear anywhere in a
+/// valid UTF-8 sequence.
+fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
+ // In the ASCII case, the most significant bit is never set. The leading
+ // byte of a 2/3/4-byte sequence always has the top two most significant
+ // bits set. For bytes that can never appear anywhere in valid UTF-8, this
+ // also returns true, since every such byte has its two most significant
+ // bits set:
+ //
+ // \xC0 :: 11000000
+ // \xC1 :: 11000001
+ // \xF5 :: 11110101
+ // \xF6 :: 11110110
+ // \xF7 :: 11110111
+ // \xF8 :: 11111000
+ // \xF9 :: 11111001
+ // \xFA :: 11111010
+ // \xFB :: 11111011
+ // \xFC :: 11111100
+ // \xFD :: 11111101
+ // \xFE :: 11111110
+ // \xFF :: 11111111
+ (b & 0b1100_0000) != 0b1000_0000
+}
+
+#[cfg(all(test, feature = "std"))]
+mod tests {
+ use std::char;
+
+ use crate::{
+ ext_slice::{ByteSlice, B},
+ tests::LOSSY_TESTS,
+ utf8::{self, Utf8Error},
+ };
+
+ fn utf8e(valid_up_to: usize) -> Utf8Error {
+ Utf8Error { valid_up_to, error_len: None }
+ }
+
+ fn utf8e2(valid_up_to: usize, error_len: usize) -> Utf8Error {
+ Utf8Error { valid_up_to, error_len: Some(error_len) }
+ }
+
+ #[test]
+ #[cfg(not(miri))]
+ fn validate_all_codepoints() {
+ for i in 0..(0x10FFFF + 1) {
+ let cp = match char::from_u32(i) {
+ None => continue,
+ Some(cp) => cp,
+ };
+ let mut buf = [0; 4];
+ let s = cp.encode_utf8(&mut buf);
+ assert_eq!(Ok(()), utf8::validate(s.as_bytes()));
+ }
+ }
+
+ #[test]
+ fn validate_multiple_codepoints() {
+ assert_eq!(Ok(()), utf8::validate(b"abc"));
+ assert_eq!(Ok(()), utf8::validate(b"a\xE2\x98\x83a"));
+ assert_eq!(Ok(()), utf8::validate(b"a\xF0\x9D\x9C\xB7a"));
+ assert_eq!(Ok(()), utf8::validate(b"\xE2\x98\x83\xF0\x9D\x9C\xB7",));
+ assert_eq!(
+ Ok(()),
+ utf8::validate(b"a\xE2\x98\x83a\xF0\x9D\x9C\xB7a",)
+ );
+ assert_eq!(
+ Ok(()),
+ utf8::validate(b"\xEF\xBF\xBD\xE2\x98\x83\xEF\xBF\xBD",)
+ );
+ }
+
+ #[test]
+ fn validate_errors() {
+ // single invalid byte
+ assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xFF"));
+ // single invalid byte after ASCII
+ assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xFF"));
+ // single invalid byte after 2 byte sequence
+ assert_eq!(Err(utf8e2(2, 1)), utf8::validate(b"\xCE\xB2\xFF"));
+ // single invalid byte after 3 byte sequence
+ assert_eq!(Err(utf8e2(3, 1)), utf8::validate(b"\xE2\x98\x83\xFF"));
+ // single invalid byte after 4 byte sequence
+ assert_eq!(Err(utf8e2(4, 1)), utf8::validate(b"\xF0\x9D\x9D\xB1\xFF"));
+
+ // An invalid 2-byte sequence with a valid 1-byte prefix.
+ assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCE\xF0"));
+ // An invalid 3-byte sequence with a valid 2-byte prefix.
+ assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98\xF0"));
+ // An invalid 4-byte sequence with a valid 3-byte prefix.
+ assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9D\xF0"));
+
+ // An overlong sequence. Should be \xE2\x82\xAC, but we encode the
+ // same codepoint value in 4 bytes. This not only tests that we reject
+ // overlong sequences, but that we get valid_up_to correct.
+ assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xF0\x82\x82\xAC"));
+ assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xF0\x82\x82\xAC"));
+ assert_eq!(
+ Err(utf8e2(3, 1)),
+ utf8::validate(b"\xE2\x98\x83\xF0\x82\x82\xAC",)
+ );
+
+ // Check that encoding a surrogate codepoint using the UTF-8 scheme
+ // fails validation.
+ assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xED\xA0\x80"));
+ assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xED\xA0\x80"));
+ assert_eq!(
+ Err(utf8e2(3, 1)),
+ utf8::validate(b"\xE2\x98\x83\xED\xA0\x80",)
+ );
+
+ // Check that an incomplete 2-byte sequence fails.
+ assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCEa"));
+ assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xCEa"));
+ assert_eq!(
+ Err(utf8e2(3, 1)),
+ utf8::validate(b"\xE2\x98\x83\xCE\xE2\x98\x83",)
+ );
+ // Check that an incomplete 3-byte sequence fails.
+ assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98a"));
+ assert_eq!(Err(utf8e2(1, 2)), utf8::validate(b"a\xE2\x98a"));
+ assert_eq!(
+ Err(utf8e2(3, 2)),
+ utf8::validate(b"\xE2\x98\x83\xE2\x98\xE2\x98\x83",)
+ );
+ // Check that an incomplete 4-byte sequence fails.
+ assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9Ca"));
+ assert_eq!(Err(utf8e2(1, 3)), utf8::validate(b"a\xF0\x9D\x9Ca"));
+ assert_eq!(
+ Err(utf8e2(4, 3)),
+ utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C\xE2\x98\x83",)
+ );
+ assert_eq!(
+ Err(utf8e2(6, 3)),
+ utf8::validate(b"foobar\xF1\x80\x80quux",)
+ );
+
+ // Check that an incomplete (EOF) 2-byte sequence fails.
+ assert_eq!(Err(utf8e(0)), utf8::validate(b"\xCE"));
+ assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xCE"));
+ assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xCE"));
+ // Check that an incomplete (EOF) 3-byte sequence fails.
+ assert_eq!(Err(utf8e(0)), utf8::validate(b"\xE2\x98"));
+ assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xE2\x98"));
+ assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xE2\x98"));
+ // Check that an incomplete (EOF) 4-byte sequence fails.
+ assert_eq!(Err(utf8e(0)), utf8::validate(b"\xF0\x9D\x9C"));
+ assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xF0\x9D\x9C"));
+ assert_eq!(
+ Err(utf8e(4)),
+ utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C",)
+ );
+
+ // Test that we errors correct even after long valid sequences. This
+ // checks that our "backup" logic for detecting errors is correct.
+ assert_eq!(
+ Err(utf8e2(8, 1)),
+ utf8::validate(b"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF",)
+ );
+ }
+
+ #[test]
+ fn decode_valid() {
+ fn d(mut s: &str) -> Vec<char> {
+ let mut chars = vec![];
+ while !s.is_empty() {
+ let (ch, size) = utf8::decode(s.as_bytes());
+ s = &s[size..];
+ chars.push(ch.unwrap());
+ }
+ chars
+ }
+
+ assert_eq!(vec!['☃'], d("☃"));
+ assert_eq!(vec!['☃', '☃'], d("☃☃"));
+ assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
+ assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
+ assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲"));
+ }
+
+ #[test]
+ fn decode_invalid() {
+ let (ch, size) = utf8::decode(b"");
+ assert_eq!(None, ch);
+ assert_eq!(0, size);
+
+ let (ch, size) = utf8::decode(b"\xFF");
+ assert_eq!(None, ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode(b"\xCE\xF0");
+ assert_eq!(None, ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode(b"\xE2\x98\xF0");
+ assert_eq!(None, ch);
+ assert_eq!(2, size);
+
+ let (ch, size) = utf8::decode(b"\xF0\x9D\x9D");
+ assert_eq!(None, ch);
+ assert_eq!(3, size);
+
+ let (ch, size) = utf8::decode(b"\xF0\x9D\x9D\xF0");
+ assert_eq!(None, ch);
+ assert_eq!(3, size);
+
+ let (ch, size) = utf8::decode(b"\xF0\x82\x82\xAC");
+ assert_eq!(None, ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode(b"\xED\xA0\x80");
+ assert_eq!(None, ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode(b"\xCEa");
+ assert_eq!(None, ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode(b"\xE2\x98a");
+ assert_eq!(None, ch);
+ assert_eq!(2, size);
+
+ let (ch, size) = utf8::decode(b"\xF0\x9D\x9Ca");
+ assert_eq!(None, ch);
+ assert_eq!(3, size);
+ }
+
+ #[test]
+ fn decode_lossy() {
+ let (ch, size) = utf8::decode_lossy(b"");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(0, size);
+
+ let (ch, size) = utf8::decode_lossy(b"\xFF");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_lossy(b"\xCE\xF0");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_lossy(b"\xE2\x98\xF0");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(2, size);
+
+ let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9D\xF0");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(3, size);
+
+ let (ch, size) = utf8::decode_lossy(b"\xF0\x82\x82\xAC");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_lossy(b"\xED\xA0\x80");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_lossy(b"\xCEa");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_lossy(b"\xE2\x98a");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(2, size);
+
+ let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9Ca");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(3, size);
+ }
+
+ #[test]
+ fn decode_last_valid() {
+ fn d(mut s: &str) -> Vec<char> {
+ let mut chars = vec![];
+ while !s.is_empty() {
+ let (ch, size) = utf8::decode_last(s.as_bytes());
+ s = &s[..s.len() - size];
+ chars.push(ch.unwrap());
+ }
+ chars
+ }
+
+ assert_eq!(vec!['☃'], d("☃"));
+ assert_eq!(vec!['☃', '☃'], d("☃☃"));
+ assert_eq!(vec!['ε', 'δ', 'γ', 'β', 'α'], d("αβγδε"));
+ assert_eq!(vec!['⛇', '⛄', '☃'], d("☃⛄⛇"));
+ assert_eq!(vec!['𝗲', '𝗱', '𝗰', '𝗯', '𝗮'], d("𝗮𝗯𝗰𝗱𝗲"));
+ }
+
+ #[test]
+ fn decode_last_invalid() {
+ let (ch, size) = utf8::decode_last(b"");
+ assert_eq!(None, ch);
+ assert_eq!(0, size);
+
+ let (ch, size) = utf8::decode_last(b"\xFF");
+ assert_eq!(None, ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last(b"\xCE\xF0");
+ assert_eq!(None, ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last(b"\xCE");
+ assert_eq!(None, ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last(b"\xE2\x98\xF0");
+ assert_eq!(None, ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last(b"\xE2\x98");
+ assert_eq!(None, ch);
+ assert_eq!(2, size);
+
+ let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D\xF0");
+ assert_eq!(None, ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D");
+ assert_eq!(None, ch);
+ assert_eq!(3, size);
+
+ let (ch, size) = utf8::decode_last(b"\xF0\x82\x82\xAC");
+ assert_eq!(None, ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last(b"\xED\xA0\x80");
+ assert_eq!(None, ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last(b"\xED\xA0");
+ assert_eq!(None, ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last(b"\xED");
+ assert_eq!(None, ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last(b"a\xCE");
+ assert_eq!(None, ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last(b"a\xE2\x98");
+ assert_eq!(None, ch);
+ assert_eq!(2, size);
+
+ let (ch, size) = utf8::decode_last(b"a\xF0\x9D\x9C");
+ assert_eq!(None, ch);
+ assert_eq!(3, size);
+ }
+
+ #[test]
+ fn decode_last_lossy() {
+ let (ch, size) = utf8::decode_last_lossy(b"");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(0, size);
+
+ let (ch, size) = utf8::decode_last_lossy(b"\xFF");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last_lossy(b"\xCE\xF0");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last_lossy(b"\xCE");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98\xF0");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(2, size);
+
+ let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D\xF0");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(3, size);
+
+ let (ch, size) = utf8::decode_last_lossy(b"\xF0\x82\x82\xAC");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0\x80");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last_lossy(b"\xED");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last_lossy(b"a\xCE");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(1, size);
+
+ let (ch, size) = utf8::decode_last_lossy(b"a\xE2\x98");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(2, size);
+
+ let (ch, size) = utf8::decode_last_lossy(b"a\xF0\x9D\x9C");
+ assert_eq!('\u{FFFD}', ch);
+ assert_eq!(3, size);
+ }
+
+ #[test]
+ fn chars() {
+ for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() {
+ let got: String = B(input).chars().collect();
+ assert_eq!(
+ expected, got,
+ "chars(ith: {:?}, given: {:?})",
+ i, input,
+ );
+ let got: String =
+ B(input).char_indices().map(|(_, _, ch)| ch).collect();
+ assert_eq!(
+ expected, got,
+ "char_indices(ith: {:?}, given: {:?})",
+ i, input,
+ );
+
+ let expected: String = expected.chars().rev().collect();
+
+ let got: String = B(input).chars().rev().collect();
+ assert_eq!(
+ expected, got,
+ "chars.rev(ith: {:?}, given: {:?})",
+ i, input,
+ );
+ let got: String =
+ B(input).char_indices().rev().map(|(_, _, ch)| ch).collect();
+ assert_eq!(
+ expected, got,
+ "char_indices.rev(ith: {:?}, given: {:?})",
+ i, input,
+ );
+ }
+ }
+
+ #[test]
+ fn utf8_chunks() {
+ let mut c = utf8::Utf8Chunks { bytes: b"123\xC0" };
+ assert_eq!(
+ (c.next(), c.next()),
+ (
+ Some(utf8::Utf8Chunk {
+ valid: "123",
+ invalid: b"\xC0".as_bstr(),
+ incomplete: false,
+ }),
+ None,
+ )
+ );
+
+ let mut c = utf8::Utf8Chunks { bytes: b"123\xFF\xFF" };
+ assert_eq!(
+ (c.next(), c.next(), c.next()),
+ (
+ Some(utf8::Utf8Chunk {
+ valid: "123",
+ invalid: b"\xFF".as_bstr(),
+ incomplete: false,
+ }),
+ Some(utf8::Utf8Chunk {
+ valid: "",
+ invalid: b"\xFF".as_bstr(),
+ incomplete: false,
+ }),
+ None,
+ )
+ );
+
+ let mut c = utf8::Utf8Chunks { bytes: b"123\xD0" };
+ assert_eq!(
+ (c.next(), c.next()),
+ (
+ Some(utf8::Utf8Chunk {
+ valid: "123",
+ invalid: b"\xD0".as_bstr(),
+ incomplete: true,
+ }),
+ None,
+ )
+ );
+
+ let mut c = utf8::Utf8Chunks { bytes: b"123\xD0456" };
+ assert_eq!(
+ (c.next(), c.next(), c.next()),
+ (
+ Some(utf8::Utf8Chunk {
+ valid: "123",
+ invalid: b"\xD0".as_bstr(),
+ incomplete: false,
+ }),
+ Some(utf8::Utf8Chunk {
+ valid: "456",
+ invalid: b"".as_bstr(),
+ incomplete: false,
+ }),
+ None,
+ )
+ );
+
+ let mut c = utf8::Utf8Chunks { bytes: b"123\xE2\x98" };
+ assert_eq!(
+ (c.next(), c.next()),
+ (
+ Some(utf8::Utf8Chunk {
+ valid: "123",
+ invalid: b"\xE2\x98".as_bstr(),
+ incomplete: true,
+ }),
+ None,
+ )
+ );
+
+ let mut c = utf8::Utf8Chunks { bytes: b"123\xF4\x8F\xBF" };
+ assert_eq!(
+ (c.next(), c.next()),
+ (
+ Some(utf8::Utf8Chunk {
+ valid: "123",
+ invalid: b"\xF4\x8F\xBF".as_bstr(),
+ incomplete: true,
+ }),
+ None,
+ )
+ );
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 +763 +764 +765 +766 +767 +768 +769 +770 +771 +772 +773 +774 +775 +776 +777 +778 +779 +780 +781 +782 +783 +784 +785 +786 +787 +788 +789 +790 +791 +792 +793 +794 +795 +796 +797 +798 +799 +800 +801 +802 +803 +804 +805 +806 +807 +808 +809 +810 +811 +812 +813 +814 +815 +816 +817 +818 +819 +820 +821 +822 +823 +824 +825 +826 +827 +828 +829 +830 +831 +832 +833 +834 +835 +836 +837 +838 +839 +840 +841 +842 +843 +844 +845 +846 +847 +848 +849 +850 +851 +852 +853 +854 +855 +856 +857 +858 +859 +860 +861 +862 +863 +864 +865 +866 +867 +868 +869 +870 +871 +872 +873 +874 +875 +876 +877 +878 +879 +880 +881 +882 +883 +884 +885 +886 +887 +888 +889 +890 +891 +892 +893 +894 +895 +896 +897 +898 +899 +900 +901 +902 +903 +904 +905 +906 +907 +908 +909 +910 +911 +912 +913 +914 +915 +916 +917 +918 +919 +920 +921 +922 +923 +924 +925 +926 +927 +928 +929 +930 +931 +932 +933 +934 +935 +936 +937 +938 +939 +940 +941 +942 +943 +944 +945 +946 +947 +948 +949 +950 +951 +952 +953 +954 +955 +956 +957 +958 +959 +960 +961 +962 +963 +964 +965 +966 +967 +968 +969 +970 +971 +972 +973 +974 +975 +976 +977 +978 +979 +980 +981 +982 +983 +984 +985 +986 +987 +988 +989 +990 +991 +992 +993 +994 +995 +996 +
/*!
+Provides architecture independent implementations of `memchr` and friends.
+
+The main types in this module are [`One`], [`Two`] and [`Three`]. They are for
+searching for one, two or three distinct bytes, respectively, in a haystack.
+Each type also has corresponding double ended iterators. These searchers
+are typically slower than hand-coded vector routines accomplishing the same
+task, but are also typically faster than naive scalar code. These routines
+effectively work by treating a `usize` as a vector of 8-bit lanes, and thus
+achieves some level of data parallelism even without explicit vector support.
+
+The `One` searcher also provides a [`One::count`] routine for efficiently
+counting the number of times a single byte occurs in a haystack. This is
+useful, for example, for counting the number of lines in a haystack. This
+routine exists because it is usually faster, especially with a high match
+count, then using [`One::find`] repeatedly. ([`OneIter`] specializes its
+`Iterator::count` implementation to use this routine.)
+
+Only one, two and three bytes are supported because three bytes is about
+the point where one sees diminishing returns. Beyond this point and it's
+probably (but not necessarily) better to just use a simple `[bool; 256]` array
+or similar. However, it depends mightily on the specific work-load and the
+expected match frequency.
+*/
+
+use crate::{arch::generic::memchr as generic, ext::Pointer};
+
+/// The number of bytes in a single `usize` value.
+const USIZE_BYTES: usize = (usize::BITS / 8) as usize;
+/// The bits that must be zero for a `*const usize` to be properly aligned.
+const USIZE_ALIGN: usize = USIZE_BYTES - 1;
+
+/// Finds all occurrences of a single byte in a haystack.
+#[derive(Clone, Copy, Debug)]
+pub struct One {
+ s1: u8,
+ v1: usize,
+}
+
+impl One {
+ /// The number of bytes we examine per each iteration of our search loop.
+ const LOOP_BYTES: usize = 2 * USIZE_BYTES;
+
+ /// Create a new searcher that finds occurrences of the byte given.
+ #[inline]
+ pub fn new(needle: u8) -> One {
+ One { s1: needle, v1: splat(needle) }
+ }
+
+ /// A test-only routine so that we can bundle a bunch of quickcheck
+ /// properties into a single macro. Basically, this provides a constructor
+ /// that makes it identical to most other memchr implementations, which
+ /// have fallible constructors.
+ #[cfg(test)]
+ pub(crate) fn try_new(needle: u8) -> Option<One> {
+ Some(One::new(needle))
+ }
+
+ /// Return the first occurrence of the needle in the given haystack. If no
+ /// such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value for a non-empty haystack is `haystack.len() - 1`.
+ #[inline]
+ pub fn find(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `find_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.find_raw(s, e)
+ })
+ }
+ }
+
+ /// Return the last occurrence of the needle in the given haystack. If no
+ /// such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value for a non-empty haystack is `haystack.len() - 1`.
+ #[inline]
+ pub fn rfind(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `find_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.rfind_raw(s, e)
+ })
+ }
+ }
+
+ /// Counts all occurrences of this byte in the given haystack.
+ #[inline]
+ pub fn count(&self, haystack: &[u8]) -> usize {
+ // SAFETY: All of our pointers are derived directly from a borrowed
+ // slice, which is guaranteed to be valid.
+ unsafe {
+ let start = haystack.as_ptr();
+ let end = start.add(haystack.len());
+ self.count_raw(start, end)
+ }
+ }
+
+ /// Like `find`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn find_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ let confirm = |b| self.confirm(b);
+ let len = end.distance(start);
+ if len < USIZE_BYTES {
+ return generic::fwd_byte_by_byte(start, end, confirm);
+ }
+
+ // The start of the search may not be aligned to `*const usize`,
+ // so we do an unaligned load here.
+ let chunk = start.cast::<usize>().read_unaligned();
+ if self.has_needle(chunk) {
+ return generic::fwd_byte_by_byte(start, end, confirm);
+ }
+
+ // And now we start our search at a guaranteed aligned position.
+ // The first iteration of the loop below will overlap with the the
+ // unaligned chunk above in cases where the search starts at an
+ // unaligned offset, but that's okay as we're only here if that
+ // above didn't find a match.
+ let mut cur =
+ start.add(USIZE_BYTES - (start.as_usize() & USIZE_ALIGN));
+ debug_assert!(cur > start);
+ if len <= One::LOOP_BYTES {
+ return generic::fwd_byte_by_byte(cur, end, confirm);
+ }
+ debug_assert!(end.sub(One::LOOP_BYTES) >= start);
+ while cur <= end.sub(One::LOOP_BYTES) {
+ debug_assert_eq!(0, cur.as_usize() % USIZE_BYTES);
+
+ let a = cur.cast::<usize>().read();
+ let b = cur.add(USIZE_BYTES).cast::<usize>().read();
+ if self.has_needle(a) || self.has_needle(b) {
+ break;
+ }
+ cur = cur.add(One::LOOP_BYTES);
+ }
+ generic::fwd_byte_by_byte(cur, end, confirm)
+ }
+
+ /// Like `rfind`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn rfind_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ let confirm = |b| self.confirm(b);
+ let len = end.distance(start);
+ if len < USIZE_BYTES {
+ return generic::rev_byte_by_byte(start, end, confirm);
+ }
+
+ let chunk = end.sub(USIZE_BYTES).cast::<usize>().read_unaligned();
+ if self.has_needle(chunk) {
+ return generic::rev_byte_by_byte(start, end, confirm);
+ }
+
+ let mut cur = end.sub(end.as_usize() & USIZE_ALIGN);
+ debug_assert!(start <= cur && cur <= end);
+ if len <= One::LOOP_BYTES {
+ return generic::rev_byte_by_byte(start, cur, confirm);
+ }
+ while cur >= start.add(One::LOOP_BYTES) {
+ debug_assert_eq!(0, cur.as_usize() % USIZE_BYTES);
+
+ let a = cur.sub(2 * USIZE_BYTES).cast::<usize>().read();
+ let b = cur.sub(1 * USIZE_BYTES).cast::<usize>().read();
+ if self.has_needle(a) || self.has_needle(b) {
+ break;
+ }
+ cur = cur.sub(One::LOOP_BYTES);
+ }
+ generic::rev_byte_by_byte(start, cur, confirm)
+ }
+
+ /// Counts all occurrences of this byte in the given haystack represented
+ /// by raw pointers.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `0` will always be returned.
+ #[inline]
+ pub unsafe fn count_raw(&self, start: *const u8, end: *const u8) -> usize {
+ if start >= end {
+ return 0;
+ }
+ // Sadly I couldn't get the SWAR approach to work here, so we just do
+ // one byte at a time for now. PRs to improve this are welcome.
+ let mut ptr = start;
+ let mut count = 0;
+ while ptr < end {
+ count += (ptr.read() == self.s1) as usize;
+ ptr = ptr.offset(1);
+ }
+ count
+ }
+
+ /// Returns an iterator over all occurrences of the needle byte in the
+ /// given haystack.
+ ///
+ /// The iterator returned implements `DoubleEndedIterator`. This means it
+ /// can also be used to find occurrences in reverse order.
+ pub fn iter<'a, 'h>(&'a self, haystack: &'h [u8]) -> OneIter<'a, 'h> {
+ OneIter { searcher: self, it: generic::Iter::new(haystack) }
+ }
+
+ #[inline(always)]
+ fn has_needle(&self, chunk: usize) -> bool {
+ has_zero_byte(self.v1 ^ chunk)
+ }
+
+ #[inline(always)]
+ fn confirm(&self, haystack_byte: u8) -> bool {
+ self.s1 == haystack_byte
+ }
+}
+
+/// An iterator over all occurrences of a single byte in a haystack.
+///
+/// This iterator implements `DoubleEndedIterator`, which means it can also be
+/// used to find occurrences in reverse order.
+///
+/// This iterator is created by the [`One::iter`] method.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'a` refers to the lifetime of the underlying [`One`] searcher.
+/// * `'h` refers to the lifetime of the haystack being searched.
+#[derive(Clone, Debug)]
+pub struct OneIter<'a, 'h> {
+ /// The underlying memchr searcher.
+ searcher: &'a One,
+ /// Generic iterator implementation.
+ it: generic::Iter<'h>,
+}
+
+impl<'a, 'h> Iterator for OneIter<'a, 'h> {
+ type Item = usize;
+
+ #[inline]
+ fn next(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'find_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next(|s, e| self.searcher.find_raw(s, e)) }
+ }
+
+ #[inline]
+ fn count(self) -> usize {
+ self.it.count(|s, e| {
+ // SAFETY: We rely on our generic iterator to return valid start
+ // and end pointers.
+ unsafe { self.searcher.count_raw(s, e) }
+ })
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl<'a, 'h> DoubleEndedIterator for OneIter<'a, 'h> {
+ #[inline]
+ fn next_back(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'rfind_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next_back(|s, e| self.searcher.rfind_raw(s, e)) }
+ }
+}
+
+/// Finds all occurrences of two bytes in a haystack.
+///
+/// That is, this reports matches of one of two possible bytes. For example,
+/// searching for `a` or `b` in `afoobar` would report matches at offsets `0`,
+/// `4` and `5`.
+#[derive(Clone, Copy, Debug)]
+pub struct Two {
+ s1: u8,
+ s2: u8,
+ v1: usize,
+ v2: usize,
+}
+
+impl Two {
+ /// Create a new searcher that finds occurrences of the two needle bytes
+ /// given.
+ #[inline]
+ pub fn new(needle1: u8, needle2: u8) -> Two {
+ Two {
+ s1: needle1,
+ s2: needle2,
+ v1: splat(needle1),
+ v2: splat(needle2),
+ }
+ }
+
+ /// A test-only routine so that we can bundle a bunch of quickcheck
+ /// properties into a single macro. Basically, this provides a constructor
+ /// that makes it identical to most other memchr implementations, which
+ /// have fallible constructors.
+ #[cfg(test)]
+ pub(crate) fn try_new(needle1: u8, needle2: u8) -> Option<Two> {
+ Some(Two::new(needle1, needle2))
+ }
+
+ /// Return the first occurrence of one of the needle bytes in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value for a non-empty haystack is `haystack.len() - 1`.
+ #[inline]
+ pub fn find(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `find_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.find_raw(s, e)
+ })
+ }
+ }
+
+ /// Return the last occurrence of one of the needle bytes in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value for a non-empty haystack is `haystack.len() - 1`.
+ #[inline]
+ pub fn rfind(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `find_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.rfind_raw(s, e)
+ })
+ }
+ }
+
+ /// Like `find`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn find_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ let confirm = |b| self.confirm(b);
+ let len = end.distance(start);
+ if len < USIZE_BYTES {
+ return generic::fwd_byte_by_byte(start, end, confirm);
+ }
+
+ // The start of the search may not be aligned to `*const usize`,
+ // so we do an unaligned load here.
+ let chunk = start.cast::<usize>().read_unaligned();
+ if self.has_needle(chunk) {
+ return generic::fwd_byte_by_byte(start, end, confirm);
+ }
+
+ // And now we start our search at a guaranteed aligned position.
+ // The first iteration of the loop below will overlap with the the
+ // unaligned chunk above in cases where the search starts at an
+ // unaligned offset, but that's okay as we're only here if that
+ // above didn't find a match.
+ let mut cur =
+ start.add(USIZE_BYTES - (start.as_usize() & USIZE_ALIGN));
+ debug_assert!(cur > start);
+ debug_assert!(end.sub(USIZE_BYTES) >= start);
+ while cur <= end.sub(USIZE_BYTES) {
+ debug_assert_eq!(0, cur.as_usize() % USIZE_BYTES);
+
+ let chunk = cur.cast::<usize>().read();
+ if self.has_needle(chunk) {
+ break;
+ }
+ cur = cur.add(USIZE_BYTES);
+ }
+ generic::fwd_byte_by_byte(cur, end, confirm)
+ }
+
+ /// Like `rfind`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn rfind_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ let confirm = |b| self.confirm(b);
+ let len = end.distance(start);
+ if len < USIZE_BYTES {
+ return generic::rev_byte_by_byte(start, end, confirm);
+ }
+
+ let chunk = end.sub(USIZE_BYTES).cast::<usize>().read_unaligned();
+ if self.has_needle(chunk) {
+ return generic::rev_byte_by_byte(start, end, confirm);
+ }
+
+ let mut cur = end.sub(end.as_usize() & USIZE_ALIGN);
+ debug_assert!(start <= cur && cur <= end);
+ while cur >= start.add(USIZE_BYTES) {
+ debug_assert_eq!(0, cur.as_usize() % USIZE_BYTES);
+
+ let chunk = cur.sub(USIZE_BYTES).cast::<usize>().read();
+ if self.has_needle(chunk) {
+ break;
+ }
+ cur = cur.sub(USIZE_BYTES);
+ }
+ generic::rev_byte_by_byte(start, cur, confirm)
+ }
+
+ /// Returns an iterator over all occurrences of one of the needle bytes in
+ /// the given haystack.
+ ///
+ /// The iterator returned implements `DoubleEndedIterator`. This means it
+ /// can also be used to find occurrences in reverse order.
+ pub fn iter<'a, 'h>(&'a self, haystack: &'h [u8]) -> TwoIter<'a, 'h> {
+ TwoIter { searcher: self, it: generic::Iter::new(haystack) }
+ }
+
+ #[inline(always)]
+ fn has_needle(&self, chunk: usize) -> bool {
+ has_zero_byte(self.v1 ^ chunk) || has_zero_byte(self.v2 ^ chunk)
+ }
+
+ #[inline(always)]
+ fn confirm(&self, haystack_byte: u8) -> bool {
+ self.s1 == haystack_byte || self.s2 == haystack_byte
+ }
+}
+
+/// An iterator over all occurrences of two possible bytes in a haystack.
+///
+/// This iterator implements `DoubleEndedIterator`, which means it can also be
+/// used to find occurrences in reverse order.
+///
+/// This iterator is created by the [`Two::iter`] method.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'a` refers to the lifetime of the underlying [`Two`] searcher.
+/// * `'h` refers to the lifetime of the haystack being searched.
+#[derive(Clone, Debug)]
+pub struct TwoIter<'a, 'h> {
+ /// The underlying memchr searcher.
+ searcher: &'a Two,
+ /// Generic iterator implementation.
+ it: generic::Iter<'h>,
+}
+
+impl<'a, 'h> Iterator for TwoIter<'a, 'h> {
+ type Item = usize;
+
+ #[inline]
+ fn next(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'find_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next(|s, e| self.searcher.find_raw(s, e)) }
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl<'a, 'h> DoubleEndedIterator for TwoIter<'a, 'h> {
+ #[inline]
+ fn next_back(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'rfind_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next_back(|s, e| self.searcher.rfind_raw(s, e)) }
+ }
+}
+
+/// Finds all occurrences of three bytes in a haystack.
+///
+/// That is, this reports matches of one of three possible bytes. For example,
+/// searching for `a`, `b` or `o` in `afoobar` would report matches at offsets
+/// `0`, `2`, `3`, `4` and `5`.
+#[derive(Clone, Copy, Debug)]
+pub struct Three {
+ s1: u8,
+ s2: u8,
+ s3: u8,
+ v1: usize,
+ v2: usize,
+ v3: usize,
+}
+
+impl Three {
+ /// Create a new searcher that finds occurrences of the three needle bytes
+ /// given.
+ #[inline]
+ pub fn new(needle1: u8, needle2: u8, needle3: u8) -> Three {
+ Three {
+ s1: needle1,
+ s2: needle2,
+ s3: needle3,
+ v1: splat(needle1),
+ v2: splat(needle2),
+ v3: splat(needle3),
+ }
+ }
+
+ /// A test-only routine so that we can bundle a bunch of quickcheck
+ /// properties into a single macro. Basically, this provides a constructor
+ /// that makes it identical to most other memchr implementations, which
+ /// have fallible constructors.
+ #[cfg(test)]
+ pub(crate) fn try_new(
+ needle1: u8,
+ needle2: u8,
+ needle3: u8,
+ ) -> Option<Three> {
+ Some(Three::new(needle1, needle2, needle3))
+ }
+
+ /// Return the first occurrence of one of the needle bytes in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value for a non-empty haystack is `haystack.len() - 1`.
+ #[inline]
+ pub fn find(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `find_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.find_raw(s, e)
+ })
+ }
+ }
+
+ /// Return the last occurrence of one of the needle bytes in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value for a non-empty haystack is `haystack.len() - 1`.
+ #[inline]
+ pub fn rfind(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `find_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.rfind_raw(s, e)
+ })
+ }
+ }
+
+ /// Like `find`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn find_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ let confirm = |b| self.confirm(b);
+ let len = end.distance(start);
+ if len < USIZE_BYTES {
+ return generic::fwd_byte_by_byte(start, end, confirm);
+ }
+
+ // The start of the search may not be aligned to `*const usize`,
+ // so we do an unaligned load here.
+ let chunk = start.cast::<usize>().read_unaligned();
+ if self.has_needle(chunk) {
+ return generic::fwd_byte_by_byte(start, end, confirm);
+ }
+
+ // And now we start our search at a guaranteed aligned position.
+ // The first iteration of the loop below will overlap with the the
+ // unaligned chunk above in cases where the search starts at an
+ // unaligned offset, but that's okay as we're only here if that
+ // above didn't find a match.
+ let mut cur =
+ start.add(USIZE_BYTES - (start.as_usize() & USIZE_ALIGN));
+ debug_assert!(cur > start);
+ debug_assert!(end.sub(USIZE_BYTES) >= start);
+ while cur <= end.sub(USIZE_BYTES) {
+ debug_assert_eq!(0, cur.as_usize() % USIZE_BYTES);
+
+ let chunk = cur.cast::<usize>().read();
+ if self.has_needle(chunk) {
+ break;
+ }
+ cur = cur.add(USIZE_BYTES);
+ }
+ generic::fwd_byte_by_byte(cur, end, confirm)
+ }
+
+ /// Like `rfind`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn rfind_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ let confirm = |b| self.confirm(b);
+ let len = end.distance(start);
+ if len < USIZE_BYTES {
+ return generic::rev_byte_by_byte(start, end, confirm);
+ }
+
+ let chunk = end.sub(USIZE_BYTES).cast::<usize>().read_unaligned();
+ if self.has_needle(chunk) {
+ return generic::rev_byte_by_byte(start, end, confirm);
+ }
+
+ let mut cur = end.sub(end.as_usize() & USIZE_ALIGN);
+ debug_assert!(start <= cur && cur <= end);
+ while cur >= start.add(USIZE_BYTES) {
+ debug_assert_eq!(0, cur.as_usize() % USIZE_BYTES);
+
+ let chunk = cur.sub(USIZE_BYTES).cast::<usize>().read();
+ if self.has_needle(chunk) {
+ break;
+ }
+ cur = cur.sub(USIZE_BYTES);
+ }
+ generic::rev_byte_by_byte(start, cur, confirm)
+ }
+
+ /// Returns an iterator over all occurrences of one of the needle bytes in
+ /// the given haystack.
+ ///
+ /// The iterator returned implements `DoubleEndedIterator`. This means it
+ /// can also be used to find occurrences in reverse order.
+ pub fn iter<'a, 'h>(&'a self, haystack: &'h [u8]) -> ThreeIter<'a, 'h> {
+ ThreeIter { searcher: self, it: generic::Iter::new(haystack) }
+ }
+
+ #[inline(always)]
+ fn has_needle(&self, chunk: usize) -> bool {
+ has_zero_byte(self.v1 ^ chunk)
+ || has_zero_byte(self.v2 ^ chunk)
+ || has_zero_byte(self.v3 ^ chunk)
+ }
+
+ #[inline(always)]
+ fn confirm(&self, haystack_byte: u8) -> bool {
+ self.s1 == haystack_byte
+ || self.s2 == haystack_byte
+ || self.s3 == haystack_byte
+ }
+}
+
+/// An iterator over all occurrences of three possible bytes in a haystack.
+///
+/// This iterator implements `DoubleEndedIterator`, which means it can also be
+/// used to find occurrences in reverse order.
+///
+/// This iterator is created by the [`Three::iter`] method.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'a` refers to the lifetime of the underlying [`Three`] searcher.
+/// * `'h` refers to the lifetime of the haystack being searched.
+#[derive(Clone, Debug)]
+pub struct ThreeIter<'a, 'h> {
+ /// The underlying memchr searcher.
+ searcher: &'a Three,
+ /// Generic iterator implementation.
+ it: generic::Iter<'h>,
+}
+
+impl<'a, 'h> Iterator for ThreeIter<'a, 'h> {
+ type Item = usize;
+
+ #[inline]
+ fn next(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'find_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next(|s, e| self.searcher.find_raw(s, e)) }
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl<'a, 'h> DoubleEndedIterator for ThreeIter<'a, 'h> {
+ #[inline]
+ fn next_back(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'rfind_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next_back(|s, e| self.searcher.rfind_raw(s, e)) }
+ }
+}
+
+/// Return `true` if `x` contains any zero byte.
+///
+/// That is, this routine treats `x` as a register of 8-bit lanes and returns
+/// true when any of those lanes is `0`.
+///
+/// From "Matters Computational" by J. Arndt.
+#[inline(always)]
+fn has_zero_byte(x: usize) -> bool {
+ // "The idea is to subtract one from each of the bytes and then look for
+ // bytes where the borrow propagated all the way to the most significant
+ // bit."
+ const LO: usize = splat(0x01);
+ const HI: usize = splat(0x80);
+
+ (x.wrapping_sub(LO) & !x & HI) != 0
+}
+
+/// Repeat the given byte into a word size number. That is, every 8 bits
+/// is equivalent to the given byte. For example, if `b` is `\x4E` or
+/// `01001110` in binary, then the returned value on a 32-bit system would be:
+/// `01001110_01001110_01001110_01001110`.
+#[inline(always)]
+const fn splat(b: u8) -> usize {
+ // TODO: use `usize::from` once it can be used in const context.
+ (b as usize) * (usize::MAX / 255)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ define_memchr_quickcheck!(super, try_new);
+
+ #[test]
+ fn forward_one() {
+ crate::tests::memchr::Runner::new(1).forward_iter(
+ |haystack, needles| {
+ Some(One::new(needles[0]).iter(haystack).collect())
+ },
+ )
+ }
+
+ #[test]
+ fn reverse_one() {
+ crate::tests::memchr::Runner::new(1).reverse_iter(
+ |haystack, needles| {
+ Some(One::new(needles[0]).iter(haystack).rev().collect())
+ },
+ )
+ }
+
+ #[test]
+ fn count_one() {
+ crate::tests::memchr::Runner::new(1).count_iter(|haystack, needles| {
+ Some(One::new(needles[0]).iter(haystack).count())
+ })
+ }
+
+ #[test]
+ fn forward_two() {
+ crate::tests::memchr::Runner::new(2).forward_iter(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ Some(Two::new(n1, n2).iter(haystack).collect())
+ },
+ )
+ }
+
+ #[test]
+ fn reverse_two() {
+ crate::tests::memchr::Runner::new(2).reverse_iter(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ Some(Two::new(n1, n2).iter(haystack).rev().collect())
+ },
+ )
+ }
+
+ #[test]
+ fn forward_three() {
+ crate::tests::memchr::Runner::new(3).forward_iter(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ let n3 = needles.get(2).copied()?;
+ Some(Three::new(n1, n2, n3).iter(haystack).collect())
+ },
+ )
+ }
+
+ #[test]
+ fn reverse_three() {
+ crate::tests::memchr::Runner::new(3).reverse_iter(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ let n3 = needles.get(2).copied()?;
+ Some(Three::new(n1, n2, n3).iter(haystack).rev().collect())
+ },
+ )
+ }
+
+ // This was found by quickcheck in the course of refactoring this crate
+ // after memchr 2.5.0.
+ #[test]
+ fn regression_double_ended_iterator() {
+ let finder = One::new(b'a');
+ let haystack = "a";
+ let mut it = finder.iter(haystack.as_bytes());
+ assert_eq!(Some(0), it.next());
+ assert_eq!(None, it.next_back());
+ }
+
+ // This regression test was caught by ripgrep's test suite on i686 when
+ // upgrading to memchr 2.6. Namely, something about the \x0B bytes here
+ // screws with the SWAR counting approach I was using. This regression test
+ // prompted me to remove the SWAR counting approach and just replace it
+ // with a byte-at-a-time loop.
+ #[test]
+ fn regression_count_new_lines() {
+ let haystack = "01234567\x0b\n\x0b\n\x0b\n\x0b\nx";
+ let count = One::new(b'\n').count(haystack.as_bytes());
+ assert_eq!(4, count);
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +
/*!
+Contains architecture independent routines.
+
+These routines are often used as a "fallback" implementation when the more
+specialized architecture dependent routines are unavailable.
+*/
+
+pub mod memchr;
+pub mod packedpair;
+pub mod rabinkarp;
+#[cfg(feature = "alloc")]
+pub mod shiftor;
+pub mod twoway;
+
+/// Returns true if and only if `needle` is a prefix of `haystack`.
+///
+/// This uses a latency optimized variant of `memcmp` internally which *might*
+/// make this faster for very short strings.
+///
+/// # Inlining
+///
+/// This routine is marked `inline(always)`. If you want to call this function
+/// in a way that is not always inlined, you'll need to wrap a call to it in
+/// another function that is marked as `inline(never)` or just `inline`.
+#[inline(always)]
+pub fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool {
+ needle.len() <= haystack.len()
+ && is_equal(&haystack[..needle.len()], needle)
+}
+
+/// Returns true if and only if `needle` is a suffix of `haystack`.
+///
+/// This uses a latency optimized variant of `memcmp` internally which *might*
+/// make this faster for very short strings.
+///
+/// # Inlining
+///
+/// This routine is marked `inline(always)`. If you want to call this function
+/// in a way that is not always inlined, you'll need to wrap a call to it in
+/// another function that is marked as `inline(never)` or just `inline`.
+#[inline(always)]
+pub fn is_suffix(haystack: &[u8], needle: &[u8]) -> bool {
+ needle.len() <= haystack.len()
+ && is_equal(&haystack[haystack.len() - needle.len()..], needle)
+}
+
+/// Compare corresponding bytes in `x` and `y` for equality.
+///
+/// That is, this returns true if and only if `x.len() == y.len()` and
+/// `x[i] == y[i]` for all `0 <= i < x.len()`.
+///
+/// # Inlining
+///
+/// This routine is marked `inline(always)`. If you want to call this function
+/// in a way that is not always inlined, you'll need to wrap a call to it in
+/// another function that is marked as `inline(never)` or just `inline`.
+///
+/// # Motivation
+///
+/// Why not use slice equality instead? Well, slice equality usually results in
+/// a call out to the current platform's `libc` which might not be inlineable
+/// or have other overhead. This routine isn't guaranteed to be a win, but it
+/// might be in some cases.
+#[inline(always)]
+pub fn is_equal(x: &[u8], y: &[u8]) -> bool {
+ if x.len() != y.len() {
+ return false;
+ }
+ // SAFETY: Our pointers are derived directly from borrowed slices which
+ // uphold all of our safety guarantees except for length. We account for
+ // length with the check above.
+ unsafe { is_equal_raw(x.as_ptr(), y.as_ptr(), x.len()) }
+}
+
+/// Compare `n` bytes at the given pointers for equality.
+///
+/// This returns true if and only if `*x.add(i) == *y.add(i)` for all
+/// `0 <= i < n`.
+///
+/// # Inlining
+///
+/// This routine is marked `inline(always)`. If you want to call this function
+/// in a way that is not always inlined, you'll need to wrap a call to it in
+/// another function that is marked as `inline(never)` or just `inline`.
+///
+/// # Motivation
+///
+/// Why not use slice equality instead? Well, slice equality usually results in
+/// a call out to the current platform's `libc` which might not be inlineable
+/// or have other overhead. This routine isn't guaranteed to be a win, but it
+/// might be in some cases.
+///
+/// # Safety
+///
+/// * Both `x` and `y` must be valid for reads of up to `n` bytes.
+/// * Both `x` and `y` must point to an initialized value.
+/// * Both `x` and `y` must each point to an allocated object and
+/// must either be in bounds or at most one byte past the end of the
+/// allocated object. `x` and `y` do not need to point to the same allocated
+/// object, but they may.
+/// * Both `x` and `y` must be _derived from_ a pointer to their respective
+/// allocated objects.
+/// * The distance between `x` and `x+n` must not overflow `isize`. Similarly
+/// for `y` and `y+n`.
+/// * The distance being in bounds must not rely on "wrapping around" the
+/// address space.
+#[inline(always)]
+pub unsafe fn is_equal_raw(
+ mut x: *const u8,
+ mut y: *const u8,
+ n: usize,
+) -> bool {
+ // If we don't have enough bytes to do 4-byte at a time loads, then
+ // handle each possible length specially. Note that I used to have a
+ // byte-at-a-time loop here and that turned out to be quite a bit slower
+ // for the memmem/pathological/defeat-simple-vector-alphabet benchmark.
+ if n < 4 {
+ return match n {
+ 0 => true,
+ 1 => x.read() == y.read(),
+ 2 => {
+ x.cast::<u16>().read_unaligned()
+ == y.cast::<u16>().read_unaligned()
+ }
+ // I also tried copy_nonoverlapping here and it looks like the
+ // codegen is the same.
+ 3 => x.cast::<[u8; 3]>().read() == y.cast::<[u8; 3]>().read(),
+ _ => unreachable!(),
+ };
+ }
+ // When we have 4 or more bytes to compare, then proceed in chunks of 4 at
+ // a time using unaligned loads.
+ //
+ // Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is
+ // that this particular version of memcmp is likely to be called with tiny
+ // needles. That means that if we do 8 byte loads, then a higher proportion
+ // of memcmp calls will use the slower variant above. With that said, this
+ // is a hypothesis and is only loosely supported by benchmarks. There's
+ // likely some improvement that could be made here. The main thing here
+ // though is to optimize for latency, not throughput.
+
+ // SAFETY: The caller is responsible for ensuring the pointers we get are
+ // valid and readable for at least `n` bytes. We also do unaligned loads,
+ // so there's no need to ensure we're aligned. (This is justified by this
+ // routine being specifically for short strings.)
+ let xend = x.add(n.wrapping_sub(4));
+ let yend = y.add(n.wrapping_sub(4));
+ while x < xend {
+ let vx = x.cast::<u32>().read_unaligned();
+ let vy = y.cast::<u32>().read_unaligned();
+ if vx != vy {
+ return false;
+ }
+ x = x.add(4);
+ y = y.add(4);
+ }
+ let vx = xend.cast::<u32>().read_unaligned();
+ let vy = yend.cast::<u32>().read_unaligned();
+ vx == vy
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn equals_different_lengths() {
+ assert!(!is_equal(b"", b"a"));
+ assert!(!is_equal(b"a", b""));
+ assert!(!is_equal(b"ab", b"a"));
+ assert!(!is_equal(b"a", b"ab"));
+ }
+
+ #[test]
+ fn equals_mismatch() {
+ let one_mismatch = [
+ (&b"a"[..], &b"x"[..]),
+ (&b"ab"[..], &b"ax"[..]),
+ (&b"abc"[..], &b"abx"[..]),
+ (&b"abcd"[..], &b"abcx"[..]),
+ (&b"abcde"[..], &b"abcdx"[..]),
+ (&b"abcdef"[..], &b"abcdex"[..]),
+ (&b"abcdefg"[..], &b"abcdefx"[..]),
+ (&b"abcdefgh"[..], &b"abcdefgx"[..]),
+ (&b"abcdefghi"[..], &b"abcdefghx"[..]),
+ (&b"abcdefghij"[..], &b"abcdefghix"[..]),
+ (&b"abcdefghijk"[..], &b"abcdefghijx"[..]),
+ (&b"abcdefghijkl"[..], &b"abcdefghijkx"[..]),
+ (&b"abcdefghijklm"[..], &b"abcdefghijklx"[..]),
+ (&b"abcdefghijklmn"[..], &b"abcdefghijklmx"[..]),
+ ];
+ for (x, y) in one_mismatch {
+ assert_eq!(x.len(), y.len(), "lengths should match");
+ assert!(!is_equal(x, y));
+ assert!(!is_equal(y, x));
+ }
+ }
+
+ #[test]
+ fn equals_yes() {
+ assert!(is_equal(b"", b""));
+ assert!(is_equal(b"a", b"a"));
+ assert!(is_equal(b"ab", b"ab"));
+ assert!(is_equal(b"abc", b"abc"));
+ assert!(is_equal(b"abcd", b"abcd"));
+ assert!(is_equal(b"abcde", b"abcde"));
+ assert!(is_equal(b"abcdef", b"abcdef"));
+ assert!(is_equal(b"abcdefg", b"abcdefg"));
+ assert!(is_equal(b"abcdefgh", b"abcdefgh"));
+ assert!(is_equal(b"abcdefghi", b"abcdefghi"));
+ }
+
+ #[test]
+ fn prefix() {
+ assert!(is_prefix(b"", b""));
+ assert!(is_prefix(b"a", b""));
+ assert!(is_prefix(b"ab", b""));
+ assert!(is_prefix(b"foo", b"foo"));
+ assert!(is_prefix(b"foobar", b"foo"));
+
+ assert!(!is_prefix(b"foo", b"fob"));
+ assert!(!is_prefix(b"foobar", b"fob"));
+ }
+
+ #[test]
+ fn suffix() {
+ assert!(is_suffix(b"", b""));
+ assert!(is_suffix(b"a", b""));
+ assert!(is_suffix(b"ab", b""));
+ assert!(is_suffix(b"foo", b"foo"));
+ assert!(is_suffix(b"foobar", b"bar"));
+
+ assert!(!is_suffix(b"foo", b"goo"));
+ assert!(!is_suffix(b"foobar", b"gar"));
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +
pub(crate) const RANK: [u8; 256] = [
+ 55, // '\x00'
+ 52, // '\x01'
+ 51, // '\x02'
+ 50, // '\x03'
+ 49, // '\x04'
+ 48, // '\x05'
+ 47, // '\x06'
+ 46, // '\x07'
+ 45, // '\x08'
+ 103, // '\t'
+ 242, // '\n'
+ 66, // '\x0b'
+ 67, // '\x0c'
+ 229, // '\r'
+ 44, // '\x0e'
+ 43, // '\x0f'
+ 42, // '\x10'
+ 41, // '\x11'
+ 40, // '\x12'
+ 39, // '\x13'
+ 38, // '\x14'
+ 37, // '\x15'
+ 36, // '\x16'
+ 35, // '\x17'
+ 34, // '\x18'
+ 33, // '\x19'
+ 56, // '\x1a'
+ 32, // '\x1b'
+ 31, // '\x1c'
+ 30, // '\x1d'
+ 29, // '\x1e'
+ 28, // '\x1f'
+ 255, // ' '
+ 148, // '!'
+ 164, // '"'
+ 149, // '#'
+ 136, // '$'
+ 160, // '%'
+ 155, // '&'
+ 173, // "'"
+ 221, // '('
+ 222, // ')'
+ 134, // '*'
+ 122, // '+'
+ 232, // ','
+ 202, // '-'
+ 215, // '.'
+ 224, // '/'
+ 208, // '0'
+ 220, // '1'
+ 204, // '2'
+ 187, // '3'
+ 183, // '4'
+ 179, // '5'
+ 177, // '6'
+ 168, // '7'
+ 178, // '8'
+ 200, // '9'
+ 226, // ':'
+ 195, // ';'
+ 154, // '<'
+ 184, // '='
+ 174, // '>'
+ 126, // '?'
+ 120, // '@'
+ 191, // 'A'
+ 157, // 'B'
+ 194, // 'C'
+ 170, // 'D'
+ 189, // 'E'
+ 162, // 'F'
+ 161, // 'G'
+ 150, // 'H'
+ 193, // 'I'
+ 142, // 'J'
+ 137, // 'K'
+ 171, // 'L'
+ 176, // 'M'
+ 185, // 'N'
+ 167, // 'O'
+ 186, // 'P'
+ 112, // 'Q'
+ 175, // 'R'
+ 192, // 'S'
+ 188, // 'T'
+ 156, // 'U'
+ 140, // 'V'
+ 143, // 'W'
+ 123, // 'X'
+ 133, // 'Y'
+ 128, // 'Z'
+ 147, // '['
+ 138, // '\\'
+ 146, // ']'
+ 114, // '^'
+ 223, // '_'
+ 151, // '`'
+ 249, // 'a'
+ 216, // 'b'
+ 238, // 'c'
+ 236, // 'd'
+ 253, // 'e'
+ 227, // 'f'
+ 218, // 'g'
+ 230, // 'h'
+ 247, // 'i'
+ 135, // 'j'
+ 180, // 'k'
+ 241, // 'l'
+ 233, // 'm'
+ 246, // 'n'
+ 244, // 'o'
+ 231, // 'p'
+ 139, // 'q'
+ 245, // 'r'
+ 243, // 's'
+ 251, // 't'
+ 235, // 'u'
+ 201, // 'v'
+ 196, // 'w'
+ 240, // 'x'
+ 214, // 'y'
+ 152, // 'z'
+ 182, // '{'
+ 205, // '|'
+ 181, // '}'
+ 127, // '~'
+ 27, // '\x7f'
+ 212, // '\x80'
+ 211, // '\x81'
+ 210, // '\x82'
+ 213, // '\x83'
+ 228, // '\x84'
+ 197, // '\x85'
+ 169, // '\x86'
+ 159, // '\x87'
+ 131, // '\x88'
+ 172, // '\x89'
+ 105, // '\x8a'
+ 80, // '\x8b'
+ 98, // '\x8c'
+ 96, // '\x8d'
+ 97, // '\x8e'
+ 81, // '\x8f'
+ 207, // '\x90'
+ 145, // '\x91'
+ 116, // '\x92'
+ 115, // '\x93'
+ 144, // '\x94'
+ 130, // '\x95'
+ 153, // '\x96'
+ 121, // '\x97'
+ 107, // '\x98'
+ 132, // '\x99'
+ 109, // '\x9a'
+ 110, // '\x9b'
+ 124, // '\x9c'
+ 111, // '\x9d'
+ 82, // '\x9e'
+ 108, // '\x9f'
+ 118, // '\xa0'
+ 141, // '¡'
+ 113, // '¢'
+ 129, // '£'
+ 119, // '¤'
+ 125, // '¥'
+ 165, // '¦'
+ 117, // '§'
+ 92, // '¨'
+ 106, // '©'
+ 83, // 'ª'
+ 72, // '«'
+ 99, // '¬'
+ 93, // '\xad'
+ 65, // '®'
+ 79, // '¯'
+ 166, // '°'
+ 237, // '±'
+ 163, // '²'
+ 199, // '³'
+ 190, // '´'
+ 225, // 'µ'
+ 209, // '¶'
+ 203, // '·'
+ 198, // '¸'
+ 217, // '¹'
+ 219, // 'º'
+ 206, // '»'
+ 234, // '¼'
+ 248, // '½'
+ 158, // '¾'
+ 239, // '¿'
+ 255, // 'À'
+ 255, // 'Á'
+ 255, // 'Â'
+ 255, // 'Ã'
+ 255, // 'Ä'
+ 255, // 'Å'
+ 255, // 'Æ'
+ 255, // 'Ç'
+ 255, // 'È'
+ 255, // 'É'
+ 255, // 'Ê'
+ 255, // 'Ë'
+ 255, // 'Ì'
+ 255, // 'Í'
+ 255, // 'Î'
+ 255, // 'Ï'
+ 255, // 'Ð'
+ 255, // 'Ñ'
+ 255, // 'Ò'
+ 255, // 'Ó'
+ 255, // 'Ô'
+ 255, // 'Õ'
+ 255, // 'Ö'
+ 255, // '×'
+ 255, // 'Ø'
+ 255, // 'Ù'
+ 255, // 'Ú'
+ 255, // 'Û'
+ 255, // 'Ü'
+ 255, // 'Ý'
+ 255, // 'Þ'
+ 255, // 'ß'
+ 255, // 'à'
+ 255, // 'á'
+ 255, // 'â'
+ 255, // 'ã'
+ 255, // 'ä'
+ 255, // 'å'
+ 255, // 'æ'
+ 255, // 'ç'
+ 255, // 'è'
+ 255, // 'é'
+ 255, // 'ê'
+ 255, // 'ë'
+ 255, // 'ì'
+ 255, // 'í'
+ 255, // 'î'
+ 255, // 'ï'
+ 255, // 'ð'
+ 255, // 'ñ'
+ 255, // 'ò'
+ 255, // 'ó'
+ 255, // 'ô'
+ 255, // 'õ'
+ 255, // 'ö'
+ 255, // '÷'
+ 255, // 'ø'
+ 255, // 'ù'
+ 255, // 'ú'
+ 255, // 'û'
+ 255, // 'ü'
+ 255, // 'ý'
+ 255, // 'þ'
+ 255, // 'ÿ'
+];
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +
/*!
+Provides an architecture independent implementation of the "packed pair"
+algorithm.
+
+The "packed pair" algorithm is based on the [generic SIMD] algorithm. The main
+difference is that it (by default) uses a background distribution of byte
+frequencies to heuristically select the pair of bytes to search for. Note that
+this module provides an architecture independent version that doesn't do as
+good of a job keeping the search for candidates inside a SIMD hot path. It
+however can be good enough in many circumstances.
+
+[generic SIMD]: http://0x80.pl/articles/simd-strfind.html#first-and-last
+*/
+
+use crate::memchr;
+
+mod default_rank;
+
+/// An architecture independent "packed pair" finder.
+///
+/// This finder picks two bytes that it believes have high predictive power for
+/// indicating an overall match of a needle. At search time, it reports offsets
+/// where the needle could match based on whether the pair of bytes it chose
+/// match.
+///
+/// This is architecture independent because it utilizes `memchr` to find the
+/// occurrence of one of the bytes in the pair, and then checks whether the
+/// second byte matches. If it does, in the case of [`Finder::find_prefilter`],
+/// the location at which the needle could match is returned.
+///
+/// It is generally preferred to use architecture specific routines for a
+/// "packed pair" prefilter, but this can be a useful fallback when the
+/// architecture independent routines are unavailable.
+#[derive(Clone, Copy, Debug)]
+pub struct Finder {
+ pair: Pair,
+ byte1: u8,
+ byte2: u8,
+}
+
+impl Finder {
+ /// Create a new prefilter that reports possible locations where the given
+ /// needle matches.
+ #[inline]
+ pub fn new(needle: &[u8]) -> Option<Finder> {
+ Finder::with_pair(needle, Pair::new(needle)?)
+ }
+
+ /// Create a new prefilter using the pair given.
+ ///
+ /// If the prefilter could not be constructed, then `None` is returned.
+ ///
+ /// This constructor permits callers to control precisely which pair of
+ /// bytes is used as a predicate.
+ #[inline]
+ pub fn with_pair(needle: &[u8], pair: Pair) -> Option<Finder> {
+ let byte1 = needle[usize::from(pair.index1())];
+ let byte2 = needle[usize::from(pair.index2())];
+ // Currently this can never fail so we could just return a Finder,
+ // but it's conceivable this could change.
+ Some(Finder { pair, byte1, byte2 })
+ }
+
+ /// Run this finder on the given haystack as a prefilter.
+ ///
+ /// If a candidate match is found, then an offset where the needle *could*
+ /// begin in the haystack is returned.
+ #[inline]
+ pub fn find_prefilter(&self, haystack: &[u8]) -> Option<usize> {
+ let mut i = 0;
+ let index1 = usize::from(self.pair.index1());
+ let index2 = usize::from(self.pair.index2());
+ loop {
+ // Use a fast vectorized implementation to skip to the next
+ // occurrence of the rarest byte (heuristically chosen) in the
+ // needle.
+ i += memchr(self.byte1, &haystack[i..])?;
+ let found = i;
+ i += 1;
+
+ // If we can't align our first byte match with the haystack, then a
+ // match is impossible.
+ let aligned1 = match found.checked_sub(index1) {
+ None => continue,
+ Some(aligned1) => aligned1,
+ };
+
+ // Now align the second byte match with the haystack. A mismatch
+ // means that a match is impossible.
+ let aligned2 = match aligned1.checked_add(index2) {
+ None => continue,
+ Some(aligned_index2) => aligned_index2,
+ };
+ if haystack.get(aligned2).map_or(true, |&b| b != self.byte2) {
+ continue;
+ }
+
+ // We've done what we can. There might be a match here.
+ return Some(aligned1);
+ }
+ }
+
+ /// Returns the pair of offsets (into the needle) used to check as a
+ /// predicate before confirming whether a needle exists at a particular
+ /// position.
+ #[inline]
+ pub fn pair(&self) -> &Pair {
+ &self.pair
+ }
+}
+
+/// A pair of byte offsets into a needle to use as a predicate.
+///
+/// This pair is used as a predicate to quickly filter out positions in a
+/// haystack in which a needle cannot match. In some cases, this pair can even
+/// be used in vector algorithms such that the vector algorithm only switches
+/// over to scalar code once this pair has been found.
+///
+/// A pair of offsets can be used in both substring search implementations and
+/// in prefilters. The former will report matches of a needle in a haystack
+/// where as the latter will only report possible matches of a needle.
+///
+/// The offsets are limited each to a maximum of 255 to keep memory usage low.
+/// Moreover, it's rarely advantageous to create a predicate using offsets
+/// greater than 255 anyway.
+///
+/// The only guarantee enforced on the pair of offsets is that they are not
+/// equivalent. It is not necessarily the case that `index1 < index2` for
+/// example. By convention, `index1` corresponds to the byte in the needle
+/// that is believed to be most the predictive. Note also that because of the
+/// requirement that the indices be both valid for the needle used to build
+/// the pair and not equal, it follows that a pair can only be constructed for
+/// needles with length at least 2.
+#[derive(Clone, Copy, Debug)]
+pub struct Pair {
+ index1: u8,
+ index2: u8,
+}
+
+impl Pair {
+ /// Create a new pair of offsets from the given needle.
+ ///
+ /// If a pair could not be created (for example, if the needle is too
+ /// short), then `None` is returned.
+ ///
+ /// This chooses the pair in the needle that is believed to be as
+ /// predictive of an overall match of the needle as possible.
+ #[inline]
+ pub fn new(needle: &[u8]) -> Option<Pair> {
+ Pair::with_ranker(needle, DefaultFrequencyRank)
+ }
+
+ /// Create a new pair of offsets from the given needle and ranker.
+ ///
+ /// This permits the caller to choose a background frequency distribution
+ /// with which bytes are selected. The idea is to select a pair of bytes
+ /// that is believed to strongly predict a match in the haystack. This
+ /// usually means selecting bytes that occur rarely in a haystack.
+ ///
+ /// If a pair could not be created (for example, if the needle is too
+ /// short), then `None` is returned.
+ #[inline]
+ pub fn with_ranker<R: HeuristicFrequencyRank>(
+ needle: &[u8],
+ ranker: R,
+ ) -> Option<Pair> {
+ if needle.len() <= 1 {
+ return None;
+ }
+ // Find the rarest two bytes. We make them distinct indices by
+ // construction. (The actual byte value may be the same in degenerate
+ // cases, but that's OK.)
+ let (mut rare1, mut index1) = (needle[0], 0);
+ let (mut rare2, mut index2) = (needle[1], 1);
+ if ranker.rank(rare2) < ranker.rank(rare1) {
+ core::mem::swap(&mut rare1, &mut rare2);
+ core::mem::swap(&mut index1, &mut index2);
+ }
+ let max = usize::from(core::u8::MAX);
+ for (i, &b) in needle.iter().enumerate().take(max).skip(2) {
+ if ranker.rank(b) < ranker.rank(rare1) {
+ rare2 = rare1;
+ index2 = index1;
+ rare1 = b;
+ index1 = u8::try_from(i).unwrap();
+ } else if b != rare1 && ranker.rank(b) < ranker.rank(rare2) {
+ rare2 = b;
+ index2 = u8::try_from(i).unwrap();
+ }
+ }
+ // While not strictly required for how a Pair is normally used, we
+ // really don't want these to be equivalent. If they were, it would
+ // reduce the effectiveness of candidate searching using these rare
+ // bytes by increasing the rate of false positives.
+ assert_ne!(index1, index2);
+ Some(Pair { index1, index2 })
+ }
+
+ /// Create a new pair using the offsets given for the needle given.
+ ///
+ /// This bypasses any sort of heuristic process for choosing the offsets
+ /// and permits the caller to choose the offsets themselves.
+ ///
+ /// Indices are limited to valid `u8` values so that a `Pair` uses less
+ /// memory. It is not possible to create a `Pair` with offsets bigger than
+ /// `u8::MAX`. It's likely that such a thing is not needed, but if it is,
+ /// it's suggested to build your own bespoke algorithm because you're
+ /// likely working on a very niche case. (File an issue if this suggestion
+ /// does not make sense to you.)
+ ///
+ /// If a pair could not be created (for example, if the needle is too
+ /// short), then `None` is returned.
+ #[inline]
+ pub fn with_indices(
+ needle: &[u8],
+ index1: u8,
+ index2: u8,
+ ) -> Option<Pair> {
+ // While not strictly required for how a Pair is normally used, we
+ // really don't want these to be equivalent. If they were, it would
+ // reduce the effectiveness of candidate searching using these rare
+ // bytes by increasing the rate of false positives.
+ if index1 == index2 {
+ return None;
+ }
+ // Similarly, invalid indices means the Pair is invalid too.
+ if usize::from(index1) >= needle.len() {
+ return None;
+ }
+ if usize::from(index2) >= needle.len() {
+ return None;
+ }
+ Some(Pair { index1, index2 })
+ }
+
+ /// Returns the first offset of the pair.
+ #[inline]
+ pub fn index1(&self) -> u8 {
+ self.index1
+ }
+
+ /// Returns the second offset of the pair.
+ #[inline]
+ pub fn index2(&self) -> u8 {
+ self.index2
+ }
+}
+
+/// This trait allows the user to customize the heuristic used to determine the
+/// relative frequency of a given byte in the dataset being searched.
+///
+/// The use of this trait can have a dramatic impact on performance depending
+/// on the type of data being searched. The details of why are explained in the
+/// docs of [`crate::memmem::Prefilter`]. To summarize, the core algorithm uses
+/// a prefilter to quickly identify candidate matches that are later verified
+/// more slowly. This prefilter is implemented in terms of trying to find
+/// `rare` bytes at specific offsets that will occur less frequently in the
+/// dataset. While the concept of a `rare` byte is similar for most datasets,
+/// there are some specific datasets (like binary executables) that have
+/// dramatically different byte distributions. For these datasets customizing
+/// the byte frequency heuristic can have a massive impact on performance, and
+/// might even need to be done at runtime.
+///
+/// The default implementation of `HeuristicFrequencyRank` reads from the
+/// static frequency table defined in `src/memmem/byte_frequencies.rs`. This
+/// is optimal for most inputs, so if you are unsure of the impact of using a
+/// custom `HeuristicFrequencyRank` you should probably just use the default.
+///
+/// # Example
+///
+/// ```
+/// use memchr::{
+/// arch::all::packedpair::HeuristicFrequencyRank,
+/// memmem::FinderBuilder,
+/// };
+///
+/// /// A byte-frequency table that is good for scanning binary executables.
+/// struct Binary;
+///
+/// impl HeuristicFrequencyRank for Binary {
+/// fn rank(&self, byte: u8) -> u8 {
+/// const TABLE: [u8; 256] = [
+/// 255, 128, 61, 43, 50, 41, 27, 28, 57, 15, 21, 13, 24, 17, 17,
+/// 89, 58, 16, 11, 7, 14, 23, 7, 6, 24, 9, 6, 5, 9, 4, 7, 16,
+/// 68, 11, 9, 6, 88, 7, 4, 4, 23, 9, 4, 8, 8, 5, 10, 4, 30, 11,
+/// 9, 24, 11, 5, 5, 5, 19, 11, 6, 17, 9, 9, 6, 8,
+/// 48, 58, 11, 14, 53, 40, 9, 9, 254, 35, 3, 6, 52, 23, 6, 6, 27,
+/// 4, 7, 11, 14, 13, 10, 11, 11, 5, 2, 10, 16, 12, 6, 19,
+/// 19, 20, 5, 14, 16, 31, 19, 7, 14, 20, 4, 4, 19, 8, 18, 20, 24,
+/// 1, 25, 19, 58, 29, 10, 5, 15, 20, 2, 2, 9, 4, 3, 5,
+/// 51, 11, 4, 53, 23, 39, 6, 4, 13, 81, 4, 186, 5, 67, 3, 2, 15,
+/// 0, 0, 1, 3, 2, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0,
+/// 12, 2, 1, 1, 3, 1, 1, 1, 6, 1, 2, 1, 3, 1, 1, 2, 9, 1, 1, 0,
+/// 2, 2, 4, 4, 11, 6, 7, 3, 6, 9, 4, 5,
+/// 46, 18, 8, 18, 17, 3, 8, 20, 16, 10, 3, 7, 175, 4, 6, 7, 13,
+/// 3, 7, 3, 3, 1, 3, 3, 10, 3, 1, 5, 2, 0, 1, 2,
+/// 16, 3, 5, 1, 6, 1, 1, 2, 58, 20, 3, 14, 12, 2, 1, 3, 16, 3, 5,
+/// 8, 3, 1, 8, 6, 17, 6, 5, 3, 8, 6, 13, 175,
+/// ];
+/// TABLE[byte as usize]
+/// }
+/// }
+/// // Create a new finder with the custom heuristic.
+/// let finder = FinderBuilder::new()
+/// .build_forward_with_ranker(Binary, b"\x00\x00\xdd\xdd");
+/// // Find needle with custom heuristic.
+/// assert!(finder.find(b"\x00\x00\x00\xdd\xdd").is_some());
+/// ```
+pub trait HeuristicFrequencyRank {
+ /// Return the heuristic frequency rank of the given byte. A lower rank
+ /// means the byte is believed to occur less frequently in the haystack.
+ ///
+ /// Some uses of this heuristic may treat arbitrary absolute rank values as
+ /// significant. For example, an implementation detail in this crate may
+ /// determine that heuristic prefilters are inappropriate if every byte in
+ /// the needle has a "high" rank.
+ fn rank(&self, byte: u8) -> u8;
+}
+
+/// The default byte frequency heuristic that is good for most haystacks.
+pub(crate) struct DefaultFrequencyRank;
+
+impl HeuristicFrequencyRank for DefaultFrequencyRank {
+ fn rank(&self, byte: u8) -> u8 {
+ self::default_rank::RANK[usize::from(byte)]
+ }
+}
+
+/// This permits passing any implementation of `HeuristicFrequencyRank` as a
+/// borrowed version of itself.
+impl<'a, R> HeuristicFrequencyRank for &'a R
+where
+ R: HeuristicFrequencyRank,
+{
+ fn rank(&self, byte: u8) -> u8 {
+ (**self).rank(byte)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn forward_packedpair() {
+ fn find(
+ haystack: &[u8],
+ needle: &[u8],
+ _index1: u8,
+ _index2: u8,
+ ) -> Option<Option<usize>> {
+ // We ignore the index positions requested since it winds up making
+ // this test too slow overall.
+ let f = Finder::new(needle)?;
+ Some(f.find_prefilter(haystack))
+ }
+ crate::tests::packedpair::Runner::new().fwd(find).run()
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +
/*!
+An implementation of the [Rabin-Karp substring search algorithm][rabinkarp].
+
+Rabin-Karp works by creating a hash of the needle provided and then computing
+a rolling hash for each needle sized window in the haystack. When the rolling
+hash matches the hash of the needle, a byte-wise comparison is done to check
+if a match exists. The worst case time complexity of Rabin-Karp is `O(m *
+n)` where `m ~ len(needle)` and `n ~ len(haystack)`. Its worst case space
+complexity is constant.
+
+The main utility of Rabin-Karp is that the searcher can be constructed very
+quickly with very little memory. This makes it especially useful when searching
+for small needles in small haystacks, as it might finish its search before a
+beefier algorithm (like Two-Way) even starts.
+
+[rabinkarp]: https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
+*/
+
+/*
+(This was the comment I wrote for this module originally when it was not
+exposed. The comment still looks useful, but it's a bit in the weeds, so it's
+not public itself.)
+
+This module implements the classical Rabin-Karp substring search algorithm,
+with no extra frills. While its use would seem to break our time complexity
+guarantee of O(m+n) (RK's time complexity is O(mn)), we are careful to only
+ever use RK on a constant subset of haystacks. The main point here is that
+RK has good latency properties for small needles/haystacks. It's very quick
+to compute a needle hash and zip through the haystack when compared to
+initializing Two-Way, for example. And this is especially useful for cases
+where the haystack is just too short for vector instructions to do much good.
+
+The hashing function used here is the same one recommended by ESMAJ.
+
+Another choice instead of Rabin-Karp would be Shift-Or. But its latency
+isn't quite as good since its preprocessing time is a bit more expensive
+(both in practice and in theory). However, perhaps Shift-Or has a place
+somewhere else for short patterns. I think the main problem is that it
+requires space proportional to the alphabet and the needle. If we, for
+example, supported needles up to length 16, then the total table size would be
+len(alphabet)*size_of::<u16>()==512 bytes. Which isn't exactly small, and it's
+probably bad to put that on the stack. So ideally, we'd throw it on the heap,
+but we'd really like to write as much code without using alloc/std as possible.
+But maybe it's worth the special casing. It's a TODO to benchmark.
+
+Wikipedia has a decent explanation, if a bit heavy on the theory:
+https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
+
+But ESMAJ provides something a bit more concrete:
+http://www-igm.univ-mlv.fr/~lecroq/string/node5.html
+
+Finally, aho-corasick uses Rabin-Karp for multiple pattern match in some cases:
+https://github.com/BurntSushi/aho-corasick/blob/3852632f10587db0ff72ef29e88d58bf305a0946/src/packed/rabinkarp.rs
+*/
+
+use crate::ext::Pointer;
+
+/// A forward substring searcher using the Rabin-Karp algorithm.
+///
+/// Note that, as a lower level API, a `Finder` does not have access to the
+/// needle it was constructed with. For this reason, executing a search
+/// with a `Finder` requires passing both the needle and the haystack,
+/// where the needle is exactly equivalent to the one given to the `Finder`
+/// at construction time. This design was chosen so that callers can have
+/// more precise control over where and how many times a needle is stored.
+/// For example, in cases where Rabin-Karp is just one of several possible
+/// substring search algorithms.
+#[derive(Clone, Debug)]
+pub struct Finder {
+ /// The actual hash.
+ hash: Hash,
+ /// The factor needed to multiply a byte by in order to subtract it from
+ /// the hash. It is defined to be 2^(n-1) (using wrapping exponentiation),
+ /// where n is the length of the needle. This is how we "remove" a byte
+ /// from the hash once the hash window rolls past it.
+ hash_2pow: u32,
+}
+
+impl Finder {
+ /// Create a new Rabin-Karp forward searcher for the given `needle`.
+ ///
+ /// The needle may be empty. The empty needle matches at every byte offset.
+ ///
+ /// Note that callers must pass the same needle to all search calls using
+ /// this `Finder`.
+ #[inline]
+ pub fn new(needle: &[u8]) -> Finder {
+ let mut s = Finder { hash: Hash::new(), hash_2pow: 1 };
+ let first_byte = match needle.get(0) {
+ None => return s,
+ Some(&first_byte) => first_byte,
+ };
+ s.hash.add(first_byte);
+ for b in needle.iter().copied().skip(1) {
+ s.hash.add(b);
+ s.hash_2pow = s.hash_2pow.wrapping_shl(1);
+ }
+ s
+ }
+
+ /// Return the first occurrence of the `needle` in the `haystack`
+ /// given. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The `needle` provided must match the needle given to this finder at
+ /// construction time.
+ ///
+ /// The maximum value this can return is `haystack.len()`, which can only
+ /// occur when the needle and haystack both have length zero. Otherwise,
+ /// for non-empty haystacks, the maximum value is `haystack.len() - 1`.
+ #[inline]
+ pub fn find(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
+ unsafe {
+ let hstart = haystack.as_ptr();
+ let hend = hstart.add(haystack.len());
+ let nstart = needle.as_ptr();
+ let nend = nstart.add(needle.len());
+ let found = self.find_raw(hstart, hend, nstart, nend)?;
+ Some(found.distance(hstart))
+ }
+ }
+
+ /// Like `find`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `<= end`. The pointer returned is only ever equivalent
+ /// to `end` when both the needle and haystack are empty. (That is, the
+ /// empty string matches the empty string.)
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// Note that `start` and `end` below refer to both pairs of pointers given
+ /// to this routine. That is, the conditions apply to both `hstart`/`hend`
+ /// and `nstart`/`nend`.
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ /// * It must be the case that `start <= end`.
+ #[inline]
+ pub unsafe fn find_raw(
+ &self,
+ hstart: *const u8,
+ hend: *const u8,
+ nstart: *const u8,
+ nend: *const u8,
+ ) -> Option<*const u8> {
+ let hlen = hend.distance(hstart);
+ let nlen = nend.distance(nstart);
+ if nlen > hlen {
+ return None;
+ }
+ let mut cur = hstart;
+ let end = hend.sub(nlen);
+ let mut hash = Hash::forward(cur, cur.add(nlen));
+ loop {
+ if self.hash == hash && is_equal_raw(cur, nstart, nlen) {
+ return Some(cur);
+ }
+ if cur >= end {
+ return None;
+ }
+ hash.roll(self, cur.read(), cur.add(nlen).read());
+ cur = cur.add(1);
+ }
+ }
+}
+
+/// A reverse substring searcher using the Rabin-Karp algorithm.
+#[derive(Clone, Debug)]
+pub struct FinderRev(Finder);
+
+impl FinderRev {
+ /// Create a new Rabin-Karp reverse searcher for the given `needle`.
+ #[inline]
+ pub fn new(needle: &[u8]) -> FinderRev {
+ let mut s = FinderRev(Finder { hash: Hash::new(), hash_2pow: 1 });
+ let last_byte = match needle.last() {
+ None => return s,
+ Some(&last_byte) => last_byte,
+ };
+ s.0.hash.add(last_byte);
+ for b in needle.iter().rev().copied().skip(1) {
+ s.0.hash.add(b);
+ s.0.hash_2pow = s.0.hash_2pow.wrapping_shl(1);
+ }
+ s
+ }
+
+ /// Return the last occurrence of the `needle` in the `haystack`
+ /// given. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The `needle` provided must match the needle given to this finder at
+ /// construction time.
+ ///
+ /// The maximum value this can return is `haystack.len()`, which can only
+ /// occur when the needle and haystack both have length zero. Otherwise,
+ /// for non-empty haystacks, the maximum value is `haystack.len() - 1`.
+ #[inline]
+ pub fn rfind(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
+ unsafe {
+ let hstart = haystack.as_ptr();
+ let hend = hstart.add(haystack.len());
+ let nstart = needle.as_ptr();
+ let nend = nstart.add(needle.len());
+ let found = self.rfind_raw(hstart, hend, nstart, nend)?;
+ Some(found.distance(hstart))
+ }
+ }
+
+ /// Like `rfind`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `<= end`. The pointer returned is only ever equivalent
+ /// to `end` when both the needle and haystack are empty. (That is, the
+ /// empty string matches the empty string.)
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// Note that `start` and `end` below refer to both pairs of pointers given
+ /// to this routine. That is, the conditions apply to both `hstart`/`hend`
+ /// and `nstart`/`nend`.
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ /// * It must be the case that `start <= end`.
+ #[inline]
+ pub unsafe fn rfind_raw(
+ &self,
+ hstart: *const u8,
+ hend: *const u8,
+ nstart: *const u8,
+ nend: *const u8,
+ ) -> Option<*const u8> {
+ let hlen = hend.distance(hstart);
+ let nlen = nend.distance(nstart);
+ if nlen > hlen {
+ return None;
+ }
+ let mut cur = hend.sub(nlen);
+ let start = hstart;
+ let mut hash = Hash::reverse(cur, cur.add(nlen));
+ loop {
+ if self.0.hash == hash && is_equal_raw(cur, nstart, nlen) {
+ return Some(cur);
+ }
+ if cur <= start {
+ return None;
+ }
+ cur = cur.sub(1);
+ hash.roll(&self.0, cur.add(nlen).read(), cur.read());
+ }
+ }
+}
+
+/// Whether RK is believed to be very fast for the given needle/haystack.
+#[inline]
+pub(crate) fn is_fast(haystack: &[u8], _needle: &[u8]) -> bool {
+ haystack.len() < 16
+}
+
+/// A Rabin-Karp hash. This might represent the hash of a needle, or the hash
+/// of a rolling window in the haystack.
+#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
+struct Hash(u32);
+
+impl Hash {
+ /// Create a new hash that represents the empty string.
+ #[inline(always)]
+ fn new() -> Hash {
+ Hash(0)
+ }
+
+ /// Create a new hash from the bytes given for use in forward searches.
+ ///
+ /// # Safety
+ ///
+ /// The given pointers must be valid to read from within their range.
+ #[inline(always)]
+ unsafe fn forward(mut start: *const u8, end: *const u8) -> Hash {
+ let mut hash = Hash::new();
+ while start < end {
+ hash.add(start.read());
+ start = start.add(1);
+ }
+ hash
+ }
+
+ /// Create a new hash from the bytes given for use in reverse searches.
+ ///
+ /// # Safety
+ ///
+ /// The given pointers must be valid to read from within their range.
+ #[inline(always)]
+ unsafe fn reverse(start: *const u8, mut end: *const u8) -> Hash {
+ let mut hash = Hash::new();
+ while start < end {
+ end = end.sub(1);
+ hash.add(end.read());
+ }
+ hash
+ }
+
+ /// Add 'new' and remove 'old' from this hash. The given needle hash should
+ /// correspond to the hash computed for the needle being searched for.
+ ///
+ /// This is meant to be used when the rolling window of the haystack is
+ /// advanced.
+ #[inline(always)]
+ fn roll(&mut self, finder: &Finder, old: u8, new: u8) {
+ self.del(finder, old);
+ self.add(new);
+ }
+
+ /// Add a byte to this hash.
+ #[inline(always)]
+ fn add(&mut self, byte: u8) {
+ self.0 = self.0.wrapping_shl(1).wrapping_add(u32::from(byte));
+ }
+
+ /// Remove a byte from this hash. The given needle hash should correspond
+ /// to the hash computed for the needle being searched for.
+ #[inline(always)]
+ fn del(&mut self, finder: &Finder, byte: u8) {
+ let factor = finder.hash_2pow;
+ self.0 = self.0.wrapping_sub(u32::from(byte).wrapping_mul(factor));
+ }
+}
+
+/// Returns true when `x[i] == y[i]` for all `0 <= i < n`.
+///
+/// We forcefully don't inline this to hint at the compiler that it is unlikely
+/// to be called. This causes the inner rabinkarp loop above to be a bit
+/// tighter and leads to some performance improvement. See the
+/// memmem/krate/prebuilt/sliceslice-words/words benchmark.
+///
+/// # Safety
+///
+/// Same as `crate::arch::all::is_equal_raw`.
+#[cold]
+#[inline(never)]
+unsafe fn is_equal_raw(x: *const u8, y: *const u8, n: usize) -> bool {
+ crate::arch::all::is_equal_raw(x, y, n)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ define_substring_forward_quickcheck!(|h, n| Some(
+ Finder::new(n).find(h, n)
+ ));
+ define_substring_reverse_quickcheck!(|h, n| Some(
+ FinderRev::new(n).rfind(h, n)
+ ));
+
+ #[test]
+ fn forward() {
+ crate::tests::substring::Runner::new()
+ .fwd(|h, n| Some(Finder::new(n).find(h, n)))
+ .run();
+ }
+
+ #[test]
+ fn reverse() {
+ crate::tests::substring::Runner::new()
+ .rev(|h, n| Some(FinderRev::new(n).rfind(h, n)))
+ .run();
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +
/*!
+An implementation of the [Shift-Or substring search algorithm][shiftor].
+
+[shiftor]: https://en.wikipedia.org/wiki/Bitap_algorithm
+*/
+
+use alloc::boxed::Box;
+
+/// The type of our mask.
+///
+/// While we don't expose anyway to configure this in the public API, if one
+/// really needs less memory usage or support for longer needles, then it is
+/// suggested to copy the code from this module and modify it to fit your
+/// needs. The code below is written to be correct regardless of whether Mask
+/// is a u8, u16, u32, u64 or u128.
+type Mask = u16;
+
+/// A forward substring searcher using the Shift-Or algorithm.
+#[derive(Debug)]
+pub struct Finder {
+ masks: Box<[Mask; 256]>,
+ needle_len: usize,
+}
+
+impl Finder {
+ const MAX_NEEDLE_LEN: usize = (Mask::BITS - 1) as usize;
+
+ /// Create a new Shift-Or forward searcher for the given `needle`.
+ ///
+ /// The needle may be empty. The empty needle matches at every byte offset.
+ #[inline]
+ pub fn new(needle: &[u8]) -> Option<Finder> {
+ let needle_len = needle.len();
+ if needle_len > Finder::MAX_NEEDLE_LEN {
+ // A match is found when bit 7 is set in 'result' in the search
+ // routine below. So our needle can't be bigger than 7. We could
+ // permit bigger needles by using u16, u32 or u64 for our mask
+ // entries. But this is all we need for this example.
+ return None;
+ }
+ let mut searcher = Finder { masks: Box::from([!0; 256]), needle_len };
+ for (i, &byte) in needle.iter().enumerate() {
+ searcher.masks[usize::from(byte)] &= !(1 << i);
+ }
+ Some(searcher)
+ }
+
+ /// Return the first occurrence of the needle given to `Finder::new` in
+ /// the `haystack` given. If no such occurrence exists, then `None` is
+ /// returned.
+ ///
+ /// Unlike most other substring search implementations in this crate, this
+ /// finder does not require passing the needle at search time. A match can
+ /// be determined without the needle at all since the required information
+ /// is already encoded into this finder at construction time.
+ ///
+ /// The maximum value this can return is `haystack.len()`, which can only
+ /// occur when the needle and haystack both have length zero. Otherwise,
+ /// for non-empty haystacks, the maximum value is `haystack.len() - 1`.
+ #[inline]
+ pub fn find(&self, haystack: &[u8]) -> Option<usize> {
+ if self.needle_len == 0 {
+ return Some(0);
+ }
+ let mut result = !1;
+ for (i, &byte) in haystack.iter().enumerate() {
+ result |= self.masks[usize::from(byte)];
+ result <<= 1;
+ if result & (1 << self.needle_len) == 0 {
+ return Some(i + 1 - self.needle_len);
+ }
+ }
+ None
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ define_substring_forward_quickcheck!(|h, n| Some(Finder::new(n)?.find(h)));
+
+ #[test]
+ fn forward() {
+ crate::tests::substring::Runner::new()
+ .fwd(|h, n| Some(Finder::new(n)?.find(h)))
+ .run();
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 +763 +764 +765 +766 +767 +768 +769 +770 +771 +772 +773 +774 +775 +776 +777 +778 +779 +780 +781 +782 +783 +784 +785 +786 +787 +788 +789 +790 +791 +792 +793 +794 +795 +796 +797 +798 +799 +800 +801 +802 +803 +804 +805 +806 +807 +808 +809 +810 +811 +812 +813 +814 +815 +816 +817 +818 +819 +820 +821 +822 +823 +824 +825 +826 +827 +828 +829 +830 +831 +832 +833 +834 +835 +836 +837 +838 +839 +840 +841 +842 +843 +844 +845 +846 +847 +848 +849 +850 +851 +852 +853 +854 +855 +856 +857 +858 +859 +860 +861 +862 +863 +864 +865 +866 +867 +868 +869 +870 +871 +872 +873 +874 +875 +876 +877 +
/*!
+An implementation of the [Two-Way substring search algorithm][two-way].
+
+[`Finder`] can be built for forward searches, while [`FinderRev`] can be built
+for reverse searches.
+
+Two-Way makes for a nice general purpose substring search algorithm because of
+its time and space complexity properties. It also performs well in practice.
+Namely, with `m = len(needle)` and `n = len(haystack)`, Two-Way takes `O(m)`
+time to create a finder, `O(1)` space and `O(n)` search time. In other words,
+the preprocessing step is quick, doesn't require any heap memory and the worst
+case search time is guaranteed to be linear in the haystack regardless of the
+size of the needle.
+
+While vector algorithms will usually beat Two-Way handedly, vector algorithms
+also usually have pathological or edge cases that are better handled by Two-Way.
+Moreover, not all targets support vector algorithms or implementations for them
+simply may not exist yet.
+
+Two-Way can be found in the `memmem` implementations in at least [GNU libc] and
+[musl].
+
+[two-way]: https://en.wikipedia.org/wiki/Two-way_string-matching_algorithm
+[GNU libc]: https://www.gnu.org/software/libc/
+[musl]: https://www.musl-libc.org/
+*/
+
+use core::cmp;
+
+use crate::{
+ arch::all::{is_prefix, is_suffix},
+ memmem::Pre,
+};
+
+/// A forward substring searcher that uses the Two-Way algorithm.
+#[derive(Clone, Copy, Debug)]
+pub struct Finder(TwoWay);
+
+/// A reverse substring searcher that uses the Two-Way algorithm.
+#[derive(Clone, Copy, Debug)]
+pub struct FinderRev(TwoWay);
+
+/// An implementation of the TwoWay substring search algorithm.
+///
+/// This searcher supports forward and reverse search, although not
+/// simultaneously. It runs in `O(n + m)` time and `O(1)` space, where
+/// `n ~ len(needle)` and `m ~ len(haystack)`.
+///
+/// The implementation here roughly matches that which was developed by
+/// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The
+/// changes in this implementation are 1) the use of zero-based indices, 2) a
+/// heuristic skip table based on the last byte (borrowed from Rust's standard
+/// library) and 3) the addition of heuristics for a fast skip loop. For (3),
+/// callers can pass any kind of prefilter they want, but usually it's one
+/// based on a heuristic that uses an approximate background frequency of bytes
+/// to choose rare bytes to quickly look for candidate match positions. Note
+/// though that currently, this prefilter functionality is not exposed directly
+/// in the public API. (File an issue if you want it and provide a use case
+/// please.)
+///
+/// The heuristic for fast skipping is automatically shut off if it's
+/// detected to be ineffective at search time. Generally, this only occurs in
+/// pathological cases. But this is generally necessary in order to preserve
+/// a `O(n + m)` time bound.
+///
+/// The code below is fairly complex and not obviously correct at all. It's
+/// likely necessary to read the Two-Way paper cited above in order to fully
+/// grok this code. The essence of it is:
+///
+/// 1. Do something to detect a "critical" position in the needle.
+/// 2. For the current position in the haystack, look if `needle[critical..]`
+/// matches at that position.
+/// 3. If so, look if `needle[..critical]` matches.
+/// 4. If a mismatch occurs, shift the search by some amount based on the
+/// critical position and a pre-computed shift.
+///
+/// This type is wrapped in the forward and reverse finders that expose
+/// consistent forward or reverse APIs.
+#[derive(Clone, Copy, Debug)]
+struct TwoWay {
+ /// A small bitset used as a quick prefilter (in addition to any prefilter
+ /// given by the caller). Namely, a bit `i` is set if and only if `b%64==i`
+ /// for any `b == needle[i]`.
+ ///
+ /// When used as a prefilter, if the last byte at the current candidate
+ /// position is NOT in this set, then we can skip that entire candidate
+ /// position (the length of the needle). This is essentially the shift
+ /// trick found in Boyer-Moore, but only applied to bytes that don't appear
+ /// in the needle.
+ ///
+ /// N.B. This trick was inspired by something similar in std's
+ /// implementation of Two-Way.
+ byteset: ApproximateByteSet,
+ /// A critical position in needle. Specifically, this position corresponds
+ /// to beginning of either the minimal or maximal suffix in needle. (N.B.
+ /// See SuffixType below for why "minimal" isn't quite the correct word
+ /// here.)
+ ///
+ /// This is the position at which every search begins. Namely, search
+ /// starts by scanning text to the right of this position, and only if
+ /// there's a match does the text to the left of this position get scanned.
+ critical_pos: usize,
+ /// The amount we shift by in the Two-Way search algorithm. This
+ /// corresponds to the "small period" and "large period" cases.
+ shift: Shift,
+}
+
+impl Finder {
+ /// Create a searcher that finds occurrences of the given `needle`.
+ ///
+ /// An empty `needle` results in a match at every position in a haystack,
+ /// including at `haystack.len()`.
+ #[inline]
+ pub fn new(needle: &[u8]) -> Finder {
+ let byteset = ApproximateByteSet::new(needle);
+ let min_suffix = Suffix::forward(needle, SuffixKind::Minimal);
+ let max_suffix = Suffix::forward(needle, SuffixKind::Maximal);
+ let (period_lower_bound, critical_pos) =
+ if min_suffix.pos > max_suffix.pos {
+ (min_suffix.period, min_suffix.pos)
+ } else {
+ (max_suffix.period, max_suffix.pos)
+ };
+ let shift = Shift::forward(needle, period_lower_bound, critical_pos);
+ Finder(TwoWay { byteset, critical_pos, shift })
+ }
+
+ /// Returns the first occurrence of `needle` in the given `haystack`, or
+ /// `None` if no such occurrence could be found.
+ ///
+ /// The `needle` given must be the same as the `needle` provided to
+ /// [`Finder::new`].
+ ///
+ /// An empty `needle` results in a match at every position in a haystack,
+ /// including at `haystack.len()`.
+ #[inline]
+ pub fn find(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
+ self.find_with_prefilter(None, haystack, needle)
+ }
+
+ /// This is like [`Finder::find`], but it accepts a prefilter for
+ /// accelerating searches.
+ ///
+ /// Currently this is not exposed in the public API because, at the time
+ /// of writing, I didn't want to spend time thinking about how to expose
+ /// the prefilter infrastructure (if at all). If you have a compelling use
+ /// case for exposing this routine, please create an issue. Do *not* open
+ /// a PR that just exposes `Pre` and friends. Exporting this routine will
+ /// require API design.
+ #[inline(always)]
+ pub(crate) fn find_with_prefilter(
+ &self,
+ pre: Option<Pre<'_>>,
+ haystack: &[u8],
+ needle: &[u8],
+ ) -> Option<usize> {
+ match self.0.shift {
+ Shift::Small { period } => {
+ self.find_small_imp(pre, haystack, needle, period)
+ }
+ Shift::Large { shift } => {
+ self.find_large_imp(pre, haystack, needle, shift)
+ }
+ }
+ }
+
+ // Each of the two search implementations below can be accelerated by a
+ // prefilter, but it is not always enabled. To avoid its overhead when
+ // its disabled, we explicitly inline each search implementation based on
+ // whether a prefilter will be used or not. The decision on which to use
+ // is made in the parent meta searcher.
+
+ #[inline(always)]
+ fn find_small_imp(
+ &self,
+ mut pre: Option<Pre<'_>>,
+ haystack: &[u8],
+ needle: &[u8],
+ period: usize,
+ ) -> Option<usize> {
+ let mut pos = 0;
+ let mut shift = 0;
+ let last_byte_pos = match needle.len().checked_sub(1) {
+ None => return Some(pos),
+ Some(last_byte) => last_byte,
+ };
+ while pos + needle.len() <= haystack.len() {
+ let mut i = cmp::max(self.0.critical_pos, shift);
+ if let Some(pre) = pre.as_mut() {
+ if pre.is_effective() {
+ pos += pre.find(&haystack[pos..])?;
+ shift = 0;
+ i = self.0.critical_pos;
+ if pos + needle.len() > haystack.len() {
+ return None;
+ }
+ }
+ }
+ if !self.0.byteset.contains(haystack[pos + last_byte_pos]) {
+ pos += needle.len();
+ shift = 0;
+ continue;
+ }
+ while i < needle.len() && needle[i] == haystack[pos + i] {
+ i += 1;
+ }
+ if i < needle.len() {
+ pos += i - self.0.critical_pos + 1;
+ shift = 0;
+ } else {
+ let mut j = self.0.critical_pos;
+ while j > shift && needle[j] == haystack[pos + j] {
+ j -= 1;
+ }
+ if j <= shift && needle[shift] == haystack[pos + shift] {
+ return Some(pos);
+ }
+ pos += period;
+ shift = needle.len() - period;
+ }
+ }
+ None
+ }
+
+ #[inline(always)]
+ fn find_large_imp(
+ &self,
+ mut pre: Option<Pre<'_>>,
+ haystack: &[u8],
+ needle: &[u8],
+ shift: usize,
+ ) -> Option<usize> {
+ let mut pos = 0;
+ let last_byte_pos = match needle.len().checked_sub(1) {
+ None => return Some(pos),
+ Some(last_byte) => last_byte,
+ };
+ 'outer: while pos + needle.len() <= haystack.len() {
+ if let Some(pre) = pre.as_mut() {
+ if pre.is_effective() {
+ pos += pre.find(&haystack[pos..])?;
+ if pos + needle.len() > haystack.len() {
+ return None;
+ }
+ }
+ }
+
+ if !self.0.byteset.contains(haystack[pos + last_byte_pos]) {
+ pos += needle.len();
+ continue;
+ }
+ let mut i = self.0.critical_pos;
+ while i < needle.len() && needle[i] == haystack[pos + i] {
+ i += 1;
+ }
+ if i < needle.len() {
+ pos += i - self.0.critical_pos + 1;
+ } else {
+ for j in (0..self.0.critical_pos).rev() {
+ if needle[j] != haystack[pos + j] {
+ pos += shift;
+ continue 'outer;
+ }
+ }
+ return Some(pos);
+ }
+ }
+ None
+ }
+}
+
+impl FinderRev {
+ /// Create a searcher that finds occurrences of the given `needle`.
+ ///
+ /// An empty `needle` results in a match at every position in a haystack,
+ /// including at `haystack.len()`.
+ #[inline]
+ pub fn new(needle: &[u8]) -> FinderRev {
+ let byteset = ApproximateByteSet::new(needle);
+ let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal);
+ let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal);
+ let (period_lower_bound, critical_pos) =
+ if min_suffix.pos < max_suffix.pos {
+ (min_suffix.period, min_suffix.pos)
+ } else {
+ (max_suffix.period, max_suffix.pos)
+ };
+ let shift = Shift::reverse(needle, period_lower_bound, critical_pos);
+ FinderRev(TwoWay { byteset, critical_pos, shift })
+ }
+
+ /// Returns the last occurrence of `needle` in the given `haystack`, or
+ /// `None` if no such occurrence could be found.
+ ///
+ /// The `needle` given must be the same as the `needle` provided to
+ /// [`FinderRev::new`].
+ ///
+ /// An empty `needle` results in a match at every position in a haystack,
+ /// including at `haystack.len()`.
+ #[inline]
+ pub fn rfind(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
+ // For the reverse case, we don't use a prefilter. It's plausible that
+ // perhaps we should, but it's a lot of additional code to do it, and
+ // it's not clear that it's actually worth it. If you have a really
+ // compelling use case for this, please file an issue.
+ match self.0.shift {
+ Shift::Small { period } => {
+ self.rfind_small_imp(haystack, needle, period)
+ }
+ Shift::Large { shift } => {
+ self.rfind_large_imp(haystack, needle, shift)
+ }
+ }
+ }
+
+ #[inline(always)]
+ fn rfind_small_imp(
+ &self,
+ haystack: &[u8],
+ needle: &[u8],
+ period: usize,
+ ) -> Option<usize> {
+ let nlen = needle.len();
+ let mut pos = haystack.len();
+ let mut shift = nlen;
+ let first_byte = match needle.get(0) {
+ None => return Some(pos),
+ Some(&first_byte) => first_byte,
+ };
+ while pos >= nlen {
+ if !self.0.byteset.contains(haystack[pos - nlen]) {
+ pos -= nlen;
+ shift = nlen;
+ continue;
+ }
+ let mut i = cmp::min(self.0.critical_pos, shift);
+ while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
+ i -= 1;
+ }
+ if i > 0 || first_byte != haystack[pos - nlen] {
+ pos -= self.0.critical_pos - i + 1;
+ shift = nlen;
+ } else {
+ let mut j = self.0.critical_pos;
+ while j < shift && needle[j] == haystack[pos - nlen + j] {
+ j += 1;
+ }
+ if j >= shift {
+ return Some(pos - nlen);
+ }
+ pos -= period;
+ shift = period;
+ }
+ }
+ None
+ }
+
+ #[inline(always)]
+ fn rfind_large_imp(
+ &self,
+ haystack: &[u8],
+ needle: &[u8],
+ shift: usize,
+ ) -> Option<usize> {
+ let nlen = needle.len();
+ let mut pos = haystack.len();
+ let first_byte = match needle.get(0) {
+ None => return Some(pos),
+ Some(&first_byte) => first_byte,
+ };
+ while pos >= nlen {
+ if !self.0.byteset.contains(haystack[pos - nlen]) {
+ pos -= nlen;
+ continue;
+ }
+ let mut i = self.0.critical_pos;
+ while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
+ i -= 1;
+ }
+ if i > 0 || first_byte != haystack[pos - nlen] {
+ pos -= self.0.critical_pos - i + 1;
+ } else {
+ let mut j = self.0.critical_pos;
+ while j < nlen && needle[j] == haystack[pos - nlen + j] {
+ j += 1;
+ }
+ if j == nlen {
+ return Some(pos - nlen);
+ }
+ pos -= shift;
+ }
+ }
+ None
+ }
+}
+
+/// A representation of the amount we're allowed to shift by during Two-Way
+/// search.
+///
+/// When computing a critical factorization of the needle, we find the position
+/// of the critical factorization by finding the needle's maximal (or minimal)
+/// suffix, along with the period of that suffix. It turns out that the period
+/// of that suffix is a lower bound on the period of the needle itself.
+///
+/// This lower bound is equivalent to the actual period of the needle in
+/// some cases. To describe that case, we denote the needle as `x` where
+/// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower
+/// bound given here is always the period of `v`, which is `<= period(x)`. The
+/// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and
+/// where `u` is a suffix of `v[0..period(v)]`.
+///
+/// This case is important because the search algorithm for when the
+/// periods are equivalent is slightly different than the search algorithm
+/// for when the periods are not equivalent. In particular, when they aren't
+/// equivalent, we know that the period of the needle is no less than half its
+/// length. In this case, we shift by an amount less than or equal to the
+/// period of the needle (determined by the maximum length of the components
+/// of the critical factorization of `x`, i.e., `max(len(u), len(v))`)..
+///
+/// The above two cases are represented by the variants below. Each entails
+/// a different instantiation of the Two-Way search algorithm.
+///
+/// N.B. If we could find a way to compute the exact period in all cases,
+/// then we could collapse this case analysis and simplify the algorithm. The
+/// Two-Way paper suggests this is possible, but more reading is required to
+/// grok why the authors didn't pursue that path.
+#[derive(Clone, Copy, Debug)]
+enum Shift {
+ Small { period: usize },
+ Large { shift: usize },
+}
+
+impl Shift {
+ /// Compute the shift for a given needle in the forward direction.
+ ///
+ /// This requires a lower bound on the period and a critical position.
+ /// These can be computed by extracting both the minimal and maximal
+ /// lexicographic suffixes, and choosing the right-most starting position.
+ /// The lower bound on the period is then the period of the chosen suffix.
+ fn forward(
+ needle: &[u8],
+ period_lower_bound: usize,
+ critical_pos: usize,
+ ) -> Shift {
+ let large = cmp::max(critical_pos, needle.len() - critical_pos);
+ if critical_pos * 2 >= needle.len() {
+ return Shift::Large { shift: large };
+ }
+
+ let (u, v) = needle.split_at(critical_pos);
+ if !is_suffix(&v[..period_lower_bound], u) {
+ return Shift::Large { shift: large };
+ }
+ Shift::Small { period: period_lower_bound }
+ }
+
+ /// Compute the shift for a given needle in the reverse direction.
+ ///
+ /// This requires a lower bound on the period and a critical position.
+ /// These can be computed by extracting both the minimal and maximal
+ /// lexicographic suffixes, and choosing the left-most starting position.
+ /// The lower bound on the period is then the period of the chosen suffix.
+ fn reverse(
+ needle: &[u8],
+ period_lower_bound: usize,
+ critical_pos: usize,
+ ) -> Shift {
+ let large = cmp::max(critical_pos, needle.len() - critical_pos);
+ if (needle.len() - critical_pos) * 2 >= needle.len() {
+ return Shift::Large { shift: large };
+ }
+
+ let (v, u) = needle.split_at(critical_pos);
+ if !is_prefix(&v[v.len() - period_lower_bound..], u) {
+ return Shift::Large { shift: large };
+ }
+ Shift::Small { period: period_lower_bound }
+ }
+}
+
+/// A suffix extracted from a needle along with its period.
+#[derive(Debug)]
+struct Suffix {
+ /// The starting position of this suffix.
+ ///
+ /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this
+ /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for
+ /// forward suffixes, this is an inclusive starting position, where as for
+ /// reverse suffixes, this is an exclusive ending position.
+ pos: usize,
+ /// The period of this suffix.
+ ///
+ /// Note that this is NOT necessarily the period of the string from which
+ /// this suffix comes from. (It is always less than or equal to the period
+ /// of the original string.)
+ period: usize,
+}
+
+impl Suffix {
+ fn forward(needle: &[u8], kind: SuffixKind) -> Suffix {
+ // suffix represents our maximal (or minimal) suffix, along with
+ // its period.
+ let mut suffix = Suffix { pos: 0, period: 1 };
+ // The start of a suffix in `needle` that we are considering as a
+ // more maximal (or minimal) suffix than what's in `suffix`.
+ let mut candidate_start = 1;
+ // The current offset of our suffixes that we're comparing.
+ //
+ // When the characters at this offset are the same, then we mush on
+ // to the next position since no decision is possible. When the
+ // candidate's character is greater (or lesser) than the corresponding
+ // character than our current maximal (or minimal) suffix, then the
+ // current suffix is changed over to the candidate and we restart our
+ // search. Otherwise, the candidate suffix is no good and we restart
+ // our search on the next candidate.
+ //
+ // The three cases above correspond to the three cases in the loop
+ // below.
+ let mut offset = 0;
+
+ while candidate_start + offset < needle.len() {
+ let current = needle[suffix.pos + offset];
+ let candidate = needle[candidate_start + offset];
+ match kind.cmp(current, candidate) {
+ SuffixOrdering::Accept => {
+ suffix = Suffix { pos: candidate_start, period: 1 };
+ candidate_start += 1;
+ offset = 0;
+ }
+ SuffixOrdering::Skip => {
+ candidate_start += offset + 1;
+ offset = 0;
+ suffix.period = candidate_start - suffix.pos;
+ }
+ SuffixOrdering::Push => {
+ if offset + 1 == suffix.period {
+ candidate_start += suffix.period;
+ offset = 0;
+ } else {
+ offset += 1;
+ }
+ }
+ }
+ }
+ suffix
+ }
+
+ fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix {
+ // See the comments in `forward` for how this works.
+ let mut suffix = Suffix { pos: needle.len(), period: 1 };
+ if needle.len() == 1 {
+ return suffix;
+ }
+ let mut candidate_start = match needle.len().checked_sub(1) {
+ None => return suffix,
+ Some(candidate_start) => candidate_start,
+ };
+ let mut offset = 0;
+
+ while offset < candidate_start {
+ let current = needle[suffix.pos - offset - 1];
+ let candidate = needle[candidate_start - offset - 1];
+ match kind.cmp(current, candidate) {
+ SuffixOrdering::Accept => {
+ suffix = Suffix { pos: candidate_start, period: 1 };
+ candidate_start -= 1;
+ offset = 0;
+ }
+ SuffixOrdering::Skip => {
+ candidate_start -= offset + 1;
+ offset = 0;
+ suffix.period = suffix.pos - candidate_start;
+ }
+ SuffixOrdering::Push => {
+ if offset + 1 == suffix.period {
+ candidate_start -= suffix.period;
+ offset = 0;
+ } else {
+ offset += 1;
+ }
+ }
+ }
+ }
+ suffix
+ }
+}
+
+/// The kind of suffix to extract.
+#[derive(Clone, Copy, Debug)]
+enum SuffixKind {
+ /// Extract the smallest lexicographic suffix from a string.
+ ///
+ /// Technically, this doesn't actually pick the smallest lexicographic
+ /// suffix. e.g., Given the choice between `a` and `aa`, this will choose
+ /// the latter over the former, even though `a < aa`. The reasoning for
+ /// this isn't clear from the paper, but it still smells like a minimal
+ /// suffix.
+ Minimal,
+ /// Extract the largest lexicographic suffix from a string.
+ ///
+ /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given
+ /// the choice between `z` and `zz`, this will choose the latter over the
+ /// former.
+ Maximal,
+}
+
+/// The result of comparing corresponding bytes between two suffixes.
+#[derive(Clone, Copy, Debug)]
+enum SuffixOrdering {
+ /// This occurs when the given candidate byte indicates that the candidate
+ /// suffix is better than the current maximal (or minimal) suffix. That is,
+ /// the current candidate suffix should supplant the current maximal (or
+ /// minimal) suffix.
+ Accept,
+ /// This occurs when the given candidate byte excludes the candidate suffix
+ /// from being better than the current maximal (or minimal) suffix. That
+ /// is, the current candidate suffix should be dropped and the next one
+ /// should be considered.
+ Skip,
+ /// This occurs when no decision to accept or skip the candidate suffix
+ /// can be made, e.g., when corresponding bytes are equivalent. In this
+ /// case, the next corresponding bytes should be compared.
+ Push,
+}
+
+impl SuffixKind {
+ /// Returns true if and only if the given candidate byte indicates that
+ /// it should replace the current suffix as the maximal (or minimal)
+ /// suffix.
+ fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering {
+ use self::SuffixOrdering::*;
+
+ match self {
+ SuffixKind::Minimal if candidate < current => Accept,
+ SuffixKind::Minimal if candidate > current => Skip,
+ SuffixKind::Minimal => Push,
+ SuffixKind::Maximal if candidate > current => Accept,
+ SuffixKind::Maximal if candidate < current => Skip,
+ SuffixKind::Maximal => Push,
+ }
+ }
+}
+
+/// A bitset used to track whether a particular byte exists in a needle or not.
+///
+/// Namely, bit 'i' is set if and only if byte%64==i for any byte in the
+/// needle. If a particular byte in the haystack is NOT in this set, then one
+/// can conclude that it is also not in the needle, and thus, one can advance
+/// in the haystack by needle.len() bytes.
+#[derive(Clone, Copy, Debug)]
+struct ApproximateByteSet(u64);
+
+impl ApproximateByteSet {
+ /// Create a new set from the given needle.
+ fn new(needle: &[u8]) -> ApproximateByteSet {
+ let mut bits = 0;
+ for &b in needle {
+ bits |= 1 << (b % 64);
+ }
+ ApproximateByteSet(bits)
+ }
+
+ /// Return true if and only if the given byte might be in this set. This
+ /// may return a false positive, but will never return a false negative.
+ #[inline(always)]
+ fn contains(&self, byte: u8) -> bool {
+ self.0 & (1 << (byte % 64)) != 0
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use alloc::vec::Vec;
+
+ use super::*;
+
+ /// Convenience wrapper for computing the suffix as a byte string.
+ fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
+ let s = Suffix::forward(needle, kind);
+ (&needle[s.pos..], s.period)
+ }
+
+ /// Convenience wrapper for computing the reverse suffix as a byte string.
+ fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
+ let s = Suffix::reverse(needle, kind);
+ (&needle[..s.pos], s.period)
+ }
+
+ /// Return all of the non-empty suffixes in the given byte string.
+ fn suffixes(bytes: &[u8]) -> Vec<&[u8]> {
+ (0..bytes.len()).map(|i| &bytes[i..]).collect()
+ }
+
+ /// Return the lexicographically maximal suffix of the given byte string.
+ fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] {
+ let mut sufs = suffixes(needle);
+ sufs.sort();
+ sufs.pop().unwrap()
+ }
+
+ /// Return the lexicographically maximal suffix of the reverse of the given
+ /// byte string.
+ fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> {
+ let mut reversed = needle.to_vec();
+ reversed.reverse();
+ let mut got = naive_maximal_suffix_forward(&reversed).to_vec();
+ got.reverse();
+ got
+ }
+
+ define_substring_forward_quickcheck!(|h, n| Some(
+ Finder::new(n).find(h, n)
+ ));
+ define_substring_reverse_quickcheck!(|h, n| Some(
+ FinderRev::new(n).rfind(h, n)
+ ));
+
+ #[test]
+ fn forward() {
+ crate::tests::substring::Runner::new()
+ .fwd(|h, n| Some(Finder::new(n).find(h, n)))
+ .run();
+ }
+
+ #[test]
+ fn reverse() {
+ crate::tests::substring::Runner::new()
+ .rev(|h, n| Some(FinderRev::new(n).rfind(h, n)))
+ .run();
+ }
+
+ #[test]
+ fn suffix_forward() {
+ macro_rules! assert_suffix_min {
+ ($given:expr, $expected:expr, $period:expr) => {
+ let (got_suffix, got_period) =
+ get_suffix_forward($given.as_bytes(), SuffixKind::Minimal);
+ let got_suffix = core::str::from_utf8(got_suffix).unwrap();
+ assert_eq!(($expected, $period), (got_suffix, got_period));
+ };
+ }
+
+ macro_rules! assert_suffix_max {
+ ($given:expr, $expected:expr, $period:expr) => {
+ let (got_suffix, got_period) =
+ get_suffix_forward($given.as_bytes(), SuffixKind::Maximal);
+ let got_suffix = core::str::from_utf8(got_suffix).unwrap();
+ assert_eq!(($expected, $period), (got_suffix, got_period));
+ };
+ }
+
+ assert_suffix_min!("a", "a", 1);
+ assert_suffix_max!("a", "a", 1);
+
+ assert_suffix_min!("ab", "ab", 2);
+ assert_suffix_max!("ab", "b", 1);
+
+ assert_suffix_min!("ba", "a", 1);
+ assert_suffix_max!("ba", "ba", 2);
+
+ assert_suffix_min!("abc", "abc", 3);
+ assert_suffix_max!("abc", "c", 1);
+
+ assert_suffix_min!("acb", "acb", 3);
+ assert_suffix_max!("acb", "cb", 2);
+
+ assert_suffix_min!("cba", "a", 1);
+ assert_suffix_max!("cba", "cba", 3);
+
+ assert_suffix_min!("abcabc", "abcabc", 3);
+ assert_suffix_max!("abcabc", "cabc", 3);
+
+ assert_suffix_min!("abcabcabc", "abcabcabc", 3);
+ assert_suffix_max!("abcabcabc", "cabcabc", 3);
+
+ assert_suffix_min!("abczz", "abczz", 5);
+ assert_suffix_max!("abczz", "zz", 1);
+
+ assert_suffix_min!("zzabc", "abc", 3);
+ assert_suffix_max!("zzabc", "zzabc", 5);
+
+ assert_suffix_min!("aaa", "aaa", 1);
+ assert_suffix_max!("aaa", "aaa", 1);
+
+ assert_suffix_min!("foobar", "ar", 2);
+ assert_suffix_max!("foobar", "r", 1);
+ }
+
+ #[test]
+ fn suffix_reverse() {
+ macro_rules! assert_suffix_min {
+ ($given:expr, $expected:expr, $period:expr) => {
+ let (got_suffix, got_period) =
+ get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal);
+ let got_suffix = core::str::from_utf8(got_suffix).unwrap();
+ assert_eq!(($expected, $period), (got_suffix, got_period));
+ };
+ }
+
+ macro_rules! assert_suffix_max {
+ ($given:expr, $expected:expr, $period:expr) => {
+ let (got_suffix, got_period) =
+ get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal);
+ let got_suffix = core::str::from_utf8(got_suffix).unwrap();
+ assert_eq!(($expected, $period), (got_suffix, got_period));
+ };
+ }
+
+ assert_suffix_min!("a", "a", 1);
+ assert_suffix_max!("a", "a", 1);
+
+ assert_suffix_min!("ab", "a", 1);
+ assert_suffix_max!("ab", "ab", 2);
+
+ assert_suffix_min!("ba", "ba", 2);
+ assert_suffix_max!("ba", "b", 1);
+
+ assert_suffix_min!("abc", "a", 1);
+ assert_suffix_max!("abc", "abc", 3);
+
+ assert_suffix_min!("acb", "a", 1);
+ assert_suffix_max!("acb", "ac", 2);
+
+ assert_suffix_min!("cba", "cba", 3);
+ assert_suffix_max!("cba", "c", 1);
+
+ assert_suffix_min!("abcabc", "abca", 3);
+ assert_suffix_max!("abcabc", "abcabc", 3);
+
+ assert_suffix_min!("abcabcabc", "abcabca", 3);
+ assert_suffix_max!("abcabcabc", "abcabcabc", 3);
+
+ assert_suffix_min!("abczz", "a", 1);
+ assert_suffix_max!("abczz", "abczz", 5);
+
+ assert_suffix_min!("zzabc", "zza", 3);
+ assert_suffix_max!("zzabc", "zz", 1);
+
+ assert_suffix_min!("aaa", "aaa", 1);
+ assert_suffix_max!("aaa", "aaa", 1);
+ }
+
+ #[cfg(not(miri))]
+ quickcheck::quickcheck! {
+ fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool {
+ if bytes.is_empty() {
+ return true;
+ }
+
+ let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal);
+ let expected = naive_maximal_suffix_forward(&bytes);
+ got == expected
+ }
+
+ fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool {
+ if bytes.is_empty() {
+ return true;
+ }
+
+ let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal);
+ let expected = naive_maximal_suffix_reverse(&bytes);
+ expected == got
+ }
+ }
+
+ // This is a regression test caught by quickcheck that exercised a bug in
+ // the reverse small period handling. The bug was that we were using 'if j
+ // == shift' to determine if a match occurred, but the correct guard is 'if
+ // j >= shift', which matches the corresponding guard in the forward impl.
+ #[test]
+ fn regression_rev_small_period() {
+ let rfind = |h, n| FinderRev::new(n).rfind(h, n);
+ let haystack = "ababaz";
+ let needle = "abab";
+ assert_eq!(Some(0), rfind(haystack.as_bytes(), needle.as_bytes()));
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 +763 +764 +765 +766 +767 +768 +769 +770 +771 +772 +773 +774 +775 +776 +777 +778 +779 +780 +781 +782 +783 +784 +785 +786 +787 +788 +789 +790 +791 +792 +793 +794 +795 +796 +797 +798 +799 +800 +801 +802 +803 +804 +805 +806 +807 +808 +809 +810 +811 +812 +813 +814 +815 +816 +817 +818 +819 +820 +821 +822 +823 +824 +825 +826 +827 +828 +829 +830 +831 +832 +833 +834 +835 +836 +837 +838 +839 +840 +841 +842 +843 +844 +845 +846 +847 +848 +849 +850 +851 +852 +853 +854 +855 +856 +857 +858 +859 +860 +861 +862 +863 +864 +865 +866 +867 +868 +869 +870 +871 +872 +873 +874 +875 +876 +877 +878 +879 +880 +881 +882 +883 +884 +885 +886 +887 +888 +889 +890 +891 +892 +893 +894 +895 +896 +897 +898 +899 +900 +901 +902 +903 +904 +905 +906 +907 +908 +909 +910 +911 +912 +913 +914 +915 +916 +917 +918 +919 +920 +921 +922 +923 +924 +925 +926 +927 +928 +929 +930 +931 +932 +933 +934 +935 +936 +937 +938 +939 +940 +941 +942 +943 +944 +945 +946 +947 +948 +949 +950 +951 +952 +953 +954 +955 +956 +957 +958 +959 +960 +961 +962 +963 +964 +965 +966 +967 +968 +969 +970 +971 +972 +973 +974 +975 +976 +977 +978 +979 +980 +981 +982 +983 +984 +985 +986 +987 +988 +989 +990 +991 +992 +993 +994 +995 +996 +997 +998 +999 +1000 +1001 +1002 +1003 +1004 +1005 +1006 +1007 +1008 +1009 +1010 +1011 +1012 +1013 +1014 +1015 +1016 +1017 +1018 +1019 +1020 +1021 +1022 +1023 +1024 +1025 +1026 +1027 +1028 +1029 +1030 +1031 +1032 +1033 +1034 +1035 +1036 +1037 +1038 +1039 +1040 +1041 +1042 +1043 +1044 +1045 +1046 +1047 +1048 +1049 +1050 +1051 +1052 +1053 +1054 +1055 +1056 +1057 +1058 +1059 +1060 +1061 +1062 +1063 +1064 +1065 +1066 +1067 +1068 +1069 +1070 +1071 +1072 +1073 +1074 +1075 +1076 +1077 +1078 +1079 +1080 +1081 +1082 +1083 +1084 +1085 +1086 +1087 +1088 +1089 +1090 +1091 +1092 +1093 +1094 +1095 +1096 +1097 +1098 +1099 +1100 +1101 +1102 +1103 +1104 +1105 +1106 +1107 +1108 +1109 +1110 +1111 +1112 +1113 +1114 +1115 +1116 +1117 +1118 +1119 +1120 +1121 +1122 +1123 +1124 +1125 +1126 +1127 +1128 +1129 +1130 +1131 +1132 +1133 +1134 +1135 +1136 +1137 +1138 +1139 +1140 +1141 +1142 +1143 +1144 +1145 +1146 +1147 +1148 +1149 +1150 +1151 +1152 +1153 +1154 +1155 +1156 +1157 +1158 +1159 +1160 +1161 +1162 +1163 +1164 +1165 +1166 +1167 +1168 +1169 +1170 +1171 +1172 +1173 +1174 +1175 +1176 +1177 +1178 +1179 +1180 +1181 +1182 +1183 +1184 +1185 +1186 +1187 +1188 +1189 +1190 +1191 +1192 +1193 +1194 +1195 +1196 +1197 +1198 +1199 +1200 +1201 +1202 +1203 +1204 +1205 +1206 +1207 +1208 +1209 +1210 +1211 +1212 +1213 +1214 +
/*!
+Generic crate-internal routines for the `memchr` family of functions.
+*/
+
+// What follows is a vector algorithm generic over the specific vector
+// type to detect the position of one, two or three needles in a haystack.
+// From what I know, this is a "classic" algorithm, although I don't
+// believe it has been published in any peer reviewed journal. I believe
+// it can be found in places like glibc and Go's standard library. It
+// appears to be well known and is elaborated on in more detail here:
+// https://gms.tf/stdfind-and-memchr-optimizations.html
+//
+// While the routine below is fairly long and perhaps intimidating, the basic
+// idea is actually very simple and can be expressed straight-forwardly in
+// pseudo code. The psuedo code below is written for 128 bit vectors, but the
+// actual code below works for anything that implements the Vector trait.
+//
+// needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1
+// // Note: shift amount is in bytes
+//
+// while i <= haystack.len() - 16:
+// // A 16 byte vector. Each byte in chunk corresponds to a byte in
+// // the haystack.
+// chunk = haystack[i:i+16]
+// // Compare bytes in needle with bytes in chunk. The result is a 16
+// // byte chunk where each byte is 0xFF if the corresponding bytes
+// // in needle and chunk were equal, or 0x00 otherwise.
+// eqs = cmpeq(needle, chunk)
+// // Return a 32 bit integer where the most significant 16 bits
+// // are always 0 and the lower 16 bits correspond to whether the
+// // most significant bit in the correspond byte in `eqs` is set.
+// // In other words, `mask as u16` has bit i set if and only if
+// // needle[i] == chunk[i].
+// mask = movemask(eqs)
+//
+// // Mask is 0 if there is no match, and non-zero otherwise.
+// if mask != 0:
+// // trailing_zeros tells us the position of the least significant
+// // bit that is set.
+// return i + trailing_zeros(mask)
+//
+// // haystack length may not be a multiple of 16, so search the rest.
+// while i < haystack.len():
+// if haystack[i] == n1:
+// return i
+//
+// // No match found.
+// return NULL
+//
+// In fact, we could loosely translate the above code to Rust line-for-line
+// and it would be a pretty fast algorithm. But, we pull out all the stops
+// to go as fast as possible:
+//
+// 1. We use aligned loads. That is, we do some finagling to make sure our
+// primary loop not only proceeds in increments of 16 bytes, but that
+// the address of haystack's pointer that we dereference is aligned to
+// 16 bytes. 16 is a magic number here because it is the size of SSE2
+// 128-bit vector. (For the AVX2 algorithm, 32 is the magic number.)
+// Therefore, to get aligned loads, our pointer's address must be evenly
+// divisible by 16.
+// 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's
+// kind of like loop unrolling, but we combine the equality comparisons
+// using a vector OR such that we only need to extract a single mask to
+// determine whether a match exists or not. If so, then we do some
+// book-keeping to determine the precise location but otherwise mush on.
+// 3. We use our "chunk" comparison routine in as many places as possible,
+// even if it means using unaligned loads. In particular, if haystack
+// starts with an unaligned address, then we do an unaligned load to
+// search the first 16 bytes. We then start our primary loop at the
+// smallest subsequent aligned address, which will actually overlap with
+// previously searched bytes. But we're OK with that. We do a similar
+// dance at the end of our primary loop. Finally, to avoid a
+// byte-at-a-time loop at the end, we do a final 16 byte unaligned load
+// that may overlap with a previous load. This is OK because it converts
+// a loop into a small number of very fast vector instructions. The overlap
+// is OK because we know the place where the overlap occurs does not
+// contain a match.
+//
+// And that's pretty all there is to it. Note that since the below is
+// generic and since it's meant to be inlined into routines with a
+// `#[target_feature(enable = "...")]` annotation, we must mark all routines as
+// both unsafe and `#[inline(always)]`.
+//
+// The fact that the code below is generic does somewhat inhibit us. For
+// example, I've noticed that introducing an unlineable `#[cold]` function to
+// handle the match case in the loop generates tighter assembly, but there is
+// no way to do this in the generic code below because the generic code doesn't
+// know what `target_feature` annotation to apply to the unlineable function.
+// We could make such functions part of the `Vector` trait, but we instead live
+// with the slightly sub-optimal codegen for now since it doesn't seem to have
+// a noticeable perf difference.
+
+use crate::{
+ ext::Pointer,
+ vector::{MoveMask, Vector},
+};
+
+/// Finds all occurrences of a single byte in a haystack.
+#[derive(Clone, Copy, Debug)]
+pub(crate) struct One<V> {
+ s1: u8,
+ v1: V,
+}
+
+impl<V: Vector> One<V> {
+ /// The number of bytes we examine per each iteration of our search loop.
+ const LOOP_SIZE: usize = 4 * V::BYTES;
+
+ /// Create a new searcher that finds occurrences of the byte given.
+ #[inline(always)]
+ pub(crate) unsafe fn new(needle: u8) -> One<V> {
+ One { s1: needle, v1: V::splat(needle) }
+ }
+
+ /// Returns the needle given to `One::new`.
+ #[inline(always)]
+ pub(crate) fn needle1(&self) -> u8 {
+ self.s1
+ }
+
+ /// Return a pointer to the first occurrence of the needle in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// # Safety
+ ///
+ /// * It must be the case that `start < end` and that the distance between
+ /// them is at least equal to `V::BYTES`. That is, it must always be valid
+ /// to do at least an unaligned load of `V` at `start`.
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ #[inline(always)]
+ pub(crate) unsafe fn find_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ // If we want to support vectors bigger than 256 bits, we probably
+ // need to move up to using a u64 for the masks used below. Currently
+ // they are 32 bits, which means we're SOL for vectors that need masks
+ // bigger than 32 bits. Overall unclear until there's a use case.
+ debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
+
+ let topos = V::Mask::first_offset;
+ let len = end.distance(start);
+ debug_assert!(
+ len >= V::BYTES,
+ "haystack has length {}, but must be at least {}",
+ len,
+ V::BYTES
+ );
+
+ // Search a possibly unaligned chunk at `start`. This covers any part
+ // of the haystack prior to where aligned loads can start.
+ if let Some(cur) = self.search_chunk(start, topos) {
+ return Some(cur);
+ }
+ // Set `cur` to the first V-aligned pointer greater than `start`.
+ let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN));
+ debug_assert!(cur > start && end.sub(V::BYTES) >= start);
+ if len >= Self::LOOP_SIZE {
+ while cur <= end.sub(Self::LOOP_SIZE) {
+ debug_assert_eq!(0, cur.as_usize() % V::BYTES);
+
+ let a = V::load_aligned(cur);
+ let b = V::load_aligned(cur.add(1 * V::BYTES));
+ let c = V::load_aligned(cur.add(2 * V::BYTES));
+ let d = V::load_aligned(cur.add(3 * V::BYTES));
+ let eqa = self.v1.cmpeq(a);
+ let eqb = self.v1.cmpeq(b);
+ let eqc = self.v1.cmpeq(c);
+ let eqd = self.v1.cmpeq(d);
+ let or1 = eqa.or(eqb);
+ let or2 = eqc.or(eqd);
+ let or3 = or1.or(or2);
+ if or3.movemask_will_have_non_zero() {
+ let mask = eqa.movemask();
+ if mask.has_non_zero() {
+ return Some(cur.add(topos(mask)));
+ }
+
+ let mask = eqb.movemask();
+ if mask.has_non_zero() {
+ return Some(cur.add(1 * V::BYTES).add(topos(mask)));
+ }
+
+ let mask = eqc.movemask();
+ if mask.has_non_zero() {
+ return Some(cur.add(2 * V::BYTES).add(topos(mask)));
+ }
+
+ let mask = eqd.movemask();
+ debug_assert!(mask.has_non_zero());
+ return Some(cur.add(3 * V::BYTES).add(topos(mask)));
+ }
+ cur = cur.add(Self::LOOP_SIZE);
+ }
+ }
+ // Handle any leftovers after the aligned loop above. We use unaligned
+ // loads here, but I believe we are guaranteed that they are aligned
+ // since `cur` is aligned.
+ while cur <= end.sub(V::BYTES) {
+ debug_assert!(end.distance(cur) >= V::BYTES);
+ if let Some(cur) = self.search_chunk(cur, topos) {
+ return Some(cur);
+ }
+ cur = cur.add(V::BYTES);
+ }
+ // Finally handle any remaining bytes less than the size of V. In this
+ // case, our pointer may indeed be unaligned and the load may overlap
+ // with the previous one. But that's okay since we know the previous
+ // load didn't lead to a match (otherwise we wouldn't be here).
+ if cur < end {
+ debug_assert!(end.distance(cur) < V::BYTES);
+ cur = cur.sub(V::BYTES - end.distance(cur));
+ debug_assert_eq!(end.distance(cur), V::BYTES);
+ return self.search_chunk(cur, topos);
+ }
+ None
+ }
+
+ /// Return a pointer to the last occurrence of the needle in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// # Safety
+ ///
+ /// * It must be the case that `start < end` and that the distance between
+ /// them is at least equal to `V::BYTES`. That is, it must always be valid
+ /// to do at least an unaligned load of `V` at `start`.
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ #[inline(always)]
+ pub(crate) unsafe fn rfind_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ // If we want to support vectors bigger than 256 bits, we probably
+ // need to move up to using a u64 for the masks used below. Currently
+ // they are 32 bits, which means we're SOL for vectors that need masks
+ // bigger than 32 bits. Overall unclear until there's a use case.
+ debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
+
+ let topos = V::Mask::last_offset;
+ let len = end.distance(start);
+ debug_assert!(
+ len >= V::BYTES,
+ "haystack has length {}, but must be at least {}",
+ len,
+ V::BYTES
+ );
+
+ if let Some(cur) = self.search_chunk(end.sub(V::BYTES), topos) {
+ return Some(cur);
+ }
+ let mut cur = end.sub(end.as_usize() & V::ALIGN);
+ debug_assert!(start <= cur && cur <= end);
+ if len >= Self::LOOP_SIZE {
+ while cur >= start.add(Self::LOOP_SIZE) {
+ debug_assert_eq!(0, cur.as_usize() % V::BYTES);
+
+ cur = cur.sub(Self::LOOP_SIZE);
+ let a = V::load_aligned(cur);
+ let b = V::load_aligned(cur.add(1 * V::BYTES));
+ let c = V::load_aligned(cur.add(2 * V::BYTES));
+ let d = V::load_aligned(cur.add(3 * V::BYTES));
+ let eqa = self.v1.cmpeq(a);
+ let eqb = self.v1.cmpeq(b);
+ let eqc = self.v1.cmpeq(c);
+ let eqd = self.v1.cmpeq(d);
+ let or1 = eqa.or(eqb);
+ let or2 = eqc.or(eqd);
+ let or3 = or1.or(or2);
+ if or3.movemask_will_have_non_zero() {
+ let mask = eqd.movemask();
+ if mask.has_non_zero() {
+ return Some(cur.add(3 * V::BYTES).add(topos(mask)));
+ }
+
+ let mask = eqc.movemask();
+ if mask.has_non_zero() {
+ return Some(cur.add(2 * V::BYTES).add(topos(mask)));
+ }
+
+ let mask = eqb.movemask();
+ if mask.has_non_zero() {
+ return Some(cur.add(1 * V::BYTES).add(topos(mask)));
+ }
+
+ let mask = eqa.movemask();
+ debug_assert!(mask.has_non_zero());
+ return Some(cur.add(topos(mask)));
+ }
+ }
+ }
+ while cur >= start.add(V::BYTES) {
+ debug_assert!(cur.distance(start) >= V::BYTES);
+ cur = cur.sub(V::BYTES);
+ if let Some(cur) = self.search_chunk(cur, topos) {
+ return Some(cur);
+ }
+ }
+ if cur > start {
+ debug_assert!(cur.distance(start) < V::BYTES);
+ return self.search_chunk(start, topos);
+ }
+ None
+ }
+
+ /// Return a count of all matching bytes in the given haystack.
+ ///
+ /// # Safety
+ ///
+ /// * It must be the case that `start < end` and that the distance between
+ /// them is at least equal to `V::BYTES`. That is, it must always be valid
+ /// to do at least an unaligned load of `V` at `start`.
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ #[inline(always)]
+ pub(crate) unsafe fn count_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> usize {
+ debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
+
+ let confirm = |b| b == self.needle1();
+ let len = end.distance(start);
+ debug_assert!(
+ len >= V::BYTES,
+ "haystack has length {}, but must be at least {}",
+ len,
+ V::BYTES
+ );
+
+ // Set `cur` to the first V-aligned pointer greater than `start`.
+ let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN));
+ // Count any matching bytes before we start our aligned loop.
+ let mut count = count_byte_by_byte(start, cur, confirm);
+ debug_assert!(cur > start && end.sub(V::BYTES) >= start);
+ if len >= Self::LOOP_SIZE {
+ while cur <= end.sub(Self::LOOP_SIZE) {
+ debug_assert_eq!(0, cur.as_usize() % V::BYTES);
+
+ let a = V::load_aligned(cur);
+ let b = V::load_aligned(cur.add(1 * V::BYTES));
+ let c = V::load_aligned(cur.add(2 * V::BYTES));
+ let d = V::load_aligned(cur.add(3 * V::BYTES));
+ let eqa = self.v1.cmpeq(a);
+ let eqb = self.v1.cmpeq(b);
+ let eqc = self.v1.cmpeq(c);
+ let eqd = self.v1.cmpeq(d);
+ count += eqa.movemask().count_ones();
+ count += eqb.movemask().count_ones();
+ count += eqc.movemask().count_ones();
+ count += eqd.movemask().count_ones();
+ cur = cur.add(Self::LOOP_SIZE);
+ }
+ }
+ // Handle any leftovers after the aligned loop above. We use unaligned
+ // loads here, but I believe we are guaranteed that they are aligned
+ // since `cur` is aligned.
+ while cur <= end.sub(V::BYTES) {
+ debug_assert!(end.distance(cur) >= V::BYTES);
+ let chunk = V::load_unaligned(cur);
+ count += self.v1.cmpeq(chunk).movemask().count_ones();
+ cur = cur.add(V::BYTES);
+ }
+ // And finally count any leftovers that weren't caught above.
+ count += count_byte_by_byte(cur, end, confirm);
+ count
+ }
+
+ /// Search `V::BYTES` starting at `cur` via an unaligned load.
+ ///
+ /// `mask_to_offset` should be a function that converts a `movemask` to
+ /// an offset such that `cur.add(offset)` corresponds to a pointer to the
+ /// match location if one is found. Generally it is expected to use either
+ /// `mask_to_first_offset` or `mask_to_last_offset`, depending on whether
+ /// one is implementing a forward or reverse search, respectively.
+ ///
+ /// # Safety
+ ///
+ /// `cur` must be a valid pointer and it must be valid to do an unaligned
+ /// load of size `V::BYTES` at `cur`.
+ #[inline(always)]
+ unsafe fn search_chunk(
+ &self,
+ cur: *const u8,
+ mask_to_offset: impl Fn(V::Mask) -> usize,
+ ) -> Option<*const u8> {
+ let chunk = V::load_unaligned(cur);
+ let mask = self.v1.cmpeq(chunk).movemask();
+ if mask.has_non_zero() {
+ Some(cur.add(mask_to_offset(mask)))
+ } else {
+ None
+ }
+ }
+}
+
+/// Finds all occurrences of two bytes in a haystack.
+///
+/// That is, this reports matches of one of two possible bytes. For example,
+/// searching for `a` or `b` in `afoobar` would report matches at offsets `0`,
+/// `4` and `5`.
+#[derive(Clone, Copy, Debug)]
+pub(crate) struct Two<V> {
+ s1: u8,
+ s2: u8,
+ v1: V,
+ v2: V,
+}
+
+impl<V: Vector> Two<V> {
+ /// The number of bytes we examine per each iteration of our search loop.
+ const LOOP_SIZE: usize = 2 * V::BYTES;
+
+ /// Create a new searcher that finds occurrences of the byte given.
+ #[inline(always)]
+ pub(crate) unsafe fn new(needle1: u8, needle2: u8) -> Two<V> {
+ Two {
+ s1: needle1,
+ s2: needle2,
+ v1: V::splat(needle1),
+ v2: V::splat(needle2),
+ }
+ }
+
+ /// Returns the first needle given to `Two::new`.
+ #[inline(always)]
+ pub(crate) fn needle1(&self) -> u8 {
+ self.s1
+ }
+
+ /// Returns the second needle given to `Two::new`.
+ #[inline(always)]
+ pub(crate) fn needle2(&self) -> u8 {
+ self.s2
+ }
+
+ /// Return a pointer to the first occurrence of one of the needles in the
+ /// given haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// # Safety
+ ///
+ /// * It must be the case that `start < end` and that the distance between
+ /// them is at least equal to `V::BYTES`. That is, it must always be valid
+ /// to do at least an unaligned load of `V` at `start`.
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ #[inline(always)]
+ pub(crate) unsafe fn find_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ // If we want to support vectors bigger than 256 bits, we probably
+ // need to move up to using a u64 for the masks used below. Currently
+ // they are 32 bits, which means we're SOL for vectors that need masks
+ // bigger than 32 bits. Overall unclear until there's a use case.
+ debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
+
+ let topos = V::Mask::first_offset;
+ let len = end.distance(start);
+ debug_assert!(
+ len >= V::BYTES,
+ "haystack has length {}, but must be at least {}",
+ len,
+ V::BYTES
+ );
+
+ // Search a possibly unaligned chunk at `start`. This covers any part
+ // of the haystack prior to where aligned loads can start.
+ if let Some(cur) = self.search_chunk(start, topos) {
+ return Some(cur);
+ }
+ // Set `cur` to the first V-aligned pointer greater than `start`.
+ let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN));
+ debug_assert!(cur > start && end.sub(V::BYTES) >= start);
+ if len >= Self::LOOP_SIZE {
+ while cur <= end.sub(Self::LOOP_SIZE) {
+ debug_assert_eq!(0, cur.as_usize() % V::BYTES);
+
+ let a = V::load_aligned(cur);
+ let b = V::load_aligned(cur.add(V::BYTES));
+ let eqa1 = self.v1.cmpeq(a);
+ let eqb1 = self.v1.cmpeq(b);
+ let eqa2 = self.v2.cmpeq(a);
+ let eqb2 = self.v2.cmpeq(b);
+ let or1 = eqa1.or(eqb1);
+ let or2 = eqa2.or(eqb2);
+ let or3 = or1.or(or2);
+ if or3.movemask_will_have_non_zero() {
+ let mask = eqa1.movemask().or(eqa2.movemask());
+ if mask.has_non_zero() {
+ return Some(cur.add(topos(mask)));
+ }
+
+ let mask = eqb1.movemask().or(eqb2.movemask());
+ debug_assert!(mask.has_non_zero());
+ return Some(cur.add(V::BYTES).add(topos(mask)));
+ }
+ cur = cur.add(Self::LOOP_SIZE);
+ }
+ }
+ // Handle any leftovers after the aligned loop above. We use unaligned
+ // loads here, but I believe we are guaranteed that they are aligned
+ // since `cur` is aligned.
+ while cur <= end.sub(V::BYTES) {
+ debug_assert!(end.distance(cur) >= V::BYTES);
+ if let Some(cur) = self.search_chunk(cur, topos) {
+ return Some(cur);
+ }
+ cur = cur.add(V::BYTES);
+ }
+ // Finally handle any remaining bytes less than the size of V. In this
+ // case, our pointer may indeed be unaligned and the load may overlap
+ // with the previous one. But that's okay since we know the previous
+ // load didn't lead to a match (otherwise we wouldn't be here).
+ if cur < end {
+ debug_assert!(end.distance(cur) < V::BYTES);
+ cur = cur.sub(V::BYTES - end.distance(cur));
+ debug_assert_eq!(end.distance(cur), V::BYTES);
+ return self.search_chunk(cur, topos);
+ }
+ None
+ }
+
+ /// Return a pointer to the last occurrence of the needle in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// # Safety
+ ///
+ /// * It must be the case that `start < end` and that the distance between
+ /// them is at least equal to `V::BYTES`. That is, it must always be valid
+ /// to do at least an unaligned load of `V` at `start`.
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ #[inline(always)]
+ pub(crate) unsafe fn rfind_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ // If we want to support vectors bigger than 256 bits, we probably
+ // need to move up to using a u64 for the masks used below. Currently
+ // they are 32 bits, which means we're SOL for vectors that need masks
+ // bigger than 32 bits. Overall unclear until there's a use case.
+ debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
+
+ let topos = V::Mask::last_offset;
+ let len = end.distance(start);
+ debug_assert!(
+ len >= V::BYTES,
+ "haystack has length {}, but must be at least {}",
+ len,
+ V::BYTES
+ );
+
+ if let Some(cur) = self.search_chunk(end.sub(V::BYTES), topos) {
+ return Some(cur);
+ }
+ let mut cur = end.sub(end.as_usize() & V::ALIGN);
+ debug_assert!(start <= cur && cur <= end);
+ if len >= Self::LOOP_SIZE {
+ while cur >= start.add(Self::LOOP_SIZE) {
+ debug_assert_eq!(0, cur.as_usize() % V::BYTES);
+
+ cur = cur.sub(Self::LOOP_SIZE);
+ let a = V::load_aligned(cur);
+ let b = V::load_aligned(cur.add(V::BYTES));
+ let eqa1 = self.v1.cmpeq(a);
+ let eqb1 = self.v1.cmpeq(b);
+ let eqa2 = self.v2.cmpeq(a);
+ let eqb2 = self.v2.cmpeq(b);
+ let or1 = eqa1.or(eqb1);
+ let or2 = eqa2.or(eqb2);
+ let or3 = or1.or(or2);
+ if or3.movemask_will_have_non_zero() {
+ let mask = eqb1.movemask().or(eqb2.movemask());
+ if mask.has_non_zero() {
+ return Some(cur.add(V::BYTES).add(topos(mask)));
+ }
+
+ let mask = eqa1.movemask().or(eqa2.movemask());
+ debug_assert!(mask.has_non_zero());
+ return Some(cur.add(topos(mask)));
+ }
+ }
+ }
+ while cur >= start.add(V::BYTES) {
+ debug_assert!(cur.distance(start) >= V::BYTES);
+ cur = cur.sub(V::BYTES);
+ if let Some(cur) = self.search_chunk(cur, topos) {
+ return Some(cur);
+ }
+ }
+ if cur > start {
+ debug_assert!(cur.distance(start) < V::BYTES);
+ return self.search_chunk(start, topos);
+ }
+ None
+ }
+
+ /// Search `V::BYTES` starting at `cur` via an unaligned load.
+ ///
+ /// `mask_to_offset` should be a function that converts a `movemask` to
+ /// an offset such that `cur.add(offset)` corresponds to a pointer to the
+ /// match location if one is found. Generally it is expected to use either
+ /// `mask_to_first_offset` or `mask_to_last_offset`, depending on whether
+ /// one is implementing a forward or reverse search, respectively.
+ ///
+ /// # Safety
+ ///
+ /// `cur` must be a valid pointer and it must be valid to do an unaligned
+ /// load of size `V::BYTES` at `cur`.
+ #[inline(always)]
+ unsafe fn search_chunk(
+ &self,
+ cur: *const u8,
+ mask_to_offset: impl Fn(V::Mask) -> usize,
+ ) -> Option<*const u8> {
+ let chunk = V::load_unaligned(cur);
+ let eq1 = self.v1.cmpeq(chunk);
+ let eq2 = self.v2.cmpeq(chunk);
+ let mask = eq1.or(eq2).movemask();
+ if mask.has_non_zero() {
+ let mask1 = eq1.movemask();
+ let mask2 = eq2.movemask();
+ Some(cur.add(mask_to_offset(mask1.or(mask2))))
+ } else {
+ None
+ }
+ }
+}
+
+/// Finds all occurrences of two bytes in a haystack.
+///
+/// That is, this reports matches of one of two possible bytes. For example,
+/// searching for `a` or `b` in `afoobar` would report matches at offsets `0`,
+/// `4` and `5`.
+#[derive(Clone, Copy, Debug)]
+pub(crate) struct Three<V> {
+ s1: u8,
+ s2: u8,
+ s3: u8,
+ v1: V,
+ v2: V,
+ v3: V,
+}
+
+impl<V: Vector> Three<V> {
+ /// The number of bytes we examine per each iteration of our search loop.
+ const LOOP_SIZE: usize = 2 * V::BYTES;
+
+ /// Create a new searcher that finds occurrences of the byte given.
+ #[inline(always)]
+ pub(crate) unsafe fn new(
+ needle1: u8,
+ needle2: u8,
+ needle3: u8,
+ ) -> Three<V> {
+ Three {
+ s1: needle1,
+ s2: needle2,
+ s3: needle3,
+ v1: V::splat(needle1),
+ v2: V::splat(needle2),
+ v3: V::splat(needle3),
+ }
+ }
+
+ /// Returns the first needle given to `Three::new`.
+ #[inline(always)]
+ pub(crate) fn needle1(&self) -> u8 {
+ self.s1
+ }
+
+ /// Returns the second needle given to `Three::new`.
+ #[inline(always)]
+ pub(crate) fn needle2(&self) -> u8 {
+ self.s2
+ }
+
+ /// Returns the third needle given to `Three::new`.
+ #[inline(always)]
+ pub(crate) fn needle3(&self) -> u8 {
+ self.s3
+ }
+
+ /// Return a pointer to the first occurrence of one of the needles in the
+ /// given haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// # Safety
+ ///
+ /// * It must be the case that `start < end` and that the distance between
+ /// them is at least equal to `V::BYTES`. That is, it must always be valid
+ /// to do at least an unaligned load of `V` at `start`.
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ #[inline(always)]
+ pub(crate) unsafe fn find_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ // If we want to support vectors bigger than 256 bits, we probably
+ // need to move up to using a u64 for the masks used below. Currently
+ // they are 32 bits, which means we're SOL for vectors that need masks
+ // bigger than 32 bits. Overall unclear until there's a use case.
+ debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
+
+ let topos = V::Mask::first_offset;
+ let len = end.distance(start);
+ debug_assert!(
+ len >= V::BYTES,
+ "haystack has length {}, but must be at least {}",
+ len,
+ V::BYTES
+ );
+
+ // Search a possibly unaligned chunk at `start`. This covers any part
+ // of the haystack prior to where aligned loads can start.
+ if let Some(cur) = self.search_chunk(start, topos) {
+ return Some(cur);
+ }
+ // Set `cur` to the first V-aligned pointer greater than `start`.
+ let mut cur = start.add(V::BYTES - (start.as_usize() & V::ALIGN));
+ debug_assert!(cur > start && end.sub(V::BYTES) >= start);
+ if len >= Self::LOOP_SIZE {
+ while cur <= end.sub(Self::LOOP_SIZE) {
+ debug_assert_eq!(0, cur.as_usize() % V::BYTES);
+
+ let a = V::load_aligned(cur);
+ let b = V::load_aligned(cur.add(V::BYTES));
+ let eqa1 = self.v1.cmpeq(a);
+ let eqb1 = self.v1.cmpeq(b);
+ let eqa2 = self.v2.cmpeq(a);
+ let eqb2 = self.v2.cmpeq(b);
+ let eqa3 = self.v3.cmpeq(a);
+ let eqb3 = self.v3.cmpeq(b);
+ let or1 = eqa1.or(eqb1);
+ let or2 = eqa2.or(eqb2);
+ let or3 = eqa3.or(eqb3);
+ let or4 = or1.or(or2);
+ let or5 = or3.or(or4);
+ if or5.movemask_will_have_non_zero() {
+ let mask = eqa1
+ .movemask()
+ .or(eqa2.movemask())
+ .or(eqa3.movemask());
+ if mask.has_non_zero() {
+ return Some(cur.add(topos(mask)));
+ }
+
+ let mask = eqb1
+ .movemask()
+ .or(eqb2.movemask())
+ .or(eqb3.movemask());
+ debug_assert!(mask.has_non_zero());
+ return Some(cur.add(V::BYTES).add(topos(mask)));
+ }
+ cur = cur.add(Self::LOOP_SIZE);
+ }
+ }
+ // Handle any leftovers after the aligned loop above. We use unaligned
+ // loads here, but I believe we are guaranteed that they are aligned
+ // since `cur` is aligned.
+ while cur <= end.sub(V::BYTES) {
+ debug_assert!(end.distance(cur) >= V::BYTES);
+ if let Some(cur) = self.search_chunk(cur, topos) {
+ return Some(cur);
+ }
+ cur = cur.add(V::BYTES);
+ }
+ // Finally handle any remaining bytes less than the size of V. In this
+ // case, our pointer may indeed be unaligned and the load may overlap
+ // with the previous one. But that's okay since we know the previous
+ // load didn't lead to a match (otherwise we wouldn't be here).
+ if cur < end {
+ debug_assert!(end.distance(cur) < V::BYTES);
+ cur = cur.sub(V::BYTES - end.distance(cur));
+ debug_assert_eq!(end.distance(cur), V::BYTES);
+ return self.search_chunk(cur, topos);
+ }
+ None
+ }
+
+ /// Return a pointer to the last occurrence of the needle in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// # Safety
+ ///
+ /// * It must be the case that `start < end` and that the distance between
+ /// them is at least equal to `V::BYTES`. That is, it must always be valid
+ /// to do at least an unaligned load of `V` at `start`.
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ #[inline(always)]
+ pub(crate) unsafe fn rfind_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ // If we want to support vectors bigger than 256 bits, we probably
+ // need to move up to using a u64 for the masks used below. Currently
+ // they are 32 bits, which means we're SOL for vectors that need masks
+ // bigger than 32 bits. Overall unclear until there's a use case.
+ debug_assert!(V::BYTES <= 32, "vector cannot be bigger than 32 bytes");
+
+ let topos = V::Mask::last_offset;
+ let len = end.distance(start);
+ debug_assert!(
+ len >= V::BYTES,
+ "haystack has length {}, but must be at least {}",
+ len,
+ V::BYTES
+ );
+
+ if let Some(cur) = self.search_chunk(end.sub(V::BYTES), topos) {
+ return Some(cur);
+ }
+ let mut cur = end.sub(end.as_usize() & V::ALIGN);
+ debug_assert!(start <= cur && cur <= end);
+ if len >= Self::LOOP_SIZE {
+ while cur >= start.add(Self::LOOP_SIZE) {
+ debug_assert_eq!(0, cur.as_usize() % V::BYTES);
+
+ cur = cur.sub(Self::LOOP_SIZE);
+ let a = V::load_aligned(cur);
+ let b = V::load_aligned(cur.add(V::BYTES));
+ let eqa1 = self.v1.cmpeq(a);
+ let eqb1 = self.v1.cmpeq(b);
+ let eqa2 = self.v2.cmpeq(a);
+ let eqb2 = self.v2.cmpeq(b);
+ let eqa3 = self.v3.cmpeq(a);
+ let eqb3 = self.v3.cmpeq(b);
+ let or1 = eqa1.or(eqb1);
+ let or2 = eqa2.or(eqb2);
+ let or3 = eqa3.or(eqb3);
+ let or4 = or1.or(or2);
+ let or5 = or3.or(or4);
+ if or5.movemask_will_have_non_zero() {
+ let mask = eqb1
+ .movemask()
+ .or(eqb2.movemask())
+ .or(eqb3.movemask());
+ if mask.has_non_zero() {
+ return Some(cur.add(V::BYTES).add(topos(mask)));
+ }
+
+ let mask = eqa1
+ .movemask()
+ .or(eqa2.movemask())
+ .or(eqa3.movemask());
+ debug_assert!(mask.has_non_zero());
+ return Some(cur.add(topos(mask)));
+ }
+ }
+ }
+ while cur >= start.add(V::BYTES) {
+ debug_assert!(cur.distance(start) >= V::BYTES);
+ cur = cur.sub(V::BYTES);
+ if let Some(cur) = self.search_chunk(cur, topos) {
+ return Some(cur);
+ }
+ }
+ if cur > start {
+ debug_assert!(cur.distance(start) < V::BYTES);
+ return self.search_chunk(start, topos);
+ }
+ None
+ }
+
+ /// Search `V::BYTES` starting at `cur` via an unaligned load.
+ ///
+ /// `mask_to_offset` should be a function that converts a `movemask` to
+ /// an offset such that `cur.add(offset)` corresponds to a pointer to the
+ /// match location if one is found. Generally it is expected to use either
+ /// `mask_to_first_offset` or `mask_to_last_offset`, depending on whether
+ /// one is implementing a forward or reverse search, respectively.
+ ///
+ /// # Safety
+ ///
+ /// `cur` must be a valid pointer and it must be valid to do an unaligned
+ /// load of size `V::BYTES` at `cur`.
+ #[inline(always)]
+ unsafe fn search_chunk(
+ &self,
+ cur: *const u8,
+ mask_to_offset: impl Fn(V::Mask) -> usize,
+ ) -> Option<*const u8> {
+ let chunk = V::load_unaligned(cur);
+ let eq1 = self.v1.cmpeq(chunk);
+ let eq2 = self.v2.cmpeq(chunk);
+ let eq3 = self.v3.cmpeq(chunk);
+ let mask = eq1.or(eq2).or(eq3).movemask();
+ if mask.has_non_zero() {
+ let mask1 = eq1.movemask();
+ let mask2 = eq2.movemask();
+ let mask3 = eq3.movemask();
+ Some(cur.add(mask_to_offset(mask1.or(mask2).or(mask3))))
+ } else {
+ None
+ }
+ }
+}
+
+/// An iterator over all occurrences of a set of bytes in a haystack.
+///
+/// This iterator implements the routines necessary to provide a
+/// `DoubleEndedIterator` impl, which means it can also be used to find
+/// occurrences in reverse order.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'h` refers to the lifetime of the haystack being searched.
+///
+/// This type is intended to be used to implement all iterators for the
+/// `memchr` family of functions. It handles a tiny bit of marginally tricky
+/// raw pointer math, but otherwise expects the caller to provide `find_raw`
+/// and `rfind_raw` routines for each call of `next` and `next_back`,
+/// respectively.
+#[derive(Clone, Debug)]
+pub(crate) struct Iter<'h> {
+ /// The original starting point into the haystack. We use this to convert
+ /// pointers to offsets.
+ original_start: *const u8,
+ /// The current starting point into the haystack. That is, where the next
+ /// search will begin.
+ start: *const u8,
+ /// The current ending point into the haystack. That is, where the next
+ /// reverse search will begin.
+ end: *const u8,
+ /// A marker for tracking the lifetime of the start/cur_start/cur_end
+ /// pointers above, which all point into the haystack.
+ haystack: core::marker::PhantomData<&'h [u8]>,
+}
+
+// SAFETY: Iter contains no shared references to anything that performs any
+// interior mutations. Also, the lifetime guarantees that Iter will not outlive
+// the haystack.
+unsafe impl<'h> Send for Iter<'h> {}
+
+// SAFETY: Iter perform no interior mutations, therefore no explicit
+// synchronization is necessary. Also, the lifetime guarantees that Iter will
+// not outlive the haystack.
+unsafe impl<'h> Sync for Iter<'h> {}
+
+impl<'h> Iter<'h> {
+ /// Create a new generic memchr iterator.
+ #[inline(always)]
+ pub(crate) fn new(haystack: &'h [u8]) -> Iter<'h> {
+ Iter {
+ original_start: haystack.as_ptr(),
+ start: haystack.as_ptr(),
+ end: haystack.as_ptr().wrapping_add(haystack.len()),
+ haystack: core::marker::PhantomData,
+ }
+ }
+
+ /// Returns the next occurrence in the forward direction.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that if a pointer is returned from the closure
+ /// provided, then it must be greater than or equal to the start pointer
+ /// and less than the end pointer.
+ #[inline(always)]
+ pub(crate) unsafe fn next(
+ &mut self,
+ mut find_raw: impl FnMut(*const u8, *const u8) -> Option<*const u8>,
+ ) -> Option<usize> {
+ // SAFETY: Pointers are derived directly from the same &[u8] haystack.
+ // We only ever modify start/end corresponding to a matching offset
+ // found between start and end. Thus all changes to start/end maintain
+ // our safety requirements.
+ //
+ // The only other assumption we rely on is that the pointer returned
+ // by `find_raw` satisfies `self.start <= found < self.end`, and that
+ // safety contract is forwarded to the caller.
+ let found = find_raw(self.start, self.end)?;
+ let result = found.distance(self.original_start);
+ self.start = found.add(1);
+ Some(result)
+ }
+
+ /// Returns the number of remaining elements in this iterator.
+ #[inline(always)]
+ pub(crate) fn count(
+ self,
+ mut count_raw: impl FnMut(*const u8, *const u8) -> usize,
+ ) -> usize {
+ // SAFETY: Pointers are derived directly from the same &[u8] haystack.
+ // We only ever modify start/end corresponding to a matching offset
+ // found between start and end. Thus all changes to start/end maintain
+ // our safety requirements.
+ count_raw(self.start, self.end)
+ }
+
+ /// Returns the next occurrence in reverse.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that if a pointer is returned from the closure
+ /// provided, then it must be greater than or equal to the start pointer
+ /// and less than the end pointer.
+ #[inline(always)]
+ pub(crate) unsafe fn next_back(
+ &mut self,
+ mut rfind_raw: impl FnMut(*const u8, *const u8) -> Option<*const u8>,
+ ) -> Option<usize> {
+ // SAFETY: Pointers are derived directly from the same &[u8] haystack.
+ // We only ever modify start/end corresponding to a matching offset
+ // found between start and end. Thus all changes to start/end maintain
+ // our safety requirements.
+ //
+ // The only other assumption we rely on is that the pointer returned
+ // by `rfind_raw` satisfies `self.start <= found < self.end`, and that
+ // safety contract is forwarded to the caller.
+ let found = rfind_raw(self.start, self.end)?;
+ let result = found.distance(self.original_start);
+ self.end = found;
+ Some(result)
+ }
+
+ /// Provides an implementation of `Iterator::size_hint`.
+ #[inline(always)]
+ pub(crate) fn size_hint(&self) -> (usize, Option<usize>) {
+ (0, Some(self.end.as_usize().saturating_sub(self.start.as_usize())))
+ }
+}
+
+/// Search a slice using a function that operates on raw pointers.
+///
+/// Given a function to search a contiguous sequence of memory for the location
+/// of a non-empty set of bytes, this will execute that search on a slice of
+/// bytes. The pointer returned by the given function will be converted to an
+/// offset relative to the starting point of the given slice. That is, if a
+/// match is found, the offset returned by this routine is guaranteed to be a
+/// valid index into `haystack`.
+///
+/// Callers may use this for a forward or reverse search.
+///
+/// # Safety
+///
+/// Callers must ensure that if a pointer is returned by `find_raw`, then the
+/// pointer must be greater than or equal to the starting pointer and less than
+/// the end pointer.
+#[inline(always)]
+pub(crate) unsafe fn search_slice_with_raw(
+ haystack: &[u8],
+ mut find_raw: impl FnMut(*const u8, *const u8) -> Option<*const u8>,
+) -> Option<usize> {
+ // SAFETY: We rely on `find_raw` to return a correct and valid pointer, but
+ // otherwise, `start` and `end` are valid due to the guarantees provided by
+ // a &[u8].
+ let start = haystack.as_ptr();
+ let end = start.add(haystack.len());
+ let found = find_raw(start, end)?;
+ Some(found.distance(start))
+}
+
+/// Performs a forward byte-at-a-time loop until either `ptr >= end_ptr` or
+/// until `confirm(*ptr)` returns `true`. If the former occurs, then `None` is
+/// returned. If the latter occurs, then the pointer at which `confirm` returns
+/// `true` is returned.
+///
+/// # Safety
+///
+/// Callers must provide valid pointers and they must satisfy `start_ptr <=
+/// ptr` and `ptr <= end_ptr`.
+#[inline(always)]
+pub(crate) unsafe fn fwd_byte_by_byte<F: Fn(u8) -> bool>(
+ start: *const u8,
+ end: *const u8,
+ confirm: F,
+) -> Option<*const u8> {
+ debug_assert!(start <= end);
+ let mut ptr = start;
+ while ptr < end {
+ if confirm(*ptr) {
+ return Some(ptr);
+ }
+ ptr = ptr.offset(1);
+ }
+ None
+}
+
+/// Performs a reverse byte-at-a-time loop until either `ptr < start_ptr` or
+/// until `confirm(*ptr)` returns `true`. If the former occurs, then `None` is
+/// returned. If the latter occurs, then the pointer at which `confirm` returns
+/// `true` is returned.
+///
+/// # Safety
+///
+/// Callers must provide valid pointers and they must satisfy `start_ptr <=
+/// ptr` and `ptr <= end_ptr`.
+#[inline(always)]
+pub(crate) unsafe fn rev_byte_by_byte<F: Fn(u8) -> bool>(
+ start: *const u8,
+ end: *const u8,
+ confirm: F,
+) -> Option<*const u8> {
+ debug_assert!(start <= end);
+
+ let mut ptr = end;
+ while ptr > start {
+ ptr = ptr.offset(-1);
+ if confirm(*ptr) {
+ return Some(ptr);
+ }
+ }
+ None
+}
+
+/// Performs a forward byte-at-a-time loop until `ptr >= end_ptr` and returns
+/// the number of times `confirm(*ptr)` returns `true`.
+///
+/// # Safety
+///
+/// Callers must provide valid pointers and they must satisfy `start_ptr <=
+/// ptr` and `ptr <= end_ptr`.
+#[inline(always)]
+pub(crate) unsafe fn count_byte_by_byte<F: Fn(u8) -> bool>(
+ start: *const u8,
+ end: *const u8,
+ confirm: F,
+) -> usize {
+ debug_assert!(start <= end);
+ let mut ptr = start;
+ let mut count = 0;
+ while ptr < end {
+ if confirm(*ptr) {
+ count += 1;
+ }
+ ptr = ptr.offset(1);
+ }
+ count
+}
+
/*!
+This module defines "generic" routines that can be specialized to specific
+architectures.
+
+We don't expose this module primarily because it would require exposing all
+of the internal infrastructure required to write these generic routines.
+That infrastructure should be treated as an implementation detail so that
+it is allowed to evolve. Instead, what we expose are architecture specific
+instantiations of these generic implementations. The generic code just lets us
+write the code once (usually).
+*/
+
+pub(crate) mod memchr;
+pub(crate) mod packedpair;
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +
/*!
+Generic crate-internal routines for the "packed pair" SIMD algorithm.
+
+The "packed pair" algorithm is based on the [generic SIMD] algorithm. The main
+difference is that it (by default) uses a background distribution of byte
+frequencies to heuristically select the pair of bytes to search for.
+
+[generic SIMD]: http://0x80.pl/articles/simd-strfind.html#first-and-last
+*/
+
+use crate::{
+ arch::all::{is_equal_raw, packedpair::Pair},
+ ext::Pointer,
+ vector::{MoveMask, Vector},
+};
+
+/// A generic architecture dependent "packed pair" finder.
+///
+/// This finder picks two bytes that it believes have high predictive power
+/// for indicating an overall match of a needle. Depending on whether
+/// `Finder::find` or `Finder::find_prefilter` is used, it reports offsets
+/// where the needle matches or could match. In the prefilter case, candidates
+/// are reported whenever the [`Pair`] of bytes given matches.
+///
+/// This is architecture dependent because it uses specific vector operations
+/// to look for occurrences of the pair of bytes.
+///
+/// This type is not meant to be exported and is instead meant to be used as
+/// the implementation for architecture specific facades. Why? Because it's a
+/// bit of a quirky API that requires `inline(always)` annotations. And pretty
+/// much everything has safety obligations due (at least) to the caller needing
+/// to inline calls into routines marked with
+/// `#[target_feature(enable = "...")]`.
+#[derive(Clone, Copy, Debug)]
+pub(crate) struct Finder<V> {
+ pair: Pair,
+ v1: V,
+ v2: V,
+ min_haystack_len: usize,
+}
+
+impl<V: Vector> Finder<V> {
+ /// Create a new pair searcher. The searcher returned can either report
+ /// exact matches of `needle` or act as a prefilter and report candidate
+ /// positions of `needle`.
+ ///
+ /// # Safety
+ ///
+ /// Callers must ensure that whatever vector type this routine is called
+ /// with is supported by the current environment.
+ ///
+ /// Callers must also ensure that `needle.len() >= 2`.
+ #[inline(always)]
+ pub(crate) unsafe fn new(needle: &[u8], pair: Pair) -> Finder<V> {
+ let max_index = pair.index1().max(pair.index2());
+ let min_haystack_len =
+ core::cmp::max(needle.len(), usize::from(max_index) + V::BYTES);
+ let v1 = V::splat(needle[usize::from(pair.index1())]);
+ let v2 = V::splat(needle[usize::from(pair.index2())]);
+ Finder { pair, v1, v2, min_haystack_len }
+ }
+
+ /// Searches the given haystack for the given needle. The needle given
+ /// should be the same as the needle that this finder was initialized
+ /// with.
+ ///
+ /// # Panics
+ ///
+ /// When `haystack.len()` is less than [`Finder::min_haystack_len`].
+ ///
+ /// # Safety
+ ///
+ /// Since this is meant to be used with vector functions, callers need to
+ /// specialize this inside of a function with a `target_feature` attribute.
+ /// Therefore, callers must ensure that whatever target feature is being
+ /// used supports the vector functions that this function is specialized
+ /// for. (For the specific vector functions used, see the Vector trait
+ /// implementations.)
+ #[inline(always)]
+ pub(crate) unsafe fn find(
+ &self,
+ haystack: &[u8],
+ needle: &[u8],
+ ) -> Option<usize> {
+ assert!(
+ haystack.len() >= self.min_haystack_len,
+ "haystack too small, should be at least {} but got {}",
+ self.min_haystack_len,
+ haystack.len(),
+ );
+
+ let all = V::Mask::all_zeros_except_least_significant(0);
+ let start = haystack.as_ptr();
+ let end = start.add(haystack.len());
+ let max = end.sub(self.min_haystack_len);
+ let mut cur = start;
+
+ // N.B. I did experiment with unrolling the loop to deal with size(V)
+ // bytes at a time and 2*size(V) bytes at a time. The double unroll
+ // was marginally faster while the quadruple unroll was unambiguously
+ // slower. In the end, I decided the complexity from unrolling wasn't
+ // worth it. I used the memmem/krate/prebuilt/huge-en/ benchmarks to
+ // compare.
+ while cur <= max {
+ if let Some(chunki) = self.find_in_chunk(needle, cur, end, all) {
+ return Some(matched(start, cur, chunki));
+ }
+ cur = cur.add(V::BYTES);
+ }
+ if cur < end {
+ let remaining = end.distance(cur);
+ debug_assert!(
+ remaining < self.min_haystack_len,
+ "remaining bytes should be smaller than the minimum haystack \
+ length of {}, but there are {} bytes remaining",
+ self.min_haystack_len,
+ remaining,
+ );
+ if remaining < needle.len() {
+ return None;
+ }
+ debug_assert!(
+ max < cur,
+ "after main loop, cur should have exceeded max",
+ );
+ let overlap = cur.distance(max);
+ debug_assert!(
+ overlap > 0,
+ "overlap ({}) must always be non-zero",
+ overlap,
+ );
+ debug_assert!(
+ overlap < V::BYTES,
+ "overlap ({}) cannot possibly be >= than a vector ({})",
+ overlap,
+ V::BYTES,
+ );
+ // The mask has all of its bits set except for the first N least
+ // significant bits, where N=overlap. This way, any matches that
+ // occur in find_in_chunk within the overlap are automatically
+ // ignored.
+ let mask = V::Mask::all_zeros_except_least_significant(overlap);
+ cur = max;
+ let m = self.find_in_chunk(needle, cur, end, mask);
+ if let Some(chunki) = m {
+ return Some(matched(start, cur, chunki));
+ }
+ }
+ None
+ }
+
+ /// Searches the given haystack for offsets that represent candidate
+ /// matches of the `needle` given to this finder's constructor. The offsets
+ /// returned, if they are a match, correspond to the starting offset of
+ /// `needle` in the given `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// When `haystack.len()` is less than [`Finder::min_haystack_len`].
+ ///
+ /// # Safety
+ ///
+ /// Since this is meant to be used with vector functions, callers need to
+ /// specialize this inside of a function with a `target_feature` attribute.
+ /// Therefore, callers must ensure that whatever target feature is being
+ /// used supports the vector functions that this function is specialized
+ /// for. (For the specific vector functions used, see the Vector trait
+ /// implementations.)
+ #[inline(always)]
+ pub(crate) unsafe fn find_prefilter(
+ &self,
+ haystack: &[u8],
+ ) -> Option<usize> {
+ assert!(
+ haystack.len() >= self.min_haystack_len,
+ "haystack too small, should be at least {} but got {}",
+ self.min_haystack_len,
+ haystack.len(),
+ );
+
+ let start = haystack.as_ptr();
+ let end = start.add(haystack.len());
+ let max = end.sub(self.min_haystack_len);
+ let mut cur = start;
+
+ // N.B. I did experiment with unrolling the loop to deal with size(V)
+ // bytes at a time and 2*size(V) bytes at a time. The double unroll
+ // was marginally faster while the quadruple unroll was unambiguously
+ // slower. In the end, I decided the complexity from unrolling wasn't
+ // worth it. I used the memmem/krate/prebuilt/huge-en/ benchmarks to
+ // compare.
+ while cur <= max {
+ if let Some(chunki) = self.find_prefilter_in_chunk(cur) {
+ return Some(matched(start, cur, chunki));
+ }
+ cur = cur.add(V::BYTES);
+ }
+ if cur < end {
+ // This routine immediately quits if a candidate match is found.
+ // That means that if we're here, no candidate matches have been
+ // found at or before 'ptr'. Thus, we don't need to mask anything
+ // out even though we might technically search part of the haystack
+ // that we've already searched (because we know it can't match).
+ cur = max;
+ if let Some(chunki) = self.find_prefilter_in_chunk(cur) {
+ return Some(matched(start, cur, chunki));
+ }
+ }
+ None
+ }
+
+ /// Search for an occurrence of our byte pair from the needle in the chunk
+ /// pointed to by cur, with the end of the haystack pointed to by end.
+ /// When an occurrence is found, memcmp is run to check if a match occurs
+ /// at the corresponding position.
+ ///
+ /// `mask` should have bits set corresponding the positions in the chunk
+ /// in which matches are considered. This is only used for the last vector
+ /// load where the beginning of the vector might have overlapped with the
+ /// last load in the main loop. The mask lets us avoid visiting positions
+ /// that have already been discarded as matches.
+ ///
+ /// # Safety
+ ///
+ /// It must be safe to do an unaligned read of size(V) bytes starting at
+ /// both (cur + self.index1) and (cur + self.index2). It must also be safe
+ /// to do unaligned loads on cur up to (end - needle.len()).
+ #[inline(always)]
+ unsafe fn find_in_chunk(
+ &self,
+ needle: &[u8],
+ cur: *const u8,
+ end: *const u8,
+ mask: V::Mask,
+ ) -> Option<usize> {
+ let index1 = usize::from(self.pair.index1());
+ let index2 = usize::from(self.pair.index2());
+ let chunk1 = V::load_unaligned(cur.add(index1));
+ let chunk2 = V::load_unaligned(cur.add(index2));
+ let eq1 = chunk1.cmpeq(self.v1);
+ let eq2 = chunk2.cmpeq(self.v2);
+
+ let mut offsets = eq1.and(eq2).movemask().and(mask);
+ while offsets.has_non_zero() {
+ let offset = offsets.first_offset();
+ let cur = cur.add(offset);
+ if end.sub(needle.len()) < cur {
+ return None;
+ }
+ if is_equal_raw(needle.as_ptr(), cur, needle.len()) {
+ return Some(offset);
+ }
+ offsets = offsets.clear_least_significant_bit();
+ }
+ None
+ }
+
+ /// Search for an occurrence of our byte pair from the needle in the chunk
+ /// pointed to by cur, with the end of the haystack pointed to by end.
+ /// When an occurrence is found, memcmp is run to check if a match occurs
+ /// at the corresponding position.
+ ///
+ /// # Safety
+ ///
+ /// It must be safe to do an unaligned read of size(V) bytes starting at
+ /// both (cur + self.index1) and (cur + self.index2). It must also be safe
+ /// to do unaligned reads on cur up to (end - needle.len()).
+ #[inline(always)]
+ unsafe fn find_prefilter_in_chunk(&self, cur: *const u8) -> Option<usize> {
+ let index1 = usize::from(self.pair.index1());
+ let index2 = usize::from(self.pair.index2());
+ let chunk1 = V::load_unaligned(cur.add(index1));
+ let chunk2 = V::load_unaligned(cur.add(index2));
+ let eq1 = chunk1.cmpeq(self.v1);
+ let eq2 = chunk2.cmpeq(self.v2);
+
+ let offsets = eq1.and(eq2).movemask();
+ if !offsets.has_non_zero() {
+ return None;
+ }
+ Some(offsets.first_offset())
+ }
+
+ /// Returns the pair of offsets (into the needle) used to check as a
+ /// predicate before confirming whether a needle exists at a particular
+ /// position.
+ #[inline]
+ pub(crate) fn pair(&self) -> &Pair {
+ &self.pair
+ }
+
+ /// Returns the minimum haystack length that this `Finder` can search.
+ ///
+ /// Providing a haystack to this `Finder` shorter than this length is
+ /// guaranteed to result in a panic.
+ #[inline(always)]
+ pub(crate) fn min_haystack_len(&self) -> usize {
+ self.min_haystack_len
+ }
+}
+
+/// Accepts a chunk-relative offset and returns a haystack relative offset.
+///
+/// This used to be marked `#[cold]` and `#[inline(never)]`, but I couldn't
+/// observe a consistent measureable difference between that and just inlining
+/// it. So we go with inlining it.
+///
+/// # Safety
+///
+/// Same at `ptr::offset_from` in addition to `cur >= start`.
+#[inline(always)]
+unsafe fn matched(start: *const u8, cur: *const u8, chunki: usize) -> usize {
+ cur.distance(start) + chunki
+}
+
+// If you're looking for tests, those are run for each instantiation of the
+// above code. So for example, see arch::x86_64::sse2::packedpair.
+
/*!
+A module with low-level architecture dependent routines.
+
+These routines are useful as primitives for tasks not covered by the higher
+level crate API.
+*/
+
+pub mod all;
+pub(crate) mod generic;
+
+#[cfg(target_arch = "aarch64")]
+pub mod aarch64;
+#[cfg(target_arch = "wasm32")]
+pub mod wasm32;
+#[cfg(target_arch = "x86_64")]
+pub mod x86_64;
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 +763 +764 +765 +766 +767 +768 +769 +770 +771 +772 +773 +774 +775 +776 +777 +778 +779 +780 +781 +782 +783 +784 +785 +786 +787 +788 +789 +790 +791 +792 +793 +794 +795 +796 +797 +798 +799 +800 +801 +802 +803 +804 +805 +806 +807 +808 +809 +810 +811 +812 +813 +814 +815 +816 +817 +818 +819 +820 +821 +822 +823 +824 +825 +826 +827 +828 +829 +830 +831 +832 +833 +834 +835 +836 +837 +838 +839 +840 +841 +842 +843 +844 +845 +846 +847 +848 +849 +850 +851 +852 +853 +854 +855 +856 +857 +858 +859 +860 +861 +862 +863 +864 +865 +866 +867 +868 +869 +870 +871 +872 +873 +874 +875 +876 +877 +878 +879 +880 +881 +882 +883 +884 +885 +886 +887 +888 +889 +890 +891 +892 +893 +894 +895 +896 +897 +898 +899 +900 +901 +902 +903 +904 +905 +906 +907 +908 +909 +910 +911 +912 +913 +914 +915 +916 +917 +918 +919 +920 +921 +922 +923 +924 +925 +926 +927 +928 +929 +930 +931 +932 +933 +934 +935 +936 +937 +938 +939 +940 +941 +942 +943 +944 +945 +946 +947 +948 +949 +950 +951 +952 +953 +954 +955 +956 +957 +958 +959 +960 +961 +962 +963 +964 +965 +966 +967 +968 +969 +970 +971 +972 +973 +974 +975 +976 +977 +978 +979 +980 +981 +982 +983 +984 +985 +986 +987 +988 +989 +990 +991 +992 +993 +994 +995 +996 +997 +998 +999 +1000 +1001 +1002 +1003 +1004 +1005 +1006 +1007 +1008 +1009 +1010 +1011 +1012 +1013 +1014 +1015 +1016 +1017 +1018 +1019 +1020 +1021 +1022 +1023 +1024 +1025 +1026 +1027 +1028 +1029 +1030 +1031 +1032 +1033 +1034 +1035 +1036 +1037 +1038 +1039 +1040 +1041 +1042 +1043 +1044 +1045 +1046 +1047 +1048 +1049 +1050 +1051 +1052 +1053 +1054 +1055 +1056 +1057 +1058 +1059 +1060 +1061 +1062 +1063 +1064 +1065 +1066 +1067 +1068 +1069 +1070 +1071 +1072 +1073 +1074 +1075 +1076 +1077 +1078 +1079 +1080 +1081 +1082 +1083 +1084 +1085 +1086 +1087 +1088 +1089 +1090 +1091 +1092 +1093 +1094 +1095 +1096 +1097 +1098 +1099 +1100 +1101 +1102 +1103 +1104 +1105 +1106 +1107 +1108 +1109 +1110 +1111 +1112 +1113 +1114 +1115 +1116 +1117 +1118 +1119 +1120 +1121 +1122 +1123 +1124 +1125 +1126 +1127 +1128 +1129 +1130 +1131 +1132 +1133 +1134 +1135 +1136 +1137 +1138 +1139 +1140 +1141 +1142 +1143 +1144 +1145 +1146 +1147 +1148 +1149 +1150 +1151 +1152 +1153 +1154 +1155 +1156 +1157 +1158 +1159 +1160 +1161 +1162 +1163 +1164 +1165 +1166 +1167 +1168 +1169 +1170 +1171 +1172 +1173 +1174 +1175 +1176 +1177 +1178 +1179 +1180 +1181 +1182 +1183 +1184 +1185 +1186 +1187 +1188 +1189 +1190 +1191 +1192 +1193 +1194 +1195 +1196 +1197 +1198 +1199 +1200 +1201 +1202 +1203 +1204 +1205 +1206 +1207 +1208 +1209 +1210 +1211 +1212 +1213 +1214 +1215 +1216 +1217 +1218 +1219 +1220 +1221 +1222 +1223 +1224 +1225 +1226 +1227 +1228 +1229 +1230 +1231 +1232 +1233 +1234 +1235 +1236 +1237 +1238 +1239 +1240 +1241 +1242 +1243 +1244 +1245 +1246 +1247 +1248 +1249 +1250 +1251 +1252 +1253 +1254 +1255 +1256 +1257 +1258 +1259 +1260 +1261 +1262 +1263 +1264 +1265 +1266 +1267 +1268 +1269 +1270 +1271 +1272 +1273 +1274 +1275 +1276 +1277 +1278 +1279 +1280 +1281 +1282 +1283 +1284 +1285 +1286 +1287 +1288 +1289 +1290 +1291 +1292 +1293 +1294 +1295 +1296 +1297 +1298 +1299 +1300 +1301 +1302 +1303 +1304 +1305 +1306 +1307 +1308 +1309 +1310 +1311 +1312 +1313 +1314 +1315 +1316 +1317 +1318 +1319 +1320 +1321 +1322 +1323 +1324 +1325 +1326 +1327 +1328 +1329 +1330 +1331 +1332 +1333 +1334 +1335 +1336 +1337 +1338 +1339 +1340 +1341 +1342 +1343 +1344 +1345 +1346 +1347 +1348 +1349 +1350 +1351 +1352 +
/*!
+This module defines 256-bit vector implementations of `memchr` and friends.
+
+The main types in this module are [`One`], [`Two`] and [`Three`]. They are for
+searching for one, two or three distinct bytes, respectively, in a haystack.
+Each type also has corresponding double ended iterators. These searchers are
+typically much faster than scalar routines accomplishing the same task.
+
+The `One` searcher also provides a [`One::count`] routine for efficiently
+counting the number of times a single byte occurs in a haystack. This is
+useful, for example, for counting the number of lines in a haystack. This
+routine exists because it is usually faster, especially with a high match
+count, then using [`One::find`] repeatedly. ([`OneIter`] specializes its
+`Iterator::count` implementation to use this routine.)
+
+Only one, two and three bytes are supported because three bytes is about
+the point where one sees diminishing returns. Beyond this point and it's
+probably (but not necessarily) better to just use a simple `[bool; 256]` array
+or similar. However, it depends mightily on the specific work-load and the
+expected match frequency.
+*/
+
+use core::arch::x86_64::{__m128i, __m256i};
+
+use crate::{arch::generic::memchr as generic, ext::Pointer, vector::Vector};
+
+/// Finds all occurrences of a single byte in a haystack.
+#[derive(Clone, Copy, Debug)]
+pub struct One {
+ /// Used for haystacks less than 32 bytes.
+ sse2: generic::One<__m128i>,
+ /// Used for haystacks bigger than 32 bytes.
+ avx2: generic::One<__m256i>,
+}
+
+impl One {
+ /// Create a new searcher that finds occurrences of the needle byte given.
+ ///
+ /// This particular searcher is specialized to use AVX2 vector instructions
+ /// that typically make it quite fast. (SSE2 is used for haystacks that
+ /// are too short to accommodate an AVX2 vector.)
+ ///
+ /// If either SSE2 or AVX2 is unavailable in the current environment, then
+ /// `None` is returned.
+ #[inline]
+ pub fn new(needle: u8) -> Option<One> {
+ if One::is_available() {
+ // SAFETY: we check that sse2 and avx2 are available above.
+ unsafe { Some(One::new_unchecked(needle)) }
+ } else {
+ None
+ }
+ }
+
+ /// Create a new finder specific to AVX2 vectors and routines without
+ /// checking that either SSE2 or AVX2 is available.
+ ///
+ /// # Safety
+ ///
+ /// Callers must guarantee that it is safe to execute both `sse2` and
+ /// `avx2` instructions in the current environment.
+ ///
+ /// Note that it is a common misconception that if one compiles for an
+ /// `x86_64` target, then they therefore automatically have access to SSE2
+ /// instructions. While this is almost always the case, it isn't true in
+ /// 100% of cases.
+ #[target_feature(enable = "sse2", enable = "avx2")]
+ #[inline]
+ pub unsafe fn new_unchecked(needle: u8) -> One {
+ One {
+ sse2: generic::One::new(needle),
+ avx2: generic::One::new(needle),
+ }
+ }
+
+ /// Returns true when this implementation is available in the current
+ /// environment.
+ ///
+ /// When this is true, it is guaranteed that [`One::new`] will return
+ /// a `Some` value. Similarly, when it is false, it is guaranteed that
+ /// `One::new` will return a `None` value.
+ ///
+ /// Note also that for the lifetime of a single program, if this returns
+ /// true then it will always return true.
+ #[inline]
+ pub fn is_available() -> bool {
+ #[cfg(not(target_feature = "sse2"))]
+ {
+ false
+ }
+ #[cfg(target_feature = "sse2")]
+ {
+ #[cfg(target_feature = "avx2")]
+ {
+ true
+ }
+ #[cfg(not(target_feature = "avx2"))]
+ {
+ #[cfg(feature = "std")]
+ {
+ std::is_x86_feature_detected!("avx2")
+ }
+ #[cfg(not(feature = "std"))]
+ {
+ false
+ }
+ }
+ }
+ }
+
+ /// Return the first occurrence of one of the needle bytes in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value is `haystack.len() - 1`.
+ #[inline]
+ pub fn find(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `find_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.find_raw(s, e)
+ })
+ }
+ }
+
+ /// Return the last occurrence of one of the needle bytes in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value is `haystack.len() - 1`.
+ #[inline]
+ pub fn rfind(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `find_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.rfind_raw(s, e)
+ })
+ }
+ }
+
+ /// Counts all occurrences of this byte in the given haystack.
+ #[inline]
+ pub fn count(&self, haystack: &[u8]) -> usize {
+ // SAFETY: All of our pointers are derived directly from a borrowed
+ // slice, which is guaranteed to be valid.
+ unsafe {
+ let start = haystack.as_ptr();
+ let end = start.add(haystack.len());
+ self.count_raw(start, end)
+ }
+ }
+
+ /// Like `find`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn find_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ let len = end.distance(start);
+ if len < __m256i::BYTES {
+ return if len < __m128i::BYTES {
+ // SAFETY: We require the caller to pass valid start/end
+ // pointers.
+ generic::fwd_byte_by_byte(start, end, |b| {
+ b == self.sse2.needle1()
+ })
+ } else {
+ // SAFETY: We require the caller to pass valid start/end
+ // pointers.
+ self.find_raw_sse2(start, end)
+ };
+ }
+ // SAFETY: Building a `One` means it's safe to call both 'sse2' and
+ // 'avx2' routines. Also, we've checked that our haystack is big
+ // enough to run on the vector routine. Pointer validity is caller's
+ // responsibility.
+ //
+ // Note that we could call `self.avx2.find_raw` directly here. But that
+ // means we'd have to annotate this routine with `target_feature`.
+ // Which is fine, because this routine is `unsafe` anyway and the
+ // `target_feature` obligation is met by virtue of building a `One`.
+ // The real problem is that a routine with a `target_feature`
+ // annotation generally can't be inlined into caller code unless
+ // the caller code has the same target feature annotations. Namely,
+ // the common case (at time of writing) is for calling code to not
+ // have the `avx2` target feature enabled *at compile time*. Without
+ // `target_feature` on this routine, it can be inlined which will
+ // handle some of the short-haystack cases above without touching the
+ // architecture specific code.
+ self.find_raw_avx2(start, end)
+ }
+
+ /// Like `rfind`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn rfind_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ let len = end.distance(start);
+ if len < __m256i::BYTES {
+ return if len < __m128i::BYTES {
+ // SAFETY: We require the caller to pass valid start/end
+ // pointers.
+ generic::rev_byte_by_byte(start, end, |b| {
+ b == self.sse2.needle1()
+ })
+ } else {
+ // SAFETY: We require the caller to pass valid start/end
+ // pointers.
+ self.rfind_raw_sse2(start, end)
+ };
+ }
+ // SAFETY: Building a `One` means it's safe to call both 'sse2' and
+ // 'avx2' routines. Also, we've checked that our haystack is big
+ // enough to run on the vector routine. Pointer validity is caller's
+ // responsibility.
+ //
+ // See note in forward routine above for why we don't just call
+ // `self.avx2.rfind_raw` directly here.
+ self.rfind_raw_avx2(start, end)
+ }
+
+ /// Counts all occurrences of this byte in the given haystack represented
+ /// by raw pointers.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `0` will always be returned.
+ #[inline]
+ pub unsafe fn count_raw(&self, start: *const u8, end: *const u8) -> usize {
+ if start >= end {
+ return 0;
+ }
+ let len = end.distance(start);
+ if len < __m256i::BYTES {
+ return if len < __m128i::BYTES {
+ // SAFETY: We require the caller to pass valid start/end
+ // pointers.
+ generic::count_byte_by_byte(start, end, |b| {
+ b == self.sse2.needle1()
+ })
+ } else {
+ // SAFETY: We require the caller to pass valid start/end
+ // pointers.
+ self.count_raw_sse2(start, end)
+ };
+ }
+ // SAFETY: Building a `One` means it's safe to call both 'sse2' and
+ // 'avx2' routines. Also, we've checked that our haystack is big
+ // enough to run on the vector routine. Pointer validity is caller's
+ // responsibility.
+ self.count_raw_avx2(start, end)
+ }
+
+ /// Execute a search using SSE2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`One::find_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an SSE2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `One`, which can only be constructed
+ /// when it is safe to call `sse2`/`avx2` routines.)
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn find_raw_sse2(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.sse2.find_raw(start, end)
+ }
+
+ /// Execute a search using SSE2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`One::rfind_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an SSE2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `One`, which can only be constructed
+ /// when it is safe to call `sse2`/`avx2` routines.)
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn rfind_raw_sse2(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.sse2.rfind_raw(start, end)
+ }
+
+ /// Execute a count using SSE2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`One::count_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an SSE2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `One`, which can only be constructed
+ /// when it is safe to call `sse2`/`avx2` routines.)
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn count_raw_sse2(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> usize {
+ self.sse2.count_raw(start, end)
+ }
+
+ /// Execute a search using AVX2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`One::find_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an AVX2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `One`, which can only be constructed
+ /// when it is safe to call `sse2`/`avx2` routines.)
+ #[target_feature(enable = "avx2")]
+ #[inline]
+ unsafe fn find_raw_avx2(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.avx2.find_raw(start, end)
+ }
+
+ /// Execute a search using AVX2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`One::rfind_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an AVX2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `One`, which can only be constructed
+ /// when it is safe to call `sse2`/`avx2` routines.)
+ #[target_feature(enable = "avx2")]
+ #[inline]
+ unsafe fn rfind_raw_avx2(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.avx2.rfind_raw(start, end)
+ }
+
+ /// Execute a count using AVX2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`One::count_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an AVX2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `One`, which can only be constructed
+ /// when it is safe to call `sse2`/`avx2` routines.)
+ #[target_feature(enable = "avx2")]
+ #[inline]
+ unsafe fn count_raw_avx2(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> usize {
+ self.avx2.count_raw(start, end)
+ }
+
+ /// Returns an iterator over all occurrences of the needle byte in the
+ /// given haystack.
+ ///
+ /// The iterator returned implements `DoubleEndedIterator`. This means it
+ /// can also be used to find occurrences in reverse order.
+ #[inline]
+ pub fn iter<'a, 'h>(&'a self, haystack: &'h [u8]) -> OneIter<'a, 'h> {
+ OneIter { searcher: self, it: generic::Iter::new(haystack) }
+ }
+}
+
+/// An iterator over all occurrences of a single byte in a haystack.
+///
+/// This iterator implements `DoubleEndedIterator`, which means it can also be
+/// used to find occurrences in reverse order.
+///
+/// This iterator is created by the [`One::iter`] method.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'a` refers to the lifetime of the underlying [`One`] searcher.
+/// * `'h` refers to the lifetime of the haystack being searched.
+#[derive(Clone, Debug)]
+pub struct OneIter<'a, 'h> {
+ searcher: &'a One,
+ it: generic::Iter<'h>,
+}
+
+impl<'a, 'h> Iterator for OneIter<'a, 'h> {
+ type Item = usize;
+
+ #[inline]
+ fn next(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'find_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next(|s, e| self.searcher.find_raw(s, e)) }
+ }
+
+ #[inline]
+ fn count(self) -> usize {
+ self.it.count(|s, e| {
+ // SAFETY: We rely on our generic iterator to return valid start
+ // and end pointers.
+ unsafe { self.searcher.count_raw(s, e) }
+ })
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl<'a, 'h> DoubleEndedIterator for OneIter<'a, 'h> {
+ #[inline]
+ fn next_back(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'rfind_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next_back(|s, e| self.searcher.rfind_raw(s, e)) }
+ }
+}
+
+impl<'a, 'h> core::iter::FusedIterator for OneIter<'a, 'h> {}
+
+/// Finds all occurrences of two bytes in a haystack.
+///
+/// That is, this reports matches of one of two possible bytes. For example,
+/// searching for `a` or `b` in `afoobar` would report matches at offsets `0`,
+/// `4` and `5`.
+#[derive(Clone, Copy, Debug)]
+pub struct Two {
+ /// Used for haystacks less than 32 bytes.
+ sse2: generic::Two<__m128i>,
+ /// Used for haystacks bigger than 32 bytes.
+ avx2: generic::Two<__m256i>,
+}
+
+impl Two {
+ /// Create a new searcher that finds occurrences of the needle bytes given.
+ ///
+ /// This particular searcher is specialized to use AVX2 vector instructions
+ /// that typically make it quite fast. (SSE2 is used for haystacks that
+ /// are too short to accommodate an AVX2 vector.)
+ ///
+ /// If either SSE2 or AVX2 is unavailable in the current environment, then
+ /// `None` is returned.
+ #[inline]
+ pub fn new(needle1: u8, needle2: u8) -> Option<Two> {
+ if Two::is_available() {
+ // SAFETY: we check that sse2 and avx2 are available above.
+ unsafe { Some(Two::new_unchecked(needle1, needle2)) }
+ } else {
+ None
+ }
+ }
+
+ /// Create a new finder specific to AVX2 vectors and routines without
+ /// checking that either SSE2 or AVX2 is available.
+ ///
+ /// # Safety
+ ///
+ /// Callers must guarantee that it is safe to execute both `sse2` and
+ /// `avx2` instructions in the current environment.
+ ///
+ /// Note that it is a common misconception that if one compiles for an
+ /// `x86_64` target, then they therefore automatically have access to SSE2
+ /// instructions. While this is almost always the case, it isn't true in
+ /// 100% of cases.
+ #[target_feature(enable = "sse2", enable = "avx2")]
+ #[inline]
+ pub unsafe fn new_unchecked(needle1: u8, needle2: u8) -> Two {
+ Two {
+ sse2: generic::Two::new(needle1, needle2),
+ avx2: generic::Two::new(needle1, needle2),
+ }
+ }
+
+ /// Returns true when this implementation is available in the current
+ /// environment.
+ ///
+ /// When this is true, it is guaranteed that [`Two::new`] will return
+ /// a `Some` value. Similarly, when it is false, it is guaranteed that
+ /// `Two::new` will return a `None` value.
+ ///
+ /// Note also that for the lifetime of a single program, if this returns
+ /// true then it will always return true.
+ #[inline]
+ pub fn is_available() -> bool {
+ #[cfg(not(target_feature = "sse2"))]
+ {
+ false
+ }
+ #[cfg(target_feature = "sse2")]
+ {
+ #[cfg(target_feature = "avx2")]
+ {
+ true
+ }
+ #[cfg(not(target_feature = "avx2"))]
+ {
+ #[cfg(feature = "std")]
+ {
+ std::is_x86_feature_detected!("avx2")
+ }
+ #[cfg(not(feature = "std"))]
+ {
+ false
+ }
+ }
+ }
+ }
+
+ /// Return the first occurrence of one of the needle bytes in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value is `haystack.len() - 1`.
+ #[inline]
+ pub fn find(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `find_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.find_raw(s, e)
+ })
+ }
+ }
+
+ /// Return the last occurrence of one of the needle bytes in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value is `haystack.len() - 1`.
+ #[inline]
+ pub fn rfind(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `find_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.rfind_raw(s, e)
+ })
+ }
+ }
+
+ /// Like `find`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn find_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ let len = end.distance(start);
+ if len < __m256i::BYTES {
+ return if len < __m128i::BYTES {
+ // SAFETY: We require the caller to pass valid start/end
+ // pointers.
+ generic::fwd_byte_by_byte(start, end, |b| {
+ b == self.sse2.needle1() || b == self.sse2.needle2()
+ })
+ } else {
+ // SAFETY: We require the caller to pass valid start/end
+ // pointers.
+ self.find_raw_sse2(start, end)
+ };
+ }
+ // SAFETY: Building a `Two` means it's safe to call both 'sse2' and
+ // 'avx2' routines. Also, we've checked that our haystack is big
+ // enough to run on the vector routine. Pointer validity is caller's
+ // responsibility.
+ //
+ // Note that we could call `self.avx2.find_raw` directly here. But that
+ // means we'd have to annotate this routine with `target_feature`.
+ // Which is fine, because this routine is `unsafe` anyway and the
+ // `target_feature` obligation is met by virtue of building a `Two`.
+ // The real problem is that a routine with a `target_feature`
+ // annotation generally can't be inlined into caller code unless
+ // the caller code has the same target feature annotations. Namely,
+ // the common case (at time of writing) is for calling code to not
+ // have the `avx2` target feature enabled *at compile time*. Without
+ // `target_feature` on this routine, it can be inlined which will
+ // handle some of the short-haystack cases above without touching the
+ // architecture specific code.
+ self.find_raw_avx2(start, end)
+ }
+
+ /// Like `rfind`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn rfind_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ let len = end.distance(start);
+ if len < __m256i::BYTES {
+ return if len < __m128i::BYTES {
+ // SAFETY: We require the caller to pass valid start/end
+ // pointers.
+ generic::rev_byte_by_byte(start, end, |b| {
+ b == self.sse2.needle1() || b == self.sse2.needle2()
+ })
+ } else {
+ // SAFETY: We require the caller to pass valid start/end
+ // pointers.
+ self.rfind_raw_sse2(start, end)
+ };
+ }
+ // SAFETY: Building a `Two` means it's safe to call both 'sse2' and
+ // 'avx2' routines. Also, we've checked that our haystack is big
+ // enough to run on the vector routine. Pointer validity is caller's
+ // responsibility.
+ //
+ // See note in forward routine above for why we don't just call
+ // `self.avx2.rfind_raw` directly here.
+ self.rfind_raw_avx2(start, end)
+ }
+
+ /// Execute a search using SSE2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`Two::find_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an SSE2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `Two`, which can only be constructed
+ /// when it is safe to call `sse2`/`avx2` routines.)
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn find_raw_sse2(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.sse2.find_raw(start, end)
+ }
+
+ /// Execute a search using SSE2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`Two::rfind_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an SSE2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `Two`, which can only be constructed
+ /// when it is safe to call `sse2`/`avx2` routines.)
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn rfind_raw_sse2(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.sse2.rfind_raw(start, end)
+ }
+
+ /// Execute a search using AVX2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`Two::find_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an AVX2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `Two`, which can only be constructed
+ /// when it is safe to call `sse2`/`avx2` routines.)
+ #[target_feature(enable = "avx2")]
+ #[inline]
+ unsafe fn find_raw_avx2(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.avx2.find_raw(start, end)
+ }
+
+ /// Execute a search using AVX2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`Two::rfind_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an AVX2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `Two`, which can only be constructed
+ /// when it is safe to call `sse2`/`avx2` routines.)
+ #[target_feature(enable = "avx2")]
+ #[inline]
+ unsafe fn rfind_raw_avx2(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.avx2.rfind_raw(start, end)
+ }
+
+ /// Returns an iterator over all occurrences of the needle bytes in the
+ /// given haystack.
+ ///
+ /// The iterator returned implements `DoubleEndedIterator`. This means it
+ /// can also be used to find occurrences in reverse order.
+ #[inline]
+ pub fn iter<'a, 'h>(&'a self, haystack: &'h [u8]) -> TwoIter<'a, 'h> {
+ TwoIter { searcher: self, it: generic::Iter::new(haystack) }
+ }
+}
+
+/// An iterator over all occurrences of two possible bytes in a haystack.
+///
+/// This iterator implements `DoubleEndedIterator`, which means it can also be
+/// used to find occurrences in reverse order.
+///
+/// This iterator is created by the [`Two::iter`] method.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'a` refers to the lifetime of the underlying [`Two`] searcher.
+/// * `'h` refers to the lifetime of the haystack being searched.
+#[derive(Clone, Debug)]
+pub struct TwoIter<'a, 'h> {
+ searcher: &'a Two,
+ it: generic::Iter<'h>,
+}
+
+impl<'a, 'h> Iterator for TwoIter<'a, 'h> {
+ type Item = usize;
+
+ #[inline]
+ fn next(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'find_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next(|s, e| self.searcher.find_raw(s, e)) }
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl<'a, 'h> DoubleEndedIterator for TwoIter<'a, 'h> {
+ #[inline]
+ fn next_back(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'rfind_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next_back(|s, e| self.searcher.rfind_raw(s, e)) }
+ }
+}
+
+impl<'a, 'h> core::iter::FusedIterator for TwoIter<'a, 'h> {}
+
+/// Finds all occurrences of three bytes in a haystack.
+///
+/// That is, this reports matches of one of three possible bytes. For example,
+/// searching for `a`, `b` or `o` in `afoobar` would report matches at offsets
+/// `0`, `2`, `3`, `4` and `5`.
+#[derive(Clone, Copy, Debug)]
+pub struct Three {
+ /// Used for haystacks less than 32 bytes.
+ sse2: generic::Three<__m128i>,
+ /// Used for haystacks bigger than 32 bytes.
+ avx2: generic::Three<__m256i>,
+}
+
+impl Three {
+ /// Create a new searcher that finds occurrences of the needle bytes given.
+ ///
+ /// This particular searcher is specialized to use AVX2 vector instructions
+ /// that typically make it quite fast. (SSE2 is used for haystacks that
+ /// are too short to accommodate an AVX2 vector.)
+ ///
+ /// If either SSE2 or AVX2 is unavailable in the current environment, then
+ /// `None` is returned.
+ #[inline]
+ pub fn new(needle1: u8, needle2: u8, needle3: u8) -> Option<Three> {
+ if Three::is_available() {
+ // SAFETY: we check that sse2 and avx2 are available above.
+ unsafe { Some(Three::new_unchecked(needle1, needle2, needle3)) }
+ } else {
+ None
+ }
+ }
+
+ /// Create a new finder specific to AVX2 vectors and routines without
+ /// checking that either SSE2 or AVX2 is available.
+ ///
+ /// # Safety
+ ///
+ /// Callers must guarantee that it is safe to execute both `sse2` and
+ /// `avx2` instructions in the current environment.
+ ///
+ /// Note that it is a common misconception that if one compiles for an
+ /// `x86_64` target, then they therefore automatically have access to SSE2
+ /// instructions. While this is almost always the case, it isn't true in
+ /// 100% of cases.
+ #[target_feature(enable = "sse2", enable = "avx2")]
+ #[inline]
+ pub unsafe fn new_unchecked(
+ needle1: u8,
+ needle2: u8,
+ needle3: u8,
+ ) -> Three {
+ Three {
+ sse2: generic::Three::new(needle1, needle2, needle3),
+ avx2: generic::Three::new(needle1, needle2, needle3),
+ }
+ }
+
+ /// Returns true when this implementation is available in the current
+ /// environment.
+ ///
+ /// When this is true, it is guaranteed that [`Three::new`] will return
+ /// a `Some` value. Similarly, when it is false, it is guaranteed that
+ /// `Three::new` will return a `None` value.
+ ///
+ /// Note also that for the lifetime of a single program, if this returns
+ /// true then it will always return true.
+ #[inline]
+ pub fn is_available() -> bool {
+ #[cfg(not(target_feature = "sse2"))]
+ {
+ false
+ }
+ #[cfg(target_feature = "sse2")]
+ {
+ #[cfg(target_feature = "avx2")]
+ {
+ true
+ }
+ #[cfg(not(target_feature = "avx2"))]
+ {
+ #[cfg(feature = "std")]
+ {
+ std::is_x86_feature_detected!("avx2")
+ }
+ #[cfg(not(feature = "std"))]
+ {
+ false
+ }
+ }
+ }
+ }
+
+ /// Return the first occurrence of one of the needle bytes in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value is `haystack.len() - 1`.
+ #[inline]
+ pub fn find(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `find_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.find_raw(s, e)
+ })
+ }
+ }
+
+ /// Return the last occurrence of one of the needle bytes in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value is `haystack.len() - 1`.
+ #[inline]
+ pub fn rfind(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `find_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.rfind_raw(s, e)
+ })
+ }
+ }
+
+ /// Like `find`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn find_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ let len = end.distance(start);
+ if len < __m256i::BYTES {
+ return if len < __m128i::BYTES {
+ // SAFETY: We require the caller to pass valid start/end
+ // pointers.
+ generic::fwd_byte_by_byte(start, end, |b| {
+ b == self.sse2.needle1()
+ || b == self.sse2.needle2()
+ || b == self.sse2.needle3()
+ })
+ } else {
+ // SAFETY: We require the caller to pass valid start/end
+ // pointers.
+ self.find_raw_sse2(start, end)
+ };
+ }
+ // SAFETY: Building a `Three` means it's safe to call both 'sse2' and
+ // 'avx2' routines. Also, we've checked that our haystack is big
+ // enough to run on the vector routine. Pointer validity is caller's
+ // responsibility.
+ //
+ // Note that we could call `self.avx2.find_raw` directly here. But that
+ // means we'd have to annotate this routine with `target_feature`.
+ // Which is fine, because this routine is `unsafe` anyway and the
+ // `target_feature` obligation is met by virtue of building a `Three`.
+ // The real problem is that a routine with a `target_feature`
+ // annotation generally can't be inlined into caller code unless
+ // the caller code has the same target feature annotations. Namely,
+ // the common case (at time of writing) is for calling code to not
+ // have the `avx2` target feature enabled *at compile time*. Without
+ // `target_feature` on this routine, it can be inlined which will
+ // handle some of the short-haystack cases above without touching the
+ // architecture specific code.
+ self.find_raw_avx2(start, end)
+ }
+
+ /// Like `rfind`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn rfind_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ let len = end.distance(start);
+ if len < __m256i::BYTES {
+ return if len < __m128i::BYTES {
+ // SAFETY: We require the caller to pass valid start/end
+ // pointers.
+ generic::rev_byte_by_byte(start, end, |b| {
+ b == self.sse2.needle1()
+ || b == self.sse2.needle2()
+ || b == self.sse2.needle3()
+ })
+ } else {
+ // SAFETY: We require the caller to pass valid start/end
+ // pointers.
+ self.rfind_raw_sse2(start, end)
+ };
+ }
+ // SAFETY: Building a `Three` means it's safe to call both 'sse2' and
+ // 'avx2' routines. Also, we've checked that our haystack is big
+ // enough to run on the vector routine. Pointer validity is caller's
+ // responsibility.
+ //
+ // See note in forward routine above for why we don't just call
+ // `self.avx2.rfind_raw` directly here.
+ self.rfind_raw_avx2(start, end)
+ }
+
+ /// Execute a search using SSE2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`Three::find_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an SSE2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `Three`, which can only be constructed
+ /// when it is safe to call `sse2`/`avx2` routines.)
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn find_raw_sse2(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.sse2.find_raw(start, end)
+ }
+
+ /// Execute a search using SSE2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`Three::rfind_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an SSE2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `Three`, which can only be constructed
+ /// when it is safe to call `sse2`/`avx2` routines.)
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn rfind_raw_sse2(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.sse2.rfind_raw(start, end)
+ }
+
+ /// Execute a search using AVX2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`Three::find_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an AVX2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `Three`, which can only be constructed
+ /// when it is safe to call `sse2`/`avx2` routines.)
+ #[target_feature(enable = "avx2")]
+ #[inline]
+ unsafe fn find_raw_avx2(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.avx2.find_raw(start, end)
+ }
+
+ /// Execute a search using AVX2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`Three::rfind_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an AVX2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `Three`, which can only be constructed
+ /// when it is safe to call `sse2`/`avx2` routines.)
+ #[target_feature(enable = "avx2")]
+ #[inline]
+ unsafe fn rfind_raw_avx2(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.avx2.rfind_raw(start, end)
+ }
+
+ /// Returns an iterator over all occurrences of the needle bytes in the
+ /// given haystack.
+ ///
+ /// The iterator returned implements `DoubleEndedIterator`. This means it
+ /// can also be used to find occurrences in reverse order.
+ #[inline]
+ pub fn iter<'a, 'h>(&'a self, haystack: &'h [u8]) -> ThreeIter<'a, 'h> {
+ ThreeIter { searcher: self, it: generic::Iter::new(haystack) }
+ }
+}
+
+/// An iterator over all occurrences of three possible bytes in a haystack.
+///
+/// This iterator implements `DoubleEndedIterator`, which means it can also be
+/// used to find occurrences in reverse order.
+///
+/// This iterator is created by the [`Three::iter`] method.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'a` refers to the lifetime of the underlying [`Three`] searcher.
+/// * `'h` refers to the lifetime of the haystack being searched.
+#[derive(Clone, Debug)]
+pub struct ThreeIter<'a, 'h> {
+ searcher: &'a Three,
+ it: generic::Iter<'h>,
+}
+
+impl<'a, 'h> Iterator for ThreeIter<'a, 'h> {
+ type Item = usize;
+
+ #[inline]
+ fn next(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'find_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next(|s, e| self.searcher.find_raw(s, e)) }
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl<'a, 'h> DoubleEndedIterator for ThreeIter<'a, 'h> {
+ #[inline]
+ fn next_back(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'rfind_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next_back(|s, e| self.searcher.rfind_raw(s, e)) }
+ }
+}
+
+impl<'a, 'h> core::iter::FusedIterator for ThreeIter<'a, 'h> {}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ define_memchr_quickcheck!(super);
+
+ #[test]
+ fn forward_one() {
+ crate::tests::memchr::Runner::new(1).forward_iter(
+ |haystack, needles| {
+ Some(One::new(needles[0])?.iter(haystack).collect())
+ },
+ )
+ }
+
+ #[test]
+ fn reverse_one() {
+ crate::tests::memchr::Runner::new(1).reverse_iter(
+ |haystack, needles| {
+ Some(One::new(needles[0])?.iter(haystack).rev().collect())
+ },
+ )
+ }
+
+ #[test]
+ fn count_one() {
+ crate::tests::memchr::Runner::new(1).count_iter(|haystack, needles| {
+ Some(One::new(needles[0])?.iter(haystack).count())
+ })
+ }
+
+ #[test]
+ fn forward_two() {
+ crate::tests::memchr::Runner::new(2).forward_iter(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ Some(Two::new(n1, n2)?.iter(haystack).collect())
+ },
+ )
+ }
+
+ #[test]
+ fn reverse_two() {
+ crate::tests::memchr::Runner::new(2).reverse_iter(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ Some(Two::new(n1, n2)?.iter(haystack).rev().collect())
+ },
+ )
+ }
+
+ #[test]
+ fn forward_three() {
+ crate::tests::memchr::Runner::new(3).forward_iter(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ let n3 = needles.get(2).copied()?;
+ Some(Three::new(n1, n2, n3)?.iter(haystack).collect())
+ },
+ )
+ }
+
+ #[test]
+ fn reverse_three() {
+ crate::tests::memchr::Runner::new(3).reverse_iter(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ let n3 = needles.get(2).copied()?;
+ Some(Three::new(n1, n2, n3)?.iter(haystack).rev().collect())
+ },
+ )
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +
/*!
+A 256-bit vector implementation of the "packed pair" SIMD algorithm.
+
+The "packed pair" algorithm is based on the [generic SIMD] algorithm. The main
+difference is that it (by default) uses a background distribution of byte
+frequencies to heuristically select the pair of bytes to search for.
+
+[generic SIMD]: http://0x80.pl/articles/simd-strfind.html#first-and-last
+*/
+
+use core::arch::x86_64::{__m128i, __m256i};
+
+use crate::arch::{all::packedpair::Pair, generic::packedpair};
+
+/// A "packed pair" finder that uses 256-bit vector operations.
+///
+/// This finder picks two bytes that it believes have high predictive power
+/// for indicating an overall match of a needle. Depending on whether
+/// `Finder::find` or `Finder::find_prefilter` is used, it reports offsets
+/// where the needle matches or could match. In the prefilter case, candidates
+/// are reported whenever the [`Pair`] of bytes given matches.
+#[derive(Clone, Copy, Debug)]
+pub struct Finder {
+ sse2: packedpair::Finder<__m128i>,
+ avx2: packedpair::Finder<__m256i>,
+}
+
+impl Finder {
+ /// Create a new pair searcher. The searcher returned can either report
+ /// exact matches of `needle` or act as a prefilter and report candidate
+ /// positions of `needle`.
+ ///
+ /// If AVX2 is unavailable in the current environment or if a [`Pair`]
+ /// could not be constructed from the needle given, then `None` is
+ /// returned.
+ #[inline]
+ pub fn new(needle: &[u8]) -> Option<Finder> {
+ Finder::with_pair(needle, Pair::new(needle)?)
+ }
+
+ /// Create a new "packed pair" finder using the pair of bytes given.
+ ///
+ /// This constructor permits callers to control precisely which pair of
+ /// bytes is used as a predicate.
+ ///
+ /// If AVX2 is unavailable in the current environment, then `None` is
+ /// returned.
+ #[inline]
+ pub fn with_pair(needle: &[u8], pair: Pair) -> Option<Finder> {
+ if Finder::is_available() {
+ // SAFETY: we check that sse2/avx2 is available above. We are also
+ // guaranteed to have needle.len() > 1 because we have a valid
+ // Pair.
+ unsafe { Some(Finder::with_pair_impl(needle, pair)) }
+ } else {
+ None
+ }
+ }
+
+ /// Create a new `Finder` specific to SSE2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as the safety for `packedpair::Finder::new`, and callers must also
+ /// ensure that both SSE2 and AVX2 are available.
+ #[target_feature(enable = "sse2", enable = "avx2")]
+ #[inline]
+ unsafe fn with_pair_impl(needle: &[u8], pair: Pair) -> Finder {
+ let sse2 = packedpair::Finder::<__m128i>::new(needle, pair);
+ let avx2 = packedpair::Finder::<__m256i>::new(needle, pair);
+ Finder { sse2, avx2 }
+ }
+
+ /// Returns true when this implementation is available in the current
+ /// environment.
+ ///
+ /// When this is true, it is guaranteed that [`Finder::with_pair`] will
+ /// return a `Some` value. Similarly, when it is false, it is guaranteed
+ /// that `Finder::with_pair` will return a `None` value. Notice that this
+ /// does not guarantee that [`Finder::new`] will return a `Finder`. Namely,
+ /// even when `Finder::is_available` is true, it is not guaranteed that a
+ /// valid [`Pair`] can be found from the needle given.
+ ///
+ /// Note also that for the lifetime of a single program, if this returns
+ /// true then it will always return true.
+ #[inline]
+ pub fn is_available() -> bool {
+ #[cfg(not(target_feature = "sse2"))]
+ {
+ false
+ }
+ #[cfg(target_feature = "sse2")]
+ {
+ #[cfg(target_feature = "avx2")]
+ {
+ true
+ }
+ #[cfg(not(target_feature = "avx2"))]
+ {
+ #[cfg(feature = "std")]
+ {
+ std::is_x86_feature_detected!("avx2")
+ }
+ #[cfg(not(feature = "std"))]
+ {
+ false
+ }
+ }
+ }
+ }
+
+ /// Execute a search using AVX2 vectors and routines.
+ ///
+ /// # Panics
+ ///
+ /// When `haystack.len()` is less than [`Finder::min_haystack_len`].
+ #[inline]
+ pub fn find(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
+ // SAFETY: Building a `Finder` means it's safe to call 'sse2' routines.
+ unsafe { self.find_impl(haystack, needle) }
+ }
+
+ /// Run this finder on the given haystack as a prefilter.
+ ///
+ /// If a candidate match is found, then an offset where the needle *could*
+ /// begin in the haystack is returned.
+ ///
+ /// # Panics
+ ///
+ /// When `haystack.len()` is less than [`Finder::min_haystack_len`].
+ #[inline]
+ pub fn find_prefilter(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: Building a `Finder` means it's safe to call 'sse2' routines.
+ unsafe { self.find_prefilter_impl(haystack) }
+ }
+
+ /// Execute a search using AVX2 vectors and routines.
+ ///
+ /// # Panics
+ ///
+ /// When `haystack.len()` is less than [`Finder::min_haystack_len`].
+ ///
+ /// # Safety
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `Finder`, which can only be constructed
+ /// when it is safe to call `sse2` and `avx2` routines.)
+ #[target_feature(enable = "sse2", enable = "avx2")]
+ #[inline]
+ unsafe fn find_impl(
+ &self,
+ haystack: &[u8],
+ needle: &[u8],
+ ) -> Option<usize> {
+ if haystack.len() < self.avx2.min_haystack_len() {
+ self.sse2.find(haystack, needle)
+ } else {
+ self.avx2.find(haystack, needle)
+ }
+ }
+
+ /// Execute a prefilter search using AVX2 vectors and routines.
+ ///
+ /// # Panics
+ ///
+ /// When `haystack.len()` is less than [`Finder::min_haystack_len`].
+ ///
+ /// # Safety
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `Finder`, which can only be constructed
+ /// when it is safe to call `sse2` and `avx2` routines.)
+ #[target_feature(enable = "sse2", enable = "avx2")]
+ #[inline]
+ unsafe fn find_prefilter_impl(&self, haystack: &[u8]) -> Option<usize> {
+ if haystack.len() < self.avx2.min_haystack_len() {
+ self.sse2.find_prefilter(haystack)
+ } else {
+ self.avx2.find_prefilter(haystack)
+ }
+ }
+
+ /// Returns the pair of offsets (into the needle) used to check as a
+ /// predicate before confirming whether a needle exists at a particular
+ /// position.
+ #[inline]
+ pub fn pair(&self) -> &Pair {
+ self.avx2.pair()
+ }
+
+ /// Returns the minimum haystack length that this `Finder` can search.
+ ///
+ /// Using a haystack with length smaller than this in a search will result
+ /// in a panic. The reason for this restriction is that this finder is
+ /// meant to be a low-level component that is part of a larger substring
+ /// strategy. In that sense, it avoids trying to handle all cases and
+ /// instead only handles the cases that it can handle very well.
+ #[inline]
+ pub fn min_haystack_len(&self) -> usize {
+ // The caller doesn't need to care about AVX2's min_haystack_len
+ // since this implementation will automatically switch to the SSE2
+ // implementation if the haystack is too short for AVX2. Therefore, the
+ // caller only needs to care about SSE2's min_haystack_len.
+ //
+ // This does assume that SSE2's min_haystack_len is less than or
+ // equal to AVX2's min_haystack_len. In practice, this is true and
+ // there is no way it could be false based on how this Finder is
+ // implemented. Namely, both SSE2 and AVX2 use the same `Pair`. If
+ // they used different pairs, then it's possible (although perhaps
+ // pathological) for SSE2's min_haystack_len to be bigger than AVX2's.
+ self.sse2.min_haystack_len()
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn find(haystack: &[u8], needle: &[u8]) -> Option<Option<usize>> {
+ let f = Finder::new(needle)?;
+ if haystack.len() < f.min_haystack_len() {
+ return None;
+ }
+ Some(f.find(haystack, needle))
+ }
+
+ define_substring_forward_quickcheck!(find);
+
+ #[test]
+ fn forward_substring() {
+ crate::tests::substring::Runner::new().fwd(find).run()
+ }
+
+ #[test]
+ fn forward_packedpair() {
+ fn find(
+ haystack: &[u8],
+ needle: &[u8],
+ index1: u8,
+ index2: u8,
+ ) -> Option<Option<usize>> {
+ let pair = Pair::with_indices(needle, index1, index2)?;
+ let f = Finder::with_pair(needle, pair)?;
+ if haystack.len() < f.min_haystack_len() {
+ return None;
+ }
+ Some(f.find(haystack, needle))
+ }
+ crate::tests::packedpair::Runner::new().fwd(find).run()
+ }
+
+ #[test]
+ fn forward_packedpair_prefilter() {
+ fn find(
+ haystack: &[u8],
+ needle: &[u8],
+ index1: u8,
+ index2: u8,
+ ) -> Option<Option<usize>> {
+ if !cfg!(target_feature = "sse2") {
+ return None;
+ }
+ let pair = Pair::with_indices(needle, index1, index2)?;
+ let f = Finder::with_pair(needle, pair)?;
+ if haystack.len() < f.min_haystack_len() {
+ return None;
+ }
+ Some(f.find_prefilter(haystack))
+ }
+ crate::tests::packedpair::Runner::new().fwd(find).run()
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +
/*!
+Wrapper routines for `memchr` and friends.
+
+These routines efficiently dispatch to the best implementation based on what
+the CPU supports.
+*/
+
+/// Provides a way to run a memchr-like function while amortizing the cost of
+/// runtime CPU feature detection.
+///
+/// This works by loading a function pointer from an atomic global. Initially,
+/// this global is set to a function that does CPU feature detection. For
+/// example, if AVX2 is enabled, then the AVX2 implementation is used.
+/// Otherwise, at least on x86_64, the SSE2 implementation is used. (And
+/// in some niche cases, if SSE2 isn't available, then the architecture
+/// independent fallback implementation is used.)
+///
+/// After the first call to this function, the atomic global is replaced with
+/// the specific AVX2, SSE2 or fallback routine chosen. Subsequent calls then
+/// will directly call the chosen routine instead of needing to go through the
+/// CPU feature detection branching again.
+///
+/// This particular macro is specifically written to provide the implementation
+/// of functions with the following signature:
+///
+/// ```ignore
+/// fn memchr(needle1: u8, start: *const u8, end: *const u8) -> Option<usize>;
+/// ```
+///
+/// Where you can also have `memchr2` and `memchr3`, but with `needle2` and
+/// `needle3`, respectively. The `start` and `end` parameters correspond to the
+/// start and end of the haystack, respectively.
+///
+/// We use raw pointers here instead of the more obvious `haystack: &[u8]` so
+/// that the function is compatible with our lower level iterator logic that
+/// operates on raw pointers. We use this macro to implement "raw" memchr
+/// routines with the signature above, and then define memchr routines using
+/// regular slices on top of them.
+///
+/// Note that we use `#[cfg(target_feature = "sse2")]` below even though
+/// it shouldn't be strictly necessary because without it, it seems to
+/// cause the compiler to blow up. I guess it can't handle a function
+/// pointer being created with a sse target feature? Dunno. See the
+/// `build-for-x86-64-but-non-sse-target` CI job if you want to experiment with
+/// this.
+///
+/// # Safety
+///
+/// Primarily callers must that `$fnty` is a correct function pointer type and
+/// not something else.
+///
+/// Callers must also ensure that `$memchrty::$memchrfind` corresponds to a
+/// routine that returns a valid function pointer when a match is found. That
+/// is, a pointer that is `>= start` and `< end`.
+///
+/// Callers must also ensure that the `$hay_start` and `$hay_end` identifiers
+/// correspond to valid pointers.
+macro_rules! unsafe_ifunc {
+ (
+ $memchrty:ident,
+ $memchrfind:ident,
+ $fnty:ty,
+ $retty:ty,
+ $hay_start:ident,
+ $hay_end:ident,
+ $($needle:ident),+
+ ) => {{
+ #![allow(unused_unsafe)]
+
+ use core::sync::atomic::{AtomicPtr, Ordering};
+
+ type Fn = *mut ();
+ type RealFn = $fnty;
+ static FN: AtomicPtr<()> = AtomicPtr::new(detect as Fn);
+
+ #[cfg(target_feature = "sse2")]
+ #[target_feature(enable = "sse2", enable = "avx2")]
+ unsafe fn find_avx2(
+ $($needle: u8),+,
+ $hay_start: *const u8,
+ $hay_end: *const u8,
+ ) -> $retty {
+ use crate::arch::x86_64::avx2::memchr::$memchrty;
+ $memchrty::new_unchecked($($needle),+)
+ .$memchrfind($hay_start, $hay_end)
+ }
+
+ #[cfg(target_feature = "sse2")]
+ #[target_feature(enable = "sse2")]
+ unsafe fn find_sse2(
+ $($needle: u8),+,
+ $hay_start: *const u8,
+ $hay_end: *const u8,
+ ) -> $retty {
+ use crate::arch::x86_64::sse2::memchr::$memchrty;
+ $memchrty::new_unchecked($($needle),+)
+ .$memchrfind($hay_start, $hay_end)
+ }
+
+ unsafe fn find_fallback(
+ $($needle: u8),+,
+ $hay_start: *const u8,
+ $hay_end: *const u8,
+ ) -> $retty {
+ use crate::arch::all::memchr::$memchrty;
+ $memchrty::new($($needle),+).$memchrfind($hay_start, $hay_end)
+ }
+
+ unsafe fn detect(
+ $($needle: u8),+,
+ $hay_start: *const u8,
+ $hay_end: *const u8,
+ ) -> $retty {
+ let fun = {
+ #[cfg(not(target_feature = "sse2"))]
+ {
+ debug!(
+ "no sse2 feature available, using fallback for {}",
+ stringify!($memchrty),
+ );
+ find_fallback as RealFn
+ }
+ #[cfg(target_feature = "sse2")]
+ {
+ use crate::arch::x86_64::{sse2, avx2};
+ if avx2::memchr::$memchrty::is_available() {
+ debug!("chose AVX2 for {}", stringify!($memchrty));
+ find_avx2 as RealFn
+ } else if sse2::memchr::$memchrty::is_available() {
+ debug!("chose SSE2 for {}", stringify!($memchrty));
+ find_sse2 as RealFn
+ } else {
+ debug!("chose fallback for {}", stringify!($memchrty));
+ find_fallback as RealFn
+ }
+ }
+ };
+ FN.store(fun as Fn, Ordering::Relaxed);
+ // SAFETY: The only thing we need to uphold here is the
+ // `#[target_feature]` requirements. Since we check is_available
+ // above before using the corresponding implementation, we are
+ // guaranteed to only call code that is supported on the current
+ // CPU.
+ fun($($needle),+, $hay_start, $hay_end)
+ }
+
+ // SAFETY: By virtue of the caller contract, RealFn is a function
+ // pointer, which is always safe to transmute with a *mut (). Also,
+ // since we use $memchrty::is_available, it is guaranteed to be safe
+ // to call $memchrty::$memchrfind.
+ unsafe {
+ let fun = FN.load(Ordering::Relaxed);
+ core::mem::transmute::<Fn, RealFn>(fun)(
+ $($needle),+,
+ $hay_start,
+ $hay_end,
+ )
+ }
+ }};
+}
+
+// The routines below dispatch to AVX2, SSE2 or a fallback routine based on
+// what's available in the current environment. The secret sauce here is that
+// we only check for which one to use approximately once, and then "cache" that
+// choice into a global function pointer. Subsequent invocations then just call
+// the appropriate function directly.
+
+/// memchr, but using raw pointers to represent the haystack.
+///
+/// # Safety
+///
+/// Pointers must be valid. See `One::find_raw`.
+#[inline(always)]
+pub(crate) fn memchr_raw(
+ n1: u8,
+ start: *const u8,
+ end: *const u8,
+) -> Option<*const u8> {
+ // SAFETY: We provide a valid function pointer type.
+ unsafe_ifunc!(
+ One,
+ find_raw,
+ unsafe fn(u8, *const u8, *const u8) -> Option<*const u8>,
+ Option<*const u8>,
+ start,
+ end,
+ n1
+ )
+}
+
+/// memrchr, but using raw pointers to represent the haystack.
+///
+/// # Safety
+///
+/// Pointers must be valid. See `One::rfind_raw`.
+#[inline(always)]
+pub(crate) fn memrchr_raw(
+ n1: u8,
+ start: *const u8,
+ end: *const u8,
+) -> Option<*const u8> {
+ // SAFETY: We provide a valid function pointer type.
+ unsafe_ifunc!(
+ One,
+ rfind_raw,
+ unsafe fn(u8, *const u8, *const u8) -> Option<*const u8>,
+ Option<*const u8>,
+ start,
+ end,
+ n1
+ )
+}
+
+/// memchr2, but using raw pointers to represent the haystack.
+///
+/// # Safety
+///
+/// Pointers must be valid. See `Two::find_raw`.
+#[inline(always)]
+pub(crate) fn memchr2_raw(
+ n1: u8,
+ n2: u8,
+ start: *const u8,
+ end: *const u8,
+) -> Option<*const u8> {
+ // SAFETY: We provide a valid function pointer type.
+ unsafe_ifunc!(
+ Two,
+ find_raw,
+ unsafe fn(u8, u8, *const u8, *const u8) -> Option<*const u8>,
+ Option<*const u8>,
+ start,
+ end,
+ n1,
+ n2
+ )
+}
+
+/// memrchr2, but using raw pointers to represent the haystack.
+///
+/// # Safety
+///
+/// Pointers must be valid. See `Two::rfind_raw`.
+#[inline(always)]
+pub(crate) fn memrchr2_raw(
+ n1: u8,
+ n2: u8,
+ start: *const u8,
+ end: *const u8,
+) -> Option<*const u8> {
+ // SAFETY: We provide a valid function pointer type.
+ unsafe_ifunc!(
+ Two,
+ rfind_raw,
+ unsafe fn(u8, u8, *const u8, *const u8) -> Option<*const u8>,
+ Option<*const u8>,
+ start,
+ end,
+ n1,
+ n2
+ )
+}
+
+/// memchr3, but using raw pointers to represent the haystack.
+///
+/// # Safety
+///
+/// Pointers must be valid. See `Three::find_raw`.
+#[inline(always)]
+pub(crate) fn memchr3_raw(
+ n1: u8,
+ n2: u8,
+ n3: u8,
+ start: *const u8,
+ end: *const u8,
+) -> Option<*const u8> {
+ // SAFETY: We provide a valid function pointer type.
+ unsafe_ifunc!(
+ Three,
+ find_raw,
+ unsafe fn(u8, u8, u8, *const u8, *const u8) -> Option<*const u8>,
+ Option<*const u8>,
+ start,
+ end,
+ n1,
+ n2,
+ n3
+ )
+}
+
+/// memrchr3, but using raw pointers to represent the haystack.
+///
+/// # Safety
+///
+/// Pointers must be valid. See `Three::rfind_raw`.
+#[inline(always)]
+pub(crate) fn memrchr3_raw(
+ n1: u8,
+ n2: u8,
+ n3: u8,
+ start: *const u8,
+ end: *const u8,
+) -> Option<*const u8> {
+ // SAFETY: We provide a valid function pointer type.
+ unsafe_ifunc!(
+ Three,
+ rfind_raw,
+ unsafe fn(u8, u8, u8, *const u8, *const u8) -> Option<*const u8>,
+ Option<*const u8>,
+ start,
+ end,
+ n1,
+ n2,
+ n3
+ )
+}
+
+/// Count all matching bytes, but using raw pointers to represent the haystack.
+///
+/// # Safety
+///
+/// Pointers must be valid. See `One::count_raw`.
+#[inline(always)]
+pub(crate) fn count_raw(n1: u8, start: *const u8, end: *const u8) -> usize {
+ // SAFETY: We provide a valid function pointer type.
+ unsafe_ifunc!(
+ One,
+ count_raw,
+ unsafe fn(u8, *const u8, *const u8) -> usize,
+ usize,
+ start,
+ end,
+ n1
+ )
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 +763 +764 +765 +766 +767 +768 +769 +770 +771 +772 +773 +774 +775 +776 +777 +778 +779 +780 +781 +782 +783 +784 +785 +786 +787 +788 +789 +790 +791 +792 +793 +794 +795 +796 +797 +798 +799 +800 +801 +802 +803 +804 +805 +806 +807 +808 +809 +810 +811 +812 +813 +814 +815 +816 +817 +818 +819 +820 +821 +822 +823 +824 +825 +826 +827 +828 +829 +830 +831 +832 +833 +834 +835 +836 +837 +838 +839 +840 +841 +842 +843 +844 +845 +846 +847 +848 +849 +850 +851 +852 +853 +854 +855 +856 +857 +858 +859 +860 +861 +862 +863 +864 +865 +866 +867 +868 +869 +870 +871 +872 +873 +874 +875 +876 +877 +878 +879 +880 +881 +882 +883 +884 +885 +886 +887 +888 +889 +890 +891 +892 +893 +894 +895 +896 +897 +898 +899 +900 +901 +902 +903 +904 +905 +906 +907 +908 +909 +910 +911 +912 +913 +914 +915 +916 +917 +918 +919 +920 +921 +922 +923 +924 +925 +926 +927 +928 +929 +930 +931 +932 +933 +934 +935 +936 +937 +938 +939 +940 +941 +942 +943 +944 +945 +946 +947 +948 +949 +950 +951 +952 +953 +954 +955 +956 +957 +958 +959 +960 +961 +962 +963 +964 +965 +966 +967 +968 +969 +970 +971 +972 +973 +974 +975 +976 +977 +978 +979 +980 +981 +982 +983 +984 +985 +986 +987 +988 +989 +990 +991 +992 +993 +994 +995 +996 +997 +998 +999 +1000 +1001 +1002 +1003 +1004 +1005 +1006 +1007 +1008 +1009 +1010 +1011 +1012 +1013 +1014 +1015 +1016 +1017 +1018 +1019 +1020 +1021 +1022 +1023 +1024 +1025 +1026 +1027 +1028 +1029 +1030 +1031 +1032 +1033 +1034 +1035 +1036 +1037 +1038 +1039 +1040 +1041 +1042 +1043 +1044 +1045 +1046 +1047 +1048 +1049 +1050 +1051 +1052 +1053 +1054 +1055 +1056 +1057 +1058 +1059 +1060 +1061 +1062 +1063 +1064 +1065 +1066 +1067 +1068 +1069 +1070 +1071 +1072 +1073 +1074 +1075 +1076 +1077 +
/*!
+This module defines 128-bit vector implementations of `memchr` and friends.
+
+The main types in this module are [`One`], [`Two`] and [`Three`]. They are for
+searching for one, two or three distinct bytes, respectively, in a haystack.
+Each type also has corresponding double ended iterators. These searchers are
+typically much faster than scalar routines accomplishing the same task.
+
+The `One` searcher also provides a [`One::count`] routine for efficiently
+counting the number of times a single byte occurs in a haystack. This is
+useful, for example, for counting the number of lines in a haystack. This
+routine exists because it is usually faster, especially with a high match
+count, then using [`One::find`] repeatedly. ([`OneIter`] specializes its
+`Iterator::count` implementation to use this routine.)
+
+Only one, two and three bytes are supported because three bytes is about
+the point where one sees diminishing returns. Beyond this point and it's
+probably (but not necessarily) better to just use a simple `[bool; 256]` array
+or similar. However, it depends mightily on the specific work-load and the
+expected match frequency.
+*/
+
+use core::arch::x86_64::__m128i;
+
+use crate::{arch::generic::memchr as generic, ext::Pointer, vector::Vector};
+
+/// Finds all occurrences of a single byte in a haystack.
+#[derive(Clone, Copy, Debug)]
+pub struct One(generic::One<__m128i>);
+
+impl One {
+ /// Create a new searcher that finds occurrences of the needle byte given.
+ ///
+ /// This particular searcher is specialized to use SSE2 vector instructions
+ /// that typically make it quite fast.
+ ///
+ /// If SSE2 is unavailable in the current environment, then `None` is
+ /// returned.
+ #[inline]
+ pub fn new(needle: u8) -> Option<One> {
+ if One::is_available() {
+ // SAFETY: we check that sse2 is available above.
+ unsafe { Some(One::new_unchecked(needle)) }
+ } else {
+ None
+ }
+ }
+
+ /// Create a new finder specific to SSE2 vectors and routines without
+ /// checking that SSE2 is available.
+ ///
+ /// # Safety
+ ///
+ /// Callers must guarantee that it is safe to execute `sse2` instructions
+ /// in the current environment.
+ ///
+ /// Note that it is a common misconception that if one compiles for an
+ /// `x86_64` target, then they therefore automatically have access to SSE2
+ /// instructions. While this is almost always the case, it isn't true in
+ /// 100% of cases.
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ pub unsafe fn new_unchecked(needle: u8) -> One {
+ One(generic::One::new(needle))
+ }
+
+ /// Returns true when this implementation is available in the current
+ /// environment.
+ ///
+ /// When this is true, it is guaranteed that [`One::new`] will return
+ /// a `Some` value. Similarly, when it is false, it is guaranteed that
+ /// `One::new` will return a `None` value.
+ ///
+ /// Note also that for the lifetime of a single program, if this returns
+ /// true then it will always return true.
+ #[inline]
+ pub fn is_available() -> bool {
+ #[cfg(target_feature = "sse2")]
+ {
+ true
+ }
+ #[cfg(not(target_feature = "sse2"))]
+ {
+ false
+ }
+ }
+
+ /// Return the first occurrence of one of the needle bytes in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value is `haystack.len() - 1`.
+ #[inline]
+ pub fn find(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `find_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.find_raw(s, e)
+ })
+ }
+ }
+
+ /// Return the last occurrence of one of the needle bytes in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value is `haystack.len() - 1`.
+ #[inline]
+ pub fn rfind(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `rfind_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.rfind_raw(s, e)
+ })
+ }
+ }
+
+ /// Counts all occurrences of this byte in the given haystack.
+ #[inline]
+ pub fn count(&self, haystack: &[u8]) -> usize {
+ // SAFETY: All of our pointers are derived directly from a borrowed
+ // slice, which is guaranteed to be valid.
+ unsafe {
+ let start = haystack.as_ptr();
+ let end = start.add(haystack.len());
+ self.count_raw(start, end)
+ }
+ }
+
+ /// Like `find`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn find_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ if end.distance(start) < __m128i::BYTES {
+ // SAFETY: We require the caller to pass valid start/end pointers.
+ return generic::fwd_byte_by_byte(start, end, |b| {
+ b == self.0.needle1()
+ });
+ }
+ // SAFETY: Building a `One` means it's safe to call 'sse2' routines.
+ // Also, we've checked that our haystack is big enough to run on the
+ // vector routine. Pointer validity is caller's responsibility.
+ //
+ // Note that we could call `self.0.find_raw` directly here. But that
+ // means we'd have to annotate this routine with `target_feature`.
+ // Which is fine, because this routine is `unsafe` anyway and the
+ // `target_feature` obligation is met by virtue of building a `One`.
+ // The real problem is that a routine with a `target_feature`
+ // annotation generally can't be inlined into caller code unless the
+ // caller code has the same target feature annotations. Which is maybe
+ // okay for SSE2, but we do the same thing for AVX2 where caller code
+ // probably usually doesn't have AVX2 enabled. That means that this
+ // routine can be inlined which will handle some of the short-haystack
+ // cases above without touching the architecture specific code.
+ self.find_raw_impl(start, end)
+ }
+
+ /// Like `rfind`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn rfind_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ if end.distance(start) < __m128i::BYTES {
+ // SAFETY: We require the caller to pass valid start/end pointers.
+ return generic::rev_byte_by_byte(start, end, |b| {
+ b == self.0.needle1()
+ });
+ }
+ // SAFETY: Building a `One` means it's safe to call 'sse2' routines.
+ // Also, we've checked that our haystack is big enough to run on the
+ // vector routine. Pointer validity is caller's responsibility.
+ //
+ // See note in forward routine above for why we don't just call
+ // `self.0.rfind_raw` directly here.
+ self.rfind_raw_impl(start, end)
+ }
+
+ /// Counts all occurrences of this byte in the given haystack represented
+ /// by raw pointers.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `0` will always be returned.
+ #[inline]
+ pub unsafe fn count_raw(&self, start: *const u8, end: *const u8) -> usize {
+ if start >= end {
+ return 0;
+ }
+ if end.distance(start) < __m128i::BYTES {
+ // SAFETY: We require the caller to pass valid start/end pointers.
+ return generic::count_byte_by_byte(start, end, |b| {
+ b == self.0.needle1()
+ });
+ }
+ // SAFETY: Building a `One` means it's safe to call 'sse2' routines.
+ // Also, we've checked that our haystack is big enough to run on the
+ // vector routine. Pointer validity is caller's responsibility.
+ self.count_raw_impl(start, end)
+ }
+
+ /// Execute a search using SSE2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`One::find_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an SSE2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `One`, which can only be constructed
+ /// when it is safe to call `sse2` routines.)
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn find_raw_impl(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.0.find_raw(start, end)
+ }
+
+ /// Execute a search using SSE2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`One::rfind_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an SSE2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `One`, which can only be constructed
+ /// when it is safe to call `sse2` routines.)
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn rfind_raw_impl(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.0.rfind_raw(start, end)
+ }
+
+ /// Execute a count using SSE2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`One::count_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an SSE2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `One`, which can only be constructed
+ /// when it is safe to call `sse2` routines.)
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn count_raw_impl(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> usize {
+ self.0.count_raw(start, end)
+ }
+
+ /// Returns an iterator over all occurrences of the needle byte in the
+ /// given haystack.
+ ///
+ /// The iterator returned implements `DoubleEndedIterator`. This means it
+ /// can also be used to find occurrences in reverse order.
+ #[inline]
+ pub fn iter<'a, 'h>(&'a self, haystack: &'h [u8]) -> OneIter<'a, 'h> {
+ OneIter { searcher: self, it: generic::Iter::new(haystack) }
+ }
+}
+
+/// An iterator over all occurrences of a single byte in a haystack.
+///
+/// This iterator implements `DoubleEndedIterator`, which means it can also be
+/// used to find occurrences in reverse order.
+///
+/// This iterator is created by the [`One::iter`] method.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'a` refers to the lifetime of the underlying [`One`] searcher.
+/// * `'h` refers to the lifetime of the haystack being searched.
+#[derive(Clone, Debug)]
+pub struct OneIter<'a, 'h> {
+ searcher: &'a One,
+ it: generic::Iter<'h>,
+}
+
+impl<'a, 'h> Iterator for OneIter<'a, 'h> {
+ type Item = usize;
+
+ #[inline]
+ fn next(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'find_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next(|s, e| self.searcher.find_raw(s, e)) }
+ }
+
+ #[inline]
+ fn count(self) -> usize {
+ self.it.count(|s, e| {
+ // SAFETY: We rely on our generic iterator to return valid start
+ // and end pointers.
+ unsafe { self.searcher.count_raw(s, e) }
+ })
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl<'a, 'h> DoubleEndedIterator for OneIter<'a, 'h> {
+ #[inline]
+ fn next_back(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'rfind_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next_back(|s, e| self.searcher.rfind_raw(s, e)) }
+ }
+}
+
+impl<'a, 'h> core::iter::FusedIterator for OneIter<'a, 'h> {}
+
+/// Finds all occurrences of two bytes in a haystack.
+///
+/// That is, this reports matches of one of two possible bytes. For example,
+/// searching for `a` or `b` in `afoobar` would report matches at offsets `0`,
+/// `4` and `5`.
+#[derive(Clone, Copy, Debug)]
+pub struct Two(generic::Two<__m128i>);
+
+impl Two {
+ /// Create a new searcher that finds occurrences of the needle bytes given.
+ ///
+ /// This particular searcher is specialized to use SSE2 vector instructions
+ /// that typically make it quite fast.
+ ///
+ /// If SSE2 is unavailable in the current environment, then `None` is
+ /// returned.
+ #[inline]
+ pub fn new(needle1: u8, needle2: u8) -> Option<Two> {
+ if Two::is_available() {
+ // SAFETY: we check that sse2 is available above.
+ unsafe { Some(Two::new_unchecked(needle1, needle2)) }
+ } else {
+ None
+ }
+ }
+
+ /// Create a new finder specific to SSE2 vectors and routines without
+ /// checking that SSE2 is available.
+ ///
+ /// # Safety
+ ///
+ /// Callers must guarantee that it is safe to execute `sse2` instructions
+ /// in the current environment.
+ ///
+ /// Note that it is a common misconception that if one compiles for an
+ /// `x86_64` target, then they therefore automatically have access to SSE2
+ /// instructions. While this is almost always the case, it isn't true in
+ /// 100% of cases.
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ pub unsafe fn new_unchecked(needle1: u8, needle2: u8) -> Two {
+ Two(generic::Two::new(needle1, needle2))
+ }
+
+ /// Returns true when this implementation is available in the current
+ /// environment.
+ ///
+ /// When this is true, it is guaranteed that [`Two::new`] will return
+ /// a `Some` value. Similarly, when it is false, it is guaranteed that
+ /// `Two::new` will return a `None` value.
+ ///
+ /// Note also that for the lifetime of a single program, if this returns
+ /// true then it will always return true.
+ #[inline]
+ pub fn is_available() -> bool {
+ #[cfg(target_feature = "sse2")]
+ {
+ true
+ }
+ #[cfg(not(target_feature = "sse2"))]
+ {
+ false
+ }
+ }
+
+ /// Return the first occurrence of one of the needle bytes in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value is `haystack.len() - 1`.
+ #[inline]
+ pub fn find(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `find_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.find_raw(s, e)
+ })
+ }
+ }
+
+ /// Return the last occurrence of one of the needle bytes in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value is `haystack.len() - 1`.
+ #[inline]
+ pub fn rfind(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `rfind_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.rfind_raw(s, e)
+ })
+ }
+ }
+
+ /// Like `find`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn find_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ if end.distance(start) < __m128i::BYTES {
+ // SAFETY: We require the caller to pass valid start/end pointers.
+ return generic::fwd_byte_by_byte(start, end, |b| {
+ b == self.0.needle1() || b == self.0.needle2()
+ });
+ }
+ // SAFETY: Building a `Two` means it's safe to call 'sse2' routines.
+ // Also, we've checked that our haystack is big enough to run on the
+ // vector routine. Pointer validity is caller's responsibility.
+ //
+ // Note that we could call `self.0.find_raw` directly here. But that
+ // means we'd have to annotate this routine with `target_feature`.
+ // Which is fine, because this routine is `unsafe` anyway and the
+ // `target_feature` obligation is met by virtue of building a `Two`.
+ // The real problem is that a routine with a `target_feature`
+ // annotation generally can't be inlined into caller code unless the
+ // caller code has the same target feature annotations. Which is maybe
+ // okay for SSE2, but we do the same thing for AVX2 where caller code
+ // probably usually doesn't have AVX2 enabled. That means that this
+ // routine can be inlined which will handle some of the short-haystack
+ // cases above without touching the architecture specific code.
+ self.find_raw_impl(start, end)
+ }
+
+ /// Like `rfind`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn rfind_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ if end.distance(start) < __m128i::BYTES {
+ // SAFETY: We require the caller to pass valid start/end pointers.
+ return generic::rev_byte_by_byte(start, end, |b| {
+ b == self.0.needle1() || b == self.0.needle2()
+ });
+ }
+ // SAFETY: Building a `Two` means it's safe to call 'sse2' routines.
+ // Also, we've checked that our haystack is big enough to run on the
+ // vector routine. Pointer validity is caller's responsibility.
+ //
+ // See note in forward routine above for why we don't just call
+ // `self.0.rfind_raw` directly here.
+ self.rfind_raw_impl(start, end)
+ }
+
+ /// Execute a search using SSE2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`Two::find_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an SSE2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `Two`, which can only be constructed
+ /// when it is safe to call `sse2` routines.)
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn find_raw_impl(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.0.find_raw(start, end)
+ }
+
+ /// Execute a search using SSE2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`Two::rfind_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an SSE2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `Two`, which can only be constructed
+ /// when it is safe to call `sse2` routines.)
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn rfind_raw_impl(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.0.rfind_raw(start, end)
+ }
+
+ /// Returns an iterator over all occurrences of the needle bytes in the
+ /// given haystack.
+ ///
+ /// The iterator returned implements `DoubleEndedIterator`. This means it
+ /// can also be used to find occurrences in reverse order.
+ #[inline]
+ pub fn iter<'a, 'h>(&'a self, haystack: &'h [u8]) -> TwoIter<'a, 'h> {
+ TwoIter { searcher: self, it: generic::Iter::new(haystack) }
+ }
+}
+
+/// An iterator over all occurrences of two possible bytes in a haystack.
+///
+/// This iterator implements `DoubleEndedIterator`, which means it can also be
+/// used to find occurrences in reverse order.
+///
+/// This iterator is created by the [`Two::iter`] method.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'a` refers to the lifetime of the underlying [`Two`] searcher.
+/// * `'h` refers to the lifetime of the haystack being searched.
+#[derive(Clone, Debug)]
+pub struct TwoIter<'a, 'h> {
+ searcher: &'a Two,
+ it: generic::Iter<'h>,
+}
+
+impl<'a, 'h> Iterator for TwoIter<'a, 'h> {
+ type Item = usize;
+
+ #[inline]
+ fn next(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'find_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next(|s, e| self.searcher.find_raw(s, e)) }
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl<'a, 'h> DoubleEndedIterator for TwoIter<'a, 'h> {
+ #[inline]
+ fn next_back(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'rfind_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next_back(|s, e| self.searcher.rfind_raw(s, e)) }
+ }
+}
+
+impl<'a, 'h> core::iter::FusedIterator for TwoIter<'a, 'h> {}
+
+/// Finds all occurrences of three bytes in a haystack.
+///
+/// That is, this reports matches of one of three possible bytes. For example,
+/// searching for `a`, `b` or `o` in `afoobar` would report matches at offsets
+/// `0`, `2`, `3`, `4` and `5`.
+#[derive(Clone, Copy, Debug)]
+pub struct Three(generic::Three<__m128i>);
+
+impl Three {
+ /// Create a new searcher that finds occurrences of the needle bytes given.
+ ///
+ /// This particular searcher is specialized to use SSE2 vector instructions
+ /// that typically make it quite fast.
+ ///
+ /// If SSE2 is unavailable in the current environment, then `None` is
+ /// returned.
+ #[inline]
+ pub fn new(needle1: u8, needle2: u8, needle3: u8) -> Option<Three> {
+ if Three::is_available() {
+ // SAFETY: we check that sse2 is available above.
+ unsafe { Some(Three::new_unchecked(needle1, needle2, needle3)) }
+ } else {
+ None
+ }
+ }
+
+ /// Create a new finder specific to SSE2 vectors and routines without
+ /// checking that SSE2 is available.
+ ///
+ /// # Safety
+ ///
+ /// Callers must guarantee that it is safe to execute `sse2` instructions
+ /// in the current environment.
+ ///
+ /// Note that it is a common misconception that if one compiles for an
+ /// `x86_64` target, then they therefore automatically have access to SSE2
+ /// instructions. While this is almost always the case, it isn't true in
+ /// 100% of cases.
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ pub unsafe fn new_unchecked(
+ needle1: u8,
+ needle2: u8,
+ needle3: u8,
+ ) -> Three {
+ Three(generic::Three::new(needle1, needle2, needle3))
+ }
+
+ /// Returns true when this implementation is available in the current
+ /// environment.
+ ///
+ /// When this is true, it is guaranteed that [`Three::new`] will return
+ /// a `Some` value. Similarly, when it is false, it is guaranteed that
+ /// `Three::new` will return a `None` value.
+ ///
+ /// Note also that for the lifetime of a single program, if this returns
+ /// true then it will always return true.
+ #[inline]
+ pub fn is_available() -> bool {
+ #[cfg(target_feature = "sse2")]
+ {
+ true
+ }
+ #[cfg(not(target_feature = "sse2"))]
+ {
+ false
+ }
+ }
+
+ /// Return the first occurrence of one of the needle bytes in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value is `haystack.len() - 1`.
+ #[inline]
+ pub fn find(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `find_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.find_raw(s, e)
+ })
+ }
+ }
+
+ /// Return the last occurrence of one of the needle bytes in the given
+ /// haystack. If no such occurrence exists, then `None` is returned.
+ ///
+ /// The occurrence is reported as an offset into `haystack`. Its maximum
+ /// value is `haystack.len() - 1`.
+ #[inline]
+ pub fn rfind(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: `rfind_raw` guarantees that if a pointer is returned, it
+ // falls within the bounds of the start and end pointers.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |s, e| {
+ self.rfind_raw(s, e)
+ })
+ }
+ }
+
+ /// Like `find`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn find_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ if end.distance(start) < __m128i::BYTES {
+ // SAFETY: We require the caller to pass valid start/end pointers.
+ return generic::fwd_byte_by_byte(start, end, |b| {
+ b == self.0.needle1()
+ || b == self.0.needle2()
+ || b == self.0.needle3()
+ });
+ }
+ // SAFETY: Building a `Three` means it's safe to call 'sse2' routines.
+ // Also, we've checked that our haystack is big enough to run on the
+ // vector routine. Pointer validity is caller's responsibility.
+ //
+ // Note that we could call `self.0.find_raw` directly here. But that
+ // means we'd have to annotate this routine with `target_feature`.
+ // Which is fine, because this routine is `unsafe` anyway and the
+ // `target_feature` obligation is met by virtue of building a `Three`.
+ // The real problem is that a routine with a `target_feature`
+ // annotation generally can't be inlined into caller code unless the
+ // caller code has the same target feature annotations. Which is maybe
+ // okay for SSE2, but we do the same thing for AVX2 where caller code
+ // probably usually doesn't have AVX2 enabled. That means that this
+ // routine can be inlined which will handle some of the short-haystack
+ // cases above without touching the architecture specific code.
+ self.find_raw_impl(start, end)
+ }
+
+ /// Like `rfind`, but accepts and returns raw pointers.
+ ///
+ /// When a match is found, the pointer returned is guaranteed to be
+ /// `>= start` and `< end`.
+ ///
+ /// This routine is useful if you're already using raw pointers and would
+ /// like to avoid converting back to a slice before executing a search.
+ ///
+ /// # Safety
+ ///
+ /// * Both `start` and `end` must be valid for reads.
+ /// * Both `start` and `end` must point to an initialized value.
+ /// * Both `start` and `end` must point to the same allocated object and
+ /// must either be in bounds or at most one byte past the end of the
+ /// allocated object.
+ /// * Both `start` and `end` must be _derived from_ a pointer to the same
+ /// object.
+ /// * The distance between `start` and `end` must not overflow `isize`.
+ /// * The distance being in bounds must not rely on "wrapping around" the
+ /// address space.
+ ///
+ /// Note that callers may pass a pair of pointers such that `start >= end`.
+ /// In that case, `None` will always be returned.
+ #[inline]
+ pub unsafe fn rfind_raw(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ if start >= end {
+ return None;
+ }
+ if end.distance(start) < __m128i::BYTES {
+ // SAFETY: We require the caller to pass valid start/end pointers.
+ return generic::rev_byte_by_byte(start, end, |b| {
+ b == self.0.needle1()
+ || b == self.0.needle2()
+ || b == self.0.needle3()
+ });
+ }
+ // SAFETY: Building a `Three` means it's safe to call 'sse2' routines.
+ // Also, we've checked that our haystack is big enough to run on the
+ // vector routine. Pointer validity is caller's responsibility.
+ //
+ // See note in forward routine above for why we don't just call
+ // `self.0.rfind_raw` directly here.
+ self.rfind_raw_impl(start, end)
+ }
+
+ /// Execute a search using SSE2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`Three::find_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an SSE2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `Three`, which can only be constructed
+ /// when it is safe to call `sse2` routines.)
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn find_raw_impl(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.0.find_raw(start, end)
+ }
+
+ /// Execute a search using SSE2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as [`Three::rfind_raw`], except the distance between `start` and
+ /// `end` must be at least the size of an SSE2 vector (in bytes).
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `Three`, which can only be constructed
+ /// when it is safe to call `sse2` routines.)
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn rfind_raw_impl(
+ &self,
+ start: *const u8,
+ end: *const u8,
+ ) -> Option<*const u8> {
+ self.0.rfind_raw(start, end)
+ }
+
+ /// Returns an iterator over all occurrences of the needle byte in the
+ /// given haystack.
+ ///
+ /// The iterator returned implements `DoubleEndedIterator`. This means it
+ /// can also be used to find occurrences in reverse order.
+ #[inline]
+ pub fn iter<'a, 'h>(&'a self, haystack: &'h [u8]) -> ThreeIter<'a, 'h> {
+ ThreeIter { searcher: self, it: generic::Iter::new(haystack) }
+ }
+}
+
+/// An iterator over all occurrences of three possible bytes in a haystack.
+///
+/// This iterator implements `DoubleEndedIterator`, which means it can also be
+/// used to find occurrences in reverse order.
+///
+/// This iterator is created by the [`Three::iter`] method.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'a` refers to the lifetime of the underlying [`Three`] searcher.
+/// * `'h` refers to the lifetime of the haystack being searched.
+#[derive(Clone, Debug)]
+pub struct ThreeIter<'a, 'h> {
+ searcher: &'a Three,
+ it: generic::Iter<'h>,
+}
+
+impl<'a, 'h> Iterator for ThreeIter<'a, 'h> {
+ type Item = usize;
+
+ #[inline]
+ fn next(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'find_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next(|s, e| self.searcher.find_raw(s, e)) }
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl<'a, 'h> DoubleEndedIterator for ThreeIter<'a, 'h> {
+ #[inline]
+ fn next_back(&mut self) -> Option<usize> {
+ // SAFETY: We rely on the generic iterator to provide valid start
+ // and end pointers, but we guarantee that any pointer returned by
+ // 'rfind_raw' falls within the bounds of the start and end pointer.
+ unsafe { self.it.next_back(|s, e| self.searcher.rfind_raw(s, e)) }
+ }
+}
+
+impl<'a, 'h> core::iter::FusedIterator for ThreeIter<'a, 'h> {}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ define_memchr_quickcheck!(super);
+
+ #[test]
+ fn forward_one() {
+ crate::tests::memchr::Runner::new(1).forward_iter(
+ |haystack, needles| {
+ Some(One::new(needles[0])?.iter(haystack).collect())
+ },
+ )
+ }
+
+ #[test]
+ fn reverse_one() {
+ crate::tests::memchr::Runner::new(1).reverse_iter(
+ |haystack, needles| {
+ Some(One::new(needles[0])?.iter(haystack).rev().collect())
+ },
+ )
+ }
+
+ #[test]
+ fn count_one() {
+ crate::tests::memchr::Runner::new(1).count_iter(|haystack, needles| {
+ Some(One::new(needles[0])?.iter(haystack).count())
+ })
+ }
+
+ #[test]
+ fn forward_two() {
+ crate::tests::memchr::Runner::new(2).forward_iter(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ Some(Two::new(n1, n2)?.iter(haystack).collect())
+ },
+ )
+ }
+
+ #[test]
+ fn reverse_two() {
+ crate::tests::memchr::Runner::new(2).reverse_iter(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ Some(Two::new(n1, n2)?.iter(haystack).rev().collect())
+ },
+ )
+ }
+
+ #[test]
+ fn forward_three() {
+ crate::tests::memchr::Runner::new(3).forward_iter(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ let n3 = needles.get(2).copied()?;
+ Some(Three::new(n1, n2, n3)?.iter(haystack).collect())
+ },
+ )
+ }
+
+ #[test]
+ fn reverse_three() {
+ crate::tests::memchr::Runner::new(3).reverse_iter(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ let n3 = needles.get(2).copied()?;
+ Some(Three::new(n1, n2, n3)?.iter(haystack).rev().collect())
+ },
+ )
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +
/*!
+A 128-bit vector implementation of the "packed pair" SIMD algorithm.
+
+The "packed pair" algorithm is based on the [generic SIMD] algorithm. The main
+difference is that it (by default) uses a background distribution of byte
+frequencies to heuristically select the pair of bytes to search for.
+
+[generic SIMD]: http://0x80.pl/articles/simd-strfind.html#first-and-last
+*/
+
+use core::arch::x86_64::__m128i;
+
+use crate::arch::{all::packedpair::Pair, generic::packedpair};
+
+/// A "packed pair" finder that uses 128-bit vector operations.
+///
+/// This finder picks two bytes that it believes have high predictive power
+/// for indicating an overall match of a needle. Depending on whether
+/// `Finder::find` or `Finder::find_prefilter` is used, it reports offsets
+/// where the needle matches or could match. In the prefilter case, candidates
+/// are reported whenever the [`Pair`] of bytes given matches.
+#[derive(Clone, Copy, Debug)]
+pub struct Finder(packedpair::Finder<__m128i>);
+
+impl Finder {
+ /// Create a new pair searcher. The searcher returned can either report
+ /// exact matches of `needle` or act as a prefilter and report candidate
+ /// positions of `needle`.
+ ///
+ /// If SSE2 is unavailable in the current environment or if a [`Pair`]
+ /// could not be constructed from the needle given, then `None` is
+ /// returned.
+ #[inline]
+ pub fn new(needle: &[u8]) -> Option<Finder> {
+ Finder::with_pair(needle, Pair::new(needle)?)
+ }
+
+ /// Create a new "packed pair" finder using the pair of bytes given.
+ ///
+ /// This constructor permits callers to control precisely which pair of
+ /// bytes is used as a predicate.
+ ///
+ /// If SSE2 is unavailable in the current environment, then `None` is
+ /// returned.
+ #[inline]
+ pub fn with_pair(needle: &[u8], pair: Pair) -> Option<Finder> {
+ if Finder::is_available() {
+ // SAFETY: we check that sse2 is available above. We are also
+ // guaranteed to have needle.len() > 1 because we have a valid
+ // Pair.
+ unsafe { Some(Finder::with_pair_impl(needle, pair)) }
+ } else {
+ None
+ }
+ }
+
+ /// Create a new `Finder` specific to SSE2 vectors and routines.
+ ///
+ /// # Safety
+ ///
+ /// Same as the safety for `packedpair::Finder::new`, and callers must also
+ /// ensure that SSE2 is available.
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn with_pair_impl(needle: &[u8], pair: Pair) -> Finder {
+ let finder = packedpair::Finder::<__m128i>::new(needle, pair);
+ Finder(finder)
+ }
+
+ /// Returns true when this implementation is available in the current
+ /// environment.
+ ///
+ /// When this is true, it is guaranteed that [`Finder::with_pair`] will
+ /// return a `Some` value. Similarly, when it is false, it is guaranteed
+ /// that `Finder::with_pair` will return a `None` value. Notice that this
+ /// does not guarantee that [`Finder::new`] will return a `Finder`. Namely,
+ /// even when `Finder::is_available` is true, it is not guaranteed that a
+ /// valid [`Pair`] can be found from the needle given.
+ ///
+ /// Note also that for the lifetime of a single program, if this returns
+ /// true then it will always return true.
+ #[inline]
+ pub fn is_available() -> bool {
+ #[cfg(not(target_feature = "sse2"))]
+ {
+ false
+ }
+ #[cfg(target_feature = "sse2")]
+ {
+ true
+ }
+ }
+
+ /// Execute a search using SSE2 vectors and routines.
+ ///
+ /// # Panics
+ ///
+ /// When `haystack.len()` is less than [`Finder::min_haystack_len`].
+ #[inline]
+ pub fn find(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
+ // SAFETY: Building a `Finder` means it's safe to call 'sse2' routines.
+ unsafe { self.find_impl(haystack, needle) }
+ }
+
+ /// Run this finder on the given haystack as a prefilter.
+ ///
+ /// If a candidate match is found, then an offset where the needle *could*
+ /// begin in the haystack is returned.
+ ///
+ /// # Panics
+ ///
+ /// When `haystack.len()` is less than [`Finder::min_haystack_len`].
+ #[inline]
+ pub fn find_prefilter(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: Building a `Finder` means it's safe to call 'sse2' routines.
+ unsafe { self.find_prefilter_impl(haystack) }
+ }
+
+ /// Execute a search using SSE2 vectors and routines.
+ ///
+ /// # Panics
+ ///
+ /// When `haystack.len()` is less than [`Finder::min_haystack_len`].
+ ///
+ /// # Safety
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `Finder`, which can only be constructed
+ /// when it is safe to call `sse2` routines.)
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn find_impl(
+ &self,
+ haystack: &[u8],
+ needle: &[u8],
+ ) -> Option<usize> {
+ self.0.find(haystack, needle)
+ }
+
+ /// Execute a prefilter search using SSE2 vectors and routines.
+ ///
+ /// # Panics
+ ///
+ /// When `haystack.len()` is less than [`Finder::min_haystack_len`].
+ ///
+ /// # Safety
+ ///
+ /// (The target feature safety obligation is automatically fulfilled by
+ /// virtue of being a method on `Finder`, which can only be constructed
+ /// when it is safe to call `sse2` routines.)
+ #[target_feature(enable = "sse2")]
+ #[inline]
+ unsafe fn find_prefilter_impl(&self, haystack: &[u8]) -> Option<usize> {
+ self.0.find_prefilter(haystack)
+ }
+
+ /// Returns the pair of offsets (into the needle) used to check as a
+ /// predicate before confirming whether a needle exists at a particular
+ /// position.
+ #[inline]
+ pub fn pair(&self) -> &Pair {
+ self.0.pair()
+ }
+
+ /// Returns the minimum haystack length that this `Finder` can search.
+ ///
+ /// Using a haystack with length smaller than this in a search will result
+ /// in a panic. The reason for this restriction is that this finder is
+ /// meant to be a low-level component that is part of a larger substring
+ /// strategy. In that sense, it avoids trying to handle all cases and
+ /// instead only handles the cases that it can handle very well.
+ #[inline]
+ pub fn min_haystack_len(&self) -> usize {
+ self.0.min_haystack_len()
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn find(haystack: &[u8], needle: &[u8]) -> Option<Option<usize>> {
+ let f = Finder::new(needle)?;
+ if haystack.len() < f.min_haystack_len() {
+ return None;
+ }
+ Some(f.find(haystack, needle))
+ }
+
+ define_substring_forward_quickcheck!(find);
+
+ #[test]
+ fn forward_substring() {
+ crate::tests::substring::Runner::new().fwd(find).run()
+ }
+
+ #[test]
+ fn forward_packedpair() {
+ fn find(
+ haystack: &[u8],
+ needle: &[u8],
+ index1: u8,
+ index2: u8,
+ ) -> Option<Option<usize>> {
+ let pair = Pair::with_indices(needle, index1, index2)?;
+ let f = Finder::with_pair(needle, pair)?;
+ if haystack.len() < f.min_haystack_len() {
+ return None;
+ }
+ Some(f.find(haystack, needle))
+ }
+ crate::tests::packedpair::Runner::new().fwd(find).run()
+ }
+
+ #[test]
+ fn forward_packedpair_prefilter() {
+ fn find(
+ haystack: &[u8],
+ needle: &[u8],
+ index1: u8,
+ index2: u8,
+ ) -> Option<Option<usize>> {
+ let pair = Pair::with_indices(needle, index1, index2)?;
+ let f = Finder::with_pair(needle, pair)?;
+ if haystack.len() < f.min_haystack_len() {
+ return None;
+ }
+ Some(f.find_prefilter(haystack))
+ }
+ crate::tests::packedpair::Runner::new().fwd(find).run()
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +
use core::ops;
+
+/// A specialized copy-on-write byte string.
+///
+/// The purpose of this type is to permit usage of a "borrowed or owned
+/// byte string" in a way that keeps std/no-std compatibility. That is, in
+/// no-std/alloc mode, this type devolves into a simple &[u8] with no owned
+/// variant available. We can't just use a plain Cow because Cow is not in
+/// core.
+#[derive(Clone, Debug)]
+pub struct CowBytes<'a>(Imp<'a>);
+
+// N.B. We don't use alloc::borrow::Cow here since we can get away with a
+// Box<[u8]> for our use case, which is 1/3 smaller than the Vec<u8> that
+// a Cow<[u8]> would use.
+#[cfg(feature = "alloc")]
+#[derive(Clone, Debug)]
+enum Imp<'a> {
+ Borrowed(&'a [u8]),
+ Owned(alloc::boxed::Box<[u8]>),
+}
+
+#[cfg(not(feature = "alloc"))]
+#[derive(Clone, Debug)]
+struct Imp<'a>(&'a [u8]);
+
+impl<'a> ops::Deref for CowBytes<'a> {
+ type Target = [u8];
+
+ #[inline(always)]
+ fn deref(&self) -> &[u8] {
+ self.as_slice()
+ }
+}
+
+impl<'a> CowBytes<'a> {
+ /// Create a new borrowed CowBytes.
+ #[inline(always)]
+ pub(crate) fn new<B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> CowBytes<'a> {
+ CowBytes(Imp::new(bytes.as_ref()))
+ }
+
+ /// Create a new owned CowBytes.
+ #[cfg(feature = "alloc")]
+ #[inline(always)]
+ fn new_owned(bytes: alloc::boxed::Box<[u8]>) -> CowBytes<'static> {
+ CowBytes(Imp::Owned(bytes))
+ }
+
+ /// Return a borrowed byte string, regardless of whether this is an owned
+ /// or borrowed byte string internally.
+ #[inline(always)]
+ pub(crate) fn as_slice(&self) -> &[u8] {
+ self.0.as_slice()
+ }
+
+ /// Return an owned version of this copy-on-write byte string.
+ ///
+ /// If this is already an owned byte string internally, then this is a
+ /// no-op. Otherwise, the internal byte string is copied.
+ #[cfg(feature = "alloc")]
+ #[inline(always)]
+ pub(crate) fn into_owned(self) -> CowBytes<'static> {
+ match self.0 {
+ Imp::Borrowed(b) => {
+ CowBytes::new_owned(alloc::boxed::Box::from(b))
+ }
+ Imp::Owned(b) => CowBytes::new_owned(b),
+ }
+ }
+}
+
+impl<'a> Imp<'a> {
+ #[inline(always)]
+ pub fn new(bytes: &'a [u8]) -> Imp<'a> {
+ #[cfg(feature = "alloc")]
+ {
+ Imp::Borrowed(bytes)
+ }
+ #[cfg(not(feature = "alloc"))]
+ {
+ Imp(bytes)
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ #[inline(always)]
+ pub fn as_slice(&self) -> &[u8] {
+ #[cfg(feature = "alloc")]
+ {
+ match self {
+ Imp::Owned(ref x) => x,
+ Imp::Borrowed(x) => x,
+ }
+ }
+ #[cfg(not(feature = "alloc"))]
+ {
+ self.0
+ }
+ }
+
+ #[cfg(not(feature = "alloc"))]
+ #[inline(always)]
+ pub fn as_slice(&self) -> &[u8] {
+ self.0
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +
/// A trait for adding some helper routines to pointers.
+pub(crate) trait Pointer {
+ /// Returns the distance, in units of `T`, between `self` and `origin`.
+ ///
+ /// # Safety
+ ///
+ /// Same as `ptr::offset_from` in addition to `self >= origin`.
+ unsafe fn distance(self, origin: Self) -> usize;
+
+ /// Casts this pointer to `usize`.
+ ///
+ /// Callers should not convert the `usize` back to a pointer if at all
+ /// possible. (And if you believe it's necessary, open an issue to discuss
+ /// why. Otherwise, it has the potential to violate pointer provenance.)
+ /// The purpose of this function is just to be able to do arithmetic, i.e.,
+ /// computing offsets or alignments.
+ fn as_usize(self) -> usize;
+}
+
+impl<T> Pointer for *const T {
+ unsafe fn distance(self, origin: *const T) -> usize {
+ // TODO: Replace with `ptr::sub_ptr` once stabilized.
+ usize::try_from(self.offset_from(origin)).unwrap_unchecked()
+ }
+
+ fn as_usize(self) -> usize {
+ self as usize
+ }
+}
+
+impl<T> Pointer for *mut T {
+ unsafe fn distance(self, origin: *mut T) -> usize {
+ (self as *const T).distance(origin as *const T)
+ }
+
+ fn as_usize(self) -> usize {
+ (self as *const T).as_usize()
+ }
+}
+
+/// A trait for adding some helper routines to raw bytes.
+pub(crate) trait Byte {
+ /// Converts this byte to a `char` if it's ASCII. Otherwise panics.
+ fn to_char(self) -> char;
+}
+
+impl Byte for u8 {
+ fn to_char(self) -> char {
+ assert!(self.is_ascii());
+ char::from(self)
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +
/*!
+This library provides heavily optimized routines for string search primitives.
+
+# Overview
+
+This section gives a brief high level overview of what this crate offers.
+
+* The top-level module provides routines for searching for 1, 2 or 3 bytes
+ in the forward or reverse direction. When searching for more than one byte,
+ positions are considered a match if the byte at that position matches any
+ of the bytes.
+* The [`memmem`] sub-module provides forward and reverse substring search
+ routines.
+
+In all such cases, routines operate on `&[u8]` without regard to encoding. This
+is exactly what you want when searching either UTF-8 or arbitrary bytes.
+
+# Example: using `memchr`
+
+This example shows how to use `memchr` to find the first occurrence of `z` in
+a haystack:
+
+```
+use memchr::memchr;
+
+let haystack = b"foo bar baz quuz";
+assert_eq!(Some(10), memchr(b'z', haystack));
+```
+
+# Example: matching one of three possible bytes
+
+This examples shows how to use `memrchr3` to find occurrences of `a`, `b` or
+`c`, starting at the end of the haystack.
+
+```
+use memchr::memchr3_iter;
+
+let haystack = b"xyzaxyzbxyzc";
+
+let mut it = memchr3_iter(b'a', b'b', b'c', haystack).rev();
+assert_eq!(Some(11), it.next());
+assert_eq!(Some(7), it.next());
+assert_eq!(Some(3), it.next());
+assert_eq!(None, it.next());
+```
+
+# Example: iterating over substring matches
+
+This example shows how to use the [`memmem`] sub-module to find occurrences of
+a substring in a haystack.
+
+```
+use memchr::memmem;
+
+let haystack = b"foo bar foo baz foo";
+
+let mut it = memmem::find_iter(haystack, "foo");
+assert_eq!(Some(0), it.next());
+assert_eq!(Some(8), it.next());
+assert_eq!(Some(16), it.next());
+assert_eq!(None, it.next());
+```
+
+# Example: repeating a search for the same needle
+
+It may be possible for the overhead of constructing a substring searcher to be
+measurable in some workloads. In cases where the same needle is used to search
+many haystacks, it is possible to do construction once and thus to avoid it for
+subsequent searches. This can be done with a [`memmem::Finder`]:
+
+```
+use memchr::memmem;
+
+let finder = memmem::Finder::new("foo");
+
+assert_eq!(Some(4), finder.find(b"baz foo quux"));
+assert_eq!(None, finder.find(b"quux baz bar"));
+```
+
+# Why use this crate?
+
+At first glance, the APIs provided by this crate might seem weird. Why provide
+a dedicated routine like `memchr` for something that could be implemented
+clearly and trivially in one line:
+
+```
+fn memchr(needle: u8, haystack: &[u8]) -> Option<usize> {
+ haystack.iter().position(|&b| b == needle)
+}
+```
+
+Or similarly, why does this crate provide substring search routines when Rust's
+core library already provides them?
+
+```
+fn search(haystack: &str, needle: &str) -> Option<usize> {
+ haystack.find(needle)
+}
+```
+
+The primary reason for both of them to exist is performance. When it comes to
+performance, at a high level at least, there are two primary ways to look at
+it:
+
+* **Throughput**: For this, think about it as, "given some very large haystack
+ and a byte that never occurs in that haystack, how long does it take to
+ search through it and determine that it, in fact, does not occur?"
+* **Latency**: For this, think about it as, "given a tiny haystack---just a
+ few bytes---how long does it take to determine if a byte is in it?"
+
+The `memchr` routine in this crate has _slightly_ worse latency than the
+solution presented above, however, its throughput can easily be over an
+order of magnitude faster. This is a good general purpose trade off to make.
+You rarely lose, but often gain big.
+
+**NOTE:** The name `memchr` comes from the corresponding routine in `libc`. A
+key advantage of using this library is that its performance is not tied to its
+quality of implementation in the `libc` you happen to be using, which can vary
+greatly from platform to platform.
+
+But what about substring search? This one is a bit more complicated. The
+primary reason for its existence is still indeed performance, but it's also
+useful because Rust's core library doesn't actually expose any substring
+search routine on arbitrary bytes. The only substring search routine that
+exists works exclusively on valid UTF-8.
+
+So if you have valid UTF-8, is there a reason to use this over the standard
+library substring search routine? Yes. This routine is faster on almost every
+metric, including latency. The natural question then, is why isn't this
+implementation in the standard library, even if only for searching on UTF-8?
+The reason is that the implementation details for using SIMD in the standard
+library haven't quite been worked out yet.
+
+**NOTE:** Currently, only `x86_64`, `wasm32` and `aarch64` targets have vector
+accelerated implementations of `memchr` (and friends) and `memmem`.
+
+# Crate features
+
+* **std** - When enabled (the default), this will permit features specific to
+the standard library. Currently, the only thing used from the standard library
+is runtime SIMD CPU feature detection. This means that this feature must be
+enabled to get AVX2 accelerated routines on `x86_64` targets without enabling
+the `avx2` feature at compile time, for example. When `std` is not enabled,
+this crate will still attempt to use SSE2 accelerated routines on `x86_64`. It
+will also use AVX2 accelerated routines when the `avx2` feature is enabled at
+compile time. In general, enable this feature if you can.
+* **alloc** - When enabled (the default), APIs in this crate requiring some
+kind of allocation will become available. For example, the
+[`memmem::Finder::into_ownedd`](crate::memmem::Finder::into_owned) API and the
+[`arch::all::shiftor`](crate::arch::all::shiftor) substring search
+implementation. Otherwise, this crate is designed from the ground up to be
+usable in core-only contexts, so the `alloc` feature doesn't add much
+currently. Notably, disabling `std` but enabling `alloc` will **not** result
+in the use of AVX2 on `x86_64` targets unless the `avx2` feature is enabled
+at compile time. (With `std` enabled, AVX2 can be used even without the `avx2`
+feature enabled at compile time by way of runtime CPU feature detection.)
+* **logging** - When enabled (disabled by default), the `log` crate is used
+to emit log messages about what kinds of `memchr` and `memmem` algorithms
+are used. Namely, both `memchr` and `memmem` have a number of different
+implementation choices depending on the target and CPU, and the log messages
+can help show what specific implementations are being used. Generally, this is
+useful for debugging performance issues.
+* **libc** - **DEPRECATED**. Previously, this enabled the use of the target's
+`memchr` function from whatever `libc` was linked into the program. This
+feature is now a no-op because this crate's implementation of `memchr` should
+now be sufficiently fast on a number of platforms that `libc` should no longer
+be needed. (This feature is somewhat of a holdover from this crate's origins.
+Originally, this crate was literally just a safe wrapper function around the
+`memchr` function from `libc`.)
+*/
+
+#![deny(missing_docs)]
+#![no_std]
+// It's just not worth trying to squash all dead code warnings. Pretty
+// unfortunate IMO. Not really sure how to fix this other than to either
+// live with it or sprinkle a whole mess of `cfg` annotations everywhere.
+#![cfg_attr(
+ not(any(
+ all(target_arch = "x86_64", target_feature = "sse2"),
+ target_arch = "wasm32",
+ target_arch = "aarch64",
+ )),
+ allow(dead_code)
+)]
+// Same deal for miri.
+#![cfg_attr(miri, allow(dead_code, unused_macros))]
+
+// Supporting 8-bit (or others) would be fine. If you need it, please submit a
+// bug report at https://github.com/BurntSushi/memchr
+#[cfg(not(any(
+ target_pointer_width = "16",
+ target_pointer_width = "32",
+ target_pointer_width = "64"
+)))]
+compile_error!("memchr currently not supported on non-{16,32,64}");
+
+#[cfg(any(test, feature = "std"))]
+extern crate std;
+
+#[cfg(any(test, feature = "alloc"))]
+extern crate alloc;
+
+pub use crate::memchr::{
+ memchr, memchr2, memchr2_iter, memchr3, memchr3_iter, memchr_iter,
+ memrchr, memrchr2, memrchr2_iter, memrchr3, memrchr3_iter, memrchr_iter,
+ Memchr, Memchr2, Memchr3,
+};
+
+#[macro_use]
+mod macros;
+
+#[cfg(test)]
+#[macro_use]
+mod tests;
+
+pub mod arch;
+mod cow;
+mod ext;
+mod memchr;
+pub mod memmem;
+mod vector;
+
// Some feature combinations result in some of these macros never being used.
+// Which is fine. Just squash the warnings.
+#![allow(unused_macros)]
+
+macro_rules! log {
+ ($($tt:tt)*) => {
+ #[cfg(feature = "logging")]
+ {
+ $($tt)*
+ }
+ }
+}
+
+macro_rules! debug {
+ ($($tt:tt)*) => { log!(log::debug!($($tt)*)) }
+}
+
+macro_rules! trace {
+ ($($tt:tt)*) => { log!(log::trace!($($tt)*)) }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 +763 +764 +765 +766 +767 +768 +769 +770 +771 +772 +773 +774 +775 +776 +777 +778 +779 +780 +781 +782 +783 +784 +785 +786 +787 +788 +789 +790 +791 +792 +793 +794 +795 +796 +797 +798 +799 +800 +801 +802 +803 +804 +805 +806 +807 +808 +809 +810 +811 +812 +813 +814 +815 +816 +817 +818 +819 +820 +821 +822 +823 +824 +825 +826 +827 +828 +829 +830 +831 +832 +833 +834 +835 +836 +837 +838 +839 +840 +841 +842 +843 +844 +845 +846 +847 +848 +849 +850 +851 +852 +853 +854 +855 +856 +857 +858 +859 +860 +861 +862 +863 +864 +865 +866 +867 +868 +869 +870 +871 +872 +873 +874 +875 +876 +877 +878 +879 +880 +881 +882 +883 +884 +885 +886 +887 +888 +889 +890 +891 +892 +893 +894 +895 +896 +897 +898 +899 +900 +901 +902 +903 +
use core::iter::Rev;
+
+use crate::arch::generic::memchr as generic;
+
+/// Search for the first occurrence of a byte in a slice.
+///
+/// This returns the index corresponding to the first occurrence of `needle` in
+/// `haystack`, or `None` if one is not found. If an index is returned, it is
+/// guaranteed to be less than `haystack.len()`.
+///
+/// While this is semantically the same as something like
+/// `haystack.iter().position(|&b| b == needle)`, this routine will attempt to
+/// use highly optimized vector operations that can be an order of magnitude
+/// faster (or more).
+///
+/// # Example
+///
+/// This shows how to find the first position of a byte in a byte string.
+///
+/// ```
+/// use memchr::memchr;
+///
+/// let haystack = b"the quick brown fox";
+/// assert_eq!(memchr(b'k', haystack), Some(8));
+/// ```
+#[inline]
+pub fn memchr(needle: u8, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: memchr_raw, when a match is found, always returns a valid
+ // pointer between start and end.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |start, end| {
+ memchr_raw(needle, start, end)
+ })
+ }
+}
+
+/// Search for the last occurrence of a byte in a slice.
+///
+/// This returns the index corresponding to the last occurrence of `needle` in
+/// `haystack`, or `None` if one is not found. If an index is returned, it is
+/// guaranteed to be less than `haystack.len()`.
+///
+/// While this is semantically the same as something like
+/// `haystack.iter().rposition(|&b| b == needle)`, this routine will attempt to
+/// use highly optimized vector operations that can be an order of magnitude
+/// faster (or more).
+///
+/// # Example
+///
+/// This shows how to find the last position of a byte in a byte string.
+///
+/// ```
+/// use memchr::memrchr;
+///
+/// let haystack = b"the quick brown fox";
+/// assert_eq!(memrchr(b'o', haystack), Some(17));
+/// ```
+#[inline]
+pub fn memrchr(needle: u8, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: memrchr_raw, when a match is found, always returns a valid
+ // pointer between start and end.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |start, end| {
+ memrchr_raw(needle, start, end)
+ })
+ }
+}
+
+/// Search for the first occurrence of two possible bytes in a haystack.
+///
+/// This returns the index corresponding to the first occurrence of one of the
+/// needle bytes in `haystack`, or `None` if one is not found. If an index is
+/// returned, it is guaranteed to be less than `haystack.len()`.
+///
+/// While this is semantically the same as something like
+/// `haystack.iter().position(|&b| b == needle1 || b == needle2)`, this routine
+/// will attempt to use highly optimized vector operations that can be an order
+/// of magnitude faster (or more).
+///
+/// # Example
+///
+/// This shows how to find the first position of one of two possible bytes in a
+/// haystack.
+///
+/// ```
+/// use memchr::memchr2;
+///
+/// let haystack = b"the quick brown fox";
+/// assert_eq!(memchr2(b'k', b'q', haystack), Some(4));
+/// ```
+#[inline]
+pub fn memchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: memchr2_raw, when a match is found, always returns a valid
+ // pointer between start and end.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |start, end| {
+ memchr2_raw(needle1, needle2, start, end)
+ })
+ }
+}
+
+/// Search for the last occurrence of two possible bytes in a haystack.
+///
+/// This returns the index corresponding to the last occurrence of one of the
+/// needle bytes in `haystack`, or `None` if one is not found. If an index is
+/// returned, it is guaranteed to be less than `haystack.len()`.
+///
+/// While this is semantically the same as something like
+/// `haystack.iter().rposition(|&b| b == needle1 || b == needle2)`, this
+/// routine will attempt to use highly optimized vector operations that can be
+/// an order of magnitude faster (or more).
+///
+/// # Example
+///
+/// This shows how to find the last position of one of two possible bytes in a
+/// haystack.
+///
+/// ```
+/// use memchr::memrchr2;
+///
+/// let haystack = b"the quick brown fox";
+/// assert_eq!(memrchr2(b'k', b'o', haystack), Some(17));
+/// ```
+#[inline]
+pub fn memrchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: memrchr2_raw, when a match is found, always returns a valid
+ // pointer between start and end.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |start, end| {
+ memrchr2_raw(needle1, needle2, start, end)
+ })
+ }
+}
+
+/// Search for the first occurrence of three possible bytes in a haystack.
+///
+/// This returns the index corresponding to the first occurrence of one of the
+/// needle bytes in `haystack`, or `None` if one is not found. If an index is
+/// returned, it is guaranteed to be less than `haystack.len()`.
+///
+/// While this is semantically the same as something like
+/// `haystack.iter().position(|&b| b == needle1 || b == needle2 || b == needle3)`,
+/// this routine will attempt to use highly optimized vector operations that
+/// can be an order of magnitude faster (or more).
+///
+/// # Example
+///
+/// This shows how to find the first position of one of three possible bytes in
+/// a haystack.
+///
+/// ```
+/// use memchr::memchr3;
+///
+/// let haystack = b"the quick brown fox";
+/// assert_eq!(memchr3(b'k', b'q', b'u', haystack), Some(4));
+/// ```
+#[inline]
+pub fn memchr3(
+ needle1: u8,
+ needle2: u8,
+ needle3: u8,
+ haystack: &[u8],
+) -> Option<usize> {
+ // SAFETY: memchr3_raw, when a match is found, always returns a valid
+ // pointer between start and end.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |start, end| {
+ memchr3_raw(needle1, needle2, needle3, start, end)
+ })
+ }
+}
+
+/// Search for the last occurrence of three possible bytes in a haystack.
+///
+/// This returns the index corresponding to the last occurrence of one of the
+/// needle bytes in `haystack`, or `None` if one is not found. If an index is
+/// returned, it is guaranteed to be less than `haystack.len()`.
+///
+/// While this is semantically the same as something like
+/// `haystack.iter().rposition(|&b| b == needle1 || b == needle2 || b == needle3)`,
+/// this routine will attempt to use highly optimized vector operations that
+/// can be an order of magnitude faster (or more).
+///
+/// # Example
+///
+/// This shows how to find the last position of one of three possible bytes in
+/// a haystack.
+///
+/// ```
+/// use memchr::memrchr3;
+///
+/// let haystack = b"the quick brown fox";
+/// assert_eq!(memrchr3(b'k', b'o', b'n', haystack), Some(17));
+/// ```
+#[inline]
+pub fn memrchr3(
+ needle1: u8,
+ needle2: u8,
+ needle3: u8,
+ haystack: &[u8],
+) -> Option<usize> {
+ // SAFETY: memrchr3_raw, when a match is found, always returns a valid
+ // pointer between start and end.
+ unsafe {
+ generic::search_slice_with_raw(haystack, |start, end| {
+ memrchr3_raw(needle1, needle2, needle3, start, end)
+ })
+ }
+}
+
+/// Returns an iterator over all occurrences of the needle in a haystack.
+///
+/// The iterator returned implements `DoubleEndedIterator`. This means it
+/// can also be used to find occurrences in reverse order.
+#[inline]
+pub fn memchr_iter<'h>(needle: u8, haystack: &'h [u8]) -> Memchr<'h> {
+ Memchr::new(needle, haystack)
+}
+
+/// Returns an iterator over all occurrences of the needle in a haystack, in
+/// reverse.
+#[inline]
+pub fn memrchr_iter(needle: u8, haystack: &[u8]) -> Rev<Memchr<'_>> {
+ Memchr::new(needle, haystack).rev()
+}
+
+/// Returns an iterator over all occurrences of the needles in a haystack.
+///
+/// The iterator returned implements `DoubleEndedIterator`. This means it
+/// can also be used to find occurrences in reverse order.
+#[inline]
+pub fn memchr2_iter<'h>(
+ needle1: u8,
+ needle2: u8,
+ haystack: &'h [u8],
+) -> Memchr2<'h> {
+ Memchr2::new(needle1, needle2, haystack)
+}
+
+/// Returns an iterator over all occurrences of the needles in a haystack, in
+/// reverse.
+#[inline]
+pub fn memrchr2_iter(
+ needle1: u8,
+ needle2: u8,
+ haystack: &[u8],
+) -> Rev<Memchr2<'_>> {
+ Memchr2::new(needle1, needle2, haystack).rev()
+}
+
+/// Returns an iterator over all occurrences of the needles in a haystack.
+///
+/// The iterator returned implements `DoubleEndedIterator`. This means it
+/// can also be used to find occurrences in reverse order.
+#[inline]
+pub fn memchr3_iter<'h>(
+ needle1: u8,
+ needle2: u8,
+ needle3: u8,
+ haystack: &'h [u8],
+) -> Memchr3<'h> {
+ Memchr3::new(needle1, needle2, needle3, haystack)
+}
+
+/// Returns an iterator over all occurrences of the needles in a haystack, in
+/// reverse.
+#[inline]
+pub fn memrchr3_iter(
+ needle1: u8,
+ needle2: u8,
+ needle3: u8,
+ haystack: &[u8],
+) -> Rev<Memchr3<'_>> {
+ Memchr3::new(needle1, needle2, needle3, haystack).rev()
+}
+
+/// An iterator over all occurrences of a single byte in a haystack.
+///
+/// This iterator implements `DoubleEndedIterator`, which means it can also be
+/// used to find occurrences in reverse order.
+///
+/// This iterator is created by the [`memchr_iter`] or `[memrchr_iter`]
+/// functions. It can also be created with the [`Memchr::new`] method.
+///
+/// The lifetime parameter `'h` refers to the lifetime of the haystack being
+/// searched.
+#[derive(Clone, Debug)]
+pub struct Memchr<'h> {
+ needle1: u8,
+ it: crate::arch::generic::memchr::Iter<'h>,
+}
+
+impl<'h> Memchr<'h> {
+ /// Returns an iterator over all occurrences of the needle byte in the
+ /// given haystack.
+ ///
+ /// The iterator returned implements `DoubleEndedIterator`. This means it
+ /// can also be used to find occurrences in reverse order.
+ #[inline]
+ pub fn new(needle1: u8, haystack: &'h [u8]) -> Memchr<'h> {
+ Memchr {
+ needle1,
+ it: crate::arch::generic::memchr::Iter::new(haystack),
+ }
+ }
+}
+
+impl<'h> Iterator for Memchr<'h> {
+ type Item = usize;
+
+ #[inline]
+ fn next(&mut self) -> Option<usize> {
+ // SAFETY: All of our implementations of memchr ensure that any
+ // pointers returns will fall within the start and end bounds, and this
+ // upholds the safety contract of `self.it.next`.
+ unsafe {
+ // NOTE: I attempted to define an enum of previously created
+ // searchers and then switch on those here instead of just
+ // calling `memchr_raw` (or `One::new(..).find_raw(..)`). But
+ // that turned out to have a fair bit of extra overhead when
+ // searching very small haystacks.
+ self.it.next(|s, e| memchr_raw(self.needle1, s, e))
+ }
+ }
+
+ #[inline]
+ fn count(self) -> usize {
+ self.it.count(|s, e| {
+ // SAFETY: We rely on our generic iterator to return valid start
+ // and end pointers.
+ unsafe { count_raw(self.needle1, s, e) }
+ })
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl<'h> DoubleEndedIterator for Memchr<'h> {
+ #[inline]
+ fn next_back(&mut self) -> Option<usize> {
+ // SAFETY: All of our implementations of memchr ensure that any
+ // pointers returns will fall within the start and end bounds, and this
+ // upholds the safety contract of `self.it.next_back`.
+ unsafe { self.it.next_back(|s, e| memrchr_raw(self.needle1, s, e)) }
+ }
+}
+
+impl<'h> core::iter::FusedIterator for Memchr<'h> {}
+
+/// An iterator over all occurrences of two possible bytes in a haystack.
+///
+/// This iterator implements `DoubleEndedIterator`, which means it can also be
+/// used to find occurrences in reverse order.
+///
+/// This iterator is created by the [`memchr2_iter`] or `[memrchr2_iter`]
+/// functions. It can also be created with the [`Memchr2::new`] method.
+///
+/// The lifetime parameter `'h` refers to the lifetime of the haystack being
+/// searched.
+#[derive(Clone, Debug)]
+pub struct Memchr2<'h> {
+ needle1: u8,
+ needle2: u8,
+ it: crate::arch::generic::memchr::Iter<'h>,
+}
+
+impl<'h> Memchr2<'h> {
+ /// Returns an iterator over all occurrences of the needle bytes in the
+ /// given haystack.
+ ///
+ /// The iterator returned implements `DoubleEndedIterator`. This means it
+ /// can also be used to find occurrences in reverse order.
+ #[inline]
+ pub fn new(needle1: u8, needle2: u8, haystack: &'h [u8]) -> Memchr2<'h> {
+ Memchr2 {
+ needle1,
+ needle2,
+ it: crate::arch::generic::memchr::Iter::new(haystack),
+ }
+ }
+}
+
+impl<'h> Iterator for Memchr2<'h> {
+ type Item = usize;
+
+ #[inline]
+ fn next(&mut self) -> Option<usize> {
+ // SAFETY: All of our implementations of memchr ensure that any
+ // pointers returns will fall within the start and end bounds, and this
+ // upholds the safety contract of `self.it.next`.
+ unsafe {
+ self.it.next(|s, e| memchr2_raw(self.needle1, self.needle2, s, e))
+ }
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl<'h> DoubleEndedIterator for Memchr2<'h> {
+ #[inline]
+ fn next_back(&mut self) -> Option<usize> {
+ // SAFETY: All of our implementations of memchr ensure that any
+ // pointers returns will fall within the start and end bounds, and this
+ // upholds the safety contract of `self.it.next_back`.
+ unsafe {
+ self.it.next_back(|s, e| {
+ memrchr2_raw(self.needle1, self.needle2, s, e)
+ })
+ }
+ }
+}
+
+impl<'h> core::iter::FusedIterator for Memchr2<'h> {}
+
+/// An iterator over all occurrences of three possible bytes in a haystack.
+///
+/// This iterator implements `DoubleEndedIterator`, which means it can also be
+/// used to find occurrences in reverse order.
+///
+/// This iterator is created by the [`memchr2_iter`] or `[memrchr2_iter`]
+/// functions. It can also be created with the [`Memchr3::new`] method.
+///
+/// The lifetime parameter `'h` refers to the lifetime of the haystack being
+/// searched.
+#[derive(Clone, Debug)]
+pub struct Memchr3<'h> {
+ needle1: u8,
+ needle2: u8,
+ needle3: u8,
+ it: crate::arch::generic::memchr::Iter<'h>,
+}
+
+impl<'h> Memchr3<'h> {
+ /// Returns an iterator over all occurrences of the needle bytes in the
+ /// given haystack.
+ ///
+ /// The iterator returned implements `DoubleEndedIterator`. This means it
+ /// can also be used to find occurrences in reverse order.
+ #[inline]
+ pub fn new(
+ needle1: u8,
+ needle2: u8,
+ needle3: u8,
+ haystack: &'h [u8],
+ ) -> Memchr3<'h> {
+ Memchr3 {
+ needle1,
+ needle2,
+ needle3,
+ it: crate::arch::generic::memchr::Iter::new(haystack),
+ }
+ }
+}
+
+impl<'h> Iterator for Memchr3<'h> {
+ type Item = usize;
+
+ #[inline]
+ fn next(&mut self) -> Option<usize> {
+ // SAFETY: All of our implementations of memchr ensure that any
+ // pointers returns will fall within the start and end bounds, and this
+ // upholds the safety contract of `self.it.next`.
+ unsafe {
+ self.it.next(|s, e| {
+ memchr3_raw(self.needle1, self.needle2, self.needle3, s, e)
+ })
+ }
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl<'h> DoubleEndedIterator for Memchr3<'h> {
+ #[inline]
+ fn next_back(&mut self) -> Option<usize> {
+ // SAFETY: All of our implementations of memchr ensure that any
+ // pointers returns will fall within the start and end bounds, and this
+ // upholds the safety contract of `self.it.next_back`.
+ unsafe {
+ self.it.next_back(|s, e| {
+ memrchr3_raw(self.needle1, self.needle2, self.needle3, s, e)
+ })
+ }
+ }
+}
+
+impl<'h> core::iter::FusedIterator for Memchr3<'h> {}
+
+/// memchr, but using raw pointers to represent the haystack.
+///
+/// # Safety
+///
+/// Pointers must be valid. See `One::find_raw`.
+#[inline]
+unsafe fn memchr_raw(
+ needle: u8,
+ start: *const u8,
+ end: *const u8,
+) -> Option<*const u8> {
+ #[cfg(target_arch = "x86_64")]
+ {
+ // x86_64 does CPU feature detection at runtime in order to use AVX2
+ // instructions even when the `avx2` feature isn't enabled at compile
+ // time. This function also handles using a fallback if neither AVX2
+ // nor SSE2 (unusual) are available.
+ crate::arch::x86_64::memchr::memchr_raw(needle, start, end)
+ }
+ #[cfg(target_arch = "wasm32")]
+ {
+ crate::arch::wasm32::memchr::memchr_raw(needle, start, end)
+ }
+ #[cfg(target_arch = "aarch64")]
+ {
+ crate::arch::aarch64::memchr::memchr_raw(needle, start, end)
+ }
+ #[cfg(not(any(
+ target_arch = "x86_64",
+ target_arch = "wasm32",
+ target_arch = "aarch64"
+ )))]
+ {
+ crate::arch::all::memchr::One::new(needle).find_raw(start, end)
+ }
+}
+
+/// memrchr, but using raw pointers to represent the haystack.
+///
+/// # Safety
+///
+/// Pointers must be valid. See `One::rfind_raw`.
+#[inline]
+unsafe fn memrchr_raw(
+ needle: u8,
+ start: *const u8,
+ end: *const u8,
+) -> Option<*const u8> {
+ #[cfg(target_arch = "x86_64")]
+ {
+ crate::arch::x86_64::memchr::memrchr_raw(needle, start, end)
+ }
+ #[cfg(target_arch = "wasm32")]
+ {
+ crate::arch::wasm32::memchr::memrchr_raw(needle, start, end)
+ }
+ #[cfg(target_arch = "aarch64")]
+ {
+ crate::arch::aarch64::memchr::memrchr_raw(needle, start, end)
+ }
+ #[cfg(not(any(
+ target_arch = "x86_64",
+ target_arch = "wasm32",
+ target_arch = "aarch64"
+ )))]
+ {
+ crate::arch::all::memchr::One::new(needle).rfind_raw(start, end)
+ }
+}
+
+/// memchr2, but using raw pointers to represent the haystack.
+///
+/// # Safety
+///
+/// Pointers must be valid. See `Two::find_raw`.
+#[inline]
+unsafe fn memchr2_raw(
+ needle1: u8,
+ needle2: u8,
+ start: *const u8,
+ end: *const u8,
+) -> Option<*const u8> {
+ #[cfg(target_arch = "x86_64")]
+ {
+ crate::arch::x86_64::memchr::memchr2_raw(needle1, needle2, start, end)
+ }
+ #[cfg(target_arch = "wasm32")]
+ {
+ crate::arch::wasm32::memchr::memchr2_raw(needle1, needle2, start, end)
+ }
+ #[cfg(target_arch = "aarch64")]
+ {
+ crate::arch::aarch64::memchr::memchr2_raw(needle1, needle2, start, end)
+ }
+ #[cfg(not(any(
+ target_arch = "x86_64",
+ target_arch = "wasm32",
+ target_arch = "aarch64"
+ )))]
+ {
+ crate::arch::all::memchr::Two::new(needle1, needle2)
+ .find_raw(start, end)
+ }
+}
+
+/// memrchr2, but using raw pointers to represent the haystack.
+///
+/// # Safety
+///
+/// Pointers must be valid. See `Two::rfind_raw`.
+#[inline]
+unsafe fn memrchr2_raw(
+ needle1: u8,
+ needle2: u8,
+ start: *const u8,
+ end: *const u8,
+) -> Option<*const u8> {
+ #[cfg(target_arch = "x86_64")]
+ {
+ crate::arch::x86_64::memchr::memrchr2_raw(needle1, needle2, start, end)
+ }
+ #[cfg(target_arch = "wasm32")]
+ {
+ crate::arch::wasm32::memchr::memrchr2_raw(needle1, needle2, start, end)
+ }
+ #[cfg(target_arch = "aarch64")]
+ {
+ crate::arch::aarch64::memchr::memrchr2_raw(
+ needle1, needle2, start, end,
+ )
+ }
+ #[cfg(not(any(
+ target_arch = "x86_64",
+ target_arch = "wasm32",
+ target_arch = "aarch64"
+ )))]
+ {
+ crate::arch::all::memchr::Two::new(needle1, needle2)
+ .rfind_raw(start, end)
+ }
+}
+
+/// memchr3, but using raw pointers to represent the haystack.
+///
+/// # Safety
+///
+/// Pointers must be valid. See `Three::find_raw`.
+#[inline]
+unsafe fn memchr3_raw(
+ needle1: u8,
+ needle2: u8,
+ needle3: u8,
+ start: *const u8,
+ end: *const u8,
+) -> Option<*const u8> {
+ #[cfg(target_arch = "x86_64")]
+ {
+ crate::arch::x86_64::memchr::memchr3_raw(
+ needle1, needle2, needle3, start, end,
+ )
+ }
+ #[cfg(target_arch = "wasm32")]
+ {
+ crate::arch::wasm32::memchr::memchr3_raw(
+ needle1, needle2, needle3, start, end,
+ )
+ }
+ #[cfg(target_arch = "aarch64")]
+ {
+ crate::arch::aarch64::memchr::memchr3_raw(
+ needle1, needle2, needle3, start, end,
+ )
+ }
+ #[cfg(not(any(
+ target_arch = "x86_64",
+ target_arch = "wasm32",
+ target_arch = "aarch64"
+ )))]
+ {
+ crate::arch::all::memchr::Three::new(needle1, needle2, needle3)
+ .find_raw(start, end)
+ }
+}
+
+/// memrchr3, but using raw pointers to represent the haystack.
+///
+/// # Safety
+///
+/// Pointers must be valid. See `Three::rfind_raw`.
+#[inline]
+unsafe fn memrchr3_raw(
+ needle1: u8,
+ needle2: u8,
+ needle3: u8,
+ start: *const u8,
+ end: *const u8,
+) -> Option<*const u8> {
+ #[cfg(target_arch = "x86_64")]
+ {
+ crate::arch::x86_64::memchr::memrchr3_raw(
+ needle1, needle2, needle3, start, end,
+ )
+ }
+ #[cfg(target_arch = "wasm32")]
+ {
+ crate::arch::wasm32::memchr::memrchr3_raw(
+ needle1, needle2, needle3, start, end,
+ )
+ }
+ #[cfg(target_arch = "aarch64")]
+ {
+ crate::arch::aarch64::memchr::memrchr3_raw(
+ needle1, needle2, needle3, start, end,
+ )
+ }
+ #[cfg(not(any(
+ target_arch = "x86_64",
+ target_arch = "wasm32",
+ target_arch = "aarch64"
+ )))]
+ {
+ crate::arch::all::memchr::Three::new(needle1, needle2, needle3)
+ .rfind_raw(start, end)
+ }
+}
+
+/// Count all matching bytes, but using raw pointers to represent the haystack.
+///
+/// # Safety
+///
+/// Pointers must be valid. See `One::count_raw`.
+#[inline]
+unsafe fn count_raw(needle: u8, start: *const u8, end: *const u8) -> usize {
+ #[cfg(target_arch = "x86_64")]
+ {
+ crate::arch::x86_64::memchr::count_raw(needle, start, end)
+ }
+ #[cfg(target_arch = "wasm32")]
+ {
+ crate::arch::wasm32::memchr::count_raw(needle, start, end)
+ }
+ #[cfg(target_arch = "aarch64")]
+ {
+ crate::arch::aarch64::memchr::count_raw(needle, start, end)
+ }
+ #[cfg(not(any(
+ target_arch = "x86_64",
+ target_arch = "wasm32",
+ target_arch = "aarch64"
+ )))]
+ {
+ crate::arch::all::memchr::One::new(needle).count_raw(start, end)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn forward1_iter() {
+ crate::tests::memchr::Runner::new(1).forward_iter(
+ |haystack, needles| {
+ Some(memchr_iter(needles[0], haystack).collect())
+ },
+ )
+ }
+
+ #[test]
+ fn forward1_oneshot() {
+ crate::tests::memchr::Runner::new(1).forward_oneshot(
+ |haystack, needles| Some(memchr(needles[0], haystack)),
+ )
+ }
+
+ #[test]
+ fn reverse1_iter() {
+ crate::tests::memchr::Runner::new(1).reverse_iter(
+ |haystack, needles| {
+ Some(memrchr_iter(needles[0], haystack).collect())
+ },
+ )
+ }
+
+ #[test]
+ fn reverse1_oneshot() {
+ crate::tests::memchr::Runner::new(1).reverse_oneshot(
+ |haystack, needles| Some(memrchr(needles[0], haystack)),
+ )
+ }
+
+ #[test]
+ fn count1_iter() {
+ crate::tests::memchr::Runner::new(1).count_iter(|haystack, needles| {
+ Some(memchr_iter(needles[0], haystack).count())
+ })
+ }
+
+ #[test]
+ fn forward2_iter() {
+ crate::tests::memchr::Runner::new(2).forward_iter(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ Some(memchr2_iter(n1, n2, haystack).collect())
+ },
+ )
+ }
+
+ #[test]
+ fn forward2_oneshot() {
+ crate::tests::memchr::Runner::new(2).forward_oneshot(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ Some(memchr2(n1, n2, haystack))
+ },
+ )
+ }
+
+ #[test]
+ fn reverse2_iter() {
+ crate::tests::memchr::Runner::new(2).reverse_iter(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ Some(memrchr2_iter(n1, n2, haystack).collect())
+ },
+ )
+ }
+
+ #[test]
+ fn reverse2_oneshot() {
+ crate::tests::memchr::Runner::new(2).reverse_oneshot(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ Some(memrchr2(n1, n2, haystack))
+ },
+ )
+ }
+
+ #[test]
+ fn forward3_iter() {
+ crate::tests::memchr::Runner::new(3).forward_iter(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ let n3 = needles.get(2).copied()?;
+ Some(memchr3_iter(n1, n2, n3, haystack).collect())
+ },
+ )
+ }
+
+ #[test]
+ fn forward3_oneshot() {
+ crate::tests::memchr::Runner::new(3).forward_oneshot(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ let n3 = needles.get(2).copied()?;
+ Some(memchr3(n1, n2, n3, haystack))
+ },
+ )
+ }
+
+ #[test]
+ fn reverse3_iter() {
+ crate::tests::memchr::Runner::new(3).reverse_iter(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ let n3 = needles.get(2).copied()?;
+ Some(memrchr3_iter(n1, n2, n3, haystack).collect())
+ },
+ )
+ }
+
+ #[test]
+ fn reverse3_oneshot() {
+ crate::tests::memchr::Runner::new(3).reverse_oneshot(
+ |haystack, needles| {
+ let n1 = needles.get(0).copied()?;
+ let n2 = needles.get(1).copied()?;
+ let n3 = needles.get(2).copied()?;
+ Some(memrchr3(n1, n2, n3, haystack))
+ },
+ )
+ }
+
+ // Prior to memchr 2.6, the memchr iterators both implemented Send and
+ // Sync. But in memchr 2.6, the iterator changed to use raw pointers
+ // internally and I didn't add explicit Send/Sync impls. This ended up
+ // regressing the API. This test ensures we don't do that again.
+ //
+ // See: https://github.com/BurntSushi/memchr/issues/133
+ #[test]
+ fn sync_regression() {
+ use core::panic::{RefUnwindSafe, UnwindSafe};
+
+ fn assert_send_sync<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {}
+ assert_send_sync::<Memchr>();
+ assert_send_sync::<Memchr2>();
+ assert_send_sync::<Memchr3>()
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +
/*!
+This module provides forward and reverse substring search routines.
+
+Unlike the standard library's substring search routines, these work on
+arbitrary bytes. For all non-empty needles, these routines will report exactly
+the same values as the corresponding routines in the standard library. For
+the empty needle, the standard library reports matches only at valid UTF-8
+boundaries, where as these routines will report matches at every position.
+
+Other than being able to work on arbitrary bytes, the primary reason to prefer
+these routines over the standard library routines is that these will generally
+be faster. In some cases, significantly so.
+
+# Example: iterating over substring matches
+
+This example shows how to use [`find_iter`] to find occurrences of a substring
+in a haystack.
+
+```
+use memchr::memmem;
+
+let haystack = b"foo bar foo baz foo";
+
+let mut it = memmem::find_iter(haystack, "foo");
+assert_eq!(Some(0), it.next());
+assert_eq!(Some(8), it.next());
+assert_eq!(Some(16), it.next());
+assert_eq!(None, it.next());
+```
+
+# Example: iterating over substring matches in reverse
+
+This example shows how to use [`rfind_iter`] to find occurrences of a substring
+in a haystack starting from the end of the haystack.
+
+**NOTE:** This module does not implement double ended iterators, so reverse
+searches aren't done by calling `rev` on a forward iterator.
+
+```
+use memchr::memmem;
+
+let haystack = b"foo bar foo baz foo";
+
+let mut it = memmem::rfind_iter(haystack, "foo");
+assert_eq!(Some(16), it.next());
+assert_eq!(Some(8), it.next());
+assert_eq!(Some(0), it.next());
+assert_eq!(None, it.next());
+```
+
+# Example: repeating a search for the same needle
+
+It may be possible for the overhead of constructing a substring searcher to be
+measurable in some workloads. In cases where the same needle is used to search
+many haystacks, it is possible to do construction once and thus to avoid it for
+subsequent searches. This can be done with a [`Finder`] (or a [`FinderRev`] for
+reverse searches).
+
+```
+use memchr::memmem;
+
+let finder = memmem::Finder::new("foo");
+
+assert_eq!(Some(4), finder.find(b"baz foo quux"));
+assert_eq!(None, finder.find(b"quux baz bar"));
+```
+*/
+
+pub use crate::memmem::searcher::PrefilterConfig as Prefilter;
+
+// This is exported here for use in the crate::arch::all::twoway
+// implementation. This is essentially an abstraction breaker. Namely, the
+// public API of twoway doesn't support providing a prefilter, but its crate
+// internal API does. The main reason for this is that I didn't want to do the
+// API design required to support it without a concrete use case.
+pub(crate) use crate::memmem::searcher::Pre;
+
+use crate::{
+ arch::all::{
+ packedpair::{DefaultFrequencyRank, HeuristicFrequencyRank},
+ rabinkarp,
+ },
+ cow::CowBytes,
+ memmem::searcher::{PrefilterState, Searcher, SearcherRev},
+};
+
+mod searcher;
+
+/// Returns an iterator over all non-overlapping occurrences of a substring in
+/// a haystack.
+///
+/// # Complexity
+///
+/// This routine is guaranteed to have worst case linear time complexity
+/// with respect to both the needle and the haystack. That is, this runs
+/// in `O(needle.len() + haystack.len())` time.
+///
+/// This routine is also guaranteed to have worst case constant space
+/// complexity.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```
+/// use memchr::memmem;
+///
+/// let haystack = b"foo bar foo baz foo";
+/// let mut it = memmem::find_iter(haystack, b"foo");
+/// assert_eq!(Some(0), it.next());
+/// assert_eq!(Some(8), it.next());
+/// assert_eq!(Some(16), it.next());
+/// assert_eq!(None, it.next());
+/// ```
+#[inline]
+pub fn find_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>(
+ haystack: &'h [u8],
+ needle: &'n N,
+) -> FindIter<'h, 'n> {
+ FindIter::new(haystack, Finder::new(needle))
+}
+
+/// Returns a reverse iterator over all non-overlapping occurrences of a
+/// substring in a haystack.
+///
+/// # Complexity
+///
+/// This routine is guaranteed to have worst case linear time complexity
+/// with respect to both the needle and the haystack. That is, this runs
+/// in `O(needle.len() + haystack.len())` time.
+///
+/// This routine is also guaranteed to have worst case constant space
+/// complexity.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```
+/// use memchr::memmem;
+///
+/// let haystack = b"foo bar foo baz foo";
+/// let mut it = memmem::rfind_iter(haystack, b"foo");
+/// assert_eq!(Some(16), it.next());
+/// assert_eq!(Some(8), it.next());
+/// assert_eq!(Some(0), it.next());
+/// assert_eq!(None, it.next());
+/// ```
+#[inline]
+pub fn rfind_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>(
+ haystack: &'h [u8],
+ needle: &'n N,
+) -> FindRevIter<'h, 'n> {
+ FindRevIter::new(haystack, FinderRev::new(needle))
+}
+
+/// Returns the index of the first occurrence of the given needle.
+///
+/// Note that if you're are searching for the same needle in many different
+/// small haystacks, it may be faster to initialize a [`Finder`] once,
+/// and reuse it for each search.
+///
+/// # Complexity
+///
+/// This routine is guaranteed to have worst case linear time complexity
+/// with respect to both the needle and the haystack. That is, this runs
+/// in `O(needle.len() + haystack.len())` time.
+///
+/// This routine is also guaranteed to have worst case constant space
+/// complexity.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```
+/// use memchr::memmem;
+///
+/// let haystack = b"foo bar baz";
+/// assert_eq!(Some(0), memmem::find(haystack, b"foo"));
+/// assert_eq!(Some(4), memmem::find(haystack, b"bar"));
+/// assert_eq!(None, memmem::find(haystack, b"quux"));
+/// ```
+#[inline]
+pub fn find(haystack: &[u8], needle: &[u8]) -> Option<usize> {
+ if haystack.len() < 64 {
+ rabinkarp::Finder::new(needle).find(haystack, needle)
+ } else {
+ Finder::new(needle).find(haystack)
+ }
+}
+
+/// Returns the index of the last occurrence of the given needle.
+///
+/// Note that if you're are searching for the same needle in many different
+/// small haystacks, it may be faster to initialize a [`FinderRev`] once,
+/// and reuse it for each search.
+///
+/// # Complexity
+///
+/// This routine is guaranteed to have worst case linear time complexity
+/// with respect to both the needle and the haystack. That is, this runs
+/// in `O(needle.len() + haystack.len())` time.
+///
+/// This routine is also guaranteed to have worst case constant space
+/// complexity.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```
+/// use memchr::memmem;
+///
+/// let haystack = b"foo bar baz";
+/// assert_eq!(Some(0), memmem::rfind(haystack, b"foo"));
+/// assert_eq!(Some(4), memmem::rfind(haystack, b"bar"));
+/// assert_eq!(Some(8), memmem::rfind(haystack, b"ba"));
+/// assert_eq!(None, memmem::rfind(haystack, b"quux"));
+/// ```
+#[inline]
+pub fn rfind(haystack: &[u8], needle: &[u8]) -> Option<usize> {
+ if haystack.len() < 64 {
+ rabinkarp::FinderRev::new(needle).rfind(haystack, needle)
+ } else {
+ FinderRev::new(needle).rfind(haystack)
+ }
+}
+
+/// An iterator over non-overlapping substring matches.
+///
+/// Matches are reported by the byte offset at which they begin.
+///
+/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the
+/// needle.
+#[derive(Debug)]
+pub struct FindIter<'h, 'n> {
+ haystack: &'h [u8],
+ prestate: PrefilterState,
+ finder: Finder<'n>,
+ pos: usize,
+}
+
+impl<'h, 'n> FindIter<'h, 'n> {
+ #[inline(always)]
+ pub(crate) fn new(
+ haystack: &'h [u8],
+ finder: Finder<'n>,
+ ) -> FindIter<'h, 'n> {
+ let prestate = PrefilterState::new();
+ FindIter { haystack, prestate, finder, pos: 0 }
+ }
+
+ /// Convert this iterator into its owned variant, such that it no longer
+ /// borrows the finder and needle.
+ ///
+ /// If this is already an owned iterator, then this is a no-op. Otherwise,
+ /// this copies the needle.
+ ///
+ /// This is only available when the `alloc` feature is enabled.
+ #[cfg(feature = "alloc")]
+ #[inline]
+ pub fn into_owned(self) -> FindIter<'h, 'static> {
+ FindIter {
+ haystack: self.haystack,
+ prestate: self.prestate,
+ finder: self.finder.into_owned(),
+ pos: self.pos,
+ }
+ }
+}
+
+impl<'h, 'n> Iterator for FindIter<'h, 'n> {
+ type Item = usize;
+
+ fn next(&mut self) -> Option<usize> {
+ let needle = self.finder.needle();
+ let haystack = self.haystack.get(self.pos..)?;
+ let idx =
+ self.finder.searcher.find(&mut self.prestate, haystack, needle)?;
+
+ let pos = self.pos + idx;
+ self.pos = pos + needle.len().max(1);
+
+ Some(pos)
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ // The largest possible number of non-overlapping matches is the
+ // quotient of the haystack and the needle (or the length of the
+ // haystack, if the needle is empty)
+ match self.haystack.len().checked_sub(self.pos) {
+ None => (0, Some(0)),
+ Some(haystack_len) => match self.finder.needle().len() {
+ // Empty needles always succeed and match at every point
+ // (including the very end)
+ 0 => (
+ haystack_len.saturating_add(1),
+ haystack_len.checked_add(1),
+ ),
+ needle_len => (0, Some(haystack_len / needle_len)),
+ },
+ }
+ }
+}
+
+/// An iterator over non-overlapping substring matches in reverse.
+///
+/// Matches are reported by the byte offset at which they begin.
+///
+/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the
+/// needle.
+#[derive(Debug)]
+pub struct FindRevIter<'h, 'n> {
+ haystack: &'h [u8],
+ finder: FinderRev<'n>,
+ /// When searching with an empty needle, this gets set to `None` after
+ /// we've yielded the last element at `0`.
+ pos: Option<usize>,
+}
+
+impl<'h, 'n> FindRevIter<'h, 'n> {
+ #[inline(always)]
+ pub(crate) fn new(
+ haystack: &'h [u8],
+ finder: FinderRev<'n>,
+ ) -> FindRevIter<'h, 'n> {
+ let pos = Some(haystack.len());
+ FindRevIter { haystack, finder, pos }
+ }
+
+ /// Convert this iterator into its owned variant, such that it no longer
+ /// borrows the finder and needle.
+ ///
+ /// If this is already an owned iterator, then this is a no-op. Otherwise,
+ /// this copies the needle.
+ ///
+ /// This is only available when the `std` feature is enabled.
+ #[cfg(feature = "alloc")]
+ #[inline]
+ pub fn into_owned(self) -> FindRevIter<'h, 'static> {
+ FindRevIter {
+ haystack: self.haystack,
+ finder: self.finder.into_owned(),
+ pos: self.pos,
+ }
+ }
+}
+
+impl<'h, 'n> Iterator for FindRevIter<'h, 'n> {
+ type Item = usize;
+
+ fn next(&mut self) -> Option<usize> {
+ let pos = match self.pos {
+ None => return None,
+ Some(pos) => pos,
+ };
+ let result = self.finder.rfind(&self.haystack[..pos]);
+ match result {
+ None => None,
+ Some(i) => {
+ if pos == i {
+ self.pos = pos.checked_sub(1);
+ } else {
+ self.pos = Some(i);
+ }
+ Some(i)
+ }
+ }
+ }
+}
+
+/// A single substring searcher fixed to a particular needle.
+///
+/// The purpose of this type is to permit callers to construct a substring
+/// searcher that can be used to search haystacks without the overhead of
+/// constructing the searcher in the first place. This is a somewhat niche
+/// concern when it's necessary to re-use the same needle to search multiple
+/// different haystacks with as little overhead as possible. In general, using
+/// [`find`] is good enough, but `Finder` is useful when you can meaningfully
+/// observe searcher construction time in a profile.
+///
+/// When the `std` feature is enabled, then this type has an `into_owned`
+/// version which permits building a `Finder` that is not connected to
+/// the lifetime of its needle.
+#[derive(Clone, Debug)]
+pub struct Finder<'n> {
+ needle: CowBytes<'n>,
+ searcher: Searcher,
+}
+
+impl<'n> Finder<'n> {
+ /// Create a new finder for the given needle.
+ #[inline]
+ pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'n B) -> Finder<'n> {
+ FinderBuilder::new().build_forward(needle)
+ }
+
+ /// Returns the index of the first occurrence of this needle in the given
+ /// haystack.
+ ///
+ /// # Complexity
+ ///
+ /// This routine is guaranteed to have worst case linear time complexity
+ /// with respect to both the needle and the haystack. That is, this runs
+ /// in `O(needle.len() + haystack.len())` time.
+ ///
+ /// This routine is also guaranteed to have worst case constant space
+ /// complexity.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use memchr::memmem::Finder;
+ ///
+ /// let haystack = b"foo bar baz";
+ /// assert_eq!(Some(0), Finder::new("foo").find(haystack));
+ /// assert_eq!(Some(4), Finder::new("bar").find(haystack));
+ /// assert_eq!(None, Finder::new("quux").find(haystack));
+ /// ```
+ #[inline]
+ pub fn find(&self, haystack: &[u8]) -> Option<usize> {
+ let mut prestate = PrefilterState::new();
+ let needle = self.needle.as_slice();
+ self.searcher.find(&mut prestate, haystack, needle)
+ }
+
+ /// Returns an iterator over all occurrences of a substring in a haystack.
+ ///
+ /// # Complexity
+ ///
+ /// This routine is guaranteed to have worst case linear time complexity
+ /// with respect to both the needle and the haystack. That is, this runs
+ /// in `O(needle.len() + haystack.len())` time.
+ ///
+ /// This routine is also guaranteed to have worst case constant space
+ /// complexity.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use memchr::memmem::Finder;
+ ///
+ /// let haystack = b"foo bar foo baz foo";
+ /// let finder = Finder::new(b"foo");
+ /// let mut it = finder.find_iter(haystack);
+ /// assert_eq!(Some(0), it.next());
+ /// assert_eq!(Some(8), it.next());
+ /// assert_eq!(Some(16), it.next());
+ /// assert_eq!(None, it.next());
+ /// ```
+ #[inline]
+ pub fn find_iter<'a, 'h>(
+ &'a self,
+ haystack: &'h [u8],
+ ) -> FindIter<'h, 'a> {
+ FindIter::new(haystack, self.as_ref())
+ }
+
+ /// Convert this finder into its owned variant, such that it no longer
+ /// borrows the needle.
+ ///
+ /// If this is already an owned finder, then this is a no-op. Otherwise,
+ /// this copies the needle.
+ ///
+ /// This is only available when the `alloc` feature is enabled.
+ #[cfg(feature = "alloc")]
+ #[inline]
+ pub fn into_owned(self) -> Finder<'static> {
+ Finder {
+ needle: self.needle.into_owned(),
+ searcher: self.searcher.clone(),
+ }
+ }
+
+ /// Convert this finder into its borrowed variant.
+ ///
+ /// This is primarily useful if your finder is owned and you'd like to
+ /// store its borrowed variant in some intermediate data structure.
+ ///
+ /// Note that the lifetime parameter of the returned finder is tied to the
+ /// lifetime of `self`, and may be shorter than the `'n` lifetime of the
+ /// needle itself. Namely, a finder's needle can be either borrowed or
+ /// owned, so the lifetime of the needle returned must necessarily be the
+ /// shorter of the two.
+ #[inline]
+ pub fn as_ref(&self) -> Finder<'_> {
+ Finder {
+ needle: CowBytes::new(self.needle()),
+ searcher: self.searcher.clone(),
+ }
+ }
+
+ /// Returns the needle that this finder searches for.
+ ///
+ /// Note that the lifetime of the needle returned is tied to the lifetime
+ /// of the finder, and may be shorter than the `'n` lifetime. Namely, a
+ /// finder's needle can be either borrowed or owned, so the lifetime of the
+ /// needle returned must necessarily be the shorter of the two.
+ #[inline]
+ pub fn needle(&self) -> &[u8] {
+ self.needle.as_slice()
+ }
+}
+
+/// A single substring reverse searcher fixed to a particular needle.
+///
+/// The purpose of this type is to permit callers to construct a substring
+/// searcher that can be used to search haystacks without the overhead of
+/// constructing the searcher in the first place. This is a somewhat niche
+/// concern when it's necessary to re-use the same needle to search multiple
+/// different haystacks with as little overhead as possible. In general,
+/// using [`rfind`] is good enough, but `FinderRev` is useful when you can
+/// meaningfully observe searcher construction time in a profile.
+///
+/// When the `std` feature is enabled, then this type has an `into_owned`
+/// version which permits building a `FinderRev` that is not connected to
+/// the lifetime of its needle.
+#[derive(Clone, Debug)]
+pub struct FinderRev<'n> {
+ needle: CowBytes<'n>,
+ searcher: SearcherRev,
+}
+
+impl<'n> FinderRev<'n> {
+ /// Create a new reverse finder for the given needle.
+ #[inline]
+ pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'n B) -> FinderRev<'n> {
+ FinderBuilder::new().build_reverse(needle)
+ }
+
+ /// Returns the index of the last occurrence of this needle in the given
+ /// haystack.
+ ///
+ /// The haystack may be any type that can be cheaply converted into a
+ /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
+ ///
+ /// # Complexity
+ ///
+ /// This routine is guaranteed to have worst case linear time complexity
+ /// with respect to both the needle and the haystack. That is, this runs
+ /// in `O(needle.len() + haystack.len())` time.
+ ///
+ /// This routine is also guaranteed to have worst case constant space
+ /// complexity.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use memchr::memmem::FinderRev;
+ ///
+ /// let haystack = b"foo bar baz";
+ /// assert_eq!(Some(0), FinderRev::new("foo").rfind(haystack));
+ /// assert_eq!(Some(4), FinderRev::new("bar").rfind(haystack));
+ /// assert_eq!(None, FinderRev::new("quux").rfind(haystack));
+ /// ```
+ pub fn rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> {
+ self.searcher.rfind(haystack.as_ref(), self.needle.as_slice())
+ }
+
+ /// Returns a reverse iterator over all occurrences of a substring in a
+ /// haystack.
+ ///
+ /// # Complexity
+ ///
+ /// This routine is guaranteed to have worst case linear time complexity
+ /// with respect to both the needle and the haystack. That is, this runs
+ /// in `O(needle.len() + haystack.len())` time.
+ ///
+ /// This routine is also guaranteed to have worst case constant space
+ /// complexity.
+ ///
+ /// # Examples
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use memchr::memmem::FinderRev;
+ ///
+ /// let haystack = b"foo bar foo baz foo";
+ /// let finder = FinderRev::new(b"foo");
+ /// let mut it = finder.rfind_iter(haystack);
+ /// assert_eq!(Some(16), it.next());
+ /// assert_eq!(Some(8), it.next());
+ /// assert_eq!(Some(0), it.next());
+ /// assert_eq!(None, it.next());
+ /// ```
+ #[inline]
+ pub fn rfind_iter<'a, 'h>(
+ &'a self,
+ haystack: &'h [u8],
+ ) -> FindRevIter<'h, 'a> {
+ FindRevIter::new(haystack, self.as_ref())
+ }
+
+ /// Convert this finder into its owned variant, such that it no longer
+ /// borrows the needle.
+ ///
+ /// If this is already an owned finder, then this is a no-op. Otherwise,
+ /// this copies the needle.
+ ///
+ /// This is only available when the `std` feature is enabled.
+ #[cfg(feature = "alloc")]
+ #[inline]
+ pub fn into_owned(self) -> FinderRev<'static> {
+ FinderRev {
+ needle: self.needle.into_owned(),
+ searcher: self.searcher.clone(),
+ }
+ }
+
+ /// Convert this finder into its borrowed variant.
+ ///
+ /// This is primarily useful if your finder is owned and you'd like to
+ /// store its borrowed variant in some intermediate data structure.
+ ///
+ /// Note that the lifetime parameter of the returned finder is tied to the
+ /// lifetime of `self`, and may be shorter than the `'n` lifetime of the
+ /// needle itself. Namely, a finder's needle can be either borrowed or
+ /// owned, so the lifetime of the needle returned must necessarily be the
+ /// shorter of the two.
+ #[inline]
+ pub fn as_ref(&self) -> FinderRev<'_> {
+ FinderRev {
+ needle: CowBytes::new(self.needle()),
+ searcher: self.searcher.clone(),
+ }
+ }
+
+ /// Returns the needle that this finder searches for.
+ ///
+ /// Note that the lifetime of the needle returned is tied to the lifetime
+ /// of the finder, and may be shorter than the `'n` lifetime. Namely, a
+ /// finder's needle can be either borrowed or owned, so the lifetime of the
+ /// needle returned must necessarily be the shorter of the two.
+ #[inline]
+ pub fn needle(&self) -> &[u8] {
+ self.needle.as_slice()
+ }
+}
+
+/// A builder for constructing non-default forward or reverse memmem finders.
+///
+/// A builder is primarily useful for configuring a substring searcher.
+/// Currently, the only configuration exposed is the ability to disable
+/// heuristic prefilters used to speed up certain searches.
+#[derive(Clone, Debug, Default)]
+pub struct FinderBuilder {
+ prefilter: Prefilter,
+}
+
+impl FinderBuilder {
+ /// Create a new finder builder with default settings.
+ pub fn new() -> FinderBuilder {
+ FinderBuilder::default()
+ }
+
+ /// Build a forward finder using the given needle from the current
+ /// settings.
+ pub fn build_forward<'n, B: ?Sized + AsRef<[u8]>>(
+ &self,
+ needle: &'n B,
+ ) -> Finder<'n> {
+ self.build_forward_with_ranker(DefaultFrequencyRank, needle)
+ }
+
+ /// Build a forward finder using the given needle and a custom heuristic for
+ /// determining the frequency of a given byte in the dataset.
+ /// See [`HeuristicFrequencyRank`] for more details.
+ pub fn build_forward_with_ranker<
+ 'n,
+ R: HeuristicFrequencyRank,
+ B: ?Sized + AsRef<[u8]>,
+ >(
+ &self,
+ ranker: R,
+ needle: &'n B,
+ ) -> Finder<'n> {
+ let needle = needle.as_ref();
+ Finder {
+ needle: CowBytes::new(needle),
+ searcher: Searcher::new(self.prefilter, ranker, needle),
+ }
+ }
+
+ /// Build a reverse finder using the given needle from the current
+ /// settings.
+ pub fn build_reverse<'n, B: ?Sized + AsRef<[u8]>>(
+ &self,
+ needle: &'n B,
+ ) -> FinderRev<'n> {
+ let needle = needle.as_ref();
+ FinderRev {
+ needle: CowBytes::new(needle),
+ searcher: SearcherRev::new(needle),
+ }
+ }
+
+ /// Configure the prefilter setting for the finder.
+ ///
+ /// See the documentation for [`Prefilter`] for more discussion on why
+ /// you might want to configure this.
+ pub fn prefilter(&mut self, prefilter: Prefilter) -> &mut FinderBuilder {
+ self.prefilter = prefilter;
+ self
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ define_substring_forward_quickcheck!(|h, n| Some(Finder::new(n).find(h)));
+ define_substring_reverse_quickcheck!(|h, n| Some(
+ FinderRev::new(n).rfind(h)
+ ));
+
+ #[test]
+ fn forward() {
+ crate::tests::substring::Runner::new()
+ .fwd(|h, n| Some(Finder::new(n).find(h)))
+ .run();
+ }
+
+ #[test]
+ fn reverse() {
+ crate::tests::substring::Runner::new()
+ .rev(|h, n| Some(FinderRev::new(n).rfind(h)))
+ .run();
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 +763 +764 +765 +766 +767 +768 +769 +770 +771 +772 +773 +774 +775 +776 +777 +778 +779 +780 +781 +782 +783 +784 +785 +786 +787 +788 +789 +790 +791 +792 +793 +794 +795 +796 +797 +798 +799 +800 +801 +802 +803 +804 +805 +806 +807 +808 +809 +810 +811 +812 +813 +814 +815 +816 +817 +818 +819 +820 +821 +822 +823 +824 +825 +826 +827 +828 +829 +830 +831 +832 +833 +834 +835 +836 +837 +838 +839 +840 +841 +842 +843 +844 +845 +846 +847 +848 +849 +850 +851 +852 +853 +854 +855 +856 +857 +858 +859 +860 +861 +862 +863 +864 +865 +866 +867 +868 +869 +870 +871 +872 +873 +874 +875 +876 +877 +878 +879 +880 +881 +882 +883 +884 +885 +886 +887 +888 +889 +890 +891 +892 +893 +894 +895 +896 +897 +898 +899 +900 +901 +902 +903 +904 +905 +906 +907 +908 +909 +910 +911 +912 +913 +914 +915 +916 +917 +918 +919 +920 +921 +922 +923 +924 +925 +926 +927 +928 +929 +930 +931 +932 +933 +934 +935 +936 +937 +938 +939 +940 +941 +942 +943 +944 +945 +946 +947 +948 +949 +950 +951 +952 +953 +954 +955 +956 +957 +958 +959 +960 +961 +962 +963 +964 +965 +966 +967 +968 +969 +970 +971 +972 +973 +974 +975 +976 +977 +978 +979 +980 +981 +982 +983 +984 +985 +986 +987 +988 +989 +990 +991 +992 +993 +994 +995 +996 +997 +998 +999 +1000 +1001 +1002 +1003 +1004 +1005 +1006 +1007 +1008 +1009 +1010 +1011 +1012 +1013 +1014 +1015 +1016 +1017 +1018 +1019 +1020 +1021 +1022 +1023 +1024 +1025 +1026 +1027 +1028 +1029 +1030 +
use crate::arch::all::{
+ packedpair::{HeuristicFrequencyRank, Pair},
+ rabinkarp, twoway,
+};
+
+#[cfg(target_arch = "aarch64")]
+use crate::arch::aarch64::neon::packedpair as neon;
+#[cfg(target_arch = "wasm32")]
+use crate::arch::wasm32::simd128::packedpair as simd128;
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+use crate::arch::x86_64::{
+ avx2::packedpair as avx2, sse2::packedpair as sse2,
+};
+
+/// A "meta" substring searcher.
+///
+/// To a first approximation, this chooses what it believes to be the "best"
+/// substring search implemnetation based on the needle at construction time.
+/// Then, every call to `find` will execute that particular implementation. To
+/// a second approximation, multiple substring search algorithms may be used,
+/// depending on the haystack. For example, for supremely short haystacks,
+/// Rabin-Karp is typically used.
+///
+/// See the documentation on `Prefilter` for an explanation of the dispatching
+/// mechanism. The quick summary is that an enum has too much overhead and
+/// we can't use dynamic dispatch via traits because we need to work in a
+/// core-only environment. (Dynamic dispatch works in core-only, but you
+/// need `&dyn Trait` and we really need a `Box<dyn Trait>` here. The latter
+/// requires `alloc`.) So instead, we use a union and an appropriately paired
+/// free function to read from the correct field on the union and execute the
+/// chosen substring search implementation.
+#[derive(Clone)]
+pub(crate) struct Searcher {
+ call: SearcherKindFn,
+ kind: SearcherKind,
+ rabinkarp: rabinkarp::Finder,
+}
+
+impl Searcher {
+ /// Creates a new "meta" substring searcher that attempts to choose the
+ /// best algorithm based on the needle, heuristics and what the current
+ /// target supports.
+ #[inline]
+ pub(crate) fn new<R: HeuristicFrequencyRank>(
+ prefilter: PrefilterConfig,
+ ranker: R,
+ needle: &[u8],
+ ) -> Searcher {
+ let rabinkarp = rabinkarp::Finder::new(needle);
+ if needle.len() <= 1 {
+ return if needle.is_empty() {
+ trace!("building empty substring searcher");
+ Searcher {
+ call: searcher_kind_empty,
+ kind: SearcherKind { empty: () },
+ rabinkarp,
+ }
+ } else {
+ trace!("building one-byte substring searcher");
+ debug_assert_eq!(1, needle.len());
+ Searcher {
+ call: searcher_kind_one_byte,
+ kind: SearcherKind { one_byte: needle[0] },
+ rabinkarp,
+ }
+ };
+ }
+ let pair = match Pair::with_ranker(needle, &ranker) {
+ Some(pair) => pair,
+ None => return Searcher::twoway(needle, rabinkarp, None),
+ };
+ debug_assert_ne!(
+ pair.index1(),
+ pair.index2(),
+ "pair offsets should not be equivalent"
+ );
+ #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+ {
+ if let Some(pp) = avx2::Finder::with_pair(needle, pair) {
+ if do_packed_search(needle) {
+ trace!("building x86_64 AVX2 substring searcher");
+ let kind = SearcherKind { avx2: pp };
+ Searcher { call: searcher_kind_avx2, kind, rabinkarp }
+ } else if prefilter.is_none() {
+ Searcher::twoway(needle, rabinkarp, None)
+ } else {
+ let prestrat = Prefilter::avx2(pp, needle);
+ Searcher::twoway(needle, rabinkarp, Some(prestrat))
+ }
+ } else if let Some(pp) = sse2::Finder::with_pair(needle, pair) {
+ if do_packed_search(needle) {
+ trace!("building x86_64 SSE2 substring searcher");
+ let kind = SearcherKind { sse2: pp };
+ Searcher { call: searcher_kind_sse2, kind, rabinkarp }
+ } else if prefilter.is_none() {
+ Searcher::twoway(needle, rabinkarp, None)
+ } else {
+ let prestrat = Prefilter::sse2(pp, needle);
+ Searcher::twoway(needle, rabinkarp, Some(prestrat))
+ }
+ } else if prefilter.is_none() {
+ Searcher::twoway(needle, rabinkarp, None)
+ } else {
+ // We're pretty unlikely to get to this point, but it is
+ // possible to be running on x86_64 without SSE2. Namely, it's
+ // really up to the OS whether it wants to support vector
+ // registers or not.
+ let prestrat = Prefilter::fallback(ranker, pair, needle);
+ Searcher::twoway(needle, rabinkarp, prestrat)
+ }
+ }
+ #[cfg(target_arch = "wasm32")]
+ {
+ if let Some(pp) = simd128::Finder::with_pair(needle, pair) {
+ if do_packed_search(needle) {
+ trace!("building wasm32 simd128 substring searcher");
+ let kind = SearcherKind { simd128: pp };
+ Searcher { call: searcher_kind_simd128, kind, rabinkarp }
+ } else if prefilter.is_none() {
+ Searcher::twoway(needle, rabinkarp, None)
+ } else {
+ let prestrat = Prefilter::simd128(pp, needle);
+ Searcher::twoway(needle, rabinkarp, Some(prestrat))
+ }
+ } else if prefilter.is_none() {
+ Searcher::twoway(needle, rabinkarp, None)
+ } else {
+ let prestrat = Prefilter::fallback(ranker, pair, needle);
+ Searcher::twoway(needle, rabinkarp, prestrat)
+ }
+ }
+ #[cfg(target_arch = "aarch64")]
+ {
+ if let Some(pp) = neon::Finder::with_pair(needle, pair) {
+ if do_packed_search(needle) {
+ trace!("building aarch64 neon substring searcher");
+ let kind = SearcherKind { neon: pp };
+ Searcher { call: searcher_kind_neon, kind, rabinkarp }
+ } else if prefilter.is_none() {
+ Searcher::twoway(needle, rabinkarp, None)
+ } else {
+ let prestrat = Prefilter::neon(pp, needle);
+ Searcher::twoway(needle, rabinkarp, Some(prestrat))
+ }
+ } else if prefilter.is_none() {
+ Searcher::twoway(needle, rabinkarp, None)
+ } else {
+ let prestrat = Prefilter::fallback(ranker, pair, needle);
+ Searcher::twoway(needle, rabinkarp, prestrat)
+ }
+ }
+ #[cfg(not(any(
+ all(target_arch = "x86_64", target_feature = "sse2"),
+ target_arch = "wasm32",
+ target_arch = "aarch64"
+ )))]
+ {
+ if prefilter.is_none() {
+ Searcher::twoway(needle, rabinkarp, None)
+ } else {
+ let prestrat = Prefilter::fallback(ranker, pair, needle);
+ Searcher::twoway(needle, rabinkarp, prestrat)
+ }
+ }
+ }
+
+ /// Creates a new searcher that always uses the Two-Way algorithm. This is
+ /// typically used when vector algorithms are unavailable or inappropriate.
+ /// (For example, when the needle is "too long.")
+ ///
+ /// If a prefilter is given, then the searcher returned will be accelerated
+ /// by the prefilter.
+ #[inline]
+ fn twoway(
+ needle: &[u8],
+ rabinkarp: rabinkarp::Finder,
+ prestrat: Option<Prefilter>,
+ ) -> Searcher {
+ let finder = twoway::Finder::new(needle);
+ match prestrat {
+ None => {
+ trace!("building scalar two-way substring searcher");
+ let kind = SearcherKind { two_way: finder };
+ Searcher { call: searcher_kind_two_way, kind, rabinkarp }
+ }
+ Some(prestrat) => {
+ trace!(
+ "building scalar two-way \
+ substring searcher with a prefilter"
+ );
+ let two_way_with_prefilter =
+ TwoWayWithPrefilter { finder, prestrat };
+ let kind = SearcherKind { two_way_with_prefilter };
+ Searcher {
+ call: searcher_kind_two_way_with_prefilter,
+ kind,
+ rabinkarp,
+ }
+ }
+ }
+ }
+
+ /// Searches the given haystack for the given needle. The needle given
+ /// should be the same as the needle that this finder was initialized
+ /// with.
+ ///
+ /// Inlining this can lead to big wins for latency, and #[inline] doesn't
+ /// seem to be enough in some cases.
+ #[inline(always)]
+ pub(crate) fn find(
+ &self,
+ prestate: &mut PrefilterState,
+ haystack: &[u8],
+ needle: &[u8],
+ ) -> Option<usize> {
+ if haystack.len() < needle.len() {
+ None
+ } else {
+ // SAFETY: By construction, we've ensured that the function
+ // in `self.call` is properly paired with the union used in
+ // `self.kind`.
+ unsafe { (self.call)(self, prestate, haystack, needle) }
+ }
+ }
+}
+
+impl core::fmt::Debug for Searcher {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_struct("Searcher")
+ .field("call", &"<searcher function>")
+ .field("kind", &"<searcher kind union>")
+ .field("rabinkarp", &self.rabinkarp)
+ .finish()
+ }
+}
+
+/// A union indicating one of several possible substring search implementations
+/// that are in active use.
+///
+/// This union should only be read by one of the functions prefixed with
+/// `searcher_kind_`. Namely, the correct function is meant to be paired with
+/// the union by the caller, such that the function always reads from the
+/// designated union field.
+#[derive(Clone, Copy)]
+union SearcherKind {
+ empty: (),
+ one_byte: u8,
+ two_way: twoway::Finder,
+ two_way_with_prefilter: TwoWayWithPrefilter,
+ #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+ sse2: crate::arch::x86_64::sse2::packedpair::Finder,
+ #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+ avx2: crate::arch::x86_64::avx2::packedpair::Finder,
+ #[cfg(target_arch = "wasm32")]
+ simd128: crate::arch::wasm32::simd128::packedpair::Finder,
+ #[cfg(target_arch = "aarch64")]
+ neon: crate::arch::aarch64::neon::packedpair::Finder,
+}
+
+/// A two-way substring searcher with a prefilter.
+#[derive(Copy, Clone, Debug)]
+struct TwoWayWithPrefilter {
+ finder: twoway::Finder,
+ prestrat: Prefilter,
+}
+
+/// The type of a substring search function.
+///
+/// # Safety
+///
+/// When using a function of this type, callers must ensure that the correct
+/// function is paired with the value populated in `SearcherKind` union.
+type SearcherKindFn = unsafe fn(
+ searcher: &Searcher,
+ prestate: &mut PrefilterState,
+ haystack: &[u8],
+ needle: &[u8],
+) -> Option<usize>;
+
+/// Reads from the `empty` field of `SearcherKind` to handle the case of
+/// searching for the empty needle. Works on all platforms.
+///
+/// # Safety
+///
+/// Callers must ensure that the `searcher.kind.empty` union field is set.
+unsafe fn searcher_kind_empty(
+ _searcher: &Searcher,
+ _prestate: &mut PrefilterState,
+ _haystack: &[u8],
+ _needle: &[u8],
+) -> Option<usize> {
+ Some(0)
+}
+
+/// Reads from the `one_byte` field of `SearcherKind` to handle the case of
+/// searching for a single byte needle. Works on all platforms.
+///
+/// # Safety
+///
+/// Callers must ensure that the `searcher.kind.one_byte` union field is set.
+unsafe fn searcher_kind_one_byte(
+ searcher: &Searcher,
+ _prestate: &mut PrefilterState,
+ haystack: &[u8],
+ _needle: &[u8],
+) -> Option<usize> {
+ let needle = searcher.kind.one_byte;
+ crate::memchr(needle, haystack)
+}
+
+/// Reads from the `two_way` field of `SearcherKind` to handle the case of
+/// searching for an arbitrary needle without prefilter acceleration. Works on
+/// all platforms.
+///
+/// # Safety
+///
+/// Callers must ensure that the `searcher.kind.two_way` union field is set.
+unsafe fn searcher_kind_two_way(
+ searcher: &Searcher,
+ _prestate: &mut PrefilterState,
+ haystack: &[u8],
+ needle: &[u8],
+) -> Option<usize> {
+ if rabinkarp::is_fast(haystack, needle) {
+ searcher.rabinkarp.find(haystack, needle)
+ } else {
+ searcher.kind.two_way.find(haystack, needle)
+ }
+}
+
+/// Reads from the `two_way_with_prefilter` field of `SearcherKind` to handle
+/// the case of searching for an arbitrary needle with prefilter acceleration.
+/// Works on all platforms.
+///
+/// # Safety
+///
+/// Callers must ensure that the `searcher.kind.two_way_with_prefilter` union
+/// field is set.
+unsafe fn searcher_kind_two_way_with_prefilter(
+ searcher: &Searcher,
+ prestate: &mut PrefilterState,
+ haystack: &[u8],
+ needle: &[u8],
+) -> Option<usize> {
+ if rabinkarp::is_fast(haystack, needle) {
+ searcher.rabinkarp.find(haystack, needle)
+ } else {
+ let TwoWayWithPrefilter { ref finder, ref prestrat } =
+ searcher.kind.two_way_with_prefilter;
+ let pre = Pre { prestate, prestrat };
+ finder.find_with_prefilter(Some(pre), haystack, needle)
+ }
+}
+
+/// Reads from the `sse2` field of `SearcherKind` to execute the x86_64 SSE2
+/// vectorized substring search implementation.
+///
+/// # Safety
+///
+/// Callers must ensure that the `searcher.kind.sse2` union field is set.
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+unsafe fn searcher_kind_sse2(
+ searcher: &Searcher,
+ _prestate: &mut PrefilterState,
+ haystack: &[u8],
+ needle: &[u8],
+) -> Option<usize> {
+ let finder = &searcher.kind.sse2;
+ if haystack.len() < finder.min_haystack_len() {
+ searcher.rabinkarp.find(haystack, needle)
+ } else {
+ finder.find(haystack, needle)
+ }
+}
+
+/// Reads from the `avx2` field of `SearcherKind` to execute the x86_64 AVX2
+/// vectorized substring search implementation.
+///
+/// # Safety
+///
+/// Callers must ensure that the `searcher.kind.avx2` union field is set.
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+unsafe fn searcher_kind_avx2(
+ searcher: &Searcher,
+ _prestate: &mut PrefilterState,
+ haystack: &[u8],
+ needle: &[u8],
+) -> Option<usize> {
+ let finder = &searcher.kind.avx2;
+ if haystack.len() < finder.min_haystack_len() {
+ searcher.rabinkarp.find(haystack, needle)
+ } else {
+ finder.find(haystack, needle)
+ }
+}
+
+/// Reads from the `simd128` field of `SearcherKind` to execute the wasm32
+/// simd128 vectorized substring search implementation.
+///
+/// # Safety
+///
+/// Callers must ensure that the `searcher.kind.simd128` union field is set.
+#[cfg(target_arch = "wasm32")]
+unsafe fn searcher_kind_simd128(
+ searcher: &Searcher,
+ _prestate: &mut PrefilterState,
+ haystack: &[u8],
+ needle: &[u8],
+) -> Option<usize> {
+ let finder = &searcher.kind.simd128;
+ if haystack.len() < finder.min_haystack_len() {
+ searcher.rabinkarp.find(haystack, needle)
+ } else {
+ finder.find(haystack, needle)
+ }
+}
+
+/// Reads from the `neon` field of `SearcherKind` to execute the aarch64 neon
+/// vectorized substring search implementation.
+///
+/// # Safety
+///
+/// Callers must ensure that the `searcher.kind.neon` union field is set.
+#[cfg(target_arch = "aarch64")]
+unsafe fn searcher_kind_neon(
+ searcher: &Searcher,
+ _prestate: &mut PrefilterState,
+ haystack: &[u8],
+ needle: &[u8],
+) -> Option<usize> {
+ let finder = &searcher.kind.neon;
+ if haystack.len() < finder.min_haystack_len() {
+ searcher.rabinkarp.find(haystack, needle)
+ } else {
+ finder.find(haystack, needle)
+ }
+}
+
+/// A reverse substring searcher.
+#[derive(Clone, Debug)]
+pub(crate) struct SearcherRev {
+ kind: SearcherRevKind,
+ rabinkarp: rabinkarp::FinderRev,
+}
+
+/// The kind of the reverse searcher.
+///
+/// For the reverse case, we don't do any SIMD acceleration or prefilters.
+/// There is no specific technical reason why we don't, but rather don't do it
+/// because it's not clear it's worth the extra code to do so. If you have a
+/// use case for it, please file an issue.
+///
+/// We also don't do the union trick as we do with the forward case and
+/// prefilters. Basically for the same reason we don't have prefilters or
+/// vector algorithms for reverse searching: it's not clear it's worth doing.
+/// Please file an issue if you have a compelling use case for fast reverse
+/// substring search.
+#[derive(Clone, Debug)]
+enum SearcherRevKind {
+ Empty,
+ OneByte { needle: u8 },
+ TwoWay { finder: twoway::FinderRev },
+}
+
+impl SearcherRev {
+ /// Creates a new searcher for finding occurrences of the given needle in
+ /// reverse. That is, it reports the last (instead of the first) occurrence
+ /// of a needle in a haystack.
+ #[inline]
+ pub(crate) fn new(needle: &[u8]) -> SearcherRev {
+ let kind = if needle.len() <= 1 {
+ if needle.is_empty() {
+ trace!("building empty reverse substring searcher");
+ SearcherRevKind::Empty
+ } else {
+ trace!("building one-byte reverse substring searcher");
+ debug_assert_eq!(1, needle.len());
+ SearcherRevKind::OneByte { needle: needle[0] }
+ }
+ } else {
+ trace!("building scalar two-way reverse substring searcher");
+ let finder = twoway::FinderRev::new(needle);
+ SearcherRevKind::TwoWay { finder }
+ };
+ let rabinkarp = rabinkarp::FinderRev::new(needle);
+ SearcherRev { kind, rabinkarp }
+ }
+
+ /// Searches the given haystack for the last occurrence of the given
+ /// needle. The needle given should be the same as the needle that this
+ /// finder was initialized with.
+ #[inline]
+ pub(crate) fn rfind(
+ &self,
+ haystack: &[u8],
+ needle: &[u8],
+ ) -> Option<usize> {
+ if haystack.len() < needle.len() {
+ return None;
+ }
+ match self.kind {
+ SearcherRevKind::Empty => Some(haystack.len()),
+ SearcherRevKind::OneByte { needle } => {
+ crate::memrchr(needle, haystack)
+ }
+ SearcherRevKind::TwoWay { ref finder } => {
+ if rabinkarp::is_fast(haystack, needle) {
+ self.rabinkarp.rfind(haystack, needle)
+ } else {
+ finder.rfind(haystack, needle)
+ }
+ }
+ }
+ }
+}
+
+/// Prefilter controls whether heuristics are used to accelerate searching.
+///
+/// A prefilter refers to the idea of detecting candidate matches very quickly,
+/// and then confirming whether those candidates are full matches. This
+/// idea can be quite effective since it's often the case that looking for
+/// candidates can be a lot faster than running a complete substring search
+/// over the entire input. Namely, looking for candidates can be done with
+/// extremely fast vectorized code.
+///
+/// The downside of a prefilter is that it assumes false positives (which are
+/// candidates generated by a prefilter that aren't matches) are somewhat rare
+/// relative to the frequency of full matches. That is, if a lot of false
+/// positives are generated, then it's possible for search time to be worse
+/// than if the prefilter wasn't enabled in the first place.
+///
+/// Another downside of a prefilter is that it can result in highly variable
+/// performance, where some cases are extraordinarily fast and others aren't.
+/// Typically, variable performance isn't a problem, but it may be for your use
+/// case.
+///
+/// The use of prefilters in this implementation does use a heuristic to detect
+/// when a prefilter might not be carrying its weight, and will dynamically
+/// disable its use. Nevertheless, this configuration option gives callers
+/// the ability to disable prefilters if you have knowledge that they won't be
+/// useful.
+#[derive(Clone, Copy, Debug)]
+#[non_exhaustive]
+pub enum PrefilterConfig {
+ /// Never used a prefilter in substring search.
+ None,
+ /// Automatically detect whether a heuristic prefilter should be used. If
+ /// it is used, then heuristics will be used to dynamically disable the
+ /// prefilter if it is believed to not be carrying its weight.
+ Auto,
+}
+
+impl Default for PrefilterConfig {
+ fn default() -> PrefilterConfig {
+ PrefilterConfig::Auto
+ }
+}
+
+impl PrefilterConfig {
+ /// Returns true when this prefilter is set to the `None` variant.
+ fn is_none(&self) -> bool {
+ matches!(*self, PrefilterConfig::None)
+ }
+}
+
+/// The implementation of a prefilter.
+///
+/// This type encapsulates dispatch to one of several possible choices for a
+/// prefilter. Generally speaking, all prefilters have the same approximate
+/// algorithm: they choose a couple of bytes from the needle that are believed
+/// to be rare, use a fast vector algorithm to look for those bytes and return
+/// positions as candidates for some substring search algorithm (currently only
+/// Two-Way) to confirm as a match or not.
+///
+/// The differences between the algorithms are actually at the vector
+/// implementation level. Namely, we need different routines based on both
+/// which target architecture we're on and what CPU features are supported.
+///
+/// The straight-forwardly obvious approach here is to use an enum, and make
+/// `Prefilter::find` do case analysis to determine which algorithm was
+/// selected and invoke it. However, I've observed that this leads to poor
+/// codegen in some cases, especially in latency sensitive benchmarks. That is,
+/// this approach comes with overhead that I wasn't able to eliminate.
+///
+/// The second obvious approach is to use dynamic dispatch with traits. Doing
+/// that in this context where `Prefilter` owns the selection generally
+/// requires heap allocation, and this code is designed to run in core-only
+/// environments.
+///
+/// So we settle on using a union (that's `PrefilterKind`) and a function
+/// pointer (that's `PrefilterKindFn`). We select the right function pointer
+/// based on which field in the union we set, and that function in turn
+/// knows which field of the union to access. The downside of this approach
+/// is that it forces us to think about safety, but the upside is that
+/// there are some nice latency improvements to benchmarks. (Especially the
+/// `memmem/sliceslice/short` benchmark.)
+///
+/// In cases where we've selected a vector algorithm and the haystack given
+/// is too short, we fallback to the scalar version of `memchr` on the
+/// `rarest_byte`. (The scalar version of `memchr` is still better than a naive
+/// byte-at-a-time loop because it will read in `usize`-sized chunks at a
+/// time.)
+#[derive(Clone, Copy)]
+struct Prefilter {
+ call: PrefilterKindFn,
+ kind: PrefilterKind,
+ rarest_byte: u8,
+ rarest_offset: u8,
+}
+
+impl Prefilter {
+ /// Return a "fallback" prefilter, but only if it is believed to be
+ /// effective.
+ #[inline]
+ fn fallback<R: HeuristicFrequencyRank>(
+ ranker: R,
+ pair: Pair,
+ needle: &[u8],
+ ) -> Option<Prefilter> {
+ /// The maximum frequency rank permitted for the fallback prefilter.
+ /// If the rarest byte in the needle has a frequency rank above this
+ /// value, then no prefilter is used if the fallback prefilter would
+ /// otherwise be selected.
+ const MAX_FALLBACK_RANK: u8 = 250;
+
+ trace!("building fallback prefilter");
+ let rarest_offset = pair.index1();
+ let rarest_byte = needle[usize::from(rarest_offset)];
+ let rarest_rank = ranker.rank(rarest_byte);
+ if rarest_rank > MAX_FALLBACK_RANK {
+ None
+ } else {
+ let finder = crate::arch::all::packedpair::Finder::with_pair(
+ needle,
+ pair.clone(),
+ )?;
+ let call = prefilter_kind_fallback;
+ let kind = PrefilterKind { fallback: finder };
+ Some(Prefilter { call, kind, rarest_byte, rarest_offset })
+ }
+ }
+
+ /// Return a prefilter using a x86_64 SSE2 vector algorithm.
+ #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+ #[inline]
+ fn sse2(finder: sse2::Finder, needle: &[u8]) -> Prefilter {
+ trace!("building x86_64 SSE2 prefilter");
+ let rarest_offset = finder.pair().index1();
+ let rarest_byte = needle[usize::from(rarest_offset)];
+ Prefilter {
+ call: prefilter_kind_sse2,
+ kind: PrefilterKind { sse2: finder },
+ rarest_byte,
+ rarest_offset,
+ }
+ }
+
+ /// Return a prefilter using a x86_64 AVX2 vector algorithm.
+ #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+ #[inline]
+ fn avx2(finder: avx2::Finder, needle: &[u8]) -> Prefilter {
+ trace!("building x86_64 AVX2 prefilter");
+ let rarest_offset = finder.pair().index1();
+ let rarest_byte = needle[usize::from(rarest_offset)];
+ Prefilter {
+ call: prefilter_kind_avx2,
+ kind: PrefilterKind { avx2: finder },
+ rarest_byte,
+ rarest_offset,
+ }
+ }
+
+ /// Return a prefilter using a wasm32 simd128 vector algorithm.
+ #[cfg(target_arch = "wasm32")]
+ #[inline]
+ fn simd128(finder: simd128::Finder, needle: &[u8]) -> Prefilter {
+ trace!("building wasm32 simd128 prefilter");
+ let rarest_offset = finder.pair().index1();
+ let rarest_byte = needle[usize::from(rarest_offset)];
+ Prefilter {
+ call: prefilter_kind_simd128,
+ kind: PrefilterKind { simd128: finder },
+ rarest_byte,
+ rarest_offset,
+ }
+ }
+
+ /// Return a prefilter using a aarch64 neon vector algorithm.
+ #[cfg(target_arch = "aarch64")]
+ #[inline]
+ fn neon(finder: neon::Finder, needle: &[u8]) -> Prefilter {
+ trace!("building aarch64 neon prefilter");
+ let rarest_offset = finder.pair().index1();
+ let rarest_byte = needle[usize::from(rarest_offset)];
+ Prefilter {
+ call: prefilter_kind_neon,
+ kind: PrefilterKind { neon: finder },
+ rarest_byte,
+ rarest_offset,
+ }
+ }
+
+ /// Return a *candidate* position for a match.
+ ///
+ /// When this returns an offset, it implies that a match could begin at
+ /// that offset, but it may not. That is, it is possible for a false
+ /// positive to be returned.
+ ///
+ /// When `None` is returned, then it is guaranteed that there are no
+ /// matches for the needle in the given haystack. That is, it is impossible
+ /// for a false negative to be returned.
+ ///
+ /// The purpose of this routine is to look for candidate matching positions
+ /// as quickly as possible before running a (likely) slower confirmation
+ /// step.
+ #[inline]
+ fn find(&self, haystack: &[u8]) -> Option<usize> {
+ // SAFETY: By construction, we've ensured that the function in
+ // `self.call` is properly paired with the union used in `self.kind`.
+ unsafe { (self.call)(self, haystack) }
+ }
+
+ /// A "simple" prefilter that just looks for the occurrence of the rarest
+ /// byte from the needle. This is generally only used for very small
+ /// haystacks.
+ #[inline]
+ fn find_simple(&self, haystack: &[u8]) -> Option<usize> {
+ // We don't use crate::memchr here because the haystack should be small
+ // enough that memchr won't be able to use vector routines anyway. So
+ // we just skip straight to the fallback implementation which is likely
+ // faster. (A byte-at-a-time loop is only used when the haystack is
+ // smaller than `size_of::<usize>()`.)
+ crate::arch::all::memchr::One::new(self.rarest_byte)
+ .find(haystack)
+ .map(|i| i.saturating_sub(usize::from(self.rarest_offset)))
+ }
+}
+
+impl core::fmt::Debug for Prefilter {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_struct("Prefilter")
+ .field("call", &"<prefilter function>")
+ .field("kind", &"<prefilter kind union>")
+ .field("rarest_byte", &self.rarest_byte)
+ .field("rarest_offset", &self.rarest_offset)
+ .finish()
+ }
+}
+
+/// A union indicating one of several possible prefilters that are in active
+/// use.
+///
+/// This union should only be read by one of the functions prefixed with
+/// `prefilter_kind_`. Namely, the correct function is meant to be paired with
+/// the union by the caller, such that the function always reads from the
+/// designated union field.
+#[derive(Clone, Copy)]
+union PrefilterKind {
+ fallback: crate::arch::all::packedpair::Finder,
+ #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+ sse2: crate::arch::x86_64::sse2::packedpair::Finder,
+ #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+ avx2: crate::arch::x86_64::avx2::packedpair::Finder,
+ #[cfg(target_arch = "wasm32")]
+ simd128: crate::arch::wasm32::simd128::packedpair::Finder,
+ #[cfg(target_arch = "aarch64")]
+ neon: crate::arch::aarch64::neon::packedpair::Finder,
+}
+
+/// The type of a prefilter function.
+///
+/// # Safety
+///
+/// When using a function of this type, callers must ensure that the correct
+/// function is paired with the value populated in `PrefilterKind` union.
+type PrefilterKindFn =
+ unsafe fn(strat: &Prefilter, haystack: &[u8]) -> Option<usize>;
+
+/// Reads from the `fallback` field of `PrefilterKind` to execute the fallback
+/// prefilter. Works on all platforms.
+///
+/// # Safety
+///
+/// Callers must ensure that the `strat.kind.fallback` union field is set.
+unsafe fn prefilter_kind_fallback(
+ strat: &Prefilter,
+ haystack: &[u8],
+) -> Option<usize> {
+ strat.kind.fallback.find_prefilter(haystack)
+}
+
+/// Reads from the `sse2` field of `PrefilterKind` to execute the x86_64 SSE2
+/// prefilter.
+///
+/// # Safety
+///
+/// Callers must ensure that the `strat.kind.sse2` union field is set.
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+unsafe fn prefilter_kind_sse2(
+ strat: &Prefilter,
+ haystack: &[u8],
+) -> Option<usize> {
+ let finder = &strat.kind.sse2;
+ if haystack.len() < finder.min_haystack_len() {
+ strat.find_simple(haystack)
+ } else {
+ finder.find_prefilter(haystack)
+ }
+}
+
+/// Reads from the `avx2` field of `PrefilterKind` to execute the x86_64 AVX2
+/// prefilter.
+///
+/// # Safety
+///
+/// Callers must ensure that the `strat.kind.avx2` union field is set.
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+unsafe fn prefilter_kind_avx2(
+ strat: &Prefilter,
+ haystack: &[u8],
+) -> Option<usize> {
+ let finder = &strat.kind.avx2;
+ if haystack.len() < finder.min_haystack_len() {
+ strat.find_simple(haystack)
+ } else {
+ finder.find_prefilter(haystack)
+ }
+}
+
+/// Reads from the `simd128` field of `PrefilterKind` to execute the wasm32
+/// simd128 prefilter.
+///
+/// # Safety
+///
+/// Callers must ensure that the `strat.kind.simd128` union field is set.
+#[cfg(target_arch = "wasm32")]
+unsafe fn prefilter_kind_simd128(
+ strat: &Prefilter,
+ haystack: &[u8],
+) -> Option<usize> {
+ let finder = &strat.kind.simd128;
+ if haystack.len() < finder.min_haystack_len() {
+ strat.find_simple(haystack)
+ } else {
+ finder.find_prefilter(haystack)
+ }
+}
+
+/// Reads from the `neon` field of `PrefilterKind` to execute the aarch64 neon
+/// prefilter.
+///
+/// # Safety
+///
+/// Callers must ensure that the `strat.kind.neon` union field is set.
+#[cfg(target_arch = "aarch64")]
+unsafe fn prefilter_kind_neon(
+ strat: &Prefilter,
+ haystack: &[u8],
+) -> Option<usize> {
+ let finder = &strat.kind.neon;
+ if haystack.len() < finder.min_haystack_len() {
+ strat.find_simple(haystack)
+ } else {
+ finder.find_prefilter(haystack)
+ }
+}
+
+/// PrefilterState tracks state associated with the effectiveness of a
+/// prefilter. It is used to track how many bytes, on average, are skipped by
+/// the prefilter. If this average dips below a certain threshold over time,
+/// then the state renders the prefilter inert and stops using it.
+///
+/// A prefilter state should be created for each search. (Where creating an
+/// iterator is treated as a single search.) A prefilter state should only be
+/// created from a `Freqy`. e.g., An inert `Freqy` will produce an inert
+/// `PrefilterState`.
+#[derive(Clone, Copy, Debug)]
+pub(crate) struct PrefilterState {
+ /// The number of skips that has been executed. This is always 1 greater
+ /// than the actual number of skips. The special sentinel value of 0
+ /// indicates that the prefilter is inert. This is useful to avoid
+ /// additional checks to determine whether the prefilter is still
+ /// "effective." Once a prefilter becomes inert, it should no longer be
+ /// used (according to our heuristics).
+ skips: u32,
+ /// The total number of bytes that have been skipped.
+ skipped: u32,
+}
+
+impl PrefilterState {
+ /// The minimum number of skip attempts to try before considering whether
+ /// a prefilter is effective or not.
+ const MIN_SKIPS: u32 = 50;
+
+ /// The minimum amount of bytes that skipping must average.
+ ///
+ /// This value was chosen based on varying it and checking
+ /// the microbenchmarks. In particular, this can impact the
+ /// pathological/repeated-{huge,small} benchmarks quite a bit if it's set
+ /// too low.
+ const MIN_SKIP_BYTES: u32 = 8;
+
+ /// Create a fresh prefilter state.
+ #[inline]
+ pub(crate) fn new() -> PrefilterState {
+ PrefilterState { skips: 1, skipped: 0 }
+ }
+
+ /// Update this state with the number of bytes skipped on the last
+ /// invocation of the prefilter.
+ #[inline]
+ fn update(&mut self, skipped: usize) {
+ self.skips = self.skips.saturating_add(1);
+ // We need to do this dance since it's technically possible for
+ // `skipped` to overflow a `u32`. (And we use a `u32` to reduce the
+ // size of a prefilter state.)
+ self.skipped = match u32::try_from(skipped) {
+ Err(_) => core::u32::MAX,
+ Ok(skipped) => self.skipped.saturating_add(skipped),
+ };
+ }
+
+ /// Return true if and only if this state indicates that a prefilter is
+ /// still effective.
+ #[inline]
+ fn is_effective(&mut self) -> bool {
+ if self.is_inert() {
+ return false;
+ }
+ if self.skips() < PrefilterState::MIN_SKIPS {
+ return true;
+ }
+ if self.skipped >= PrefilterState::MIN_SKIP_BYTES * self.skips() {
+ return true;
+ }
+
+ // We're inert.
+ self.skips = 0;
+ false
+ }
+
+ /// Returns true if the prefilter this state represents should no longer
+ /// be used.
+ #[inline]
+ fn is_inert(&self) -> bool {
+ self.skips == 0
+ }
+
+ /// Returns the total number of times the prefilter has been used.
+ #[inline]
+ fn skips(&self) -> u32 {
+ // Remember, `0` is a sentinel value indicating inertness, so we
+ // always need to subtract `1` to get our actual number of skips.
+ self.skips.saturating_sub(1)
+ }
+}
+
+/// A combination of prefilter effectiveness state and the prefilter itself.
+#[derive(Debug)]
+pub(crate) struct Pre<'a> {
+ /// State that tracks the effectiveness of a prefilter.
+ prestate: &'a mut PrefilterState,
+ /// The actual prefilter.
+ prestrat: &'a Prefilter,
+}
+
+impl<'a> Pre<'a> {
+ /// Call this prefilter on the given haystack with the given needle.
+ #[inline]
+ pub(crate) fn find(&mut self, haystack: &[u8]) -> Option<usize> {
+ let result = self.prestrat.find(haystack);
+ self.prestate.update(result.unwrap_or(haystack.len()));
+ result
+ }
+
+ /// Return true if and only if this prefilter should be used.
+ #[inline]
+ pub(crate) fn is_effective(&mut self) -> bool {
+ self.prestate.is_effective()
+ }
+}
+
+/// Returns true if the needle has the right characteristics for a vector
+/// algorithm to handle the entirety of substring search.
+///
+/// Vector algorithms can be used for prefilters for other substring search
+/// algorithms (like Two-Way), but they can also be used for substring search
+/// on their own. When used for substring search, vector algorithms will
+/// quickly identify candidate match positions (just like in the prefilter
+/// case), but instead of returning the candidate position they will try to
+/// confirm the match themselves. Confirmation happens via `memcmp`. This
+/// works well for short needles, but can break down when many false candidate
+/// positions are generated for large needles. Thus, we only permit vector
+/// algorithms to own substring search when the needle is of a certain length.
+#[inline]
+fn do_packed_search(needle: &[u8]) -> bool {
+ /// The minimum length of a needle required for this algorithm. The minimum
+ /// is 2 since a length of 1 should just use memchr and a length of 0 isn't
+ /// a case handled by this searcher.
+ const MIN_LEN: usize = 2;
+
+ /// The maximum length of a needle required for this algorithm.
+ ///
+ /// In reality, there is no hard max here. The code below can handle any
+ /// length needle. (Perhaps that suggests there are missing optimizations.)
+ /// Instead, this is a heuristic and a bound guaranteeing our linear time
+ /// complexity.
+ ///
+ /// It is a heuristic because when a candidate match is found, memcmp is
+ /// run. For very large needles with lots of false positives, memcmp can
+ /// make the code run quite slow.
+ ///
+ /// It is a bound because the worst case behavior with memcmp is
+ /// multiplicative in the size of the needle and haystack, and we want
+ /// to keep that additive. This bound ensures we still meet that bound
+ /// theoretically, since it's just a constant. We aren't acting in bad
+ /// faith here, memcmp on tiny needles is so fast that even in pathological
+ /// cases (see pathological vector benchmarks), this is still just as fast
+ /// or faster in practice.
+ ///
+ /// This specific number was chosen by tweaking a bit and running
+ /// benchmarks. The rare-medium-needle, for example, gets about 5% faster
+ /// by using this algorithm instead of a prefilter-accelerated Two-Way.
+ /// There's also a theoretical desire to keep this number reasonably
+ /// low, to mitigate the impact of pathological cases. I did try 64, and
+ /// some benchmarks got a little better, and others (particularly the
+ /// pathological ones), got a lot worse. So... 32 it is?
+ const MAX_LEN: usize = 32;
+ MIN_LEN <= needle.len() && needle.len() <= MAX_LEN
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +
/// A trait for describing vector operations used by vectorized searchers.
+///
+/// The trait is highly constrained to low level vector operations needed.
+/// In general, it was invented mostly to be generic over x86's __m128i and
+/// __m256i types. At time of writing, it also supports wasm and aarch64
+/// 128-bit vector types as well.
+///
+/// # Safety
+///
+/// All methods are not safe since they are intended to be implemented using
+/// vendor intrinsics, which are also not safe. Callers must ensure that the
+/// appropriate target features are enabled in the calling function, and that
+/// the current CPU supports them. All implementations should avoid marking the
+/// routines with #[target_feature] and instead mark them as #[inline(always)]
+/// to ensure they get appropriately inlined. (inline(always) cannot be used
+/// with target_feature.)
+pub(crate) trait Vector: Copy + core::fmt::Debug {
+ /// The number of bits in the vector.
+ const BITS: usize;
+ /// The number of bytes in the vector. That is, this is the size of the
+ /// vector in memory.
+ const BYTES: usize;
+ /// The bits that must be zero in order for a `*const u8` pointer to be
+ /// correctly aligned to read vector values.
+ const ALIGN: usize;
+
+ /// The type of the value returned by `Vector::movemask`.
+ ///
+ /// This supports abstracting over the specific representation used in
+ /// order to accommodate different representations in different ISAs.
+ type Mask: MoveMask;
+
+ /// Create a vector with 8-bit lanes with the given byte repeated into each
+ /// lane.
+ unsafe fn splat(byte: u8) -> Self;
+
+ /// Read a vector-size number of bytes from the given pointer. The pointer
+ /// must be aligned to the size of the vector.
+ ///
+ /// # Safety
+ ///
+ /// Callers must guarantee that at least `BYTES` bytes are readable from
+ /// `data` and that `data` is aligned to a `BYTES` boundary.
+ unsafe fn load_aligned(data: *const u8) -> Self;
+
+ /// Read a vector-size number of bytes from the given pointer. The pointer
+ /// does not need to be aligned.
+ ///
+ /// # Safety
+ ///
+ /// Callers must guarantee that at least `BYTES` bytes are readable from
+ /// `data`.
+ unsafe fn load_unaligned(data: *const u8) -> Self;
+
+ /// _mm_movemask_epi8 or _mm256_movemask_epi8
+ unsafe fn movemask(self) -> Self::Mask;
+ /// _mm_cmpeq_epi8 or _mm256_cmpeq_epi8
+ unsafe fn cmpeq(self, vector2: Self) -> Self;
+ /// _mm_and_si128 or _mm256_and_si256
+ unsafe fn and(self, vector2: Self) -> Self;
+ /// _mm_or or _mm256_or_si256
+ unsafe fn or(self, vector2: Self) -> Self;
+ /// Returns true if and only if `Self::movemask` would return a mask that
+ /// contains at least one non-zero bit.
+ unsafe fn movemask_will_have_non_zero(self) -> bool {
+ self.movemask().has_non_zero()
+ }
+}
+
+/// A trait that abstracts over a vector-to-scalar operation called
+/// "move mask."
+///
+/// On x86-64, this is `_mm_movemask_epi8` for SSE2 and `_mm256_movemask_epi8`
+/// for AVX2. It takes a vector of `u8` lanes and returns a scalar where the
+/// `i`th bit is set if and only if the most significant bit in the `i`th lane
+/// of the vector is set. The simd128 ISA for wasm32 also supports this
+/// exact same operation natively.
+///
+/// ... But aarch64 doesn't. So we have to fake it with more instructions and
+/// a slightly different representation. We could do extra work to unify the
+/// representations, but then would require additional costs in the hot path
+/// for `memchr` and `packedpair`. So instead, we abstraction over the specific
+/// representation with this trait an ddefine the operations we actually need.
+pub(crate) trait MoveMask: Copy + core::fmt::Debug {
+ /// Return a mask that is all zeros except for the least significant `n`
+ /// lanes in a corresponding vector.
+ fn all_zeros_except_least_significant(n: usize) -> Self;
+
+ /// Returns true if and only if this mask has a a non-zero bit anywhere.
+ fn has_non_zero(self) -> bool;
+
+ /// Returns the number of bits set to 1 in this mask.
+ fn count_ones(self) -> usize;
+
+ /// Does a bitwise `and` operation between `self` and `other`.
+ fn and(self, other: Self) -> Self;
+
+ /// Does a bitwise `or` operation between `self` and `other`.
+ fn or(self, other: Self) -> Self;
+
+ /// Returns a mask that is equivalent to `self` but with the least
+ /// significant 1-bit set to 0.
+ fn clear_least_significant_bit(self) -> Self;
+
+ /// Returns the offset of the first non-zero lane this mask represents.
+ fn first_offset(self) -> usize;
+
+ /// Returns the offset of the last non-zero lane this mask represents.
+ fn last_offset(self) -> usize;
+}
+
+/// This is a "sensible" movemask implementation where each bit represents
+/// whether the most significant bit is set in each corresponding lane of a
+/// vector. This is used on x86-64 and wasm, but such a mask is more expensive
+/// to get on aarch64 so we use something a little different.
+///
+/// We call this "sensible" because this is what we get using native sse/avx
+/// movemask instructions. But neon has no such native equivalent.
+#[derive(Clone, Copy, Debug)]
+pub(crate) struct SensibleMoveMask(u32);
+
+impl SensibleMoveMask {
+ /// Get the mask in a form suitable for computing offsets.
+ ///
+ /// Basically, this normalizes to little endian. On big endian, this swaps
+ /// the bytes.
+ #[inline(always)]
+ fn get_for_offset(self) -> u32 {
+ #[cfg(target_endian = "big")]
+ {
+ self.0.swap_bytes()
+ }
+ #[cfg(target_endian = "little")]
+ {
+ self.0
+ }
+ }
+}
+
+impl MoveMask for SensibleMoveMask {
+ #[inline(always)]
+ fn all_zeros_except_least_significant(n: usize) -> SensibleMoveMask {
+ debug_assert!(n < 32);
+ SensibleMoveMask(!((1 << n) - 1))
+ }
+
+ #[inline(always)]
+ fn has_non_zero(self) -> bool {
+ self.0 != 0
+ }
+
+ #[inline(always)]
+ fn count_ones(self) -> usize {
+ self.0.count_ones() as usize
+ }
+
+ #[inline(always)]
+ fn and(self, other: SensibleMoveMask) -> SensibleMoveMask {
+ SensibleMoveMask(self.0 & other.0)
+ }
+
+ #[inline(always)]
+ fn or(self, other: SensibleMoveMask) -> SensibleMoveMask {
+ SensibleMoveMask(self.0 | other.0)
+ }
+
+ #[inline(always)]
+ fn clear_least_significant_bit(self) -> SensibleMoveMask {
+ SensibleMoveMask(self.0 & (self.0 - 1))
+ }
+
+ #[inline(always)]
+ fn first_offset(self) -> usize {
+ // We are dealing with little endian here (and if we aren't, we swap
+ // the bytes so we are in practice), where the most significant byte
+ // is at a higher address. That means the least significant bit that
+ // is set corresponds to the position of our first matching byte.
+ // That position corresponds to the number of zeros after the least
+ // significant bit.
+ self.get_for_offset().trailing_zeros() as usize
+ }
+
+ #[inline(always)]
+ fn last_offset(self) -> usize {
+ // We are dealing with little endian here (and if we aren't, we swap
+ // the bytes so we are in practice), where the most significant byte is
+ // at a higher address. That means the most significant bit that is set
+ // corresponds to the position of our last matching byte. The position
+ // from the end of the mask is therefore the number of leading zeros
+ // in a 32 bit integer, and the position from the start of the mask is
+ // therefore 32 - (leading zeros) - 1.
+ 32 - self.get_for_offset().leading_zeros() as usize - 1
+ }
+}
+
+#[cfg(target_arch = "x86_64")]
+mod x86sse2 {
+ use core::arch::x86_64::*;
+
+ use super::{SensibleMoveMask, Vector};
+
+ impl Vector for __m128i {
+ const BITS: usize = 128;
+ const BYTES: usize = 16;
+ const ALIGN: usize = Self::BYTES - 1;
+
+ type Mask = SensibleMoveMask;
+
+ #[inline(always)]
+ unsafe fn splat(byte: u8) -> __m128i {
+ _mm_set1_epi8(byte as i8)
+ }
+
+ #[inline(always)]
+ unsafe fn load_aligned(data: *const u8) -> __m128i {
+ _mm_load_si128(data as *const __m128i)
+ }
+
+ #[inline(always)]
+ unsafe fn load_unaligned(data: *const u8) -> __m128i {
+ _mm_loadu_si128(data as *const __m128i)
+ }
+
+ #[inline(always)]
+ unsafe fn movemask(self) -> SensibleMoveMask {
+ SensibleMoveMask(_mm_movemask_epi8(self) as u32)
+ }
+
+ #[inline(always)]
+ unsafe fn cmpeq(self, vector2: Self) -> __m128i {
+ _mm_cmpeq_epi8(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn and(self, vector2: Self) -> __m128i {
+ _mm_and_si128(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn or(self, vector2: Self) -> __m128i {
+ _mm_or_si128(self, vector2)
+ }
+ }
+}
+
+#[cfg(target_arch = "x86_64")]
+mod x86avx2 {
+ use core::arch::x86_64::*;
+
+ use super::{SensibleMoveMask, Vector};
+
+ impl Vector for __m256i {
+ const BITS: usize = 256;
+ const BYTES: usize = 32;
+ const ALIGN: usize = Self::BYTES - 1;
+
+ type Mask = SensibleMoveMask;
+
+ #[inline(always)]
+ unsafe fn splat(byte: u8) -> __m256i {
+ _mm256_set1_epi8(byte as i8)
+ }
+
+ #[inline(always)]
+ unsafe fn load_aligned(data: *const u8) -> __m256i {
+ _mm256_load_si256(data as *const __m256i)
+ }
+
+ #[inline(always)]
+ unsafe fn load_unaligned(data: *const u8) -> __m256i {
+ _mm256_loadu_si256(data as *const __m256i)
+ }
+
+ #[inline(always)]
+ unsafe fn movemask(self) -> SensibleMoveMask {
+ SensibleMoveMask(_mm256_movemask_epi8(self) as u32)
+ }
+
+ #[inline(always)]
+ unsafe fn cmpeq(self, vector2: Self) -> __m256i {
+ _mm256_cmpeq_epi8(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn and(self, vector2: Self) -> __m256i {
+ _mm256_and_si256(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn or(self, vector2: Self) -> __m256i {
+ _mm256_or_si256(self, vector2)
+ }
+ }
+}
+
+#[cfg(target_arch = "aarch64")]
+mod aarch64neon {
+ use core::arch::aarch64::*;
+
+ use super::{MoveMask, Vector};
+
+ impl Vector for uint8x16_t {
+ const BITS: usize = 128;
+ const BYTES: usize = 16;
+ const ALIGN: usize = Self::BYTES - 1;
+
+ type Mask = NeonMoveMask;
+
+ #[inline(always)]
+ unsafe fn splat(byte: u8) -> uint8x16_t {
+ vdupq_n_u8(byte)
+ }
+
+ #[inline(always)]
+ unsafe fn load_aligned(data: *const u8) -> uint8x16_t {
+ // I've tried `data.cast::<uint8x16_t>().read()` instead, but
+ // couldn't observe any benchmark differences.
+ Self::load_unaligned(data)
+ }
+
+ #[inline(always)]
+ unsafe fn load_unaligned(data: *const u8) -> uint8x16_t {
+ vld1q_u8(data)
+ }
+
+ #[inline(always)]
+ unsafe fn movemask(self) -> NeonMoveMask {
+ let asu16s = vreinterpretq_u16_u8(self);
+ let mask = vshrn_n_u16(asu16s, 4);
+ let asu64 = vreinterpret_u64_u8(mask);
+ let scalar64 = vget_lane_u64(asu64, 0);
+ NeonMoveMask(scalar64 & 0x8888888888888888)
+ }
+
+ #[inline(always)]
+ unsafe fn cmpeq(self, vector2: Self) -> uint8x16_t {
+ vceqq_u8(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn and(self, vector2: Self) -> uint8x16_t {
+ vandq_u8(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn or(self, vector2: Self) -> uint8x16_t {
+ vorrq_u8(self, vector2)
+ }
+
+ /// This is the only interesting implementation of this routine.
+ /// Basically, instead of doing the "shift right narrow" dance, we use
+ /// adajacent folding max to determine whether there are any non-zero
+ /// bytes in our mask. If there are, *then* we'll do the "shift right
+ /// narrow" dance. In benchmarks, this does lead to slightly better
+ /// throughput, but the win doesn't appear huge.
+ #[inline(always)]
+ unsafe fn movemask_will_have_non_zero(self) -> bool {
+ let low = vreinterpretq_u64_u8(vpmaxq_u8(self, self));
+ vgetq_lane_u64(low, 0) != 0
+ }
+ }
+
+ /// Neon doesn't have a `movemask` that works like the one in x86-64, so we
+ /// wind up using a different method[1]. The different method also produces
+ /// a mask, but 4 bits are set in the neon case instead of a single bit set
+ /// in the x86-64 case. We do an extra step to zero out 3 of the 4 bits,
+ /// but we still wind up with at least 3 zeroes between each set bit. This
+ /// generally means that we need to do some division by 4 before extracting
+ /// offsets.
+ ///
+ /// In fact, the existence of this type is the entire reason that we have
+ /// the `MoveMask` trait in the first place. This basically lets us keep
+ /// the different representations of masks without being forced to unify
+ /// them into a single representation, which could result in extra and
+ /// unnecessary work.
+ ///
+ /// [1]: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
+ #[derive(Clone, Copy, Debug)]
+ pub(crate) struct NeonMoveMask(u64);
+
+ impl NeonMoveMask {
+ /// Get the mask in a form suitable for computing offsets.
+ ///
+ /// Basically, this normalizes to little endian. On big endian, this
+ /// swaps the bytes.
+ #[inline(always)]
+ fn get_for_offset(self) -> u64 {
+ #[cfg(target_endian = "big")]
+ {
+ self.0.swap_bytes()
+ }
+ #[cfg(target_endian = "little")]
+ {
+ self.0
+ }
+ }
+ }
+
+ impl MoveMask for NeonMoveMask {
+ #[inline(always)]
+ fn all_zeros_except_least_significant(n: usize) -> NeonMoveMask {
+ debug_assert!(n < 16);
+ NeonMoveMask(!(((1 << n) << 2) - 1))
+ }
+
+ #[inline(always)]
+ fn has_non_zero(self) -> bool {
+ self.0 != 0
+ }
+
+ #[inline(always)]
+ fn count_ones(self) -> usize {
+ self.0.count_ones() as usize
+ }
+
+ #[inline(always)]
+ fn and(self, other: NeonMoveMask) -> NeonMoveMask {
+ NeonMoveMask(self.0 & other.0)
+ }
+
+ #[inline(always)]
+ fn or(self, other: NeonMoveMask) -> NeonMoveMask {
+ NeonMoveMask(self.0 | other.0)
+ }
+
+ #[inline(always)]
+ fn clear_least_significant_bit(self) -> NeonMoveMask {
+ NeonMoveMask(self.0 & (self.0 - 1))
+ }
+
+ #[inline(always)]
+ fn first_offset(self) -> usize {
+ // We are dealing with little endian here (and if we aren't,
+ // we swap the bytes so we are in practice), where the most
+ // significant byte is at a higher address. That means the least
+ // significant bit that is set corresponds to the position of our
+ // first matching byte. That position corresponds to the number of
+ // zeros after the least significant bit.
+ //
+ // Note that unlike `SensibleMoveMask`, this mask has its bits
+ // spread out over 64 bits instead of 16 bits (for a 128 bit
+ // vector). Namely, where as x86-64 will turn
+ //
+ // 0x00 0xFF 0x00 0x00 0xFF
+ //
+ // into 10010, our neon approach will turn it into
+ //
+ // 10000000000010000000
+ //
+ // And this happens because neon doesn't have a native `movemask`
+ // instruction, so we kind of fake it[1]. Thus, we divide the
+ // number of trailing zeros by 4 to get the "real" offset.
+ //
+ // [1]: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
+ (self.get_for_offset().trailing_zeros() >> 2) as usize
+ }
+
+ #[inline(always)]
+ fn last_offset(self) -> usize {
+ // See comment in `first_offset` above. This is basically the same,
+ // but coming from the other direction.
+ 16 - (self.get_for_offset().leading_zeros() >> 2) as usize - 1
+ }
+ }
+}
+
+#[cfg(target_arch = "wasm32")]
+mod wasm_simd128 {
+ use core::arch::wasm32::*;
+
+ use super::{SensibleMoveMask, Vector};
+
+ impl Vector for v128 {
+ const BITS: usize = 128;
+ const BYTES: usize = 16;
+ const ALIGN: usize = Self::BYTES - 1;
+
+ type Mask = SensibleMoveMask;
+
+ #[inline(always)]
+ unsafe fn splat(byte: u8) -> v128 {
+ u8x16_splat(byte)
+ }
+
+ #[inline(always)]
+ unsafe fn load_aligned(data: *const u8) -> v128 {
+ *data.cast()
+ }
+
+ #[inline(always)]
+ unsafe fn load_unaligned(data: *const u8) -> v128 {
+ v128_load(data.cast())
+ }
+
+ #[inline(always)]
+ unsafe fn movemask(self) -> SensibleMoveMask {
+ SensibleMoveMask(u8x16_bitmask(self).into())
+ }
+
+ #[inline(always)]
+ unsafe fn cmpeq(self, vector2: Self) -> v128 {
+ u8x16_eq(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn and(self, vector2: Self) -> v128 {
+ v128_and(self, vector2)
+ }
+
+ #[inline(always)]
+ unsafe fn or(self, vector2: Self) -> v128 {
+ v128_or(self, vector2)
+ }
+ }
+}
+
mod lowercase;
+mod titlecase;
+mod uppercase;
+
+pub use lowercase::make_ascii_lowercase;
+pub use titlecase::make_ascii_titlecase;
+pub use uppercase::make_ascii_uppercase;
+
+#[cfg(feature = "alloc")]
+pub use lowercase::to_ascii_lowercase;
+#[cfg(feature = "alloc")]
+pub use titlecase::to_ascii_titlecase;
+#[cfg(feature = "alloc")]
+pub use uppercase::to_ascii_uppercase;
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +
#[cfg(feature = "alloc")]
+use alloc::vec::Vec;
+
+/// Converts the given slice to its ASCII lower case equivalent in-place.
+///
+/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', but non-ASCII letters are
+/// unchanged.
+///
+/// This function can be used to implement [`String#downcase!`] for ASCII
+/// strings in Ruby.
+///
+#[cfg_attr(
+ feature = "alloc",
+ doc = "To return a new lowercased value without modifying the existing one, use [`to_ascii_lowercase`]."
+)]
+/// This function is analogous [`<[u8]>::make_ascii_lowercase`][slice-primitive].
+///
+/// # Examples
+///
+/// ```
+/// # use roe::make_ascii_lowercase;
+/// let mut buf = *b"ABCxyz";
+/// make_ascii_lowercase(&mut buf);
+/// assert_eq!(buf, *b"abcxyz");
+///
+/// let mut buf = *b"1234%&*";
+/// make_ascii_lowercase(&mut buf);
+/// assert_eq!(buf, *b"1234%&*");
+/// ```
+///
+/// [`String#downcase!`]: https://ruby-doc.org/core-3.1.2/String.html#method-i-downcase-21
+/// [slice-primitive]: https://doc.rust-lang.org/std/primitive.slice.html#method.make_ascii_lowercase
+#[inline]
+#[allow(clippy::module_name_repetitions)]
+pub fn make_ascii_lowercase<T: AsMut<[u8]>>(slice: &mut T) {
+ let slice = slice.as_mut();
+ slice.make_ascii_lowercase();
+}
+
+/// Returns a vector containing a copy of the given slice where each byte is
+/// mapped to its ASCII lower case equivalent.
+///
+/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', but non-ASCII letters are
+/// unchanged.
+///
+/// This function can be used to implement [`String#downcase`] and
+/// [`Symbol#downcase`] for ASCII strings in Ruby.
+///
+/// To lowercase the value in-place, use [`make_ascii_lowercase`]. This function
+/// is analogous to [`<[u8]>::to_ascii_lowercase`][slice-primitive].
+///
+/// # Examples
+///
+/// ```
+/// # use roe::to_ascii_lowercase;
+/// assert_eq!(to_ascii_lowercase("ABCxyz"), &b"abcxyz"[..]);
+/// assert_eq!(to_ascii_lowercase("1234%&*"), &b"1234%&*"[..]);
+/// ```
+///
+/// [`String#downcase`]: https://ruby-doc.org/core-3.1.2/String.html#method-i-downcase
+/// [`Symbol#downcase`]: https://ruby-doc.org/core-3.1.2/Symbol.html#method-i-downcase
+/// [slice-primitive]: https://doc.rust-lang.org/std/primitive.slice.html#method.to_ascii_lowercase
+#[inline]
+#[cfg(feature = "alloc")]
+#[cfg_attr(docsrs, doc(cfg(feature = "alloc")))]
+#[allow(clippy::module_name_repetitions)]
+pub fn to_ascii_lowercase<T: AsRef<[u8]>>(slice: T) -> Vec<u8> {
+ let slice = slice.as_ref();
+ slice.to_ascii_lowercase()
+}
+
+#[cfg(test)]
+mod tests {
+ #[test]
+ fn make_ascii_lowercase_empty() {
+ let mut buf = *b"";
+ super::make_ascii_lowercase(&mut buf);
+ assert_eq!(buf, *b"");
+ }
+
+ #[test]
+ #[cfg(feature = "alloc")]
+ fn to_ascii_lowercase_empty() {
+ assert_eq!(super::to_ascii_lowercase(""), b"");
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +
#[cfg(feature = "alloc")]
+use alloc::vec::Vec;
+
+/// Converts the given slice to its ASCII title case equivalent in-place.
+///
+/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z' in the first byte;
+/// subsequent bytes with ASCII letters 'A' to 'Z' are mapped to 'a' to 'z';
+/// non-ASCII letters are unchanged.
+///
+/// This function can be used to implement [`String#capitalize!`] for ASCII
+/// strings in Ruby.
+///
+#[cfg_attr(
+ feature = "alloc",
+ doc = "To return a new titlecased value without modifying the existing one, use [`to_ascii_titlecase`]."
+)]
+///
+/// # Examples
+///
+/// ```
+/// # use roe::make_ascii_titlecase;
+/// let mut buf = *b"ABCxyz";
+/// make_ascii_titlecase(&mut buf);
+/// assert_eq!(buf, *b"Abcxyz");
+///
+/// let mut buf = *b"1234%&*";
+/// make_ascii_titlecase(&mut buf);
+/// assert_eq!(buf, *b"1234%&*");
+///
+/// let mut buf = *b"ABC1234%&*";
+/// make_ascii_titlecase(&mut buf);
+/// assert_eq!(buf, *b"Abc1234%&*");
+///
+/// let mut buf = *b"1234%&*abcXYZ";
+/// make_ascii_titlecase(&mut buf);
+/// assert_eq!(buf, *b"1234%&*abcxyz");
+///
+/// let mut buf = *b"ABC, XYZ";
+/// make_ascii_titlecase(&mut buf);
+/// assert_eq!(buf, *b"Abc, xyz");
+/// ```
+///
+/// [`String#capitalize!`]: https://ruby-doc.org/core-3.1.2/String.html#method-i-capitalize-21
+#[inline]
+#[allow(clippy::module_name_repetitions)]
+pub fn make_ascii_titlecase<T: AsMut<[u8]>>(slice: &mut T) {
+ let slice = slice.as_mut();
+ if let Some((head, tail)) = slice.split_first_mut() {
+ head.make_ascii_uppercase();
+ tail.make_ascii_lowercase();
+ }
+}
+
+/// Returns a vector containing a copy of the given slice where each byte is
+/// mapped to its ASCII title case equivalent.
+///
+/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z' in the first byte;
+/// subsequent bytes with ASCII letters 'A' to 'Z' are mapped to 'a' to 'z';
+/// non-ASCII letters are unchanged.
+///
+/// This function can be used to implement [`String#capitalize`] and
+/// [`Symbol#capitalize`] for ASCII strings in Ruby.
+///
+/// To titlecase the value in-place, use [`make_ascii_titlecase`].
+///
+/// # Examples
+///
+/// ```
+/// # use roe::to_ascii_titlecase;
+/// assert_eq!(to_ascii_titlecase("ABCxyz"), &b"Abcxyz"[..]);
+/// assert_eq!(to_ascii_titlecase("1234%&*"), &b"1234%&*"[..]);
+/// assert_eq!(to_ascii_titlecase("ABC1234%&*"), &b"Abc1234%&*"[..]);
+/// assert_eq!(to_ascii_titlecase("1234%&*abcXYZ"), &b"1234%&*abcxyz"[..]);
+/// assert_eq!(to_ascii_titlecase("ABC, XYZ"), &b"Abc, xyz"[..]);
+/// ```
+///
+/// [`String#capitalize`]: https://ruby-doc.org/core-3.1.2/String.html#method-i-capitalize
+/// [`Symbol#capitalize`]: https://ruby-doc.org/core-3.1.2/Symbol.html#method-i-capitalize
+#[inline]
+#[cfg(feature = "alloc")]
+#[cfg_attr(docsrs, doc(cfg(feature = "alloc")))]
+#[allow(clippy::module_name_repetitions)]
+pub fn to_ascii_titlecase<T: AsRef<[u8]>>(slice: T) -> Vec<u8> {
+ let slice = slice.as_ref();
+ let mut titlecase = slice.to_ascii_lowercase();
+ if let Some(head) = titlecase.first_mut() {
+ head.make_ascii_uppercase();
+ }
+ titlecase
+}
+
+#[cfg(test)]
+mod tests {
+ #[test]
+ fn make_ascii_titlecase_empty() {
+ let mut buf = *b"";
+ super::make_ascii_titlecase(&mut buf);
+ assert_eq!(buf, *b"");
+ }
+
+ #[test]
+ #[cfg(feature = "alloc")]
+ fn to_ascii_titlecase_empty() {
+ assert_eq!(super::to_ascii_titlecase(""), b"");
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +
#[cfg(feature = "alloc")]
+use alloc::vec::Vec;
+
+/// Converts the given slice to its ASCII upper case equivalent in-place.
+///
+/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', but non-ASCII letters are
+/// unchanged.
+///
+/// This function can be used to implement [`String#upcase!`] for ASCII strings
+/// in Ruby.
+///
+#[cfg_attr(
+ feature = "alloc",
+ doc = "To return a new uppercased value without modifying the existing one, use [`to_ascii_uppercase`]."
+)]
+/// This function is analogous to [`<[u8]>::make_ascii_uppercase`][slice-primitive].
+///
+/// # Examples
+///
+/// ```
+/// # use roe::make_ascii_uppercase;
+/// let mut buf = *b"ABCxyz";
+/// make_ascii_uppercase(&mut buf);
+/// assert_eq!(buf, *b"ABCXYZ");
+///
+/// let mut buf = *b"1234%&*";
+/// make_ascii_uppercase(&mut buf);
+/// assert_eq!(buf, *b"1234%&*");
+/// ```
+///
+/// [`String#upcase!`]: https://ruby-doc.org/core-3.1.2/String.html#method-i-upcase-21
+/// [slice-primitive]: https://doc.rust-lang.org/std/primitive.u8.html#method.make_ascii_uppercase
+#[inline]
+#[allow(clippy::module_name_repetitions)]
+pub fn make_ascii_uppercase<T: AsMut<[u8]>>(slice: &mut T) {
+ let slice = slice.as_mut();
+ slice.make_ascii_uppercase();
+}
+
+/// Returns a vector containing a copy of the given slice where each byte is
+/// mapped to its ASCII upper case equivalent.
+///
+/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', but non-ASCII letters are
+/// unchanged.
+///
+/// This function can be used to implement [`String#upcase`] and
+/// [`Symbol#upcase`] for ASCII strings in Ruby.
+///
+/// To uppercase the value in-place, use [`make_ascii_uppercase`]. This function
+/// is analogous to [`<[u8]>::to_ascii_uppercase`][slice-primitive].
+///
+/// # Examples
+///
+/// ```
+/// # use roe::to_ascii_uppercase;
+/// assert_eq!(to_ascii_uppercase("ABCxyz"), &b"ABCXYZ"[..]);
+/// assert_eq!(to_ascii_uppercase("1234%&*"), &b"1234%&*"[..]);
+/// ```
+///
+/// [`String#upcase`]: https://ruby-doc.org/core-3.1.2/String.html#method-i-upcase
+/// [`Symbol#upcase`]: https://ruby-doc.org/core-3.1.2/Symbol.html#method-i-upcase
+/// [slice-primitive]: https://doc.rust-lang.org/std/primitive.slice.html#method.to_ascii_uppercase
+#[inline]
+#[cfg(feature = "alloc")]
+#[cfg_attr(docsrs, doc(cfg(feature = "alloc")))]
+#[allow(clippy::module_name_repetitions)]
+pub fn to_ascii_uppercase<T: AsRef<[u8]>>(slice: T) -> Vec<u8> {
+ let slice = slice.as_ref();
+ slice.to_ascii_uppercase()
+}
+
+#[cfg(test)]
+mod tests {
+ #[test]
+ fn make_ascii_uppercase_empty() {
+ let mut buf = *b"";
+ super::make_ascii_uppercase(&mut buf);
+ assert_eq!(buf, *b"");
+ }
+
+ #[test]
+ #[cfg(feature = "alloc")]
+ fn to_ascii_uppercase_empty() {
+ assert_eq!(super::to_ascii_uppercase(""), b"");
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +
#![warn(clippy::all)]
+#![warn(clippy::pedantic)]
+#![cfg_attr(test, allow(clippy::non_ascii_literal))]
+#![cfg_attr(test, allow(clippy::shadow_unrelated))]
+#![warn(clippy::cargo)]
+#![allow(unknown_lints)]
+#![warn(missing_copy_implementations)]
+#![warn(missing_debug_implementations)]
+#![warn(missing_docs)]
+#![warn(rust_2018_idioms)]
+#![warn(trivial_casts, trivial_numeric_casts)]
+#![warn(unused_qualifications)]
+#![warn(variant_size_differences)]
+#![forbid(unsafe_code)]
+// Enable feature callouts in generated documentation:
+// https://doc.rust-lang.org/beta/unstable-book/language-features/doc-cfg.html
+//
+// This approach is borrowed from tokio.
+#![cfg_attr(docsrs, feature(doc_cfg))]
+#![cfg_attr(docsrs, feature(doc_alias))]
+
+//! This crate provides [Unicode case mapping] routines and iterators for
+//! [conventionally UTF-8 binary strings].
+//!
+//! Unicode case mapping or case conversion can be used to transform the
+//! characters in a string. To quote the Unicode FAQ:
+//!
+//! > Case mapping or case conversion is a process whereby strings are converted
+//! > to a particular form—uppercase, lowercase, or titlecase—possibly for
+//! > display to the user.
+//!
+//! This crate is currently a *work in progress*. When the API is complete, Roe
+//! will support lowercase, uppercase, titlecase, and case folding iterators for
+//! conventionally UTF-8 byte slices.
+//!
+//! Roe will implement support for full, Turkic, ASCII, and case folding
+//! transforms.
+//!
+//! # Usage
+//!
+//! You can convert case like:
+//!
+//! ```
+//! # use roe::{LowercaseMode, UppercaseMode};
+//! assert_eq!(
+//! roe::lowercase(b"Artichoke Ruby", LowercaseMode::Ascii).collect::<Vec<_>>(),
+//! b"artichoke ruby"
+//! );
+//! assert_eq!(
+//! roe::uppercase("Αύριο".as_bytes(), UppercaseMode::Full).collect::<Vec<_>>(),
+//! "ΑΎΡΙΟ".as_bytes()
+//! );
+//! ```
+//!
+//!
+//! Roe provides fast path routines that assume the byte slice is ASCII-only.
+//!
+//! # Crate Features
+//!
+//! Roe is `no_std` compatible with an optional dependency on the [`alloc`]
+//! crate.
+//!
+//! Roe has several Cargo features, all of which are enabled by default:
+//!
+//! - **std** - Adds a dependency on [`std`], the Rust Standard Library. This
+//! feature enables [`std::error::Error`] implementations on error types in
+//! this crate. Enabling the **std** feature also enables the **alloc**
+//! feature.
+//! - **alloc** - Adds a dependency on [`alloc`], the Rust allocation and
+//! collections library. This feature enables APIs that allocate [`String`] or
+//! [`Vec`].
+//!
+#![cfg_attr(
+ not(feature = "std"),
+ doc = "[`std`]: https://doc.rust-lang.org/std/index.html"
+)]
+#![cfg_attr(
+ not(feature = "std"),
+ doc = "[`std::error::Error`]: https://doc.rust-lang.org/std/error/trait.Error.html"
+)]
+#![cfg_attr(
+ not(feature = "alloc"),
+ doc = "[`alloc`]: https://doc.rust-lang.org/alloc/index.html"
+)]
+#![cfg_attr(feature = "alloc", doc = "[`String`]: alloc::string::String")]
+#![cfg_attr(
+ not(feature = "alloc"),
+ doc = "[`String`]: https://doc.rust-lang.org/alloc/string/struct.String.html"
+)]
+#![cfg_attr(feature = "alloc", doc = "[`Vec`]: alloc::vec::Vec")]
+#![cfg_attr(
+ not(feature = "alloc"),
+ doc = "[`Vec`]: https://doc.rust-lang.org/alloc/vec/struct.Vec.html"
+)]
+//! [Unicode case mapping]: https://unicode.org/faq/casemap_charprop.html#casemap
+//! [conventionally UTF-8 binary strings]: https://docs.rs/bstr/1.*/bstr/#when-should-i-use-byte-strings
+
+#![no_std]
+#![doc(html_root_url = "https://docs.rs/roe/0.0.5")]
+
+#[cfg(any(feature = "alloc", test))]
+extern crate alloc;
+
+#[cfg(feature = "std")]
+extern crate std;
+
+use core::convert::{TryFrom, TryInto};
+use core::fmt;
+use core::str::FromStr;
+
+mod ascii;
+mod lowercase;
+mod uppercase;
+
+pub use ascii::{make_ascii_lowercase, make_ascii_titlecase, make_ascii_uppercase};
+#[cfg(feature = "alloc")]
+pub use ascii::{to_ascii_lowercase, to_ascii_titlecase, to_ascii_uppercase};
+pub use lowercase::Lowercase;
+pub use uppercase::Uppercase;
+
+/// Error that indicates a failure to parse a [`LowercaseMode`] or
+/// [`UppercaseMode`].
+///
+/// This error corresponds to the [Ruby `ArgumentError` Exception class].
+///
+/// # Examples
+///
+/// ```
+/// # use core::convert::TryInto;
+/// # use roe::{InvalidCaseMappingMode, LowercaseMode};
+/// let err = InvalidCaseMappingMode::new();
+/// assert_eq!(err.message(), "invalid option");
+///
+/// let mode: Result<LowercaseMode, InvalidCaseMappingMode> = "full".try_into();
+/// ```
+///
+/// [Ruby `ArgumentError` Exception class]: https://ruby-doc.org/core-3.1.2/ArgumentError.html
+#[derive(Default, Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
+pub struct InvalidCaseMappingMode {
+ _private: (),
+}
+
+impl InvalidCaseMappingMode {
+ /// Construct a new `InvalidCaseMappingMode` error.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use roe::InvalidCaseMappingMode;
+ /// const ERR: InvalidCaseMappingMode = InvalidCaseMappingMode::new();
+ /// assert_eq!(ERR.message(), "invalid option");
+ /// ```
+ #[must_use]
+ pub const fn new() -> Self {
+ Self { _private: () }
+ }
+
+ /// Retrieve the error message associated with this `InvalidCaseMappingMode`.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use roe::InvalidCaseMappingMode;
+ /// const MESSAGE: &str = InvalidCaseMappingMode::new().message();
+ /// assert_eq!(MESSAGE, "invalid option");
+ /// ```
+ #[must_use]
+ #[allow(clippy::unused_self)]
+ pub const fn message(self) -> &'static str {
+ "invalid option"
+ }
+}
+
+impl fmt::Display for InvalidCaseMappingMode {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ const MESSAGE: &str = InvalidCaseMappingMode::new().message();
+ f.write_str(MESSAGE)
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for InvalidCaseMappingMode {}
+
+/// Options to configure the behavior of [`lowercase`].
+///
+/// Which letters exactly are replaced, and by which other letters, depends on
+/// the given options.
+///
+/// See individual variants for a description of the available behaviors.
+///
+/// If you're not sure which mode to choose, [`LowercaseMode::Full`] is a a good
+/// default.
+///
+/// [`lowercase`]: crate::lowercase()
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
+pub enum LowercaseMode {
+ /// Full Unicode case mapping, suitable for most languages.
+ ///
+ /// See the [Turkic] and [Lithuanian] variants for exceptions.
+ ///
+ /// Context-dependent case mapping as described in Table 3-14 of the Unicode
+ /// standard is currently not supported.
+ ///
+ /// [Turkic]: Self::Turkic
+ /// [Lithuanian]: Self::Lithuanian
+ Full,
+ /// Only the ASCII region, i.e. the characters `'A'..='Z'` and `'a'..='z'`,
+ /// are affected.
+ ///
+ /// This option cannot be combined with any other option.
+ Ascii,
+ /// Full Unicode case mapping, adapted for Turkic languages (Turkish,
+ /// Azerbaijani, …).
+ ///
+ /// This means that upper case I is mapped to lower case dotless i, and so
+ /// on.
+ Turkic,
+ /// Currently, just [full Unicode case mapping].
+ ///
+ /// In the future, full Unicode case mapping adapted for Lithuanian (keeping
+ /// the dot on the lower case i even if there is an accent on top).
+ ///
+ /// [full Unicode case mapping]: Self::Full
+ Lithuanian,
+ /// Unicode case **folding**, which is more far-reaching than Unicode case
+ /// mapping.
+ ///
+ /// This option currently cannot be combined with any other option (i.e.
+ /// there is currently no variant for turkic languages).
+ Fold,
+}
+
+impl Default for LowercaseMode {
+ fn default() -> Self {
+ Self::Full
+ }
+}
+
+impl TryFrom<&str> for LowercaseMode {
+ type Error = InvalidCaseMappingMode;
+
+ #[inline]
+ fn try_from(value: &str) -> Result<Self, Self::Error> {
+ value.as_bytes().try_into()
+ }
+}
+
+impl TryFrom<Option<&str>> for LowercaseMode {
+ type Error = InvalidCaseMappingMode;
+
+ #[inline]
+ fn try_from(value: Option<&str>) -> Result<Self, Self::Error> {
+ value.map(str::as_bytes).try_into()
+ }
+}
+
+impl TryFrom<&[u8]> for LowercaseMode {
+ type Error = InvalidCaseMappingMode;
+
+ #[inline]
+ fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
+ match value {
+ b"ascii" => Ok(Self::Ascii),
+ b"turkic" => Ok(Self::Turkic),
+ b"lithuanian" => Ok(Self::Lithuanian),
+ b"fold" => Ok(Self::Fold),
+ _ => Err(InvalidCaseMappingMode::new()),
+ }
+ }
+}
+
+impl TryFrom<Option<&[u8]>> for LowercaseMode {
+ type Error = InvalidCaseMappingMode;
+
+ #[inline]
+ fn try_from(value: Option<&[u8]>) -> Result<Self, Self::Error> {
+ match value {
+ None => Ok(Self::Full),
+ Some(b"ascii") => Ok(Self::Ascii),
+ Some(b"turkic") => Ok(Self::Turkic),
+ Some(b"lithuanian") => Ok(Self::Lithuanian),
+ Some(b"fold") => Ok(Self::Fold),
+ Some(_) => Err(InvalidCaseMappingMode::new()),
+ }
+ }
+}
+
+impl FromStr for LowercaseMode {
+ type Err = InvalidCaseMappingMode;
+
+ #[inline]
+ fn from_str(s: &str) -> Result<Self, Self::Err> {
+ s.try_into()
+ }
+}
+
+/// Returns an iterator that yields a copy of the bytes in the given slice with
+/// all uppercase letters replaced with their lowercase counterparts.
+///
+/// This function treats the given slice as a [conventionally UTF-8 string].
+/// UTF-8 byte sequences are converted to their Unicode lowercase equivalents.
+/// Invalid UTF-8 byte sequences are yielded as is.
+///
+/// The case mapping mode is determined by the given [`LowercaseMode`]. See its
+/// documentation for details on the available case mapping modes.
+///
+/// # Panics
+///
+/// Not all [`LowercaseMode`]s are currently implemented. This function will
+/// panic if the caller supplies [Turkic] or [case folding] lowercasing mode.
+///
+/// [conventionally UTF-8 string]: https://docs.rs/bstr/0.2.*/bstr/#when-should-i-use-byte-strings
+/// [Turkic]: LowercaseMode::Turkic
+/// [case folding]: LowercaseMode::Fold
+// TODO: make this const once we're no longer panicking.
+pub fn lowercase(slice: &[u8], options: LowercaseMode) -> Lowercase<'_> {
+ match options {
+ LowercaseMode::Full | LowercaseMode::Lithuanian => Lowercase::with_slice(slice),
+ LowercaseMode::Ascii => Lowercase::with_ascii_slice(slice),
+ // TODO: implement `turkic` and `fold` modes.
+ LowercaseMode::Turkic => panic!("lowercase Turkic mode is not yet implemented"),
+ LowercaseMode::Fold => panic!("lowercase case folding mode is not yet implemented"),
+ }
+}
+
+/// Options to configure the behavior of [`uppercase`].
+///
+/// Which letters exactly are replaced, and by which other letters, depends on
+/// the given options.
+///
+/// See individual variants for a description of the available behaviors.
+///
+/// If you're not sure which mode to choose, [`UppercaseMode::Full`] is a a good
+/// default.
+///
+/// [`uppercase`]: crate::uppercase()
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
+pub enum UppercaseMode {
+ /// Full Unicode case mapping, suitable for most languages.
+ ///
+ /// See the [Turkic] and [Lithuanian] variants for exceptions.
+ ///
+ /// Context-dependent case mapping as described in Table 3-14 of the Unicode
+ /// standard is currently not supported.
+ ///
+ /// [Turkic]: Self::Turkic
+ /// [Lithuanian]: Self::Lithuanian
+ Full,
+ /// Only the ASCII region, i.e. the characters `'A'..='Z'` and `'a'..='z'`,
+ /// are affected.
+ ///
+ /// This option cannot be combined with any other option.
+ Ascii,
+ /// Full Unicode case mapping, adapted for Turkic languages (Turkish,
+ /// Azerbaijani, …).
+ ///
+ /// This means that upper case I is mapped to lower case dotless i, and so
+ /// on.
+ Turkic,
+ /// Currently, just [full Unicode case mapping].
+ ///
+ /// In the future, full Unicode case mapping adapted for Lithuanian (keeping
+ /// the dot on the lower case i even if there is an accent on top).
+ ///
+ /// [full Unicode case mapping]: Self::Full
+ Lithuanian,
+}
+
+impl Default for UppercaseMode {
+ fn default() -> Self {
+ Self::Full
+ }
+}
+
+impl TryFrom<&str> for UppercaseMode {
+ type Error = InvalidCaseMappingMode;
+
+ #[inline]
+ fn try_from(value: &str) -> Result<Self, Self::Error> {
+ value.as_bytes().try_into()
+ }
+}
+
+impl TryFrom<Option<&str>> for UppercaseMode {
+ type Error = InvalidCaseMappingMode;
+
+ #[inline]
+ fn try_from(value: Option<&str>) -> Result<Self, Self::Error> {
+ value.map(str::as_bytes).try_into()
+ }
+}
+
+impl TryFrom<&[u8]> for UppercaseMode {
+ type Error = InvalidCaseMappingMode;
+
+ #[inline]
+ fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
+ match value {
+ b"ascii" => Ok(Self::Ascii),
+ b"turkic" => Ok(Self::Turkic),
+ b"lithuanian" => Ok(Self::Lithuanian),
+ _ => Err(InvalidCaseMappingMode::new()),
+ }
+ }
+}
+
+impl TryFrom<Option<&[u8]>> for UppercaseMode {
+ type Error = InvalidCaseMappingMode;
+
+ #[inline]
+ fn try_from(value: Option<&[u8]>) -> Result<Self, Self::Error> {
+ match value {
+ None => Ok(Self::Full),
+ Some(b"ascii") => Ok(Self::Ascii),
+ Some(b"turkic") => Ok(Self::Turkic),
+ Some(b"lithuanian") => Ok(Self::Lithuanian),
+ Some(_) => Err(InvalidCaseMappingMode::new()),
+ }
+ }
+}
+
+impl FromStr for UppercaseMode {
+ type Err = InvalidCaseMappingMode;
+
+ #[inline]
+ fn from_str(s: &str) -> Result<Self, Self::Err> {
+ s.try_into()
+ }
+}
+
+/// Returns an iterator that yields a copy of the bytes in the given slice with
+/// all lowercase letters replaced with their uppercase counterparts.
+///
+/// This function treats the given slice as a [conventionally UTF-8 string].
+/// UTF-8 byte sequences are converted to their Unicode uppercase equivalents.
+/// Invalid UTF-8 byte sequences are yielded as is.
+///
+/// The case mapping mode is determined by the given [`UppercaseMode`]. See its
+/// documentation for details on the available case mapping modes.
+///
+/// # Panics
+///
+/// Not all [`UppercaseMode`]s are currently implemented. This function will
+/// panic if the caller supplies [Turkic] uppercasing mode.
+///
+/// [conventionally UTF-8 string]: https://docs.rs/bstr/0.2.*/bstr/#when-should-i-use-byte-strings
+/// [Turkic]: LowercaseMode::Turkic
+/// [case folding]: LowercaseMode::Fold
+// TODO: make this const once we're no longer panicking.
+pub fn uppercase(slice: &[u8], options: UppercaseMode) -> Uppercase<'_> {
+ match options {
+ UppercaseMode::Full | UppercaseMode::Lithuanian => Uppercase::with_slice(slice),
+ UppercaseMode::Ascii => Uppercase::with_ascii_slice(slice),
+ // TODO: implement `turkic` mode.
+ UppercaseMode::Turkic => panic!("uppercase Turkic mode is not yet implemented"),
+ }
+}
+
+// Ensure code blocks in README.md compile
+//
+// This module and macro declaration should be kept at the end of the file, in
+// order to not interfere with code coverage.
+#[cfg(doctest)]
+macro_rules! readme {
+ ($x:expr) => {
+ #[doc = $x]
+ mod readme {}
+ };
+ () => {
+ readme!(include_str!("../README.md"));
+ };
+}
+#[cfg(doctest)]
+readme!();
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +
use core::iter::FusedIterator;
+
+mod ascii;
+mod full;
+
+#[derive(Debug, Clone)]
+#[allow(variant_size_differences)]
+enum Inner<'a> {
+ Empty,
+ Full(full::Lowercase<'a>),
+ Ascii(ascii::Lowercase<'a>),
+}
+
+/// An iterator that yields the lowercase equivalent of a conventionally UTF-8
+/// byte string.
+///
+/// This iterator yields [bytes].
+///
+/// This struct is created by the [`lowercase`] function. See its documentation
+/// for more.
+///
+/// [bytes]: u8
+/// [`lowercase`]: crate::lowercase()
+#[derive(Debug, Clone)]
+#[must_use = "Lowercase is a Iterator and must be used"]
+pub struct Lowercase<'a> {
+ iter: Inner<'a>,
+}
+
+impl<'a> Lowercase<'a> {
+ /// Create a new, empty lowercase iterator.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use roe::Lowercase;
+ /// let mut lowercase = Lowercase::new();
+ /// assert_eq!(lowercase.next(), None);
+ /// ```
+ pub const fn new() -> Self {
+ Self { iter: Inner::Empty }
+ }
+
+ /// Create a new lowercase iterator with the given byte slice using full
+ /// Unicode case mapping.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use roe::Lowercase;
+ /// let mut lowercase = Lowercase::with_slice(b"abcXYZ");
+ /// assert_eq!(lowercase.next(), Some(b'a'));
+ /// assert_eq!(lowercase.next(), Some(b'b'));
+ /// assert_eq!(lowercase.next(), Some(b'c'));
+ /// assert_eq!(lowercase.next(), Some(b'x'));
+ /// assert_eq!(lowercase.next(), Some(b'y'));
+ /// assert_eq!(lowercase.next(), Some(b'z'));
+ /// assert_eq!(lowercase.next(), None);
+ /// ```
+ ///
+ /// Non-ASCII characters are case mapped:
+ ///
+ /// ```
+ /// # use roe::Lowercase;
+ /// let lowercase = Lowercase::with_slice("Αύριο".as_bytes());
+ /// assert_eq!(lowercase.collect::<Vec<_>>(), "αύριο".as_bytes());
+ /// ```
+ ///
+ /// Invalid UTF-8 bytes are yielded as is without impacting Unicode
+ /// characters:
+ ///
+ /// ```
+ /// # use roe::Lowercase;
+ /// let mut s = "Αύριο".to_string().into_bytes();
+ /// s.extend(b"\xFF\xFE");
+ /// let lowercase = Lowercase::with_slice(s.as_slice());
+ ///
+ /// let mut expected = "αύριο".to_string().into_bytes();
+ /// expected.extend(b"\xFF\xFE");
+ /// assert_eq!(lowercase.collect::<Vec<_>>(), expected);
+ /// ```
+ pub const fn with_slice(slice: &'a [u8]) -> Self {
+ Self {
+ iter: Inner::Full(full::Lowercase::with_slice(slice)),
+ }
+ }
+
+ /// Create a new lowercase iterator with the given byte slice using ASCII
+ /// case mapping.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use roe::Lowercase;
+ /// let mut lowercase = Lowercase::with_ascii_slice(b"abcXYZ");
+ /// assert_eq!(lowercase.next(), Some(b'a'));
+ /// assert_eq!(lowercase.next(), Some(b'b'));
+ /// assert_eq!(lowercase.next(), Some(b'c'));
+ /// assert_eq!(lowercase.next(), Some(b'x'));
+ /// assert_eq!(lowercase.next(), Some(b'y'));
+ /// assert_eq!(lowercase.next(), Some(b'z'));
+ /// assert_eq!(lowercase.next(), None);
+ /// ```
+ ///
+ /// Non-ASCII characters are ignored:
+ ///
+ /// ```
+ /// # use roe::Lowercase;
+ /// let lowercase = Lowercase::with_ascii_slice("Αύριο".as_bytes());
+ /// assert_eq!(lowercase.collect::<Vec<_>>(), "Αύριο".as_bytes());
+ /// ```
+ ///
+ /// Invalid UTF-8 bytes are yielded as is without impacting ASCII bytes:
+ ///
+ /// ```
+ /// # use roe::Lowercase;
+ /// let lowercase = Lowercase::with_ascii_slice(b"abc\xFF\xFEXYZ");
+ /// assert_eq!(lowercase.collect::<Vec<_>>(), b"abc\xFF\xFExyz");
+ /// ```
+ pub const fn with_ascii_slice(slice: &'a [u8]) -> Self {
+ Self {
+ iter: Inner::Ascii(ascii::Lowercase::with_slice(slice)),
+ }
+ }
+}
+
+impl<'a> Iterator for Lowercase<'a> {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ match self.iter {
+ Inner::Empty => None,
+ Inner::Full(ref mut iter) => iter.next(),
+ Inner::Ascii(ref mut iter) => iter.next(),
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ match self.iter {
+ Inner::Empty => (0, Some(0)),
+ Inner::Full(ref iter) => iter.size_hint(),
+ Inner::Ascii(ref iter) => iter.size_hint(),
+ }
+ }
+
+ fn count(self) -> usize {
+ match self.iter {
+ Inner::Empty => 0,
+ Inner::Full(iter) => iter.count(),
+ Inner::Ascii(iter) => iter.count(),
+ }
+ }
+}
+
+impl<'a> FusedIterator for Lowercase<'a> {}
+
+#[cfg(test)]
+mod tests {
+ use alloc::vec::Vec;
+ use bstr::ByteSlice;
+
+ use super::Lowercase;
+
+ #[test]
+ fn empty() {
+ let iter = Lowercase::new();
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"".as_bstr());
+
+ let iter = Lowercase::with_slice(b"");
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"".as_bstr());
+
+ let iter = Lowercase::with_ascii_slice(b"");
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"".as_bstr());
+ }
+
+ #[test]
+ fn size_hint() {
+ assert_eq!(Lowercase::new().size_hint(), (0, Some(0)));
+
+ assert_eq!(Lowercase::with_slice(b"abc, xyz").size_hint(), (8, Some(8)));
+ assert_eq!(
+ Lowercase::with_slice(b"abc, \xFF\xFE, xyz").size_hint(),
+ (12, Some(144))
+ );
+ assert_eq!(
+ Lowercase::with_slice("�".as_bytes()).size_hint(),
+ (3, Some(36))
+ );
+ assert_eq!(
+ Lowercase::with_slice("Έτος".as_bytes()).size_hint(),
+ (8, Some(96))
+ );
+ assert_eq!(
+ Lowercase::with_slice("ZȺȾ".as_bytes()).size_hint(),
+ (5, Some(60))
+ );
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ assert_eq!(
+ Lowercase::with_slice(&utf8_with_invalid_bytes).size_hint(),
+ (10, Some(120))
+ );
+
+ assert_eq!(
+ Lowercase::with_ascii_slice(b"abc, xyz").size_hint(),
+ (8, Some(8))
+ );
+ assert_eq!(
+ Lowercase::with_ascii_slice(b"abc, \xFF\xFE, xyz").size_hint(),
+ (12, Some(12))
+ );
+ assert_eq!(
+ Lowercase::with_ascii_slice("�".as_bytes()).size_hint(),
+ (3, Some(3))
+ );
+ assert_eq!(
+ Lowercase::with_ascii_slice("Έτος".as_bytes()).size_hint(),
+ (8, Some(8))
+ );
+ assert_eq!(
+ Lowercase::with_ascii_slice("ZȺȾ".as_bytes()).size_hint(),
+ (5, Some(5))
+ );
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ assert_eq!(
+ Lowercase::with_ascii_slice(&utf8_with_invalid_bytes).size_hint(),
+ (10, Some(10))
+ );
+ }
+
+ #[test]
+ fn count() {
+ assert_eq!(Lowercase::new().count(), 0);
+
+ assert_eq!(Lowercase::with_slice(b"abc, xyz").count(), 8);
+ assert_eq!(Lowercase::with_slice(b"abc, \xFF\xFE, xyz").count(), 12);
+ assert_eq!(Lowercase::with_slice("�".as_bytes()).count(), 3);
+ assert_eq!(Lowercase::with_slice("Έτος".as_bytes()).count(), 8);
+ assert_eq!(Lowercase::with_slice("ZȺȾ".as_bytes()).count(), 7);
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ assert_eq!(Lowercase::with_slice(&utf8_with_invalid_bytes).count(), 10);
+
+ assert_eq!(Lowercase::with_ascii_slice(b"abc, xyz").count(), 8);
+ assert_eq!(
+ Lowercase::with_ascii_slice(b"abc, \xFF\xFE, xyz").count(),
+ 12
+ );
+ assert_eq!(Lowercase::with_ascii_slice("�".as_bytes()).count(), 3);
+ assert_eq!(Lowercase::with_ascii_slice("Έτος".as_bytes()).count(), 8);
+ assert_eq!(Lowercase::with_ascii_slice("ZȺȾ".as_bytes()).count(), 5);
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ assert_eq!(
+ Lowercase::with_ascii_slice(&utf8_with_invalid_bytes).count(),
+ 10
+ );
+ }
+
+ #[test]
+ fn size_hint_covers_count() {
+ let iter = Lowercase::new();
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +
use core::fmt;
+use core::iter::FusedIterator;
+
+use bstr::ByteSlice;
+
+#[derive(Clone)]
+#[must_use = "Lowercase is a Iterator and must be used"]
+pub struct Lowercase<'a> {
+ slice: &'a [u8],
+}
+
+impl<'a> fmt::Debug for Lowercase<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_struct("Lowercase")
+ .field("slice", &self.slice.as_bstr())
+ .finish()
+ }
+}
+
+impl<'a> From<&'a [u8]> for Lowercase<'a> {
+ fn from(slice: &'a [u8]) -> Self {
+ Self::with_slice(slice)
+ }
+}
+
+impl<'a> Lowercase<'a> {
+ pub const fn with_slice(slice: &'a [u8]) -> Self {
+ Self { slice }
+ }
+}
+
+impl<'a> Iterator for Lowercase<'a> {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ let (&byte, remainder) = self.slice.split_first()?;
+ self.slice = remainder;
+ Some(byte.to_ascii_lowercase())
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ let len = self.slice.len();
+ (len, Some(len))
+ }
+
+ fn count(self) -> usize {
+ self.slice.len()
+ }
+}
+
+impl<'a> DoubleEndedIterator for Lowercase<'a> {
+ fn next_back(&mut self) -> Option<Self::Item> {
+ let (&byte, remainder) = self.slice.split_last()?;
+ self.slice = remainder;
+ Some(byte.to_ascii_lowercase())
+ }
+}
+
+impl<'a> ExactSizeIterator for Lowercase<'a> {}
+
+impl<'a> FusedIterator for Lowercase<'a> {}
+
+#[cfg(test)]
+mod tests {
+ use alloc::vec::Vec;
+ use bstr::ByteSlice;
+
+ use super::Lowercase;
+
+ #[test]
+ fn empty() {
+ let iter = Lowercase::from(&b""[..]);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"".as_bstr());
+ }
+
+ #[test]
+ fn ascii() {
+ let iter = Lowercase::from(&b"abc"[..]);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"abc".as_bstr());
+
+ let iter = Lowercase::from(&b"aBC"[..]);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"abc".as_bstr());
+
+ let iter = Lowercase::from(&b"ABC"[..]);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"abc".as_bstr());
+
+ let iter = Lowercase::from(&b"aBC, 123, ABC, baby you and me girl"[..]);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ b"abc, 123, abc, baby you and me girl".as_bstr()
+ );
+ }
+
+ // ignore unicode for ASCII iterator
+ #[test]
+ fn utf8() {
+ let s = "ß".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), "ß".as_bytes().as_bstr());
+
+ let s = "Αύριο".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "Αύριο".as_bytes().as_bstr()
+ );
+
+ let s = "Έτος".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "Έτος".as_bytes().as_bstr()
+ );
+
+ // two-byte characters
+ // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L198-L200
+ let s = "𐐜 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐙𐐊𐐡𐐝𐐓/𐐝𐐇𐐗𐐊𐐤𐐔 𐐒𐐋𐐗 𐐒𐐌 𐐜 𐐡𐐀𐐖𐐇𐐤𐐓𐐝 𐐱𐑂 𐑄 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐏𐐆𐐅𐐤𐐆𐐚𐐊𐐡𐐝𐐆𐐓𐐆".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "𐐜 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐙𐐊𐐡𐐝𐐓/𐐝𐐇𐐗𐐊𐐤𐐔 𐐒𐐋𐐗 𐐒𐐌 𐐜 𐐡𐐀𐐖𐐇𐐤𐐓𐐝 𐐱𐑂 𐑄 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐏𐐆𐐅𐐤𐐆𐐚𐐊𐐡𐐝𐐆𐐓𐐆"
+ .as_bytes()
+ .as_bstr()
+ );
+
+ // Change length when lowercased
+ // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L226-L232
+ let s = "ZȺȾ".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "zȺȾ".as_bytes().as_bstr()
+ );
+ }
+
+ #[test]
+ fn invalid_utf8() {
+ let iter = Lowercase::from(&b"\xFF\xFE"[..]);
+ assert_eq!(iter.collect::<Vec<u8>>().as_bstr(), b"\xFF\xFE".as_bstr());
+
+ let iter = Lowercase::from(&b"ABC\xFF\xFEXYZ"[..]);
+ assert_eq!(
+ iter.collect::<Vec<u8>>().as_bstr(),
+ b"abc\xFF\xFExyz".as_bstr()
+ );
+
+ let iter = Lowercase::from(&b"abc\xFF\xFEXYZ"[..]);
+ assert_eq!(
+ iter.collect::<Vec<u8>>().as_bstr(),
+ b"abc\xFF\xFExyz".as_bstr()
+ );
+
+ // The bytes \xF0\x9F\x87 could lead to a valid UTF-8 sequence, but 3 of
+ // them on their own are invalid. Only one replacement codepoint is
+ // substituted, which demonstrates the "substitution of maximal
+ // subparts" strategy.
+ //
+ // See: https://docs.rs/bstr/1.*/bstr/#handling-of-invalid-utf-8
+ let iter = Lowercase::from(&b"aB\xF0\x9F\x87Yz"[..]);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ b"ab\xF0\x9F\x87yz".as_bstr()
+ );
+ }
+
+ // ignore unicode for ASCII iterator
+ #[test]
+ fn unicode_replacement_character() {
+ let s = "�".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), "�".as_bytes().as_bstr());
+ }
+
+ // ignore unicode for ASCII iterator
+ #[test]
+ fn dz_titlecase() {
+ let s = "Dž".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), "Dž".as_bytes().as_bstr());
+ }
+
+ // ignore unicode for ASCII iterator
+ #[test]
+ fn latin_capital_i_with_dot_above() {
+ let s = "İ".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), "İ".as_bytes().as_bstr());
+ }
+
+ // ignore unicode for ASCII iterator
+ #[test]
+ fn case_map_to_two_chars() {
+ let s = "İ".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), "İ".as_bytes().as_bstr());
+ }
+
+ #[test]
+ fn size_hint() {
+ assert_eq!(Lowercase::with_slice(b"").size_hint(), (0, Some(0)));
+ assert_eq!(Lowercase::with_slice(b"abc, xyz").size_hint(), (8, Some(8)));
+ assert_eq!(
+ Lowercase::with_slice(b"abc, \xFF\xFE, xyz").size_hint(),
+ (12, Some(12))
+ );
+ assert_eq!(
+ Lowercase::with_slice("�".as_bytes()).size_hint(),
+ (3, Some(3))
+ );
+ assert_eq!(
+ Lowercase::with_slice("Έτος".as_bytes()).size_hint(),
+ (8, Some(8))
+ );
+ assert_eq!(
+ Lowercase::with_slice("ZȺȾ".as_bytes()).size_hint(),
+ (5, Some(5))
+ );
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ assert_eq!(
+ Lowercase::with_slice(&utf8_with_invalid_bytes).size_hint(),
+ (10, Some(10))
+ );
+ }
+
+ #[test]
+ fn count() {
+ assert_eq!(Lowercase::with_slice(b"").count(), 0);
+ assert_eq!(Lowercase::with_slice(b"abc, xyz").count(), 8);
+ assert_eq!(Lowercase::with_slice(b"abc, \xFF\xFE, xyz").count(), 12);
+ assert_eq!(Lowercase::with_slice("�".as_bytes()).count(), 3);
+ assert_eq!(Lowercase::with_slice("Έτος".as_bytes()).count(), 8);
+ assert_eq!(Lowercase::with_slice("ZȺȾ".as_bytes()).count(), 5);
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ assert_eq!(Lowercase::with_slice(&utf8_with_invalid_bytes).count(), 10);
+ }
+
+ #[test]
+ fn size_hint_covers_count() {
+ let iter = Lowercase::with_slice(b"");
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Lowercase::with_slice(b"abc, xyz");
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Lowercase::with_slice(b"abc, \xFF\xFE, xyz");
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Lowercase::with_slice("�".as_bytes());
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Lowercase::with_slice("Έτος".as_bytes());
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Lowercase::with_slice("ZȺȾ".as_bytes());
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ let iter = Lowercase::with_slice(&utf8_with_invalid_bytes);
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +
use core::char::ToLowercase;
+use core::fmt;
+use core::iter::FusedIterator;
+use core::ops::Range;
+
+use bstr::ByteSlice;
+
+#[derive(Clone)]
+#[must_use = "Lowercase is a Iterator and must be used"]
+pub struct Lowercase<'a> {
+ slice: &'a [u8],
+ next_bytes: [u8; 4],
+ next_range: Range<usize>,
+ lowercase: Option<ToLowercase>,
+}
+
+impl<'a> fmt::Debug for Lowercase<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_struct("Lowercase")
+ .field("slice", &self.slice.as_bstr())
+ .field("next_bytes", &self.next_bytes)
+ .field("next_range", &self.next_range)
+ .field("lowercase", &self.lowercase)
+ .finish()
+ }
+}
+
+impl<'a> From<&'a [u8]> for Lowercase<'a> {
+ fn from(slice: &'a [u8]) -> Self {
+ Self::with_slice(slice)
+ }
+}
+
+impl<'a> Lowercase<'a> {
+ pub const fn with_slice(slice: &'a [u8]) -> Self {
+ Self {
+ slice,
+ next_bytes: [0; 4],
+ next_range: 0..0,
+ lowercase: None,
+ }
+ }
+}
+
+impl<'a> Iterator for Lowercase<'a> {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ if let Some(idx) = self.next_range.next() {
+ debug_assert!(self.next_bytes.get(idx).is_some());
+
+ return Some(self.next_bytes[idx]);
+ }
+
+ if let Some(ch) = self.lowercase.as_mut().and_then(Iterator::next) {
+ let enc = ch.encode_utf8(&mut self.next_bytes);
+
+ self.next_range = 1..enc.len();
+ debug_assert!(self.next_bytes.get(self.next_range.clone()).is_some());
+
+ return Some(self.next_bytes[0]);
+ }
+
+ self.lowercase = None;
+
+ match bstr::decode_utf8(self.slice) {
+ (_, 0) => None,
+ (Some(ch), size) => {
+ self.slice = &self.slice[size..];
+ let mut lowercase = ch.to_lowercase();
+ let ch = lowercase
+ .next()
+ .expect("ToLowercase yields at least one char");
+ let enc = ch.encode_utf8(&mut self.next_bytes);
+
+ self.next_range = 1..enc.len();
+ debug_assert!(self.next_bytes.get(self.next_range.clone()).is_some());
+
+ self.lowercase = Some(lowercase);
+ Some(self.next_bytes[0])
+ }
+ (None, size) => {
+ let (bytes, remainder) = self.slice.split_at(size);
+ self.slice = remainder;
+
+ // Invalid byte sequences are at most three bytes.
+ debug_assert!(self.next_bytes.get(..bytes.len()).is_some());
+
+ self.next_bytes[..bytes.len()].copy_from_slice(bytes);
+ self.next_range = 1..bytes.len();
+ Some(self.next_bytes[0])
+ }
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ const TO_LOWER_EXPAND: usize = 3;
+ const UTF_8_CHAR_MAX_BYTES: usize = 4;
+ if self.slice.is_empty() {
+ (0, Some(0))
+ } else if self.slice.is_ascii() {
+ let len = self.slice.len();
+ (len, Some(len))
+ } else {
+ let len = self.slice.len();
+ (len, Some(len * TO_LOWER_EXPAND * UTF_8_CHAR_MAX_BYTES))
+ }
+ }
+
+ fn count(self) -> usize {
+ if self.slice.is_empty() {
+ 0
+ } else if self.slice.is_ascii() {
+ self.slice.len()
+ } else {
+ self.fold(0, |acc, _| acc + 1)
+ }
+ }
+}
+
+impl<'a> FusedIterator for Lowercase<'a> {}
+
+#[cfg(test)]
+mod tests {
+ use alloc::vec::Vec;
+ use bstr::ByteSlice;
+ use core::char;
+
+ use super::Lowercase;
+
+ #[test]
+ fn empty() {
+ let iter = Lowercase::from(&b""[..]);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"".as_bstr());
+ }
+
+ #[test]
+ fn ascii() {
+ let iter = Lowercase::from(&b"abc"[..]);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"abc".as_bstr());
+
+ let iter = Lowercase::from(&b"aBC"[..]);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"abc".as_bstr());
+
+ let iter = Lowercase::from(&b"ABC"[..]);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"abc".as_bstr());
+
+ let iter = Lowercase::from(&b"aBC, 123, ABC, baby you and me girl"[..]);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ b"abc, 123, abc, baby you and me girl".as_bstr()
+ );
+ }
+
+ #[test]
+ fn utf8() {
+ let s = "ß".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), "ß".as_bytes().as_bstr());
+
+ let s = "Αύριο".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "αύριο".as_bytes().as_bstr()
+ );
+
+ let s = "Έτος".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "έτος".as_bytes().as_bstr()
+ );
+
+ // two-byte characters
+ // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L198-L200
+ let s = "𐐜 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐙𐐊𐐡𐐝𐐓/𐐝𐐇𐐗𐐊𐐤𐐔 𐐒𐐋𐐗 𐐒𐐌 𐐜 𐐡𐐀𐐖𐐇𐐤𐐓𐐝 𐐱𐑂 𐑄 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐏𐐆𐐅𐐤𐐆𐐚𐐊𐐡𐐝𐐆𐐓𐐆".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "𐑄 𐐼𐐯𐑅𐐨𐑉𐐯𐐻 𐑁𐐲𐑉𐑅𐐻/𐑅𐐯𐐿𐐲𐑌𐐼 𐐺𐐳𐐿 𐐺𐐴 𐑄 𐑉𐐨𐐾𐐯𐑌𐐻𐑅 𐐱𐑂 𐑄 𐐼𐐯𐑅𐐨𐑉𐐯𐐻 𐐷𐐮𐐭𐑌𐐮𐑂𐐲𐑉𐑅𐐮𐐻𐐮"
+ .as_bytes()
+ .as_bstr()
+ );
+
+ // Change length when lowercased
+ // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L226-L232
+ let s = "ZȺȾ".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "zⱥⱦ".as_bytes().as_bstr()
+ );
+ }
+
+ #[test]
+ fn invalid_utf8() {
+ let iter = Lowercase::from(&b"\xFF\xFE"[..]);
+ assert_eq!(iter.collect::<Vec<u8>>().as_bstr(), b"\xFF\xFE".as_bstr());
+
+ let iter = Lowercase::from(&b"ABC\xFF\xFEXYZ"[..]);
+ assert_eq!(
+ iter.collect::<Vec<u8>>().as_bstr(),
+ b"abc\xFF\xFExyz".as_bstr()
+ );
+
+ let iter = Lowercase::from(&b"abc\xFF\xFEXYZ"[..]);
+ assert_eq!(
+ iter.collect::<Vec<u8>>().as_bstr(),
+ b"abc\xFF\xFExyz".as_bstr()
+ );
+
+ // The bytes \xF0\x9F\x87 could lead to a valid UTF-8 sequence, but 3 of
+ // them on their own are invalid. Only one replacement codepoint is
+ // substituted, which demonstrates the "substitution of maximal
+ // subparts" strategy.
+ //
+ // See: https://docs.rs/bstr/1.*/bstr/#handling-of-invalid-utf-8
+ let iter = Lowercase::from(&b"aB\xF0\x9F\x87Yz"[..]);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ b"ab\xF0\x9F\x87yz".as_bstr()
+ );
+ }
+
+ #[test]
+ fn unicode_replacement_character() {
+ let s = "�".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), "�".as_bytes().as_bstr());
+ }
+
+ #[test]
+ fn dz_titlecase() {
+ let s = "Dž".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), "dž".as_bytes().as_bstr());
+ }
+
+ #[test]
+ fn latin_capital_i_with_dot_above() {
+ let s = "İ".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ [105_u8, 204, 135].as_bstr()
+ );
+ }
+
+ #[test]
+ fn case_map_to_two_chars() {
+ let s = "İ".as_bytes();
+ let iter = Lowercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "i\u{307}".as_bytes().as_bstr()
+ );
+ }
+
+ #[test]
+ fn case_map_to_three_chars() {
+ // there are no such characters
+ for ch in '\0'..char::MAX {
+ assert!(
+ ch.to_lowercase().count() < 3,
+ "Expected no characters that downcase to three or more characters, found: '{}', which expands to: {:?}",
+ ch,
+ ch.to_lowercase().collect::<Vec<_>>()
+ );
+ }
+ }
+
+ #[test]
+ fn size_hint() {
+ assert_eq!(Lowercase::with_slice(b"").size_hint(), (0, Some(0)));
+ assert_eq!(Lowercase::with_slice(b"abc, xyz").size_hint(), (8, Some(8)));
+ assert_eq!(
+ Lowercase::with_slice(b"abc, \xFF\xFE, xyz").size_hint(),
+ (12, Some(144))
+ );
+ assert_eq!(
+ Lowercase::with_slice("�".as_bytes()).size_hint(),
+ (3, Some(36))
+ );
+ assert_eq!(
+ Lowercase::with_slice("Έτος".as_bytes()).size_hint(),
+ (8, Some(96))
+ );
+ assert_eq!(
+ Lowercase::with_slice("ZȺȾ".as_bytes()).size_hint(),
+ (5, Some(60))
+ );
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ assert_eq!(
+ Lowercase::with_slice(&utf8_with_invalid_bytes).size_hint(),
+ (10, Some(120))
+ );
+ }
+
+ #[test]
+ fn count() {
+ assert_eq!(Lowercase::with_slice(b"").count(), 0);
+ assert_eq!(Lowercase::with_slice(b"abc, xyz").count(), 8);
+ assert_eq!(Lowercase::with_slice(b"abc, \xFF\xFE, xyz").count(), 12);
+ assert_eq!(Lowercase::with_slice("�".as_bytes()).count(), 3);
+ assert_eq!(Lowercase::with_slice("Έτος".as_bytes()).count(), 8);
+ assert_eq!(Lowercase::with_slice("ZȺȾ".as_bytes()).count(), 7);
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ assert_eq!(Lowercase::with_slice(&utf8_with_invalid_bytes).count(), 10);
+ }
+
+ #[test]
+ fn size_hint_covers_count() {
+ let iter = Lowercase::with_slice(b"");
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Lowercase::with_slice(b"abc, xyz");
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Lowercase::with_slice(b"abc, \xFF\xFE, xyz");
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Lowercase::with_slice("�".as_bytes());
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Lowercase::with_slice("Έτος".as_bytes());
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Lowercase::with_slice("ZȺȾ".as_bytes());
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ let iter = Lowercase::with_slice(&utf8_with_invalid_bytes);
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +
use core::iter::FusedIterator;
+
+mod ascii;
+mod full;
+
+#[derive(Debug, Clone)]
+#[allow(variant_size_differences)]
+enum Inner<'a> {
+ Empty,
+ Full(full::Uppercase<'a>),
+ Ascii(ascii::Uppercase<'a>),
+}
+
+/// An iterator that yields the uppercase equivalent of a conventionally UTF-8
+/// byte string.
+///
+/// This iterator yields [bytes].
+///
+/// This struct is created by the [`uppercase`] function. See its documentation
+/// for more.
+///
+/// [bytes]: u8
+/// [`uppercase`]: crate::uppercase()
+#[derive(Debug, Clone)]
+#[must_use = "Uppercase is a Iterator and must be used"]
+pub struct Uppercase<'a> {
+ iter: Inner<'a>,
+}
+
+impl<'a> Uppercase<'a> {
+ /// Create a new, empty uppercase iterator.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use roe::Uppercase;
+ /// let mut uppercase = Uppercase::new();
+ /// assert_eq!(uppercase.next(), None);
+ /// ```
+ pub const fn new() -> Self {
+ Self { iter: Inner::Empty }
+ }
+
+ /// Create a new uppercase iterator with the given byte slice using full
+ /// Unicode case mapping.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use roe::Uppercase;
+ /// let mut uppercase = Uppercase::with_slice(b"abcXYZ");
+ /// assert_eq!(uppercase.next(), Some(b'A'));
+ /// assert_eq!(uppercase.next(), Some(b'B'));
+ /// assert_eq!(uppercase.next(), Some(b'C'));
+ /// assert_eq!(uppercase.next(), Some(b'X'));
+ /// assert_eq!(uppercase.next(), Some(b'Y'));
+ /// assert_eq!(uppercase.next(), Some(b'Z'));
+ /// assert_eq!(uppercase.next(), None);
+ /// ```
+ ///
+ /// Non-ASCII characters are case mapped:
+ ///
+ /// ```
+ /// # use roe::Uppercase;
+ /// let uppercase = Uppercase::with_slice("Αύριο".as_bytes());
+ /// assert_eq!(uppercase.collect::<Vec<_>>(), "ΑΎΡΙΟ".as_bytes());
+ /// ```
+ ///
+ /// Invalid UTF-8 bytes are yielded as is without impacting Unicode
+ /// characters:
+ ///
+ /// ```
+ /// # use roe::Uppercase;
+ /// let mut s = "Αύριο".to_string().into_bytes();
+ /// s.extend(b"\xFF\xFE");
+ /// let uppercase = Uppercase::with_slice(s.as_slice());
+ ///
+ /// let mut expected = "ΑΎΡΙΟ".to_string().into_bytes();
+ /// expected.extend(b"\xFF\xFE");
+ /// assert_eq!(uppercase.collect::<Vec<_>>(), expected);
+ /// ```
+ pub const fn with_slice(slice: &'a [u8]) -> Self {
+ Self {
+ iter: Inner::Full(full::Uppercase::with_slice(slice)),
+ }
+ }
+
+ /// Create a new uppercase iterator with the given byte slice using ASCII
+ /// case mapping.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use roe::Uppercase;
+ /// let mut uppercase = Uppercase::with_ascii_slice(b"abcXYZ");
+ /// assert_eq!(uppercase.next(), Some(b'A'));
+ /// assert_eq!(uppercase.next(), Some(b'B'));
+ /// assert_eq!(uppercase.next(), Some(b'C'));
+ /// assert_eq!(uppercase.next(), Some(b'X'));
+ /// assert_eq!(uppercase.next(), Some(b'Y'));
+ /// assert_eq!(uppercase.next(), Some(b'Z'));
+ /// assert_eq!(uppercase.next(), None);
+ /// ```
+ ///
+ /// Non-ASCII characters are ignored:
+ ///
+ /// ```
+ /// # use roe::Uppercase;
+ /// let uppercase = Uppercase::with_ascii_slice("Αύριο".as_bytes());
+ /// assert_eq!(uppercase.collect::<Vec<_>>(), "Αύριο".as_bytes());
+ /// ```
+ ///
+ /// Invalid UTF-8 bytes are yielded as is without impacting ASCII bytes:
+ ///
+ /// ```
+ /// # use roe::Uppercase;
+ /// let uppercase = Uppercase::with_ascii_slice(b"abc\xFF\xFEXYZ");
+ /// assert_eq!(uppercase.collect::<Vec<_>>(), b"ABC\xFF\xFEXYZ");
+ /// ```
+ pub const fn with_ascii_slice(slice: &'a [u8]) -> Self {
+ Self {
+ iter: Inner::Ascii(ascii::Uppercase::with_slice(slice)),
+ }
+ }
+}
+
+impl<'a> Iterator for Uppercase<'a> {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ match self.iter {
+ Inner::Empty => None,
+ Inner::Full(ref mut iter) => iter.next(),
+ Inner::Ascii(ref mut iter) => iter.next(),
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ match self.iter {
+ Inner::Empty => (0, Some(0)),
+ Inner::Full(ref iter) => iter.size_hint(),
+ Inner::Ascii(ref iter) => iter.size_hint(),
+ }
+ }
+
+ fn count(self) -> usize {
+ match self.iter {
+ Inner::Empty => 0,
+ Inner::Full(iter) => iter.count(),
+ Inner::Ascii(iter) => iter.count(),
+ }
+ }
+}
+
+impl<'a> FusedIterator for Uppercase<'a> {}
+
+#[cfg(test)]
+mod tests {
+ use alloc::vec::Vec;
+ use bstr::ByteSlice;
+
+ use super::Uppercase;
+
+ #[test]
+ fn empty() {
+ let iter = Uppercase::new();
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"".as_bstr());
+
+ let iter = Uppercase::with_slice(b"");
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"".as_bstr());
+
+ let iter = Uppercase::with_ascii_slice(b"");
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"".as_bstr());
+ }
+
+ #[test]
+ fn size_hint() {
+ assert_eq!(Uppercase::new().size_hint(), (0, Some(0)));
+
+ assert_eq!(Uppercase::with_slice(b"abc, xyz").size_hint(), (8, Some(8)));
+ assert_eq!(
+ Uppercase::with_slice(b"abc, \xFF\xFE, xyz").size_hint(),
+ (12, Some(144))
+ );
+ assert_eq!(
+ Uppercase::with_slice("�".as_bytes()).size_hint(),
+ (3, Some(36))
+ );
+ assert_eq!(
+ Uppercase::with_slice("Έτος".as_bytes()).size_hint(),
+ (8, Some(96))
+ );
+ assert_eq!(
+ Uppercase::with_slice("ZȺȾ".as_bytes()).size_hint(),
+ (5, Some(60))
+ );
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ assert_eq!(
+ Uppercase::with_slice(&utf8_with_invalid_bytes).size_hint(),
+ (10, Some(120))
+ );
+
+ assert_eq!(
+ Uppercase::with_ascii_slice(b"abc, xyz").size_hint(),
+ (8, Some(8))
+ );
+ assert_eq!(
+ Uppercase::with_ascii_slice(b"abc, \xFF\xFE, xyz").size_hint(),
+ (12, Some(12))
+ );
+ assert_eq!(
+ Uppercase::with_ascii_slice("�".as_bytes()).size_hint(),
+ (3, Some(3))
+ );
+ assert_eq!(
+ Uppercase::with_ascii_slice("Έτος".as_bytes()).size_hint(),
+ (8, Some(8))
+ );
+ assert_eq!(
+ Uppercase::with_ascii_slice("ZȺȾ".as_bytes()).size_hint(),
+ (5, Some(5))
+ );
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ assert_eq!(
+ Uppercase::with_ascii_slice(&utf8_with_invalid_bytes).size_hint(),
+ (10, Some(10))
+ );
+ }
+
+ #[test]
+ fn count() {
+ assert_eq!(Uppercase::new().count(), 0);
+
+ assert_eq!(Uppercase::with_slice(b"abc, xyz").count(), 8);
+ assert_eq!(Uppercase::with_slice(b"abc, \xFF\xFE, xyz").count(), 12);
+ assert_eq!(Uppercase::with_slice("�".as_bytes()).count(), 3);
+ assert_eq!(Uppercase::with_slice("Έτος".as_bytes()).count(), 8);
+ assert_eq!(Uppercase::with_slice("zⱥⱦ".as_bytes()).count(), 5);
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ assert_eq!(Uppercase::with_slice(&utf8_with_invalid_bytes).count(), 10);
+
+ assert_eq!(Uppercase::with_ascii_slice(b"abc, xyz").count(), 8);
+ assert_eq!(
+ Uppercase::with_ascii_slice(b"abc, \xFF\xFE, xyz").count(),
+ 12
+ );
+ assert_eq!(Uppercase::with_ascii_slice("�".as_bytes()).count(), 3);
+ assert_eq!(Uppercase::with_ascii_slice("Έτος".as_bytes()).count(), 8);
+ assert_eq!(Uppercase::with_ascii_slice("ZȺȾ".as_bytes()).count(), 5);
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ assert_eq!(
+ Uppercase::with_ascii_slice(&utf8_with_invalid_bytes).count(),
+ 10
+ );
+ }
+
+ #[test]
+ fn size_hint_covers_count() {
+ let iter = Uppercase::new();
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +
use core::fmt;
+use core::iter::FusedIterator;
+
+use bstr::ByteSlice;
+
+#[derive(Clone)]
+#[must_use = "Uppercase is a Iterator and must be used"]
+pub struct Uppercase<'a> {
+ slice: &'a [u8],
+}
+
+impl<'a> fmt::Debug for Uppercase<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_struct("Uppercase")
+ .field("slice", &self.slice.as_bstr())
+ .finish()
+ }
+}
+
+impl<'a> From<&'a [u8]> for Uppercase<'a> {
+ fn from(slice: &'a [u8]) -> Self {
+ Self::with_slice(slice)
+ }
+}
+
+impl<'a> Uppercase<'a> {
+ pub const fn with_slice(slice: &'a [u8]) -> Self {
+ Self { slice }
+ }
+}
+
+impl<'a> Iterator for Uppercase<'a> {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ let (&byte, remainder) = self.slice.split_first()?;
+ self.slice = remainder;
+ Some(byte.to_ascii_uppercase())
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ let len = self.slice.len();
+ (len, Some(len))
+ }
+
+ fn count(self) -> usize {
+ self.slice.len()
+ }
+}
+
+impl<'a> DoubleEndedIterator for Uppercase<'a> {
+ fn next_back(&mut self) -> Option<Self::Item> {
+ let (&byte, remainder) = self.slice.split_last()?;
+ self.slice = remainder;
+ Some(byte.to_ascii_uppercase())
+ }
+}
+
+impl<'a> ExactSizeIterator for Uppercase<'a> {}
+
+impl<'a> FusedIterator for Uppercase<'a> {}
+
+#[cfg(test)]
+mod tests {
+ use alloc::vec::Vec;
+ use bstr::ByteSlice;
+
+ use super::Uppercase;
+
+ #[test]
+ fn empty() {
+ let iter = Uppercase::from(&b""[..]);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"".as_bstr());
+ }
+
+ #[test]
+ fn ascii() {
+ let iter = Uppercase::from(&b"abc"[..]);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"ABC".as_bstr());
+
+ let iter = Uppercase::from(&b"aBC"[..]);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"ABC".as_bstr());
+
+ let iter = Uppercase::from(&b"ABC"[..]);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"ABC".as_bstr());
+
+ let iter = Uppercase::from(&b"aBC, 123, ABC, baby you and me girl"[..]);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ b"ABC, 123, ABC, BABY YOU AND ME GIRL".as_bstr()
+ );
+ }
+
+ // ignore unicode for ASCII iterator
+ #[test]
+ fn utf8() {
+ let s = "ß".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), "ß".as_bytes().as_bstr());
+
+ let s = "Αύριο".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "Αύριο".as_bytes().as_bstr()
+ );
+
+ let s = "Έτος".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "Έτος".as_bytes().as_bstr()
+ );
+
+ // two-byte characters
+ // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L198-L200
+ let s = "𐐜 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐙𐐊𐐡𐐝𐐓/𐐝𐐇𐐗𐐊𐐤𐐔 𐐒𐐋𐐗 𐐒𐐌 𐐜 𐐡𐐀𐐖𐐇𐐤𐐓𐐝 𐐱𐑂 𐑄 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐏𐐆𐐅𐐤𐐆𐐚𐐊𐐡𐐝𐐆𐐓𐐆".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "𐐜 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐙𐐊𐐡𐐝𐐓/𐐝𐐇𐐗𐐊𐐤𐐔 𐐒𐐋𐐗 𐐒𐐌 𐐜 𐐡𐐀𐐖𐐇𐐤𐐓𐐝 𐐱𐑂 𐑄 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐏𐐆𐐅𐐤𐐆𐐚𐐊𐐡𐐝𐐆𐐓𐐆"
+ .as_bytes()
+ .as_bstr()
+ );
+
+ // Change length when lowercased
+ // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L226-L232
+ let s = "zȺȾ".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "ZȺȾ".as_bytes().as_bstr()
+ );
+ }
+
+ #[test]
+ fn invalid_utf8() {
+ let iter = Uppercase::from(&b"\xFF\xFE"[..]);
+ assert_eq!(iter.collect::<Vec<u8>>().as_bstr(), b"\xFF\xFE".as_bstr());
+
+ let iter = Uppercase::from(&b"ABC\xFF\xFEXYZ"[..]);
+ assert_eq!(
+ iter.collect::<Vec<u8>>().as_bstr(),
+ b"ABC\xFF\xFEXYZ".as_bstr()
+ );
+
+ let iter = Uppercase::from(&b"abc\xFF\xFEXYZ"[..]);
+ assert_eq!(
+ iter.collect::<Vec<u8>>().as_bstr(),
+ b"ABC\xFF\xFEXYZ".as_bstr()
+ );
+
+ // The bytes \xF0\x9F\x87 could lead to a valid UTF-8 sequence, but 3 of
+ // them on their own are invalid. Only one replacement codepoint is
+ // substituted, which demonstrates the "substitution of maximal
+ // subparts" strategy.
+ //
+ // See: https://docs.rs/bstr/1.*/bstr/#handling-of-invalid-utf-8
+ let iter = Uppercase::from(&b"aB\xF0\x9F\x87Yz"[..]);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ b"AB\xF0\x9F\x87YZ".as_bstr()
+ );
+ }
+
+ // ignore unicode for ASCII iterator
+ #[test]
+ fn unicode_replacement_character() {
+ let s = "�".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), "�".as_bytes().as_bstr());
+ }
+
+ // ignore unicode for ASCII iterator
+ #[test]
+ fn dz_titlecase() {
+ let s = "Dž".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), "Dž".as_bytes().as_bstr());
+ }
+
+ // ignore unicode for ASCII iterator
+ #[test]
+ fn latin_capital_i_with_dot_above() {
+ let s = "İ".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), "İ".as_bytes().as_bstr());
+ }
+
+ // ignore unicode for ASCII iterator
+ #[test]
+ fn case_map_to_two_chars() {
+ let s = "İ".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), "İ".as_bytes().as_bstr());
+ }
+
+ #[test]
+ fn size_hint() {
+ assert_eq!(Uppercase::with_slice(b"").size_hint(), (0, Some(0)));
+ assert_eq!(Uppercase::with_slice(b"abc, xyz").size_hint(), (8, Some(8)));
+ assert_eq!(
+ Uppercase::with_slice(b"abc, \xFF\xFE, xyz").size_hint(),
+ (12, Some(12))
+ );
+ assert_eq!(
+ Uppercase::with_slice("�".as_bytes()).size_hint(),
+ (3, Some(3))
+ );
+ assert_eq!(
+ Uppercase::with_slice("Έτος".as_bytes()).size_hint(),
+ (8, Some(8))
+ );
+ assert_eq!(
+ Uppercase::with_slice("ZȺȾ".as_bytes()).size_hint(),
+ (5, Some(5))
+ );
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ assert_eq!(
+ Uppercase::with_slice(&utf8_with_invalid_bytes).size_hint(),
+ (10, Some(10))
+ );
+ }
+
+ #[test]
+ fn count() {
+ assert_eq!(Uppercase::with_slice(b"").count(), 0);
+ assert_eq!(Uppercase::with_slice(b"abc, xyz").count(), 8);
+ assert_eq!(Uppercase::with_slice(b"abc, \xFF\xFE, xyz").count(), 12);
+ assert_eq!(Uppercase::with_slice("�".as_bytes()).count(), 3);
+ assert_eq!(Uppercase::with_slice("Έτος".as_bytes()).count(), 8);
+ assert_eq!(Uppercase::with_slice("ZȺȾ".as_bytes()).count(), 5);
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ assert_eq!(Uppercase::with_slice(&utf8_with_invalid_bytes).count(), 10);
+ }
+
+ #[test]
+ fn size_hint_covers_count() {
+ let iter = Uppercase::with_slice(b"");
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Uppercase::with_slice(b"abc, xyz");
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Uppercase::with_slice(b"abc, \xFF\xFE, xyz");
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Uppercase::with_slice("�".as_bytes());
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Uppercase::with_slice("Έτος".as_bytes());
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Uppercase::with_slice("ZȺȾ".as_bytes());
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ let iter = Uppercase::with_slice(&utf8_with_invalid_bytes);
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+ }
+}
+
1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +
use core::char::ToUppercase;
+use core::fmt;
+use core::iter::FusedIterator;
+use core::ops::Range;
+
+use bstr::ByteSlice;
+
+#[derive(Clone)]
+#[must_use = "Uppercase is a Iterator and must be used"]
+pub struct Uppercase<'a> {
+ slice: &'a [u8],
+ next_bytes: [u8; 4],
+ next_range: Range<usize>,
+ uppercase: Option<ToUppercase>,
+}
+
+impl<'a> fmt::Debug for Uppercase<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_struct("Uppercase")
+ .field("slice", &self.slice.as_bstr())
+ .field("next_bytes", &self.next_bytes)
+ .field("next_range", &self.next_range)
+ .field("uppercase", &self.uppercase)
+ .finish()
+ }
+}
+
+impl<'a> From<&'a [u8]> for Uppercase<'a> {
+ fn from(slice: &'a [u8]) -> Self {
+ Self::with_slice(slice)
+ }
+}
+
+impl<'a> Uppercase<'a> {
+ pub const fn with_slice(slice: &'a [u8]) -> Self {
+ Self {
+ slice,
+ next_bytes: [0; 4],
+ next_range: 0..0,
+ uppercase: None,
+ }
+ }
+}
+
+impl<'a> Iterator for Uppercase<'a> {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ if let Some(idx) = self.next_range.next() {
+ debug_assert!(self.next_bytes.get(idx).is_some());
+
+ return Some(self.next_bytes[idx]);
+ }
+
+ if let Some(ch) = self.uppercase.as_mut().and_then(Iterator::next) {
+ let enc = ch.encode_utf8(&mut self.next_bytes);
+
+ self.next_range = 1..enc.len();
+ debug_assert!(self.next_bytes.get(self.next_range.clone()).is_some());
+
+ return Some(self.next_bytes[0]);
+ }
+
+ self.uppercase = None;
+
+ match bstr::decode_utf8(self.slice) {
+ (_, 0) => None,
+ (Some(ch), size) => {
+ self.slice = &self.slice[size..];
+ let mut uppercase = ch.to_uppercase();
+ let ch = uppercase
+ .next()
+ .expect("ToUppercase yields at least one char");
+ let enc = ch.encode_utf8(&mut self.next_bytes);
+
+ self.next_range = 1..enc.len();
+ debug_assert!(self.next_bytes.get(self.next_range.clone()).is_some());
+
+ self.uppercase = Some(uppercase);
+ Some(self.next_bytes[0])
+ }
+ (None, size) => {
+ let (bytes, remainder) = self.slice.split_at(size);
+ self.slice = remainder;
+
+ // Invalid byte sequences are at most three bytes.
+ debug_assert!(self.next_bytes.get(..bytes.len()).is_some());
+
+ self.next_bytes[..bytes.len()].copy_from_slice(bytes);
+ self.next_range = 1..bytes.len();
+ Some(self.next_bytes[0])
+ }
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ const TO_UPPER_EXPAND: usize = 3;
+ const UTF_8_CHAR_MAX_BYTES: usize = 4;
+ if self.slice.is_empty() {
+ (0, Some(0))
+ } else if self.slice.is_ascii() {
+ let len = self.slice.len();
+ (len, Some(len))
+ } else {
+ let len = self.slice.len();
+ (len, Some(len * TO_UPPER_EXPAND * UTF_8_CHAR_MAX_BYTES))
+ }
+ }
+
+ fn count(self) -> usize {
+ if self.slice.is_empty() {
+ 0
+ } else if self.slice.is_ascii() {
+ self.slice.len()
+ } else {
+ self.fold(0, |acc, _| acc + 1)
+ }
+ }
+}
+
+impl<'a> FusedIterator for Uppercase<'a> {}
+
+#[cfg(test)]
+mod tests {
+ use alloc::vec::Vec;
+ use bstr::ByteSlice;
+
+ use super::Uppercase;
+
+ #[test]
+ fn empty() {
+ let iter = Uppercase::from(&b""[..]);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"".as_bstr());
+ }
+
+ #[test]
+ fn ascii() {
+ let iter = Uppercase::from(&b"abc"[..]);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"ABC".as_bstr());
+
+ let iter = Uppercase::from(&b"aBC"[..]);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"ABC".as_bstr());
+
+ let iter = Uppercase::from(&b"ABC"[..]);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"ABC".as_bstr());
+
+ let iter = Uppercase::from(&b"aBC, 123, ABC, baby you and me girl"[..]);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ b"ABC, 123, ABC, BABY YOU AND ME GIRL".as_bstr()
+ );
+ }
+
+ #[test]
+ fn utf8() {
+ let s = "ß".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "SS".as_bytes().as_bstr()
+ );
+
+ let s = "Αύριο".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "ΑΎΡΙΟ".as_bytes().as_bstr()
+ );
+
+ let s = "Έτος".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "ΈΤΟΣ".as_bytes().as_bstr()
+ );
+
+ // two-byte characters
+ // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L198-L200
+ let s = "𐑄 𐐼𐐯𐑅𐐨𐑉𐐯𐐻 𐑁𐐲𐑉𐑅𐐻/𐑅𐐯𐐿𐐲𐑌𐐼 𐐺𐐳𐐿 𐐺𐐴 𐑄 𐑉𐐨𐐾𐐯𐑌𐐻𐑅 𐐱𐑂 𐑄 𐐼𐐯𐑅𐐨𐑉𐐯𐐻 𐐷𐐮𐐭𐑌𐐮𐑂𐐲𐑉𐑅𐐮𐐻𐐮".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "𐐜 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐙𐐊𐐡𐐝𐐓/𐐝𐐇𐐗𐐊𐐤𐐔 𐐒𐐋𐐗 𐐒𐐌 𐐜 𐐡𐐀𐐖𐐇𐐤𐐓𐐝 𐐉𐐚 𐐜 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐏𐐆𐐅𐐤𐐆𐐚𐐊𐐡𐐝𐐆𐐓𐐆"
+ .as_bytes()
+ .as_bstr()
+ );
+
+ // Change length when uppercased
+ // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L226-L232
+ let s = "zⱥⱦ".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "ZȺȾ".as_bytes().as_bstr()
+ );
+ }
+
+ #[test]
+ fn invalid_utf8() {
+ let iter = Uppercase::from(&b"\xFF\xFE"[..]);
+ assert_eq!(iter.collect::<Vec<u8>>().as_bstr(), b"\xFF\xFE".as_bstr());
+
+ let iter = Uppercase::from(&b"abc\xFF\xFExyz"[..]);
+ assert_eq!(
+ iter.collect::<Vec<u8>>().as_bstr(),
+ b"ABC\xFF\xFEXYZ".as_bstr()
+ );
+
+ let iter = Uppercase::from(&b"abc\xFF\xFEXYZ"[..]);
+ assert_eq!(
+ iter.collect::<Vec<u8>>().as_bstr(),
+ b"ABC\xFF\xFEXYZ".as_bstr()
+ );
+
+ // The bytes \xF0\x9F\x87 could lead to a valid UTF-8 sequence, but 3 of
+ // them on their own are invalid. Only one replacement codepoint is
+ // substituted, which demonstrates the "substitution of maximal
+ // subparts" strategy.
+ //
+ // See: https://docs.rs/bstr/1.*/bstr/#handling-of-invalid-utf-8
+ let iter = Uppercase::from(&b"aB\xF0\x9F\x87Yz"[..]);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ b"AB\xF0\x9F\x87YZ".as_bstr()
+ );
+ }
+
+ #[test]
+ fn unicode_replacement_character() {
+ let s = "�".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), "�".as_bytes().as_bstr());
+ }
+
+ #[test]
+ fn dz_titlecase() {
+ let s = "Dž".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), "DŽ".as_bytes().as_bstr());
+ }
+
+ #[test]
+ fn latin_small_i_with_dot_above() {
+ let s = "i̇".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ [73_u8, 204, 135].as_bstr()
+ );
+ }
+
+ #[test]
+ fn case_map_to_two_chars() {
+ let s = "և".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "ԵՒ".as_bytes().as_bstr()
+ );
+
+ let s = "ẙ".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "Y\u{30a}".as_bytes().as_bstr()
+ );
+
+ let s = "ᾂ".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "ἊΙ".as_bytes().as_bstr()
+ );
+
+ let s = "ﬗ".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "ՄԽ".as_bytes().as_bstr()
+ );
+ }
+
+ #[test]
+ fn case_map_to_three_chars() {
+ let s = "ffi".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(iter.collect::<Vec<_>>().as_bstr(), b"FFI".as_bstr());
+
+ let s = "ὖ".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "Υ\u{313}\u{342}".as_bytes().as_bstr()
+ );
+
+ let s = "ῷ".as_bytes();
+ let iter = Uppercase::from(s);
+ assert_eq!(
+ iter.collect::<Vec<_>>().as_bstr(),
+ "Ω\u{342}Ι".as_bytes().as_bstr()
+ );
+ }
+
+ #[test]
+ fn size_hint() {
+ assert_eq!(Uppercase::with_slice(b"").size_hint(), (0, Some(0)));
+ assert_eq!(Uppercase::with_slice(b"abc, xyz").size_hint(), (8, Some(8)));
+ assert_eq!(
+ Uppercase::with_slice(b"abc, \xFF\xFE, xyz").size_hint(),
+ (12, Some(144))
+ );
+ assert_eq!(
+ Uppercase::with_slice("�".as_bytes()).size_hint(),
+ (3, Some(36))
+ );
+ assert_eq!(
+ Uppercase::with_slice("Έτος".as_bytes()).size_hint(),
+ (8, Some(96))
+ );
+ assert_eq!(
+ Uppercase::with_slice("ZȺȾ".as_bytes()).size_hint(),
+ (5, Some(60))
+ );
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ assert_eq!(
+ Uppercase::with_slice(&utf8_with_invalid_bytes).size_hint(),
+ (10, Some(120))
+ );
+ }
+
+ #[test]
+ fn count() {
+ assert_eq!(Uppercase::with_slice(b"").count(), 0);
+ assert_eq!(Uppercase::with_slice(b"abc, xyz").count(), 8);
+ assert_eq!(Uppercase::with_slice(b"abc, \xFF\xFE, xyz").count(), 12);
+ assert_eq!(Uppercase::with_slice("�".as_bytes()).count(), 3);
+ assert_eq!(Uppercase::with_slice("Έτος".as_bytes()).count(), 8);
+ assert_eq!(Uppercase::with_slice("zⱥⱦ".as_bytes()).count(), 5);
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ assert_eq!(Uppercase::with_slice(&utf8_with_invalid_bytes).count(), 10);
+ }
+
+ #[test]
+ fn size_hint_covers_count() {
+ let iter = Uppercase::with_slice(b"");
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Uppercase::with_slice(b"abc, xyz");
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Uppercase::with_slice(b"abc, \xFF\xFE, xyz");
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Uppercase::with_slice("�".as_bytes());
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Uppercase::with_slice("Έτος".as_bytes());
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let iter = Uppercase::with_slice("ZȺȾ".as_bytes());
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+
+ let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec();
+ utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes());
+ let iter = Uppercase::with_slice(&utf8_with_invalid_bytes);
+ let (min, max) = iter.size_hint();
+ let count = iter.count();
+ assert!(min <= count);
+ assert!(count <= max.unwrap());
+ }
+}
+
fn:
) to \
+ restrict the search to a given item kind.","Accepted kinds are: fn
, mod
, struct
, \
+ enum
, trait
, type
, macro
, \
+ and const
.","Search functions by type signature (e.g., vec -> usize
or \
+ -> vec
or String, enum:Cow -> bool
)","You can look for items with an exact name by putting double quotes around \
+ your request: \"string\"
","Look for functions that accept or return \
+ slices and \
+ arrays by writing \
+ square brackets (e.g., -> [u8]
or [] -> Option
)","Look for items inside another one by searching for a path: vec::Vec
",].map(x=>""+x+"
").join("");const div_infos=document.createElement("div");addClass(div_infos,"infos");div_infos.innerHTML="${value.replaceAll(" ", " ")}
`}else{error[index]=value}});output+=`