From e48367ac6673fee0bf9b1fca1be10d52097b60d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Fri, 25 Aug 2023 10:12:48 +0200 Subject: [PATCH 1/3] Remove individual allocations for each line of source text --- src/source.rs | 147 ++++++++++++++++++++++++++++++++------------------ src/write.rs | 6 +-- 2 files changed, 98 insertions(+), 55 deletions(-) diff --git a/src/source.rs b/src/source.rs index c1f7cb6..8b86e1d 100644 --- a/src/source.rs +++ b/src/source.rs @@ -30,11 +30,12 @@ impl, Id: ?Sized> Cache for Box { } /// A type representing a single line of a [`Source`]. -#[derive(Clone, Debug, Hash, PartialEq, Eq)] +#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)] pub struct Line { offset: usize, - len: usize, - chars: String, + char_len: usize, + byte_offset: usize, + byte_len: usize, } impl Line { @@ -42,13 +43,14 @@ impl Line { pub fn offset(&self) -> usize { self.offset } /// Get the character length of this line. - pub fn len(&self) -> usize { self.len } + pub fn len(&self) -> usize { self.char_len } /// Get the offset span of this line in the original [`Source`]. - pub fn span(&self) -> Range { self.offset..self.offset + self.len } + pub fn span(&self) -> Range { self.offset..self.offset + self.char_len } - /// Return an iterator over the characters in the line, excluding trailing whitespace. - pub fn chars(&self) -> impl Iterator + '_ { self.chars.chars() } + /// Get the byte offset span of this line in the original [`Source`]. This can be used to + /// directly slice into its source text. + fn byte_span(&self) -> Range { self.byte_offset..self.byte_offset + self.byte_len } } /// A type representing a single source that may be referred to by [`Span`]s. @@ -56,6 +58,7 @@ impl Line { /// In most cases, a source is a single input file. #[derive(Clone, Debug, Hash, PartialEq, Eq)] pub struct Source { + text: String, lines: Vec, len: usize, } @@ -65,7 +68,8 @@ impl> From for Source { /// /// Note that this function can be expensive for long strings. Use an implementor of [`Cache`] where possible. fn from(s: S) -> Self { - let mut offset = 0; + let mut char_offset = 0; + let mut byte_offset = 0; // (Last line, last line ends with CR) let mut last_line: Option<(Line, bool)> = None; let mut lines: Vec = s @@ -85,20 +89,24 @@ impl> From for Source { if let Some((last, ends_with_cr)) = last_line.as_mut() { if *ends_with_cr && line == "\n" { - last.len += 1; - offset += 1; + last.char_len += 1; + last.byte_len += 1; + char_offset += 1; + byte_offset += 1; return replace(&mut last_line, None).map(|(l, _)| l); } } - let len = line.chars().count(); + let char_len = line.chars().count(); let ends_with_cr = line.ends_with('\r'); let line = Line { - offset, - len, - chars: line.trim_end().to_owned(), + offset: char_offset, + char_len, + byte_offset, + byte_len: line.len(), }; - offset += len; + char_offset += char_len; + byte_offset += line.byte_len; replace(&mut last_line, Some((line, ends_with_cr))).map(|(l, _)| l) }) .collect(); @@ -108,8 +116,9 @@ impl> From for Source { } Self { + text: s.as_ref().to_string(), lines, - len: offset, + len: char_offset, } } } @@ -119,26 +128,30 @@ impl Source { pub fn len(&self) -> usize { self.len } /// Return an iterator over the characters in the source. - pub fn chars(&self) -> impl Iterator + '_ { - self.lines.iter().map(|l| l.chars()).flatten() - } + pub fn chars(&self) -> impl Iterator + '_ { self.text.chars() } /// Get access to a specific, zero-indexed [`Line`]. - pub fn line(&self, idx: usize) -> Option<&Line> { self.lines.get(idx) } + pub fn line(&self, idx: usize) -> Option { self.lines.get(idx).copied() } /// Return an iterator over the [`Line`]s in this source. - pub fn lines(&self) -> impl ExactSizeIterator + '_ { self.lines.iter() } + pub fn lines(&self) -> impl ExactSizeIterator + '_ { self.lines.iter().copied() } /// Get the line that the given offset appears on, and the line/column numbers of the offset. /// /// Note that the line/column numbers are zero-indexed. - pub fn get_offset_line(&self, offset: usize) -> Option<(&Line, usize, usize)> { + pub fn get_offset_line(&self, offset: usize) -> Option<(Line, usize, usize)> { if offset <= self.len { - let idx = self.lines + let idx = self + .lines .binary_search_by_key(&offset, |line| line.offset) .unwrap_or_else(|idx| idx.saturating_sub(1)); - let line = &self.lines[idx]; - assert!(offset >= line.offset, "offset = {}, line.offset = {}", offset, line.offset); + let line = self.lines[idx]; + assert!( + offset >= line.offset, + "offset = {}, line.offset = {}", + offset, + line.offset + ); Some((line, idx, offset - line.offset)) } else { None @@ -151,9 +164,16 @@ impl Source { /// [`Source::line`]). pub fn get_line_range(&self, span: &S) -> Range { let start = self.get_offset_line(span.start()).map_or(0, |(_, l, _)| l); - let end = self.get_offset_line(span.end().saturating_sub(1).max(span.start())).map_or(self.lines.len(), |(_, l, _)| l + 1); + let end = self + .get_offset_line(span.end().saturating_sub(1).max(span.start())) + .map_or(self.lines.len(), |(_, l, _)| l + 1); start..end } + + /// Get the source text for a line, excluding trailing whitespace. + pub fn get_line_text(&self, line: Line) -> Option<&'_ str> { + self.text.get(line.byte_span()).map(|text| text.trim_end()) + } } impl Cache<()> for Source { @@ -244,38 +264,61 @@ mod tests { use super::Source; - #[test] - fn source_from() { - fn test(lines: Vec<&str>) { - let source: String = lines.iter().map(|s| *s).collect(); - let source = Source::from(source); - - assert_eq!(source.lines.len(), lines.len()); - - let mut offset = 0; - for (source_line, raw_line) in zip(source.lines.into_iter(), lines.into_iter()) { - assert_eq!(source_line.offset, offset); - assert_eq!(source_line.len, raw_line.chars().count()); - assert_eq!(source_line.chars, raw_line.trim_end()); - offset += source_line.len; - } - - assert_eq!(source.len, offset); + fn test_from(lines: Vec<&str>) { + let source: String = lines.iter().map(|s| *s).collect(); + let source = Source::from(source); + + assert_eq!(source.lines.len(), lines.len()); + + let mut offset = 0; + for (source_line, raw_line) in zip(source.lines.iter().copied(), lines.into_iter()) { + // dbg!(source_line, &raw_line[source_line.byte_span()]); + assert_eq!(source_line.offset, offset); + assert_eq!(source_line.char_len, raw_line.chars().count()); + assert_eq!( + source.get_line_text(source_line).unwrap(), + raw_line.trim_end() + ); + offset += source_line.char_len; } - test(vec![]); // Empty string + assert_eq!(source.len, offset); + } - test(vec!["Single line"]); - test(vec!["Single line with LF\n"]); - test(vec!["Single line with CRLF\r\n"]); + #[test] + fn source_from_empty() { + test_from(vec![]); // Empty string + } - test(vec!["Two\r\n", "lines\n"]); - test(vec!["Some\n", "more\r\n", "lines"]); - test(vec!["\n", "\r\n", "\n", "Empty Lines"]); + #[test] + fn source_from_single() { + test_from(vec!["Single line"]); + test_from(vec!["Single line with LF\n"]); + test_from(vec!["Single line with CRLF\r\n"]); + } - test(vec!["Trailing spaces \n", "are trimmed\t"]); + #[test] + fn source_from_multi() { + test_from(vec!["Two\r\n", "lines\n"]); + test_from(vec!["Some\n", "more\r\n", "lines"]); + test_from(vec!["\n", "\r\n", "\n", "Empty Lines"]); + } + #[test] + fn source_from_trims_trailing_spaces() { + test_from(vec!["Trailing spaces \n", "are trimmed\t"]); + } + + #[test] + fn source_from_alternate_line_endings() { // Line endings other than LF or CRLF - test(vec!["CR\r", "VT\x0B", "FF\x0C", "NEL\u{0085}", "LS\u{2028}", "PS\u{2029}"]); + test_from(vec![ + "CR\r", + "VT\x0B", + "FF\x0C", + "NEL\u{0085}", + "LS\u{2028}", + "PS\u{2029}", + ]); } } diff --git a/src/write.rs b/src/write.rs index 542ec60..a4f48e5 100644 --- a/src/write.rs +++ b/src/write.rs @@ -582,7 +582,7 @@ impl Report<'_, S> { // Line if !is_ellipsis { - for (col, c) in line.chars().enumerate() { + for (col, c) in src.get_line_text(line).unwrap().chars().enumerate() { let color = if let Some(highlight) = get_highlight(col) { highlight.color } else { @@ -620,7 +620,7 @@ impl Report<'_, S> { &margin_label, )?; // Lines alternate - let mut chars = line.chars(); + let mut chars = src.get_line_text(line).unwrap().chars(); for col in 0..arrow_len { let width = chars.next().map_or(1, |c| self.config.char_width(c, col).1); @@ -674,7 +674,7 @@ impl Report<'_, S> { &margin_label, )?; // Lines - let mut chars = line.chars(); + let mut chars = src.get_line_text(line).unwrap().chars(); for col in 0..arrow_len { let width = chars.next().map_or(1, |c| self.config.char_width(c, col).1); From dcd8b72996388ac450f7236c8f5eba4077e82618 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Fri, 29 Sep 2023 10:36:54 +0200 Subject: [PATCH 2/3] Support alternative String types in `Source` --- src/source.rs | 153 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 110 insertions(+), 43 deletions(-) diff --git a/src/source.rs b/src/source.rs index 8b86e1d..f7c575f 100644 --- a/src/source.rs +++ b/src/source.rs @@ -8,9 +8,16 @@ use std::{ /// A trait implemented by [`Source`] caches. pub trait Cache { + /// The type used to store the string data for this cache. + /// + /// Alternative types other than String can be used, but at the moment, the storage must be + /// contiguous. A primary use case for this is to use a reference-counted string instead of + /// copying the whole contents into a [`Source`]. + type Storage: AsRef; + /// Fetch the [`Source`] identified by the given ID, if possible. // TODO: Don't box - fn fetch(&mut self, id: &Id) -> Result<&Source, Box>; + fn fetch(&mut self, id: &Id) -> Result<&Source, Box>; /// Display the given ID. as a single inline value. /// @@ -20,12 +27,16 @@ pub trait Cache { } impl<'b, C: Cache, Id: ?Sized> Cache for &'b mut C { - fn fetch(&mut self, id: &Id) -> Result<&Source, Box> { C::fetch(self, id) } + type Storage = C::Storage; + + fn fetch(&mut self, id: &Id) -> Result<&Source, Box> { C::fetch(self, id) } fn display<'a>(&self, id: &'a Id) -> Option> { C::display(self, id) } } impl, Id: ?Sized> Cache for Box { - fn fetch(&mut self, id: &Id) -> Result<&Source, Box> { C::fetch(self, id) } + type Storage = C::Storage; + + fn fetch(&mut self, id: &Id) -> Result<&Source, Box> { C::fetch(self, id) } fn display<'a>(&self, id: &'a Id) -> Option> { C::display(self, id) } } @@ -57,22 +68,22 @@ impl Line { /// /// In most cases, a source is a single input file. #[derive(Clone, Debug, Hash, PartialEq, Eq)] -pub struct Source { - text: String, +pub struct Source> { + text: I, lines: Vec, len: usize, } -impl> From for Source { +impl> From for Source { /// Generate a [`Source`] from the given [`str`]. /// /// Note that this function can be expensive for long strings. Use an implementor of [`Cache`] where possible. - fn from(s: S) -> Self { + fn from(input: I) -> Self { let mut char_offset = 0; let mut byte_offset = 0; // (Last line, last line ends with CR) let mut last_line: Option<(Line, bool)> = None; - let mut lines: Vec = s + let mut lines: Vec = input .as_ref() .split_inclusive([ '\r', // Carriage return @@ -116,19 +127,19 @@ impl> From for Source { } Self { - text: s.as_ref().to_string(), + text: input, lines, len: char_offset, } } } -impl Source { +impl> Source { /// Get the length of the total number of characters in the source. pub fn len(&self) -> usize { self.len } /// Return an iterator over the characters in the source. - pub fn chars(&self) -> impl Iterator + '_ { self.text.chars() } + pub fn chars(&self) -> impl Iterator + '_ { self.text.as_ref().chars() } /// Get access to a specific, zero-indexed [`Line`]. pub fn line(&self, idx: usize) -> Option { self.lines.get(idx).copied() } @@ -172,18 +183,26 @@ impl Source { /// Get the source text for a line, excluding trailing whitespace. pub fn get_line_text(&self, line: Line) -> Option<&'_ str> { - self.text.get(line.byte_span()).map(|text| text.trim_end()) + self.text.as_ref().get(line.byte_span()).map(|text| text.trim_end()) } } -impl Cache<()> for Source { - fn fetch(&mut self, _: &()) -> Result<&Source, Box> { Ok(self) } +impl> Cache<()> for Source { + type Storage = I; + + fn fetch(&mut self, _: &()) -> Result<&Source, Box> { Ok(self) } fn display(&self, _: &()) -> Option> { None } } -impl Cache for (Id, Source) { - fn fetch(&mut self, id: &Id) -> Result<&Source, Box> { - if id == &self.0 { Ok(&self.1) } else { Err(Box::new(format!("Failed to fetch source '{}'", id))) } +impl, Id: fmt::Display + Eq> Cache for (Id, Source) { + type Storage = I; + + fn fetch(&mut self, id: &Id) -> Result<&Source, Box> { + if id == &self.0 { + Ok(&self.1) + } else { + Err(Box::new(format!("Failed to fetch source '{}'", id))) + } } fn display<'a>(&self, id: &'a Id) -> Option> { Some(Box::new(id)) } } @@ -191,14 +210,16 @@ impl Cache for (Id, Source) { /// A [`Cache`] that fetches [`Source`]s from the filesystem. #[derive(Default, Debug, Clone)] pub struct FileCache { - files: HashMap, + files: HashMap>, } impl Cache for FileCache { - fn fetch(&mut self, path: &Path) -> Result<&Source, Box> { + type Storage = String; + + fn fetch(&mut self, path: &Path) -> Result<&Source, Box> { Ok(match self.files.entry(path.to_path_buf()) { // TODO: Don't allocate here Entry::Occupied(entry) => entry.into_mut(), - Entry::Vacant(entry) => entry.insert(Source::from(&fs::read_to_string(path).map_err(|e| Box::new(e) as _)?)), + Entry::Vacant(entry) => entry.insert(Source::from(fs::read_to_string(path).map_err(|e| Box::new(e) as _)?)), }) } fn display<'a>(&self, path: &'a Path) -> Option> { Some(Box::new(path.display())) } @@ -206,12 +227,16 @@ impl Cache for FileCache { /// A [`Cache`] that fetches [`Source`]s using the provided function. #[derive(Debug, Clone)] -pub struct FnCache { - sources: HashMap, +pub struct FnCache + where I: AsRef +{ + sources: HashMap>, get: F, } -impl FnCache { +impl FnCache + where I: AsRef +{ /// Create a new [`FnCache`] with the given fetch function. pub fn new(get: F) -> Self { Self { @@ -221,7 +246,7 @@ impl FnCache { } /// Pre-insert a selection of [`Source`]s into this cache. - pub fn with_sources(mut self, sources: HashMap) -> Self + pub fn with_sources(mut self, sources: HashMap>) -> Self where Id: Eq + Hash { self.sources.reserve(sources.len()); @@ -232,10 +257,13 @@ impl FnCache { } } -impl Cache for FnCache - where F: for<'a> FnMut(&'a Id) -> Result> +impl Cache for FnCache + where I: AsRef, + F: for<'a> FnMut(&'a Id) -> Result>, { - fn fetch(&mut self, id: &Id) -> Result<&Source, Box> { + type Storage = I; + + fn fetch(&mut self, id: &Id) -> Result<&Source, Box> { Ok(match self.sources.entry(id.clone()) { Entry::Occupied(entry) => entry.into_mut(), Entry::Vacant(entry) => entry.insert(Source::from((self.get)(id)?)), @@ -246,25 +274,25 @@ impl Cache for FnCache< /// Create a [`Cache`] from a collection of ID/strings, where each corresponds to a [`Source`]. pub fn sources(iter: I) -> impl Cache -where - Id: fmt::Display + Hash + PartialEq + Eq + Clone + 'static, - I: IntoIterator, - S: AsRef, + where Id: fmt::Display + Hash + PartialEq + Eq + Clone + 'static, + I: IntoIterator, + S: AsRef, { FnCache::new((move |id| Err(Box::new(format!("Failed to fetch source '{}'", id)) as _)) as fn(&_) -> _) .with_sources(iter .into_iter() - .map(|(id, s)| (id, Source::from(s.as_ref()))) + .map(|(id, s)| (id, Source::from(s))) .collect()) } #[cfg(test)] mod tests { use std::iter::zip; + use std::sync::Arc; use super::Source; - fn test_from(lines: Vec<&str>) { + fn test_with_lines(lines: Vec<&str>) { let source: String = lines.iter().map(|s| *s).collect(); let source = Source::from(source); @@ -272,7 +300,6 @@ mod tests { let mut offset = 0; for (source_line, raw_line) in zip(source.lines.iter().copied(), lines.into_iter()) { - // dbg!(source_line, &raw_line[source_line.byte_span()]); assert_eq!(source_line.offset, offset); assert_eq!(source_line.char_len, raw_line.chars().count()); assert_eq!( @@ -287,32 +314,32 @@ mod tests { #[test] fn source_from_empty() { - test_from(vec![]); // Empty string + test_with_lines(vec![]); // Empty string } #[test] fn source_from_single() { - test_from(vec!["Single line"]); - test_from(vec!["Single line with LF\n"]); - test_from(vec!["Single line with CRLF\r\n"]); + test_with_lines(vec!["Single line"]); + test_with_lines(vec!["Single line with LF\n"]); + test_with_lines(vec!["Single line with CRLF\r\n"]); } #[test] fn source_from_multi() { - test_from(vec!["Two\r\n", "lines\n"]); - test_from(vec!["Some\n", "more\r\n", "lines"]); - test_from(vec!["\n", "\r\n", "\n", "Empty Lines"]); + test_with_lines(vec!["Two\r\n", "lines\n"]); + test_with_lines(vec!["Some\n", "more\r\n", "lines"]); + test_with_lines(vec!["\n", "\r\n", "\n", "Empty Lines"]); } #[test] fn source_from_trims_trailing_spaces() { - test_from(vec!["Trailing spaces \n", "are trimmed\t"]); + test_with_lines(vec!["Trailing spaces \n", "are trimmed\t"]); } #[test] fn source_from_alternate_line_endings() { // Line endings other than LF or CRLF - test_from(vec![ + test_with_lines(vec![ "CR\r", "VT\x0B", "FF\x0C", @@ -321,4 +348,44 @@ mod tests { "PS\u{2029}", ]); } + + #[test] + fn source_from_other_string_types() { + let raw = r#"A raw string + with multiple + lines behind + an Arc"#; + let arc = Arc::from(raw); + let source = Source::from(arc); + + assert_eq!(source.lines.len(), 4); + + let mut offset = 0; + for (source_line, raw_line) in zip(source.lines.iter().copied(), raw.split_inclusive('\n')) + { + assert_eq!(source_line.offset, offset); + assert_eq!(source_line.char_len, raw_line.chars().count()); + assert_eq!( + source.get_line_text(source_line).unwrap(), + raw_line.trim_end() + ); + offset += source_line.char_len; + } + + assert_eq!(source.len, offset); + } + + #[test] + fn source_from_reference() { + let raw = r#"A raw string + with multiple + lines"#; + + fn non_owning_source<'a>(input: &'a str) -> Source<&'a str> { + Source::from(input) + } + + let source = non_owning_source(raw); + assert_eq!(source.lines.len(), 3); + } } From 1996a0f8ef05f6251f49418543484e0bcbbcd643 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e=20Kooi?= Date: Fri, 29 Sep 2023 13:15:29 +0200 Subject: [PATCH 3/3] Default Source storage type to String --- src/source.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/source.rs b/src/source.rs index f7c575f..43982cb 100644 --- a/src/source.rs +++ b/src/source.rs @@ -68,7 +68,7 @@ impl Line { /// /// In most cases, a source is a single input file. #[derive(Clone, Debug, Hash, PartialEq, Eq)] -pub struct Source> { +pub struct Source = String> { text: I, lines: Vec, len: usize, @@ -210,13 +210,13 @@ impl, Id: fmt::Display + Eq> Cache for (Id, Source) { /// A [`Cache`] that fetches [`Source`]s from the filesystem. #[derive(Default, Debug, Clone)] pub struct FileCache { - files: HashMap>, + files: HashMap, } impl Cache for FileCache { type Storage = String; - fn fetch(&mut self, path: &Path) -> Result<&Source, Box> { + fn fetch(&mut self, path: &Path) -> Result<&Source, Box> { Ok(match self.files.entry(path.to_path_buf()) { // TODO: Don't allocate here Entry::Occupied(entry) => entry.into_mut(), Entry::Vacant(entry) => entry.insert(Source::from(fs::read_to_string(path).map_err(|e| Box::new(e) as _)?)),