Skip to content

Commit

Permalink
diff: extract narrowed view type from DiffSource, add type-safe local…
Browse files Browse the repository at this point in the history
… index

I'm going to add a Vec of precomputed hashes, and the Vec will be owned by
DiffSource.
  • Loading branch information
yuja committed Nov 19, 2024
1 parent ea2a9c3 commit 66b61c0
Showing 1 changed file with 51 additions and 32 deletions.
83 changes: 51 additions & 32 deletions lib/src/diff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,40 +219,59 @@ impl<C: CompareBytes, S: BuildHasher> WordComparator<C, S> {
}
}

/// Index in a list of word (or token) ranges.
/// Index in a list of word (or token) ranges in `DiffSource`.
#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
struct WordPosition(usize);

/// Index in a list of word (or token) ranges in `LocalDiffSource`.
#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
struct LocalWordPosition(usize);

#[derive(Clone, Debug)]
struct DiffSource<'input, 'aux> {
text: &'input BStr,
ranges: &'aux [Range<usize>],
/// The number of preceding word ranges excluded from the self `ranges`.
global_offset: WordPosition,
}

impl<'input, 'aux> DiffSource<'input, 'aux> {
fn new<T: AsRef<[u8]> + ?Sized>(text: &'input T, ranges: &'aux [Range<usize>]) -> Self {
DiffSource {
text: BStr::new(text),
ranges,
global_offset: WordPosition(0),
}
}

fn narrowed(&self, positions: Range<WordPosition>) -> Self {
DiffSource {
fn local(&self) -> LocalDiffSource<'input, '_> {
LocalDiffSource {
text: self.text,
ranges: &self.ranges[positions.start.0..positions.end.0],
global_offset: self.map_to_global(positions.start),
ranges: self.ranges,
global_offset: WordPosition(0),
}
}

fn range_at(&self, position: WordPosition) -> Range<usize> {
self.ranges[position.0].clone()
}
}

#[derive(Clone, Debug)]
struct LocalDiffSource<'input, 'aux> {
text: &'input BStr,
ranges: &'aux [Range<usize>],
/// The number of preceding word ranges excluded from the self `ranges`.
global_offset: WordPosition,
}

impl LocalDiffSource<'_, '_> {
fn narrowed(&self, positions: Range<LocalWordPosition>) -> Self {
LocalDiffSource {
text: self.text,
ranges: &self.ranges[positions.start.0..positions.end.0],
global_offset: self.map_to_global(positions.start),
}
}

fn map_to_global(&self, position: WordPosition) -> WordPosition {
fn map_to_global(&self, position: LocalWordPosition) -> WordPosition {
WordPosition(self.global_offset.0 + position.0)
}
}
Expand All @@ -261,11 +280,11 @@ struct Histogram<'input> {
word_to_positions: HashTable<HistogramEntry<'input>>,
}

type HistogramEntry<'input> = (HashedWord<'input>, Vec<WordPosition>);
type HistogramEntry<'input> = (HashedWord<'input>, Vec<LocalWordPosition>);

impl<'input> Histogram<'input> {
fn calculate<C: CompareBytes, S: BuildHasher>(
source: &DiffSource<'input, '_>,
source: &LocalDiffSource<'input, '_>,
comp: &WordComparator<C, S>,
max_occurrences: usize,
) -> Self {
Expand All @@ -285,7 +304,7 @@ impl<'input> Histogram<'input> {
// Allow one more than max_occurrences, so we can later skip those with more
// than max_occurrences
if positions.len() <= max_occurrences {
positions.push(WordPosition(i));
positions.push(LocalWordPosition(i));
}
}
Histogram { word_to_positions }
Expand All @@ -305,7 +324,7 @@ impl<'input> Histogram<'input> {
&self,
word: HashedWord<'input>,
comp: &WordComparator<C, S>,
) -> Option<&[WordPosition]> {
) -> Option<&[LocalWordPosition]> {
let (_, positions) = self
.word_to_positions
.find(word.hash, |(w, _)| comp.eq(w.text, word.text))?;
Expand Down Expand Up @@ -373,8 +392,8 @@ fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
/// arguments. The data between those words is ignored.
fn collect_unchanged_words<C: CompareBytes, S: BuildHasher>(
found_positions: &mut Vec<(WordPosition, WordPosition)>,
left: &DiffSource,
right: &DiffSource,
left: &LocalDiffSource,
right: &LocalDiffSource,
comp: &WordComparator<C, S>,
) {
if left.ranges.is_empty() || right.ranges.is_empty() {
Expand Down Expand Up @@ -403,23 +422,23 @@ fn collect_unchanged_words<C: CompareBytes, S: BuildHasher>(
found_positions.extend(itertools::chain(
(0..common_leading_len).map(|i| {
(
left.map_to_global(WordPosition(i)),
right.map_to_global(WordPosition(i)),
left.map_to_global(LocalWordPosition(i)),
right.map_to_global(LocalWordPosition(i)),
)
}),
(1..=common_trailing_len).rev().map(|i| {
(
left.map_to_global(WordPosition(left.ranges.len() - i)),
right.map_to_global(WordPosition(right.ranges.len() - i)),
left.map_to_global(LocalWordPosition(left.ranges.len() - i)),
right.map_to_global(LocalWordPosition(right.ranges.len() - i)),
)
}),
));
}

fn collect_unchanged_words_lcs<C: CompareBytes, S: BuildHasher>(
found_positions: &mut Vec<(WordPosition, WordPosition)>,
left: &DiffSource,
right: &DiffSource,
left: &LocalDiffSource,
right: &LocalDiffSource,
comp: &WordComparator<C, S>,
) {
let max_occurrences = 100;
Expand Down Expand Up @@ -473,8 +492,8 @@ fn collect_unchanged_words_lcs<C: CompareBytes, S: BuildHasher>(

// Produce output word positions, recursing into the modified areas between
// the elements in the LCS.
let mut previous_left_position = WordPosition(0);
let mut previous_right_position = WordPosition(0);
let mut previous_left_position = LocalWordPosition(0);
let mut previous_right_position = LocalWordPosition(0);
for (left_index, right_index) in lcs {
let (left_position, _) = left_positions[left_index];
let (right_position, _) = right_positions[right_index];
Expand All @@ -488,14 +507,14 @@ fn collect_unchanged_words_lcs<C: CompareBytes, S: BuildHasher>(
left.map_to_global(left_position),
right.map_to_global(right_position),
));
previous_left_position = WordPosition(left_position.0 + 1);
previous_right_position = WordPosition(right_position.0 + 1);
previous_left_position = LocalWordPosition(left_position.0 + 1);
previous_right_position = LocalWordPosition(right_position.0 + 1);
}
// Also recurse into range at end (after common ranges).
collect_unchanged_words(
found_positions,
&left.narrowed(previous_left_position..WordPosition(left.ranges.len())),
&right.narrowed(previous_right_position..WordPosition(right.ranges.len())),
&left.narrowed(previous_left_position..LocalWordPosition(left.ranges.len())),
&right.narrowed(previous_right_position..LocalWordPosition(right.ranges.len())),
comp,
);
}
Expand Down Expand Up @@ -630,8 +649,8 @@ impl<'input> Diff<'input> {
let mut first_positions = Vec::new();
collect_unchanged_words(
&mut first_positions,
&base_source,
first_other_source,
&base_source.local(),
&first_other_source.local(),
&comp,
);
if tail_other_sources.is_empty() {
Expand All @@ -656,8 +675,8 @@ impl<'input> Diff<'input> {
let mut new_positions = Vec::new();
collect_unchanged_words(
&mut new_positions,
&base_source,
other_source,
&base_source.local(),
&other_source.local(),
&comp,
);
intersect_unchanged_words(current_positions, &new_positions)
Expand Down Expand Up @@ -1148,7 +1167,7 @@ mod tests {
) -> Vec<(Range<usize>, Range<usize>)> {
let comp = WordComparator::new(CompareBytesExactly);
let mut positions = Vec::new();
collect_unchanged_words(&mut positions, left, right, &comp);
collect_unchanged_words(&mut positions, &left.local(), &right.local(), &comp);
positions
.into_iter()
.map(|(left_pos, right_pos)| (left.range_at(left_pos), right.range_at(right_pos)))
Expand Down

0 comments on commit 66b61c0

Please sign in to comment.