Skip to content

Commit

Permalink
Improve fuzzy match performance and fix corner case that omits results
Browse files Browse the repository at this point in the history
* Removes `max_results` from the matcher interface as this is better
dealt with in consumers once all results are known. The current
implementation was quite inefficient as it was using binary search to
find insertion points and then doing an insert which copies the entire
suffix each time.

* There was a corner case where if the binary search found a match
candidate with the same score, it was dropped. Now fixed.

* Uses of `util::extend_sorted` when merging results from worker
threads also repeatedly uses binary search and insertion which copies
the entire suffix. A followup will remove that and its usage.

* Adds `util::truncate_to_bottom_n_sorted_by` which uses quickselect +
sort to efficiently get a sorted count limited result.

* Improves interface of Matcher::match_candidates by providing the
match positions to the build function. This allows for removal of the
`Match` trait. It also fixes a bug where the Match's own Ord wasn't
being used, which seems relevant to PathMatch for cases where scores
are the same.
  • Loading branch information
mgsloan committed Dec 31, 2024
1 parent f912c54 commit d620d34
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 96 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/fuzzy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ path = "src/fuzzy.rs"
doctest = false

[dependencies]
futures.workspace = true
gpui.workspace = true
util.workspace = true
log.workspace = true
41 changes: 11 additions & 30 deletions crates/fuzzy/src/matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,13 @@ pub struct Matcher<'a> {
lowercase_query: &'a [char],
query_char_bag: CharBag,
smart_case: bool,
max_results: usize,
min_score: f64,
match_positions: Vec<usize>,
last_positions: Vec<usize>,
score_matrix: Vec<Option<f64>>,
best_position_matrix: Vec<usize>,
}

pub trait Match: Ord {
fn score(&self) -> f64;
fn set_positions(&mut self, positions: Vec<usize>);
}

pub trait MatchCandidate {
fn has_chars(&self, bag: CharBag) -> bool;
fn to_string(&self) -> Cow<'_, str>;
Expand All @@ -38,7 +32,6 @@ impl<'a> Matcher<'a> {
lowercase_query: &'a [char],
query_char_bag: CharBag,
smart_case: bool,
max_results: usize,
) -> Self {
Self {
query,
Expand All @@ -50,10 +43,11 @@ impl<'a> Matcher<'a> {
score_matrix: Vec::new(),
best_position_matrix: Vec::new(),
smart_case,
max_results,
}
}

/// Filter and score fuzzy match candidates. Results are returned unsorted, in the same order as
/// the input candidates.
pub fn match_candidates<C: MatchCandidate, R, F>(
&mut self,
prefix: &[char],
Expand All @@ -63,8 +57,7 @@ impl<'a> Matcher<'a> {
cancel_flag: &AtomicBool,
build_match: F,
) where
R: Match,
F: Fn(&C, f64) -> R,
F: Fn(&C, f64, &Vec<usize>) -> R,
{
let mut candidate_chars = Vec::new();
let mut lowercase_candidate_chars = Vec::new();
Expand Down Expand Up @@ -103,20 +96,7 @@ impl<'a> Matcher<'a> {
);

if score > 0.0 {
let mut mat = build_match(&candidate, score);
if let Err(i) = results.binary_search_by(|m| mat.cmp(m)) {
if results.len() < self.max_results {
mat.set_positions(self.match_positions.clone());
results.insert(i, mat);
} else if i < results.len() {
results.pop();
mat.set_positions(self.match_positions.clone());
results.insert(i, mat);
}
if results.len() == self.max_results {
self.min_score = results.last().unwrap().score();
}
}
results.push(build_match(&candidate, score, &self.match_positions));
}
}
}
Expand Down Expand Up @@ -325,18 +305,18 @@ mod tests {
#[test]
fn test_get_last_positions() {
let mut query: &[char] = &['d', 'c'];
let mut matcher = Matcher::new(query, query, query.into(), false, 10);
let mut matcher = Matcher::new(query, query, query.into(), false);
let result = matcher.find_last_positions(&['a', 'b', 'c'], &['b', 'd', 'e', 'f']);
assert!(!result);

query = &['c', 'd'];
let mut matcher = Matcher::new(query, query, query.into(), false, 10);
let mut matcher = Matcher::new(query, query, query.into(), false);
let result = matcher.find_last_positions(&['a', 'b', 'c'], &['b', 'd', 'e', 'f']);
assert!(result);
assert_eq!(matcher.last_positions, vec![2, 4]);

query = &['z', '/', 'z', 'f'];
let mut matcher = Matcher::new(query, query, query.into(), false, 10);
let mut matcher = Matcher::new(query, query, query.into(), false);
let result = matcher.find_last_positions(&['z', 'e', 'd', '/'], &['z', 'e', 'd', '/', 'f']);
assert!(result);
assert_eq!(matcher.last_positions, vec![0, 3, 4, 8]);
Expand Down Expand Up @@ -451,7 +431,7 @@ mod tests {
});
}

let mut matcher = Matcher::new(&query, &lowercase_query, query_chars, smart_case, 100);
let mut matcher = Matcher::new(&query, &lowercase_query, query_chars, smart_case);

let cancel_flag = AtomicBool::new(false);
let mut results = Vec::new();
Expand All @@ -462,16 +442,17 @@ mod tests {
path_entries.into_iter(),
&mut results,
&cancel_flag,
|candidate, score| PathMatch {
|candidate, score, positions| PathMatch {
score,
worktree_id: 0,
positions: Vec::new(),
positions: positions.clone(),
path: Arc::from(candidate.path),
path_prefix: "".into(),
distance_to_relative_ancestor: usize::MAX,
is_dir: false,
},
);
results.sort_by(|a, b| b.cmp(a));

results
.into_iter()
Expand Down
48 changes: 11 additions & 37 deletions crates/fuzzy/src/paths.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use std::{
};

use crate::{
matcher::{Match, MatchCandidate, Matcher},
matcher::{MatchCandidate, Matcher},
CharBag,
};

Expand Down Expand Up @@ -42,16 +42,6 @@ pub trait PathMatchCandidateSet<'a>: Send + Sync {
fn candidates(&'a self, start: usize) -> Self::Candidates;
}

impl Match for PathMatch {
fn score(&self) -> f64 {
self.score
}

fn set_positions(&mut self, positions: Vec<usize>) {
self.positions = positions;
}
}

impl<'a> MatchCandidate for PathMatchCandidate<'a> {
fn has_chars(&self, bag: CharBag) -> bool {
self.char_bag.is_superset(bag)
Expand Down Expand Up @@ -102,13 +92,7 @@ pub fn match_fixed_path_set(
let query = query.chars().collect::<Vec<_>>();
let query_char_bag = CharBag::from(&lowercase_query[..]);

let mut matcher = Matcher::new(
&query,
&lowercase_query,
query_char_bag,
smart_case,
max_results,
);
let mut matcher = Matcher::new(&query, &lowercase_query, query_char_bag, smart_case);

let mut results = Vec::new();
matcher.match_candidates(
Expand All @@ -117,16 +101,17 @@ pub fn match_fixed_path_set(
candidates.into_iter(),
&mut results,
&AtomicBool::new(false),
|candidate, score| PathMatch {
|candidate, score, positions| PathMatch {
score,
worktree_id,
positions: Vec::new(),
positions: positions.clone(),
is_dir: candidate.is_dir,
path: Arc::from(candidate.path),
path_prefix: Arc::default(),
distance_to_relative_ancestor: usize::MAX,
},
);
util::truncate_to_bottom_n_sorted_by(&mut results, max_results, &|a, b| b.cmp(a));
results
}

Expand Down Expand Up @@ -164,13 +149,8 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>(
scope.spawn(async move {
let segment_start = segment_idx * segment_size;
let segment_end = segment_start + segment_size;
let mut matcher = Matcher::new(
query,
lowercase_query,
query_char_bag,
smart_case,
max_results,
);
let mut matcher =
Matcher::new(query, lowercase_query, query_char_bag, smart_case);

let mut tree_start = 0;
for candidate_set in candidate_sets {
Expand All @@ -193,10 +173,10 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>(
candidates,
results,
cancel_flag,
|candidate, score| PathMatch {
|candidate, score, positions| PathMatch {
score,
worktree_id,
positions: Vec::new(),
positions: positions.clone(),
path: Arc::from(candidate.path),
is_dir: candidate.is_dir,
path_prefix: candidate_set.prefix(),
Expand All @@ -222,14 +202,8 @@ pub async fn match_path_sets<'a, Set: PathMatchCandidateSet<'a>>(
})
.await;

let mut results = Vec::new();
for segment_result in segment_results {
if results.is_empty() {
results = segment_result;
} else {
util::extend_sorted(&mut results, segment_result, max_results, |a, b| b.cmp(a));
}
}
let mut results = segment_results.concat();
util::truncate_to_bottom_n_sorted_by(&mut results, max_results, &|a, b| b.cmp(a));
results
}

Expand Down
35 changes: 7 additions & 28 deletions crates/fuzzy/src/strings.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::{
matcher::{Match, MatchCandidate, Matcher},
matcher::{MatchCandidate, Matcher},
CharBag,
};
use gpui::BackgroundExecutor;
Expand Down Expand Up @@ -46,16 +46,6 @@ pub struct StringMatch {
pub string: String,
}

impl Match for StringMatch {
fn score(&self) -> f64 {
self.score
}

fn set_positions(&mut self, positions: Vec<usize>) {
self.positions = positions;
}
}

impl StringMatch {
pub fn ranges(&self) -> impl '_ + Iterator<Item = Range<usize>> {
let mut positions = self.positions.iter().peekable();
Expand Down Expand Up @@ -167,24 +157,19 @@ pub async fn match_strings(
scope.spawn(async move {
let segment_start = cmp::min(segment_idx * segment_size, candidates.len());
let segment_end = cmp::min(segment_start + segment_size, candidates.len());
let mut matcher = Matcher::new(
query,
lowercase_query,
query_char_bag,
smart_case,
max_results,
);
let mut matcher =
Matcher::new(query, lowercase_query, query_char_bag, smart_case);

matcher.match_candidates(
&[],
&[],
candidates[segment_start..segment_end].iter(),
results,
cancel_flag,
|candidate, score| StringMatch {
|candidate, score, positions| StringMatch {
candidate_id: candidate.id,
score,
positions: Vec::new(),
positions: positions.clone(),
string: candidate.string.to_string(),
},
);
Expand All @@ -193,13 +178,7 @@ pub async fn match_strings(
})
.await;

let mut results = Vec::new();
for segment_result in segment_results {
if results.is_empty() {
results = segment_result;
} else {
util::extend_sorted(&mut results, segment_result, max_results, |a, b| b.cmp(a));
}
}
let mut results = segment_results.concat();
util::truncate_to_bottom_n_sorted_by(&mut results, max_results, &|a, b| b.cmp(a));
results
}
22 changes: 21 additions & 1 deletion crates/util/src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ pub mod test;

use anyhow::{anyhow, Context as _, Result};
use futures::Future;

use itertools::Either;
use regex::Regex;
use std::sync::{LazyLock, OnceLock};
Expand Down Expand Up @@ -111,6 +110,27 @@ where
}
}

pub fn truncate_to_bottom_n_sorted_by<T, F>(items: &mut Vec<T>, limit: usize, compare: &F)
where
F: Fn(&T, &T) -> Ordering,
{
if limit == 0 {
items.truncate(0);
}
if items.len() < limit {
return;
}
// When limit is near to items.len() it may be more efficient to sort the whole list and
// truncate, rather than always doing selection first as is done below. It's hard to analyze
// where the threshold for this should be since the quickselect style algorithm used by
// `select_nth_unstable_by` makes the prefix partially sorted, and so its work is not wasted -
// the expected number of comparisons needed by `sort_by` is less than it is for some arbitrary
// unsorted input.
items.select_nth_unstable_by(limit, compare);
items.truncate(limit);
items.sort_by(compare);
}

#[cfg(unix)]
pub fn load_shell_from_passwd() -> Result<()> {
let buflen = match unsafe { libc::sysconf(libc::_SC_GETPW_R_SIZE_MAX) } {
Expand Down

0 comments on commit d620d34

Please sign in to comment.