From 7b8415c921513db439044c01b52f7099609a537d Mon Sep 17 00:00:00 2001 From: dploch Date: Thu, 16 May 2024 10:27:24 -0400 Subject: [PATCH 1/2] union_find: implement a library for the Union-Find algorithm --- lib/src/lib.rs | 1 + lib/src/union_find.rs | 157 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 lib/src/union_find.rs diff --git a/lib/src/lib.rs b/lib/src/lib.rs index 04adfb7d64..e96d1d1788 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -83,6 +83,7 @@ pub mod submodule_store; pub mod transaction; pub mod tree; pub mod tree_builder; +pub mod union_find; pub mod view; pub mod working_copy; pub mod workspace; diff --git a/lib/src/union_find.rs b/lib/src/union_find.rs new file mode 100644 index 0000000000..aca086752b --- /dev/null +++ b/lib/src/union_find.rs @@ -0,0 +1,157 @@ +// Copyright 2024 The Jujutsu Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! This module implements a UnionFind type which can be used to +//! efficiently calculate disjoint sets for any data type. + +use std::collections::HashMap; +use std::hash::Hash; + +#[derive(Clone, Copy)] +struct Node { + root: T, + size: u32, +} + +/// Implementation of the union-find algorithm: +/// https://en.wikipedia.org/wiki/Disjoint-set_data_structure +/// +/// Joins disjoint sets by size to amortize cost. +#[derive(Clone)] +pub struct UnionFind { + roots: HashMap>, +} + +impl Default for UnionFind +where + T: Copy + Eq + Hash, +{ + fn default() -> Self { + Self::new() + } +} + +impl UnionFind +where + T: Copy + Eq + Hash, +{ + /// Creates a new empty UnionFind data structure. + pub fn new() -> Self { + Self { + roots: HashMap::new(), + } + } + + /// Returns the root identifying the union this item is a part of. + pub fn find(&mut self, item: T) -> T { + self.find_node(item).root + } + + fn find_node(&mut self, item: T) -> Node { + match self.roots.get(&item) { + Some(node) => { + if node.root != item { + let new_root = self.find_node(node.root); + self.roots.insert(item, new_root); + new_root + } else { + *node + } + } + None => { + let node = Node:: { + root: item, + size: 1, + }; + self.roots.insert(item, node); + node + } + } + } + + /// Unions the disjoint sets connected to `a` and `b`. + pub fn union(&mut self, a: T, b: T) { + let a = self.find_node(a); + let b = self.find_node(b); + if a.root == b.root { + return; + } + + let new_node = Node:: { + root: if a.size < b.size { b.root } else { a.root }, + size: a.size + b.size, + }; + self.roots.insert(a.root, new_node); + self.roots.insert(b.root, new_node); + } +} + +#[cfg(test)] +mod tests { + use itertools::Itertools; + + use super::*; + + #[test] + fn test_basic() { + let mut union_find = UnionFind::::new(); + + // Everything starts as a singleton. + assert_eq!(union_find.find(1), 1); + assert_eq!(union_find.find(2), 2); + assert_eq!(union_find.find(3), 3); + + // Make two pair sets. This implicitly adds node 4. + union_find.union(1, 2); + union_find.union(3, 4); + assert_eq!(union_find.find(1), union_find.find(2)); + assert_eq!(union_find.find(3), union_find.find(4)); + assert_ne!(union_find.find(1), union_find.find(3)); + + // Unioning the pairs gives everything the same root. + union_find.union(1, 3); + assert!([ + union_find.find(1), + union_find.find(2), + union_find.find(3), + union_find.find(4), + ] + .iter() + .all_equal()); + } + + #[test] + fn test_union_by_size() { + let mut union_find = UnionFind::::new(); + + // Create a set of 3 and a set of 2. + union_find.union(1, 2); + union_find.union(2, 3); + union_find.union(4, 5); + let set3 = union_find.find(1); + let set2 = union_find.find(4); + assert_ne!(set3, set2); + + // Merging them always chooses the larger set. + let mut large_first = union_find.clone(); + large_first.union(1, 4); + assert_eq!(large_first.find(1), set3); + assert_eq!(large_first.find(4), set3); + + let mut small_first = union_find.clone(); + small_first.union(4, 1); + assert_eq!(small_first.find(1), set3); + assert_eq!(small_first.find(4), set3); + } +} From 704d03842591fdc21d6596d9a71d0cc7072d398a Mon Sep 17 00:00:00 2001 From: dploch Date: Tue, 14 May 2024 17:01:28 -0400 Subject: [PATCH 2/2] revset: implement a 'reachable(src, domain)' expression This revset correctly implements "reachability" from a set of source commits following both parent and child edges as far as they can go within a domain set. This type of 'bfs' query is currently impossible to express with existing revset functions. --- CHANGELOG.md | 3 + cli/tests/test_revset_output.rs | 4 +- docs/revsets.md | 3 + lib/src/default_index/revset_engine.rs | 33 +++++- lib/src/revset.rs | 37 ++++++ lib/tests/test_revset.rs | 158 +++++++++++++++++++++++++ 6 files changed, 235 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e0d2cbca44..59133d9a4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -111,6 +111,9 @@ to avoid letting the user edit the immutable one. * `jj rebase -r` now accepts `--insert-after` and `--insert-before` options to customize the location of the rebased revisions. +* A new revset `reahable(srcs, domain)` will return all commits that are + reachable from `srcs` within `domain`. + ### Fixed bugs * Revsets now support `\`-escapes in string literal. diff --git a/cli/tests/test_revset_output.rs b/cli/tests/test_revset_output.rs index fde0fdc24c..ee49eacbfb 100644 --- a/cli/tests/test_revset_output.rs +++ b/cli/tests/test_revset_output.rs @@ -278,7 +278,7 @@ fn test_function_name_hint() { | ^----^ | = Function "branch" doesn't exist - Hint: Did you mean "branches"? + Hint: Did you mean "branches", "reachable"? "###); // Both builtin function and function alias should be suggested @@ -308,7 +308,7 @@ fn test_function_name_hint() { | ^----^ | = Function "branch" doesn't exist - Hint: Did you mean "branches"? + Hint: Did you mean "branches", "reachable"? "###); } diff --git a/docs/revsets.md b/docs/revsets.md index 59f0614113..d566b368d8 100644 --- a/docs/revsets.md +++ b/docs/revsets.md @@ -87,6 +87,9 @@ revsets (expressions) as arguments. * `descendants(x)`: Same as `x::`. +* `reachable(srcs, domain)`: All commits reachable from `srcs` within + `domain`, traversing all parent and child edges. + * `connected(x)`: Same as `x::x`. Useful when `x` includes several commits. * `all()`: All visible commits in the repo. diff --git a/lib/src/default_index/revset_engine.rs b/lib/src/default_index/revset_engine.rs index 41140ad8ca..b9e18e6532 100644 --- a/lib/src/default_index/revset_engine.rs +++ b/lib/src/default_index/revset_engine.rs @@ -35,8 +35,8 @@ use crate::revset::{ RevsetFilterExtensionWrapper, RevsetFilterPredicate, GENERATION_RANGE_FULL, }; use crate::revset_graph::RevsetGraphEdge; -use crate::rewrite; use crate::store::Store; +use crate::{rewrite, union_find}; type BoxedPredicateFn<'a> = Box bool + 'a>; pub(super) type BoxedRevWalk<'a> = Box + 'a>; @@ -829,6 +829,37 @@ impl<'index> EvaluationContext<'index> { Ok(Box::new(EagerRevset { positions })) } } + ResolvedExpression::Reachable { sources, domain } => { + let mut sets = union_find::UnionFind::::new(); + + // Compute all reachable subgraphs. + let domain_revset = self.evaluate(domain)?; + let domain_vec = domain_revset.positions().attach(index).collect_vec(); + let domain_set: HashSet<_> = domain_vec.iter().copied().collect(); + for pos in &domain_set { + for parent_pos in index.entry_by_pos(*pos).parent_positions() { + if domain_set.contains(&parent_pos) { + sets.union(*pos, parent_pos); + } + } + } + + // Identify disjoint sets reachable from sources. + let set_reps: HashSet<_> = intersection_by( + self.evaluate(sources)?.positions(), + EagerRevWalk::new(domain_vec.iter().copied()), + |pos1, pos2| pos1.cmp(pos2).reverse(), + ) + .attach(index) + .map(|pos| sets.find(pos)) + .collect(); + + let positions = domain_vec + .into_iter() + .filter(|pos| set_reps.contains(&sets.find(*pos))) + .collect_vec(); + Ok(Box::new(EagerRevset { positions })) + } ResolvedExpression::Heads(candidates) => { let candidate_set = self.evaluate(candidates)?; let head_positions: BTreeSet<_> = diff --git a/lib/src/revset.rs b/lib/src/revset.rs index 80611356ce..cb06cc5755 100644 --- a/lib/src/revset.rs +++ b/lib/src/revset.rs @@ -190,6 +190,11 @@ pub enum RevsetExpression { heads: Rc, // TODO: maybe add generation_from_roots/heads? }, + // Commits reachable from "sources" within "domain" + Reachable { + sources: Rc, + domain: Rc, + }, Heads(Rc), Roots(Rc), Latest { @@ -379,6 +384,18 @@ impl RevsetExpression { self.dag_range_to(self) } + /// All commits within `domain` reachable from this set of commits, by + /// traversing either parent or child edges. + pub fn reachable( + self: &Rc, + domain: &Rc, + ) -> Rc { + Rc::new(RevsetExpression::Reachable { + sources: self.clone(), + domain: domain.clone(), + }) + } + /// Commits reachable from `heads` but not from `self`. pub fn range( self: &Rc, @@ -507,6 +524,11 @@ pub enum ResolvedExpression { heads: Box, generation_from_roots: Range, }, + /// Commits reachable from `sources` within `domain`. + Reachable { + sources: Box, + domain: Box, + }, Heads(Box), Roots(Box), Latest { @@ -635,6 +657,12 @@ static BUILTIN_FUNCTION_MAP: Lazy> = Lazy: let candidates = parse_expression_rule(arg.into_inner(), state)?; Ok(candidates.connected()) }); + map.insert("reachable", |name, arguments_pair, state| { + let ([source_arg, domain_arg], []) = expect_arguments(name, arguments_pair)?; + let sources = parse_expression_rule(source_arg.into_inner(), state)?; + let domain = parse_expression_rule(domain_arg.into_inner(), state)?; + Ok(sources.reachable(&domain)) + }); map.insert("none", |name, arguments_pair, _state| { expect_no_arguments(name, arguments_pair)?; Ok(RevsetExpression::none()) @@ -960,6 +988,10 @@ fn try_transform_expression( transform_rec_pair((roots, heads), pre, post)? .map(|(roots, heads)| RevsetExpression::DagRange { roots, heads }) } + RevsetExpression::Reachable { sources, domain } => { + transform_rec_pair((sources, domain), pre, post)? + .map(|(sources, domain)| RevsetExpression::Reachable { sources, domain }) + } RevsetExpression::Heads(candidates) => { transform_rec(candidates, pre, post)?.map(RevsetExpression::Heads) } @@ -1748,6 +1780,10 @@ impl VisibilityResolutionContext<'_> { heads: self.resolve(heads).into(), generation_from_roots: GENERATION_RANGE_FULL, }, + RevsetExpression::Reachable { sources, domain } => ResolvedExpression::Reachable { + sources: self.resolve(sources).into(), + domain: self.resolve(domain).into(), + }, RevsetExpression::Heads(candidates) => { ResolvedExpression::Heads(self.resolve(candidates).into()) } @@ -1833,6 +1869,7 @@ impl VisibilityResolutionContext<'_> { | RevsetExpression::Descendants { .. } | RevsetExpression::Range { .. } | RevsetExpression::DagRange { .. } + | RevsetExpression::Reachable { .. } | RevsetExpression::Heads(_) | RevsetExpression::Roots(_) | RevsetExpression::Latest { .. } => { diff --git a/lib/tests/test_revset.rs b/lib/tests/test_revset.rs index bb372f3c7b..dcefc44cb0 100644 --- a/lib/tests/test_revset.rs +++ b/lib/tests/test_revset.rs @@ -1556,6 +1556,164 @@ fn test_evaluate_expression_connected() { ); } +#[test] +fn test_evaluate_expression_reachable() { + let settings = testutils::user_settings(); + let test_repo = TestRepo::init(); + let repo = &test_repo.repo; + + let mut tx = repo.start_transaction(&settings); + let mut_repo = tx.mut_repo(); + + // Construct 3 separate subgraphs off the root commit. + // 1 is a chain, 2 is a merge, 3 is a pyramidal monstrosity + let graph1commit1 = write_random_commit(mut_repo, &settings); + let graph1commit2 = create_random_commit(mut_repo, &settings) + .set_parents(vec![graph1commit1.id().clone()]) + .write() + .unwrap(); + let graph1commit3 = create_random_commit(mut_repo, &settings) + .set_parents(vec![graph1commit2.id().clone()]) + .write() + .unwrap(); + let graph2commit1 = write_random_commit(mut_repo, &settings); + let graph2commit2 = write_random_commit(mut_repo, &settings); + let graph2commit3 = create_random_commit(mut_repo, &settings) + .set_parents(vec![graph2commit1.id().clone(), graph2commit2.id().clone()]) + .write() + .unwrap(); + let graph3commit1 = write_random_commit(mut_repo, &settings); + let graph3commit2 = write_random_commit(mut_repo, &settings); + let graph3commit3 = write_random_commit(mut_repo, &settings); + let graph3commit4 = create_random_commit(mut_repo, &settings) + .set_parents(vec![graph3commit1.id().clone(), graph3commit2.id().clone()]) + .write() + .unwrap(); + let graph3commit5 = create_random_commit(mut_repo, &settings) + .set_parents(vec![graph3commit2.id().clone(), graph3commit3.id().clone()]) + .write() + .unwrap(); + let graph3commit6 = create_random_commit(mut_repo, &settings) + .set_parents(vec![graph3commit3.id().clone()]) + .write() + .unwrap(); + let graph3commit7 = create_random_commit(mut_repo, &settings) + .set_parents(vec![graph3commit4.id().clone(), graph3commit5.id().clone()]) + .write() + .unwrap(); + + // Domain is respected. + assert_eq!( + resolve_commit_ids( + mut_repo, + &format!( + "reachable({}, all() ~ ::{})", + graph1commit2.id().hex(), + graph1commit1.id().hex() + ) + ), + vec![graph1commit3.id().clone(), graph1commit2.id().clone(),] + ); + assert_eq!( + resolve_commit_ids( + mut_repo, + &format!( + "reachable({}, all() ~ ::{})", + graph1commit2.id().hex(), + graph1commit3.id().hex() + ) + ), + vec![] + ); + + // Each graph is identifiable from any node in it. + for (i, commit) in [&graph1commit1, &graph1commit2, &graph1commit3] + .iter() + .enumerate() + { + assert_eq!( + resolve_commit_ids( + mut_repo, + &format!("reachable({}, all() ~ root())", commit.id().hex()) + ), + vec![ + graph1commit3.id().clone(), + graph1commit2.id().clone(), + graph1commit1.id().clone(), + ], + "commit {}", + i + 1 + ); + } + + for (i, commit) in [&graph2commit1, &graph2commit2, &graph2commit3] + .iter() + .enumerate() + { + assert_eq!( + resolve_commit_ids( + mut_repo, + &format!("reachable({}, all() ~ root())", commit.id().hex()) + ), + vec![ + graph2commit3.id().clone(), + graph2commit2.id().clone(), + graph2commit1.id().clone(), + ], + "commit {}", + i + 1 + ); + } + + for (i, commit) in [ + &graph3commit1, + &graph3commit2, + &graph3commit3, + &graph3commit4, + &graph3commit5, + &graph3commit6, + &graph3commit7, + ] + .iter() + .enumerate() + { + assert_eq!( + resolve_commit_ids( + mut_repo, + &format!("reachable({}, all() ~ root())", commit.id().hex()) + ), + vec![ + graph3commit7.id().clone(), + graph3commit6.id().clone(), + graph3commit5.id().clone(), + graph3commit4.id().clone(), + graph3commit3.id().clone(), + graph3commit2.id().clone(), + graph3commit1.id().clone(), + ], + "commit {}", + i + 1 + ); + } + + // Test a split of the pyramidal monstrosity. + assert_eq!( + resolve_commit_ids( + mut_repo, + &format!( + "reachable({}, all() ~ ::{})", + graph3commit4.id().hex(), + graph3commit5.id().hex() + ) + ), + vec![ + graph3commit7.id().clone(), + graph3commit4.id().clone(), + graph3commit1.id().clone(), + ] + ); +} + #[test] fn test_evaluate_expression_descendants() { let settings = testutils::user_settings();