From 22c29f196f8a35c5c7b56e282c9d60eb0214c75e Mon Sep 17 00:00:00 2001 From: Emily Date: Fri, 28 Jun 2024 10:35:50 +0100 Subject: [PATCH] =?UTF-8?q?str=5Futil:=20support=20case=E2=80=90insensitiv?= =?UTF-8?q?e=20string=20patterns?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Partially resolve a 1.5‐year‐old TODO comment. Add opt‐in syntax for case‐insensitive matching, suffixing the pattern kind with `-i`. Not every context supports case‐insensitive patterns (e.g. Git branch fetch settings). It may make sense to make this the default in at least some contexts (e.g. the commit signature and description revsets), but it would require some thought to avoid more confusing context‐sensitivity. Make `mine()` match case‐insensitively unconditionally, since email addresses are conventionally case‐insensitive and it doesn’t take a pattern anyway. This currently only handles ASCII case folding, due to the complexities of case‐insensitive Unicode comparison and the `glob` crate’s lack of support for it. This is unlikely to matter for email addresses, which very rarely contain non‐ASCII characters, but is unfortunate for names and descriptions. However, the current matching behaviour is already seriously deficient for non‐ASCII text due to the lack of any normalization, so this hopefully shouldn’t be a blocker to adding the interface. An expository comment has been left in the code for anyone who wants to try and address this (perhaps a future version of myself). --- CHANGELOG.md | 5 + cli/src/commands/git/push.rs | 2 +- docs/revsets.md | 3 + lib/src/default_index/revset_engine.rs | 3 +- lib/src/revset.rs | 7 +- lib/src/str_util.rs | 124 ++++++++++++++++++++----- lib/tests/test_revset.rs | 27 +++++- 7 files changed, 143 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 50438c5db7..ace2489aa8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,11 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). * `jj git clone some/nested/path` now creates the full directory tree for nested destination paths if they don't exist. +* String patterns now support case‐insensitive matching by suffixing any + pattern kind with `-i`. `mine()` uses case‐insensitive matching on your email + address unconditionally. Only ASCII case folding is currently implemented, + but this will likely change in the future. + ### Fixed bugs ## [0.19.0] - 2024-07-03 diff --git a/cli/src/commands/git/push.rs b/cli/src/commands/git/push.rs index 24b87daa0b..7e317710b8 100644 --- a/cli/src/commands/git/push.rs +++ b/cli/src/commands/git/push.rs @@ -552,7 +552,7 @@ fn find_branches_targeted_by_revisions<'a>( }; let current_branches_expression = RevsetExpression::remote_branches( StringPattern::everything(), - StringPattern::Exact(remote_name.to_owned()), + StringPattern::exact(remote_name), ) .range(&RevsetExpression::commit(wc_commit_id)) .intersection(&RevsetExpression::branches(StringPattern::everything())); diff --git a/docs/revsets.md b/docs/revsets.md index e72d0f5818..8bebbac8c9 100644 --- a/docs/revsets.md +++ b/docs/revsets.md @@ -333,6 +333,9 @@ Functions that perform string matching support the following pattern syntax: * `glob:"pattern"`: Matches strings with Unix-style shell [wildcard `pattern`](https://docs.rs/glob/latest/glob/struct.Pattern.html). +You can append `-i` after the kind to match case‐insensitively (e.g. +`glob-i:"fix*jpeg*"`). + ## Aliases New symbols and functions can be defined in the config file, by using any diff --git a/lib/src/default_index/revset_engine.rs b/lib/src/default_index/revset_engine.rs index 13a93e79a0..76a495222f 100644 --- a/lib/src/default_index/revset_engine.rs +++ b/lib/src/default_index/revset_engine.rs @@ -1052,8 +1052,7 @@ fn build_predicate_fn( RevsetFilterPredicate::Author(pattern) => { let pattern = pattern.clone(); // TODO: Make these functions that take a needle to search for accept some - // syntax for specifying whether it's a regex and whether it's - // case-sensitive. + // syntax for specifying whether it's a regex. box_pure_predicate_fn(move |index, pos| { let entry = index.entry_by_pos(pos); let commit = store.get_commit(&entry.commit_id()).unwrap(); diff --git a/lib/src/revset.rs b/lib/src/revset.rs index fa9bded307..d88a584d4b 100644 --- a/lib/src/revset.rs +++ b/lib/src/revset.rs @@ -687,8 +687,11 @@ static BUILTIN_FUNCTION_MAP: Lazy> = Lazy: }); map.insert("mine", |function, context| { function.expect_no_arguments()?; + // Email address domains are inherently case‐insensitive, and the local‐parts + // are generally (although not universally) treated as case‐insensitive too, so + // we use a case‐insensitive match here. Ok(RevsetExpression::filter(RevsetFilterPredicate::Author( - StringPattern::Exact(context.user_email.to_owned()), + StringPattern::exact_i(&context.user_email), ))) }); map.insert("committer", |function, _context| { @@ -2518,7 +2521,7 @@ mod tests { assert!(parse("mine(foo)").is_err()); insta::assert_debug_snapshot!( parse("mine()").unwrap(), - @r###"Filter(Author(Exact("test.user@example.com")))"###); + @r###"Filter(Author(ExactI("test.user@example.com")))"###); insta::assert_debug_snapshot!( parse_with_workspace("empty()", &WorkspaceId::default()).unwrap(), @"NotIn(Filter(File(All)))"); diff --git a/lib/src/str_util.rs b/lib/src/str_util.rs index 2cef8bae9b..90f70d117c 100644 --- a/lib/src/str_util.rs +++ b/lib/src/str_util.rs @@ -32,16 +32,26 @@ pub enum StringPatternParseError { GlobPattern(glob::PatternError), } +fn parse_glob(src: &str) -> Result { + glob::Pattern::new(src).map_err(StringPatternParseError::GlobPattern) +} + /// Pattern to be tested against string property like commit description or /// branch name. #[derive(Clone, Debug, Eq, PartialEq)] pub enum StringPattern { - /// Matches strings exactly equal to `string`. + /// Matches strings exactly. Exact(String), - /// Unix-style shell wildcard pattern. - Glob(glob::Pattern), - /// Matches strings that contain `substring`. + /// Matches strings case‐insensitively. + ExactI(String), + /// Matches strings that contain a substring. Substring(String), + /// Matches strings that case‐insensitively contain a substring. + SubstringI(String), + /// Matches with a Unix‐style shell wildcard pattern. + Glob(glob::Pattern), + /// Matches with a case‐insensitive Unix‐style shell wildcard pattern. + GlobI(glob::Pattern), } impl StringPattern { @@ -50,11 +60,12 @@ impl StringPattern { StringPattern::Substring(String::new()) } - /// Parses the given string as a `StringPattern`. Everything before the - /// first ":" is considered the string's prefix. If the prefix is "exact:", - /// "glob:", or "substring:", a pattern of the specified kind is returned. - /// Returns an error if the string has an unrecognized prefix. Otherwise, a - /// `StringPattern::Exact` is returned. + /// Parses the given string as a [`StringPattern`]. Everything before the + /// first ":" is considered the string's prefix. If the prefix is + /// "exact[-i]:", "glob[-i]:", or "substring[-i]:", a pattern of the + /// specified kind is returned. Returns an error if the string has an + /// unrecognized prefix. Otherwise, a `StringPattern::Exact` is + /// returned. pub fn parse(src: &str) -> Result { if let Some((kind, pat)) = src.split_once(':') { StringPattern::from_str_kind(pat, kind) @@ -63,26 +74,48 @@ impl StringPattern { } } - /// Creates pattern that matches exactly. + /// Constructs a pattern that matches exactly. pub fn exact(src: impl Into) -> Self { StringPattern::Exact(src.into()) } - /// Parses the given string as glob pattern. + /// Constructs a pattern that matches case‐insensitively. + pub fn exact_i(src: impl Into) -> Self { + StringPattern::ExactI(src.into()) + } + + /// Constructs a pattern that matches a substring. + pub fn substring(src: impl Into) -> Self { + StringPattern::Substring(src.into()) + } + + /// Constructs a pattern that case‐insensitively matches a substring. + pub fn substring_i(src: impl Into) -> Self { + StringPattern::SubstringI(src.into()) + } + + /// Parses the given string as a glob pattern. pub fn glob(src: &str) -> Result { // TODO: might be better to do parsing and compilation separately since // not all backends would use the compiled pattern object. // TODO: if no meta character found, it can be mapped to Exact. - let pattern = glob::Pattern::new(src).map_err(StringPatternParseError::GlobPattern)?; - Ok(StringPattern::Glob(pattern)) + Ok(StringPattern::Glob(parse_glob(src)?)) } - /// Parses the given string as pattern of the specified `kind`. + /// Parses the given string as a case‐insensitive glob pattern. + pub fn glob_i(src: &str) -> Result { + Ok(StringPattern::GlobI(parse_glob(src)?)) + } + + /// Parses the given string as a pattern of the specified `kind`. pub fn from_str_kind(src: &str, kind: &str) -> Result { match kind { "exact" => Ok(StringPattern::exact(src)), + "exact-i" => Ok(StringPattern::exact_i(src)), + "substring" => Ok(StringPattern::substring(src)), + "substring-i" => Ok(StringPattern::substring_i(src)), "glob" => StringPattern::glob(src), - "substring" => Ok(StringPattern::Substring(src.to_owned())), + "glob-i" => StringPattern::glob_i(src), _ => Err(StringPatternParseError::InvalidKind(kind.to_owned())), } } @@ -96,9 +129,12 @@ impl StringPattern { /// /// This can be used to optimize map lookup by exact key. pub fn as_exact(&self) -> Option<&str> { + // TODO: Handle trivial case‐insensitive patterns here? It might make people + // expect they can use case‐insensitive patterns in contexts where they + // generally can’t. match self { StringPattern::Exact(literal) => Some(literal), - StringPattern::Glob(_) | StringPattern::Substring(_) => None, + _ => None, } } @@ -106,8 +142,11 @@ impl StringPattern { pub fn as_str(&self) -> &str { match self { StringPattern::Exact(literal) => literal, - StringPattern::Glob(pattern) => pattern.as_str(), + StringPattern::ExactI(literal) => literal, StringPattern::Substring(needle) => needle, + StringPattern::SubstringI(needle) => needle, + StringPattern::Glob(pattern) => pattern.as_str(), + StringPattern::GlobI(pattern) => pattern.as_str(), } } @@ -115,22 +154,61 @@ impl StringPattern { /// can't be represented as a glob. pub fn to_glob(&self) -> Option> { // TODO: If we add Regex pattern, it will return None. + // + // TODO: Handle trivial case‐insensitive patterns here? It might make people + // expect they can use case‐insensitive patterns in contexts where they + // generally can’t. match self { StringPattern::Exact(literal) => Some(glob::Pattern::escape(literal).into()), - StringPattern::Glob(pattern) => Some(pattern.as_str().into()), - StringPattern::Substring(needle) if needle.is_empty() => Some("*".into()), StringPattern::Substring(needle) => { - Some(format!("*{}*", glob::Pattern::escape(needle)).into()) + if needle.is_empty() { + Some("*".into()) + } else { + Some(format!("*{}*", glob::Pattern::escape(needle)).into()) + } } + StringPattern::Glob(pattern) => Some(pattern.as_str().into()), + StringPattern::ExactI(_) => None, + StringPattern::SubstringI(_) => None, + StringPattern::GlobI(_) => None, } } /// Returns true if this pattern matches the `haystack`. + /// + /// When matching against a case‐insensitive pattern, only ASCII case + /// differences are currently folded. This may change in the future. pub fn matches(&self, haystack: &str) -> bool { + // TODO: Unicode case folding is complicated and can be locale‐specific. The + // `glob` crate and Gitoxide only deal with ASCII case folding, so we do + // the same here; a more elaborate case folding system will require + // making sure those behave in a matching manner where relevant. + // + // Care will need to be taken regarding normalization and the choice of an + // appropriate case‐insensitive comparison scheme (`toNFKC_Casefold`?) to ensure + // that it is compatible with the standard case‐insensitivity of haystack + // components (like internationalized domain names in email addresses). The + // availability of normalization and case folding schemes in database backends + // will also need to be considered. A locale‐specific case folding + // scheme would likely not be appropriate for Jujutsu. + // + // For some discussion of this topic, see: + // match self { StringPattern::Exact(literal) => haystack == literal, - StringPattern::Glob(pattern) => pattern.matches(haystack), + StringPattern::ExactI(literal) => haystack.eq_ignore_ascii_case(literal), StringPattern::Substring(needle) => haystack.contains(needle), + StringPattern::SubstringI(needle) => haystack + .to_ascii_lowercase() + .contains(&needle.to_ascii_lowercase()), + StringPattern::Glob(pattern) => pattern.matches(haystack), + StringPattern::GlobI(pattern) => pattern.matches_with( + haystack, + glob::MatchOptions { + case_sensitive: false, + ..glob::MatchOptions::new() + }, + ), } } @@ -192,6 +270,10 @@ mod tests { StringPattern::parse("substring:foo").unwrap(), StringPattern::from_str_kind("foo", "substring").unwrap() ); + assert_eq!( + StringPattern::parse("substring-i:foo").unwrap(), + StringPattern::from_str_kind("foo", "substring-i").unwrap() + ); // Parse a pattern that contains a : itself. assert_eq!( diff --git a/lib/tests/test_revset.rs b/lib/tests/test_revset.rs index c48f82628c..86ca9260d0 100644 --- a/lib/tests/test_revset.rs +++ b/lib/tests/test_revset.rs @@ -2007,7 +2007,11 @@ fn test_evaluate_expression_branches() { vec![commit1.id().clone()] ); assert_eq!( - resolve_commit_ids(mut_repo, r#"branches(glob:"branch?")"#), + resolve_commit_ids(mut_repo, r#"branches(glob:"Branch?")"#), + vec![] + ); + assert_eq!( + resolve_commit_ids(mut_repo, r#"branches(glob-i:"Branch?")"#), vec![commit2.id().clone(), commit1.id().clone()] ); // Can silently resolve to an empty set if there's no matches @@ -2398,6 +2402,15 @@ fn test_evaluate_expression_author() { resolve_commit_ids(mut_repo, "author(\"email3\")"), vec![commit3.id().clone()] ); + // Can match case‐insensitively + assert_eq!( + resolve_commit_ids(mut_repo, "author(substring-i:Name)"), + vec![ + commit3.id().clone(), + commit2.id().clone(), + commit1.id().clone(), + ] + ); // Searches only among candidates if specified assert_eq!( resolve_commit_ids(mut_repo, "visible_heads() & author(\"name2\")"), @@ -2452,7 +2465,8 @@ fn test_evaluate_expression_mine() { .set_parents(vec![commit2.id().clone()]) .set_author(Signature { name: "name3".to_string(), - email: settings.user_email(), + // Test that matches are case‐insensitive + email: settings.user_email().to_ascii_uppercase(), timestamp, }) .write() @@ -2539,6 +2553,15 @@ fn test_evaluate_expression_committer() { resolve_commit_ids(mut_repo, "committer(\"email3\")"), vec![commit3.id().clone()] ); + // Can match case‐insensitively + assert_eq!( + resolve_commit_ids(mut_repo, "committer(substring-i:Name)"), + vec![ + commit3.id().clone(), + commit2.id().clone(), + commit1.id().clone(), + ] + ); // Searches only among candidates if specified assert_eq!( resolve_commit_ids(mut_repo, "visible_heads() & committer(\"name2\")"),