From e2fbded554d24be1a7483d9c35491053cc47a31b Mon Sep 17 00:00:00 2001 From: Daehyeok Mun Date: Fri, 16 Feb 2024 09:54:09 -0800 Subject: [PATCH] Switch to ignore crate for gitignore handling. Co-authored-by: Waleed Khan --- Cargo.lock | 30 +++ Cargo.toml | 1 + cli/examples/custom-working-copy/main.rs | 2 +- cli/src/cli_util.rs | 22 +- cli/src/commands/untrack.rs | 2 +- lib/Cargo.toml | 1 + lib/src/gitignore.rs | 308 +++++++++-------------- lib/src/local_working_copy.rs | 2 +- lib/src/working_copy.rs | 5 +- 9 files changed, 170 insertions(+), 203 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1d2c509271e..655c9d63252 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1426,6 +1426,19 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "globset" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57da3b9b5b85bd66f31093f8c408b90a74431672542466497dcbdfdc02034be1" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata 0.4.4", + "regex-syntax 0.8.2", +] + [[package]] name = "half" version = "1.8.2" @@ -1498,6 +1511,22 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "ignore" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b46810df39e66e925525d6e38ce1e7f6e1d208f72dc39757880fcb66e2c58af1" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata 0.4.4", + "same-file", + "walkdir", + "winapi-util", +] + [[package]] name = "indexmap" version = "2.2.3" @@ -1663,6 +1692,7 @@ dependencies = [ "gix", "glob", "hex", + "ignore", "insta", "itertools 0.12.1", "maplit", diff --git a/Cargo.toml b/Cargo.toml index a16579674cf..8add2ecc9b5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,6 +53,7 @@ gix = { version = "0.58.0", default-features = false, features = [ ] } glob = "0.3.1" hex = "0.4.3" +ignore = "0.4.20" indexmap = "2.2.3" insta = { version = "1.34.0", features = ["filters"] } itertools = "0.12.1" diff --git a/cli/examples/custom-working-copy/main.rs b/cli/examples/custom-working-copy/main.rs index 4f3ef0098cd..af9e10b9620 100644 --- a/cli/examples/custom-working-copy/main.rs +++ b/cli/examples/custom-working-copy/main.rs @@ -222,7 +222,7 @@ impl LockedWorkingCopy for LockedConflictsWorkingCopy { } fn snapshot(&mut self, mut options: SnapshotOptions) -> Result { - options.base_ignores = options.base_ignores.chain("", "/.conflicts".as_bytes()); + options.base_ignores = options.base_ignores.chain("", "/.conflicts".as_bytes())?; self.inner.snapshot(options) } diff --git a/cli/src/cli_util.rs b/cli/src/cli_util.rs index d9e0b3c5c54..74043d58c21 100644 --- a/cli/src/cli_util.rs +++ b/cli/src/cli_util.rs @@ -35,7 +35,7 @@ use jj_lib::backend::{BackendError, ChangeId, CommitId, MergedTreeId}; use jj_lib::commit::Commit; use jj_lib::git::{GitConfigParseError, GitExportError, GitImportError, GitRemoteManagementError}; use jj_lib::git_backend::GitBackend; -use jj_lib::gitignore::GitIgnoreFile; +use jj_lib::gitignore::{GitIgnoreError, GitIgnoreFile}; use jj_lib::hex_util::to_reverse_hex; use jj_lib::id_prefix::IdPrefixContext; use jj_lib::matchers::{EverythingMatcher, Matcher, PrefixMatcher}; @@ -481,6 +481,12 @@ impl From for CommandError { } } +impl From for CommandError { + fn from(err: GitIgnoreError) -> Self { + user_error(format!("Failed to process .gitignore: {err}")) + } +} + #[derive(Clone)] struct ChromeTracingFlushGuard { _inner: Option>, @@ -1047,7 +1053,7 @@ impl WorkspaceCommandHelper { } #[instrument(skip_all)] - pub fn base_ignores(&self) -> Arc { + pub fn base_ignores(&self) -> Result, GitIgnoreError> { fn get_excludes_file_path(config: &gix::config::File) -> Option { // TODO: maybe use path_by_key() and interpolate(), which can process non-utf-8 // path on Unix. @@ -1073,16 +1079,16 @@ impl WorkspaceCommandHelper { if let Some(git_backend) = self.git_backend() { let git_repo = git_backend.git_repo(); if let Some(excludes_file_path) = get_excludes_file_path(&git_repo.config_snapshot()) { - git_ignores = git_ignores.chain_with_file("", excludes_file_path); + git_ignores = git_ignores.chain_with_file("", excludes_file_path)?; } git_ignores = git_ignores - .chain_with_file("", git_backend.git_repo_path().join("info").join("exclude")); + .chain_with_file("", git_backend.git_repo_path().join("info").join("exclude"))?; } else if let Ok(git_config) = gix::config::File::from_globals() { if let Some(excludes_file_path) = get_excludes_file_path(&git_config) { - git_ignores = git_ignores.chain_with_file("", excludes_file_path); + git_ignores = git_ignores.chain_with_file("", excludes_file_path)?; } } - git_ignores + Ok(git_ignores) } pub fn resolve_single_op(&self, op_str: &str) -> Result { @@ -1353,7 +1359,7 @@ Set which revision the branch points to with `jj branch set {branch_name} -r { matcher: &dyn Matcher, instructions: &str, ) -> Result { - let base_ignores = self.helper.base_ignores(); + let base_ignores = self.helper.base_ignores()?; let settings = &self.helper.settings; Ok(crate::merge_tools::edit_diff( ui, diff --git a/cli/src/commands/untrack.rs b/cli/src/commands/untrack.rs index 0f1fc6561d2..a7c809195ff 100644 --- a/cli/src/commands/untrack.rs +++ b/cli/src/commands/untrack.rs @@ -46,7 +46,7 @@ pub(crate) fn cmd_untrack( let matcher = workspace_command.matcher_from_values(&args.paths)?; let mut tx = workspace_command.start_transaction().into_inner(); - let base_ignores = workspace_command.base_ignores(); + let base_ignores = workspace_command.base_ignores()?; let (mut locked_ws, wc_commit) = workspace_command.start_working_copy_mutation()?; // Create a new tree without the unwanted files let mut tree_builder = MergedTreeBuilder::new(wc_commit.tree_id().clone()); diff --git a/lib/Cargo.toml b/lib/Cargo.toml index 7a480cd05e9..8704aae6e3b 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -36,6 +36,7 @@ git2 = { workspace = true } gix = { workspace = true } glob = { workspace = true } hex = { workspace = true } +ignore = { workspace = true } itertools = { workspace = true } maplit = { workspace = true } once_cell = { workspace = true } diff --git a/lib/src/gitignore.rs b/lib/src/gitignore.rs index a6ecebc75f1..edd2ea66638 100644 --- a/lib/src/gitignore.rs +++ b/lib/src/gitignore.rs @@ -14,189 +14,98 @@ #![allow(missing_docs)] -use std::fs; +use std::{fs, io}; use std::path::PathBuf; use std::sync::Arc; -use itertools::Itertools; -use regex::{escape as regex_escape, Regex}; - -#[derive(Debug)] -struct GitIgnoreLine { - is_negative: bool, - regex: Regex, -} - -impl GitIgnoreLine { - // Remove trailing spaces (unless backslash-escaped). Any character - // can be backslash-escaped as well. - fn remove_trailing_space(input: &str) -> &str { - let input = input.strip_suffix('\r').unwrap_or(input); - let mut it = input.char_indices().rev().peekable(); - while let Some((i, c)) = it.next() { - if c != ' ' { - return &input[..i + c.len_utf8()]; - } - if matches!(it.peek(), Some((_, '\\'))) { - if it.skip(1).take_while(|(_, b)| *b == '\\').count() % 2 == 1 { - return &input[..i]; - } - return &input[..i + 1]; - } - } - "" - } - - fn parse(prefix: &str, input: &str) -> Option { - assert!(prefix.is_empty() || prefix.ends_with('/')); - if input.starts_with('#') { - return None; - } - - let input = GitIgnoreLine::remove_trailing_space(input); - // Remove leading "!" before checking for empty to match git's implementation - // (i.e. just "!" matching nothing, not everything). - let (is_negative, input) = match input.strip_prefix('!') { - None => (false, input), - Some(rest) => (true, rest), - }; - if input.is_empty() { - return None; - } - - let (matches_only_directory, input) = match input.strip_suffix('/') { - None => (false, input), - Some(rest) => (true, rest), - }; - let (mut is_rooted, input) = match input.strip_prefix('/') { - None => (false, input), - Some(rest) => (true, rest), - }; - is_rooted |= input.contains('/'); - - let mut regex = String::new(); - regex.push('^'); - regex.push_str(prefix); - if !is_rooted { - regex.push_str("(.*/)?"); - } - - let components = input.split('/').collect_vec(); - for (i, component) in components.iter().enumerate() { - if *component == "**" { - if i == components.len() - 1 { - regex.push_str(".*"); - } else { - regex.push_str("(.*/)?"); - } - } else { - let mut in_escape = false; - let mut character_class: Option = None; - for c in component.chars() { - if in_escape { - in_escape = false; - if !matches!(c, ' ' | '#' | '!' | '?' | '\\' | '*') { - regex.push_str(®ex_escape("\\")); - } - regex.push_str(®ex_escape(&c.to_string())); - } else if c == '\\' { - in_escape = true; - } else if let Some(characters) = &mut character_class { - if c == ']' { - regex.push('['); - regex.push_str(characters); - regex.push(']'); - character_class = None; - } else { - characters.push(c); - } - } else { - in_escape = false; - if c == '?' { - regex.push_str("[^/]"); - } else if c == '*' { - regex.push_str("[^/]*"); - } else if c == '[' { - character_class = Some(String::new()); - } else { - regex.push_str(®ex_escape(&c.to_string())); - } - } - } - if in_escape { - regex.push_str(®ex_escape("\\")); - } - if i < components.len() - 1 { - regex.push('/'); - } - } - } - if matches_only_directory { - regex.push_str("/.*"); - } else { - regex.push_str("(/.*|$)"); - } - let regex = Regex::new(®ex).unwrap(); - - Some(GitIgnoreLine { is_negative, regex }) - } - - fn matches(&self, path: &str) -> bool { - self.regex.is_match(path) - } +use ignore::gitignore; +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum GitIgnoreError { + #[error("failed to read ignore patterns from file {path}: {source}")] + ReadFile { + path: PathBuf, + source: io::Error, + }, + #[error("invalid UTF-8 for ignore pattern in {path} on line #{line_num_for_display} {line}")] + InvalidUtf8 { + path: PathBuf, + line_num_for_display: usize, + line: String, + source: std::str::Utf8Error, + }, + #[error(transparent)] + Underlying(#[from] ignore::Error), } /// Models the effective contents of multiple .gitignore files. #[derive(Debug)] pub struct GitIgnoreFile { - parent: Option>, - lines: Vec, + path: String, + matchers: Vec, } impl GitIgnoreFile { pub fn empty() -> Arc { Arc::new(GitIgnoreFile { - parent: None, - lines: vec![], + path: Default::default(), + matchers: Default::default(), }) } - pub fn chain(self: &Arc, prefix: &str, input: &[u8]) -> Arc { - let mut lines = vec![]; - for input_line in input.split(|b| *b == b'\n') { - // Skip non-utf8 lines - if let Ok(line_string) = String::from_utf8(input_line.to_vec()) { - if let Some(line) = GitIgnoreLine::parse(prefix, &line_string) { - lines.push(line); - } - } + pub fn chain( + self: &Arc, + prefix: &str, + input: &[u8], + ) -> Result, GitIgnoreError> { + let path = self.path.clone() + prefix; + let mut builder = gitignore::GitignoreBuilder::new(&path); + for (i, input_line) in input.split(|b| *b == b'\n').enumerate() { + let line = + std::str::from_utf8(input_line).map_err(|err| GitIgnoreError::InvalidUtf8 { + path: PathBuf::from(&path), + line_num_for_display: i + 1, + line: String::from_utf8_lossy(input_line).to_string(), + source: err, + })?; + // TODO: do we need to provide the `from` argument? Is it for providing diagnostics or correctness? + builder.add_line(None, line)?; } + let matcher = builder.build()?; + let mut matchers = self.matchers.clone(); + matchers.push(matcher); - Arc::new(GitIgnoreFile { - parent: Some(self.clone()), - lines, - }) + Ok(Arc::new(GitIgnoreFile { path, matchers })) } pub fn chain_with_file( self: &Arc, prefix: &str, file: PathBuf, - ) -> Arc { + ) -> Result, GitIgnoreError> { if file.is_file() { - let buf = fs::read(file).unwrap(); + let buf = fs::read(&file).map_err(|err| GitIgnoreError::ReadFile { path: file.clone(), source: err })?; self.chain(prefix, &buf) } else { - self.clone() + Ok(self.clone()) } } - fn all_lines_reversed<'a>(&'a self) -> Box + 'a> { - if let Some(parent) = &self.parent { - Box::new(self.lines.iter().rev().chain(parent.all_lines_reversed())) - } else { - Box::new(self.lines.iter().rev()) - } + fn matches_helper(&self, path: &str, is_dir: bool) -> bool { + self.matchers + .iter() + .rev() + .find_map(|matcher| + // TODO: the documentation warns that + // `matched_path_or_any_parents` is slower than `matched`; + // ideally, we would switch to that. + match matcher.matched_path_or_any_parents(path, is_dir) { + ignore::Match::None => None, + ignore::Match::Ignore(_) => Some(true), + ignore::Match::Whitelist(_) => Some(false), + }) + .unwrap_or_default() } /// Returns whether specified path (not just file!) should be ignored. This @@ -207,13 +116,9 @@ impl GitIgnoreFile { /// files within a ignored directory should be ignored unconditionally. /// The code in this file does not take that into account. pub fn matches(&self, path: &str) -> bool { - // Later lines take precedence, so check them in reverse - for line in self.all_lines_reversed() { - if line.matches(path) { - return !line.is_negative; - } - } - false + //If path ends with slash, consier it as a directory. + let is_dir = path.ends_with('/'); + self.matches_helper(path.trim_end_matches('/'), is_dir) } } @@ -223,7 +128,7 @@ mod tests { use super::*; fn matches(input: &[u8], path: &str) -> bool { - let file = GitIgnoreFile::empty().chain("", input); + let file = GitIgnoreFile::empty().chain("", input).unwrap(); file.matches(path) } @@ -235,13 +140,13 @@ mod tests { #[test] fn test_gitignore_empty_file_with_prefix() { - let file = GitIgnoreFile::empty().chain("dir/", b""); + let file = GitIgnoreFile::empty().chain("dir/", b"").unwrap(); assert!(!file.matches("dir/foo")); } #[test] fn test_gitignore_literal() { - let file = GitIgnoreFile::empty().chain("", b"foo\n"); + let file = GitIgnoreFile::empty().chain("", b"foo\n").unwrap(); assert!(file.matches("foo")); assert!(file.matches("dir/foo")); assert!(file.matches("dir/subdir/foo")); @@ -251,17 +156,14 @@ mod tests { #[test] fn test_gitignore_literal_with_prefix() { - let file = GitIgnoreFile::empty().chain("dir/", b"foo\n"); - // I consider it undefined whether a file in a parent directory matches, but - // let's test it anyway - assert!(!file.matches("foo")); + let file = GitIgnoreFile::empty().chain("./dir/", b"foo\n").unwrap(); assert!(file.matches("dir/foo")); assert!(file.matches("dir/subdir/foo")); } #[test] fn test_gitignore_pattern_same_as_prefix() { - let file = GitIgnoreFile::empty().chain("dir/", b"dir\n"); + let file = GitIgnoreFile::empty().chain("dir/", b"dir\n").unwrap(); assert!(file.matches("dir/dir")); // We don't want the "dir" pattern to apply to the parent directory assert!(!file.matches("dir/foo")); @@ -269,24 +171,23 @@ mod tests { #[test] fn test_gitignore_rooted_literal() { - let file = GitIgnoreFile::empty().chain("", b"/foo\n"); + let file = GitIgnoreFile::empty().chain("", b"/foo\n").unwrap(); assert!(file.matches("foo")); assert!(!file.matches("dir/foo")); } #[test] fn test_gitignore_rooted_literal_with_prefix() { - let file = GitIgnoreFile::empty().chain("dir/", b"/foo\n"); - // I consider it undefined whether a file in a parent directory matches, but - // let's test it anyway - assert!(!file.matches("foo")); + let file = GitIgnoreFile::empty().chain("dir/", b"/foo\n").unwrap(); assert!(file.matches("dir/foo")); assert!(!file.matches("dir/subdir/foo")); } #[test] fn test_gitignore_deep_dir() { - let file = GitIgnoreFile::empty().chain("", b"/dir1/dir2/dir3\n"); + let file = GitIgnoreFile::empty() + .chain("", b"/dir1/dir2/dir3\n") + .unwrap(); assert!(!file.matches("foo")); assert!(!file.matches("dir1/foo")); assert!(!file.matches("dir1/dir2/foo")); @@ -296,7 +197,7 @@ mod tests { #[test] fn test_gitignore_match_only_dir() { - let file = GitIgnoreFile::empty().chain("", b"/dir/\n"); + let file = GitIgnoreFile::empty().chain("", b"/dir/\n").unwrap(); assert!(!file.matches("dir")); assert!(file.matches("dir/foo")); assert!(file.matches("dir/subdir/foo")); @@ -306,20 +207,20 @@ mod tests { fn test_gitignore_unusual_symbols() { assert!(matches(b"\\*\n", "*")); assert!(!matches(b"\\*\n", "foo")); - assert!(matches(b"\\\n", "\\")); + assert!(!matches(b"\\\n", "\\")); assert!(matches(b"\\!\n", "!")); assert!(matches(b"\\?\n", "?")); assert!(!matches(b"\\?\n", "x")); // Invalid escapes are treated like literal backslashes - assert!(matches(b"\\w\n", "\\w")); - assert!(!matches(b"\\w\n", "w")); + assert!(!matches(b"\\w\n", "\\w")); + assert!(matches(b"\\w\n", "w")); } #[test] fn test_gitignore_whitespace() { assert!(!matches(b" \n", " ")); assert!(matches(b"\\ \n", " ")); - assert!(matches(b"\\\\ \n", "\\")); + assert!(matches(b"\\\\ \n", "\\ ")); assert!(!matches(b"\\\\ \n", " ")); assert!(matches(b"\\\\\\ \n", "\\ ")); assert!(matches(b" a\n", " a")); @@ -327,16 +228,16 @@ mod tests { assert!(matches(b"a b \n", "a b")); assert!(!matches(b"a b \n", "a b ")); assert!(matches(b"a b\\ \\ \n", "a b ")); - // It's unclear how this should be interpreted, but we count spaces before - // escaped spaces - assert!(matches(b"a b \\ \n", "a b ")); - // A single CR at EOL is ignored + // Trail CRs at EOL is ignored assert!(matches(b"a\r\n", "a")); assert!(!matches(b"a\r\n", "a\r")); - assert!(matches(b"a\r\r\n", "a\r")); + assert!(!matches(b"a\r\r\n", "a\r")); + assert!(matches(b"a\r\r\n", "a")); assert!(!matches(b"a\r\r\n", "a\r\r")); + assert!(matches(b"a\r\r\n", "a")); assert!(matches(b"\ra\n", "\ra")); assert!(!matches(b"\ra\n", "a")); + assert!(GitIgnoreFile::empty().chain("", b"a b \\ \n").is_err()); } #[test] @@ -375,10 +276,9 @@ mod tests { #[test] fn test_gitignore_leading_dir_glob_with_prefix() { - let file = GitIgnoreFile::empty().chain("dir1/dir2/", b"**/foo\n"); - // I consider it undefined whether a file in a parent directory matches, but - // let's test it anyway - assert!(!file.matches("foo")); + let file = GitIgnoreFile::empty() + .chain("dir1/dir2/", b"**/foo\n") + .unwrap(); assert!(file.matches("dir1/dir2/foo")); assert!(!file.matches("dir1/dir2/bar")); assert!(file.matches("dir1/dir2/sub1/sub2/foo")); @@ -418,13 +318,14 @@ mod tests { assert!(!matches(b"foo\n!foo/bar\nfoo/bar/baz", "foo/bar")); assert!(matches(b"foo\n!foo/bar\nfoo/bar/baz", "foo/bar/baz")); assert!(!matches(b"foo\n!foo/bar\nfoo/bar/baz", "foo/bar/quux")); + assert!(!matches(b"foo/*\n!foo/bar", "foo/bar")); } #[test] fn test_gitignore_file_ordering() { - let file1 = GitIgnoreFile::empty().chain("", b"foo\n"); - let file2 = file1.chain("foo/", b"!bar"); - let file3 = file2.chain("foo/bar/", b"baz"); + let file1 = GitIgnoreFile::empty().chain("", b"foo\n").unwrap(); + let file2 = file1.chain("foo/", b"!bar").unwrap(); + let file3 = file2.chain("foo/bar/", b"baz").unwrap(); assert!(file1.matches("foo")); assert!(file1.matches("foo/bar")); assert!(!file2.matches("foo/bar")); @@ -432,4 +333,29 @@ mod tests { assert!(file3.matches("foo/bar/baz")); assert!(!file3.matches("foo/bar/qux")); } + + #[test] + fn test_gitignore_negative_parent_directory() { + // The following script shows that Git ignores the file: + // + // ```bash + // $ rm -rf test-repo && \ + // git init test-repo &>/dev/null && \ + // cd test-repo && \ + // printf 'A/B.*\n!/A/\n' >.gitignore && \ + // mkdir A && \ + // touch A/B.ext && \ + // git check-ignore A/B.ext + // A/B.ext + // ``` + let ignore = GitIgnoreFile::empty() + .chain("", b"foo/bar.*\n!/foo/\n") + .unwrap(); + assert!(ignore.matches("foo/bar.ext")); + + let ignore = GitIgnoreFile::empty() + .chain("", b"!/foo/\nfoo/bar.*\n") + .unwrap(); + assert!(ignore.matches("foo/bar.ext")); + } } diff --git a/lib/src/local_working_copy.rs b/lib/src/local_working_copy.rs index 35949ad2211..da20c21f8b1 100644 --- a/lib/src/local_working_copy.rs +++ b/lib/src/local_working_copy.rs @@ -869,7 +869,7 @@ impl TreeState { } let git_ignore = - git_ignore.chain_with_file(&dir.to_internal_dir_string(), disk_dir.join(".gitignore")); + git_ignore.chain_with_file(&dir.to_internal_dir_string(), disk_dir.join(".gitignore"))?; let dir_entries = disk_dir .read_dir() .unwrap() diff --git a/lib/src/working_copy.rs b/lib/src/working_copy.rs index bb56b5016e8..d2c794a3565 100644 --- a/lib/src/working_copy.rs +++ b/lib/src/working_copy.rs @@ -25,7 +25,7 @@ use thiserror::Error; use crate::backend::{BackendError, MergedTreeId}; use crate::commit::Commit; use crate::fsmonitor::FsmonitorKind; -use crate::gitignore::GitIgnoreFile; +use crate::gitignore::{GitIgnoreError, GitIgnoreFile}; use crate::op_store::{OperationId, WorkspaceId}; use crate::repo_path::{RepoPath, RepoPathBuf}; use crate::settings::HumanByteSize; @@ -162,6 +162,9 @@ pub enum SnapshotError { /// The maximum allowed size. max_size: HumanByteSize, }, + /// Checking path with ignore patterns failed. + #[error(transparent)] + GitIgnoreError(#[from] GitIgnoreError), /// Some other error happened while snapshotting the working copy. #[error("{message}")] Other {