From c86300b13a80be962c3a022db4771ba1434970e7 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 28 Aug 2024 21:52:48 +0200 Subject: [PATCH] Exclude big files when performing a worktree diff. This was lost previously when switching it over to a read-only implementation. Implementing it with an ignore list will take time, 400ms in the GitLab repository, but it's not slower than it was before and it is always preferred to not dump objects into the ODB unnecessarily. --- Cargo.lock | 2 + crates/gitbutler-command-context/Cargo.toml | 1 + crates/gitbutler-command-context/src/lib.rs | 3 + .../src/repository_ext.rs | 50 +++++++++++++++++ crates/gitbutler-diff/src/diff.rs | 3 +- crates/gitbutler-oplog/Cargo.toml | 1 + crates/gitbutler-oplog/src/oplog.rs | 56 ++----------------- 7 files changed, 64 insertions(+), 52 deletions(-) create mode 100644 crates/gitbutler-command-context/src/repository_ext.rs diff --git a/Cargo.lock b/Cargo.lock index b94b5110f6..2991463947 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2211,6 +2211,7 @@ name = "gitbutler-command-context" version = "0.0.0" dependencies = [ "anyhow", + "bstr", "git2", "gitbutler-project", "gix", @@ -2374,6 +2375,7 @@ dependencies = [ "anyhow", "git2", "gitbutler-branch", + "gitbutler-command-context", "gitbutler-diff", "gitbutler-fs", "gitbutler-project", diff --git a/crates/gitbutler-command-context/Cargo.toml b/crates/gitbutler-command-context/Cargo.toml index df826abf78..cd5d076203 100644 --- a/crates/gitbutler-command-context/Cargo.toml +++ b/crates/gitbutler-command-context/Cargo.toml @@ -12,3 +12,4 @@ gix.workspace = true tracing.workspace = true gitbutler-project.workspace = true itertools = "0.13" +bstr = "1.10.0" diff --git a/crates/gitbutler-command-context/src/lib.rs b/crates/gitbutler-command-context/src/lib.rs index ca8a45422f..c4dc09dc6e 100644 --- a/crates/gitbutler-command-context/src/lib.rs +++ b/crates/gitbutler-command-context/src/lib.rs @@ -98,3 +98,6 @@ impl CommandContext { )?) } } + +mod repository_ext; +pub use repository_ext::RepositoryExtLite; diff --git a/crates/gitbutler-command-context/src/repository_ext.rs b/crates/gitbutler-command-context/src/repository_ext.rs new file mode 100644 index 0000000000..58841ea898 --- /dev/null +++ b/crates/gitbutler-command-context/src/repository_ext.rs @@ -0,0 +1,50 @@ +use anyhow::{Context, Result}; +use gix::bstr::{BString, ByteVec}; +use tracing::instrument; + +/// An extension trait that should avoid pulling in large amounts of dependency so it can be used +/// in more places without causing cycles. +/// `gitbutler_repo::RepositoryExt` may not be usable everywhere due to that. +pub trait RepositoryExtLite { + /// Exclude files that are larger than `limit_in_bytes` (eg. database.sql which may never be intended to be committed) + /// so they don't show up in the next diff. + fn ignore_large_files_in_diffs(&self, limit_in_bytes: u64) -> Result<()>; +} + +impl RepositoryExtLite for git2::Repository { + #[instrument(level = tracing::Level::DEBUG, skip(self), err(Debug))] + fn ignore_large_files_in_diffs(&self, limit_in_bytes: u64) -> Result<()> { + use gix::bstr::ByteSlice; + let repo = gix::open(self.path())?; + let worktree_dir = repo + .work_dir() + .context("All repos are expected to have a worktree")?; + let files_to_exclude: Vec<_> = repo + .dirwalk_iter( + repo.index_or_empty()?, + None::, + Default::default(), + repo.dirwalk_options()? + .emit_ignored(None) + .emit_pruned(false) + .emit_untracked(gix::dir::walk::EmissionMode::Matching), + )? + .filter_map(Result::ok) + .filter_map(|item| { + let path = worktree_dir.join(gix::path::from_bstr(item.entry.rela_path.as_bstr())); + let file_is_too_large = path + .metadata() + .map_or(false, |md| md.is_file() && md.len() > limit_in_bytes); + file_is_too_large + .then(|| Vec::from(item.entry.rela_path).into_string().ok()) + .flatten() + }) + .collect(); + // TODO(ST): refactor this to be path-safe and ' ' save - the returned list is space separated (!!) + // Just make sure this isn't needed anymore. + let ignore_list = files_to_exclude.join(" "); + // In-memory, libgit2 internal ignore rule + self.add_ignore_rule(&ignore_list)?; + Ok(()) + } +} diff --git a/crates/gitbutler-diff/src/diff.rs b/crates/gitbutler-diff/src/diff.rs index 393f27c500..0315d8cdb0 100644 --- a/crates/gitbutler-diff/src/diff.rs +++ b/crates/gitbutler-diff/src/diff.rs @@ -3,6 +3,7 @@ use std::{borrow::Cow, collections::HashMap, path::PathBuf, str}; use anyhow::{Context, Result}; use bstr::{BStr, BString, ByteSlice, ByteVec}; use gitbutler_cherry_pick::RepositoryExt; +use gitbutler_command_context::RepositoryExtLite; use gitbutler_serde::BStringForFrontend; use serde::{Deserialize, Serialize}; use tracing::instrument; @@ -155,8 +156,8 @@ pub fn workdir(repo: &git2::Repository, commit_oid: &git2::Oid) -> Result Result { - let repo = gix::open(repo.path())?; - let files_to_exclude: Vec<_> = repo - .dirwalk_iter( - repo.index_or_empty()?, - None::, - Default::default(), - repo.dirwalk_options()? - .emit_ignored(None) - .emit_pruned(false) - .emit_untracked(gix::dir::walk::EmissionMode::Matching), - )? - .filter_map(Result::ok) - .filter_map(|item| { - let path = worktree_dir.join(gix::path::from_bstr(item.entry.rela_path.as_bstr())); - let file_is_too_large = path.metadata().map_or(false, |md| { - md.is_file() && md.len() > SNAPSHOT_FILE_LIMIT_BYTES - }); - file_is_too_large - .then(|| Vec::from(item.entry.rela_path).into_string().ok()) - .flatten() - }) - .collect(); - Ok(files_to_exclude.join(" ")) -} - /// Returns the number of lines of code (added + removed) since the last snapshot in `project`. /// Includes untracked files. /// `repo` is an already opened project repository. @@ -752,13 +712,7 @@ fn lines_since_snapshot(project: &Project, repo: &git2::Repository) -> Result