Skip to content

Commit

Permalink
git_backend: on gc(), remove unreachable no-gc refs and compact them
Browse files Browse the repository at this point in the history
With my jj repo, the number of jj/keep refs went down from 87887 to 27733.
The .git directory size is halved, but we'll need to clean up extra and index
files to save disk space. "git gc --prune=now && jj debug reindex" passed, so
the repo wouldn't be corrupted.

jj-vcs#12
  • Loading branch information
yuja committed Jan 18, 2024
1 parent 80f5160 commit 3605edc
Show file tree
Hide file tree
Showing 3 changed files with 287 additions and 4 deletions.
6 changes: 3 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### New features

* New `jj op abandon` command is added to clean up the operation history. If GC
is implemented, Git refs and commit objects can be compacted.
* New `jj op abandon` command is added to clean up the operation history. Git
refs and commit objects can be further compacted by `jj util gc`.

* `jj util gc` now removes unreachable operation and view objects.
* `jj util gc` now removes unreachable operation, view, and Git objects.

* `jj branch rename` will now warn if the renamed branch has a remote branch, since
those will have to be manually renamed outside of `jj`.
Expand Down
96 changes: 95 additions & 1 deletion lib/src/git_backend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#![allow(missing_docs)]

use std::any::Any;
use std::collections::HashSet;
use std::fmt::{Debug, Error, Formatter};
use std::io::{Cursor, Read};
use std::path::{Path, PathBuf};
Expand Down Expand Up @@ -606,6 +607,88 @@ fn to_no_gc_ref_update(id: &CommitId) -> gix::refs::transaction::RefEdit {
}
}

fn to_ref_deletion(git_ref: gix::refs::Reference) -> gix::refs::transaction::RefEdit {
let expected = gix::refs::transaction::PreviousValue::ExistingMustMatch(git_ref.target);
gix::refs::transaction::RefEdit {
change: gix::refs::transaction::Change::Delete {
expected,
log: gix::refs::transaction::RefLog::AndReference,
},
name: git_ref.name,
deref: false,
}
}

/// Recreates `refs/jj/keep` refs for the `new_heads`, and removes the other
/// unreachable and non-head refs.
fn recreate_no_gc_refs(
git_repo: &gix::Repository,
new_heads: impl IntoIterator<Item = CommitId>,
keep_newer: SystemTime,
) -> Result<(), BackendError> {
// Calculate diff between existing no-gc refs and new heads.
let new_heads: HashSet<CommitId> = new_heads.into_iter().collect();
let mut no_gc_refs_to_keep_count: usize = 0;
let mut no_gc_refs_to_delete: Vec<gix::refs::Reference> = Vec::new();
let git_references = git_repo
.references()
.map_err(|err| BackendError::Other(err.into()))?;
let no_gc_refs_iter = git_references
.prefixed(NO_GC_REF_NAMESPACE)
.map_err(|err| BackendError::Other(err.into()))?;
for git_ref in no_gc_refs_iter {
let git_ref = git_ref.map_err(BackendError::Other)?.detach();
let oid = git_ref.target.try_id().ok_or_else(|| {
let name = git_ref.name.as_bstr();
BackendError::Other(format!("Symbolic no-gc ref found: {name}").into())
})?;
let id = CommitId::from_bytes(oid.as_bytes());
let name_good = git_ref.name.as_bstr()[NO_GC_REF_NAMESPACE.len()..] == id.hex();
if new_heads.contains(&id) && name_good {
no_gc_refs_to_keep_count += 1;
continue;
}
// Check timestamp of loose ref, but this is still racy on re-import
// because:
// - existing packed ref won't be demoted to loose ref
// - existing loose ref won't be touched
//
// TODO: might be better to switch to a dummy merge, where new no-gc ref
// will always have a unique name. Doing that with the current
// ref-per-head strategy would increase the number of the no-gc refs.
// https://github.com/martinvonz/jj/pull/2659#issuecomment-1837057782
let loose_ref_path = git_repo.path().join(git_ref.name.to_path());
if let Ok(metadata) = loose_ref_path.metadata() {
let mtime = metadata.modified().expect("unsupported platform?");
if mtime > keep_newer {
tracing::trace!(?git_ref, "not deleting new");
no_gc_refs_to_keep_count += 1;
continue;
}
}
// Also deletes no-gc ref of random name created by old jj.
tracing::trace!(?git_ref, ?name_good, "will delete");
no_gc_refs_to_delete.push(git_ref);
}
tracing::info!(
new_heads_count = new_heads.len(),
no_gc_refs_to_keep_count,
no_gc_refs_to_delete_count = no_gc_refs_to_delete.len(),
"collected reachable refs"
);

// It's slow to delete packed refs one by one, so update refs all at once.
let ref_edits = itertools::chain(
no_gc_refs_to_delete.into_iter().map(to_ref_deletion),
new_heads.iter().map(to_no_gc_ref_update),
);
git_repo
.edit_references(ref_edits)
.map_err(|err| BackendError::Other(err.into()))?;

Ok(())
}

fn run_git_gc(git_dir: &Path) -> Result<(), GitGcError> {
let mut git = Command::new("git");
git.arg("--git-dir=."); // turn off discovery
Expand Down Expand Up @@ -1082,7 +1165,18 @@ impl Backend for GitBackend {
Ok((id, contents))
}

fn gc(&self, _index: &dyn Index, _keep_newer: SystemTime) -> BackendResult<()> {
#[tracing::instrument(skip(self, index))]
fn gc(&self, index: &dyn Index, keep_newer: SystemTime) -> BackendResult<()> {
let git_repo = self.lock_git_repo();
let new_heads = index
.all_heads_for_gc()
.map_err(|err| BackendError::Other(err.into()))?
.filter(|id| *id != self.root_commit_id);
recreate_no_gc_refs(&git_repo, new_heads, keep_newer)?;
// TODO: remove unreachable entries from extras table if segment file
// mtime <= keep_newer? (it won't be consistent with no-gc refs
// preserved by the keep_newer timestamp though)
// TODO: remove unreachable extras table segments
// TODO: pass in keep_newer to "git gc" command
run_git_gc(self.git_repo_path()).map_err(|err| BackendError::Other(err.into()))
}
Expand Down
189 changes: 189 additions & 0 deletions lib/tests/test_git_backend.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
// Copyright 2024 The Jujutsu Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use std::collections::HashSet;
use std::process::Command;
use std::sync::Arc;
use std::time::SystemTime;

use jj_lib::backend::CommitId;
use jj_lib::git_backend::GitBackend;
use jj_lib::repo::{ReadonlyRepo, Repo};
use maplit::hashset;
use testutils::{create_random_commit, CommitGraphBuilder, TestRepo, TestRepoBackend};

fn get_git_backend(repo: &Arc<ReadonlyRepo>) -> &GitBackend {
repo.store()
.backend_impl()
.downcast_ref::<GitBackend>()
.unwrap()
}

fn get_git_repo(repo: &Arc<ReadonlyRepo>) -> gix::Repository {
get_git_backend(repo).git_repo()
}

fn collect_no_gc_refs(git_repo: &gix::Repository) -> HashSet<CommitId> {
let git_refs = git_repo.references().unwrap();
let no_gc_refs_iter = git_refs.prefixed("refs/jj/keep/").unwrap();
no_gc_refs_iter
.map(|git_ref| CommitId::from_bytes(git_ref.unwrap().id().as_bytes()))
.collect()
}

#[test]
fn test_gc() {
// TODO: Better way to disable the test if git command couldn't be executed
if Command::new("git").arg("--version").status().is_err() {
eprintln!("Skipping because git command might fail to run");
return;
}

let settings = testutils::user_settings();
let test_repo = TestRepo::init_with_backend(TestRepoBackend::Git);
let repo = test_repo.repo;
let git_repo = get_git_repo(&repo);
let base_index = repo.readonly_index();

// Set up commits:
//
// H (predecessor: D)
// G |
// |\|
// | F
// E |
// D | |
// C |/
// |/
// B
// A
let mut tx = repo.start_transaction(&settings);
let mut graph_builder = CommitGraphBuilder::new(&settings, tx.mut_repo());
let commit_a = graph_builder.initial_commit();
let commit_b = graph_builder.commit_with_parents(&[&commit_a]);
let commit_c = graph_builder.commit_with_parents(&[&commit_b]);
let commit_d = graph_builder.commit_with_parents(&[&commit_c]);
let commit_e = graph_builder.commit_with_parents(&[&commit_b]);
let commit_f = graph_builder.commit_with_parents(&[&commit_b]);
let commit_g = graph_builder.commit_with_parents(&[&commit_e, &commit_f]);
let commit_h = create_random_commit(tx.mut_repo(), &settings)
.set_parents(vec![commit_f.id().clone()])
.set_predecessors(vec![commit_d.id().clone()])
.write()
.unwrap();
let repo = tx.commit("test");
assert_eq!(
*repo.view().heads(),
hashset! {
commit_d.id().clone(),
commit_g.id().clone(),
commit_h.id().clone(),
},
);

// At first, all commits have no-gc refs
assert_eq!(
collect_no_gc_refs(&git_repo),
hashset! {
commit_a.id().clone(),
commit_b.id().clone(),
commit_c.id().clone(),
commit_d.id().clone(),
commit_e.id().clone(),
commit_f.id().clone(),
commit_g.id().clone(),
commit_h.id().clone(),
},
);

// Empty index, but all kept by file modification time
// (Beware that this invokes "git gc" and refs will be packed.)
repo.store()
.gc(base_index.as_index(), SystemTime::UNIX_EPOCH)
.unwrap();
assert_eq!(
collect_no_gc_refs(&git_repo),
hashset! {
commit_a.id().clone(),
commit_b.id().clone(),
commit_c.id().clone(),
commit_d.id().clone(),
commit_e.id().clone(),
commit_f.id().clone(),
commit_g.id().clone(),
commit_h.id().clone(),
},
);

// All reachable: redundant no-gc refs will be removed
let now = SystemTime::now();
repo.store().gc(repo.index(), now).unwrap();
assert_eq!(
collect_no_gc_refs(&git_repo),
hashset! {
commit_d.id().clone(),
commit_g.id().clone(),
commit_h.id().clone(),
},
);

// G is no longer reachable
let mut mut_index = base_index.start_modification();
mut_index.add_commit(&commit_a);
mut_index.add_commit(&commit_b);
mut_index.add_commit(&commit_c);
mut_index.add_commit(&commit_d);
mut_index.add_commit(&commit_e);
mut_index.add_commit(&commit_f);
mut_index.add_commit(&commit_h);
repo.store().gc(mut_index.as_index(), now).unwrap();
assert_eq!(
collect_no_gc_refs(&git_repo),
hashset! {
commit_d.id().clone(),
commit_e.id().clone(),
commit_h.id().clone(),
},
);

// D|E|H are no longer reachable
let mut mut_index = base_index.start_modification();
mut_index.add_commit(&commit_a);
mut_index.add_commit(&commit_b);
mut_index.add_commit(&commit_c);
mut_index.add_commit(&commit_f);
repo.store().gc(mut_index.as_index(), now).unwrap();
assert_eq!(
collect_no_gc_refs(&git_repo),
hashset! {
commit_c.id().clone(),
commit_f.id().clone(),
},
);

// B|C|F are no longer reachable
let mut mut_index = base_index.start_modification();
mut_index.add_commit(&commit_a);
repo.store().gc(mut_index.as_index(), now).unwrap();
assert_eq!(
collect_no_gc_refs(&git_repo),
hashset! {
commit_a.id().clone(),
},
);

// All unreachable
repo.store().gc(base_index.as_index(), now).unwrap();
assert_eq!(collect_no_gc_refs(&git_repo), hashset! {});
}

0 comments on commit 3605edc

Please sign in to comment.