Skip to content

Commit

Permalink
don't try to diff binary files
Browse files Browse the repository at this point in the history
previously, `jj diff` would show the full contents of binary files such as images.
after this change, it instead shows "(binary)". it still shows the filename and metadata so that
users can open the file in the viewer of their choce.

future work could involve showing binary files as Sixel or similar; finding a way to compare large
non-binary files without filling up the screen; or extending the data backends to avoid having to
read the whole file contents into memory.
  • Loading branch information
jyn514 committed Feb 3, 2024
1 parent 0182595 commit efb0f6e
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 15 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

* `jj workspace root` was aliased to `jj root`, for ease of discoverability

* `jj diff` no longer shows the contents of binary files.

### Fixed bugs

* Fixed snapshots of symlinks in `gitignore`-d directory.
Expand Down
1 change: 1 addition & 0 deletions cli/src/config/colors.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
"divergent change_id"="red"
"conflict" = "red"
"empty" = "green"
"binary" = "green"
"placeholder" = "red"
"description placeholder" = "yellow"
"empty description placeholder" = "green"
Expand Down
97 changes: 82 additions & 15 deletions cli/src/diff_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -346,19 +346,66 @@ fn show_color_words_diff_line(
Ok(())
}

fn diff_content(path: &RepoPath, value: MaterializedTreeValue) -> Result<Vec<u8>, CommandError> {
struct FileContent {
/// false if this file is likely text; true if it is likely binary.
is_binary: bool,
contents: Vec<u8>,
}

impl FileContent {
fn empty() -> Self {
Self {
is_binary: false,
contents: vec![],
}
}

pub(crate) fn is_empty(&self) -> bool {
self.contents.is_empty()
}
}

fn file_content_for_diff(reader: &mut dyn io::Read) -> io::Result<FileContent> {
// If this is a binary file, don't show the full contents.
// Determine whether it's binary by whether the first 8k bytes contain a null
// character; this is the same heuristic used by git as of writing: https://github.com/git/git/blob/eea0e59ffbed6e33d171ace5be13cde9faa41639/xdiff-interface.c#L192-L198
const PEEK_SIZE: usize = 8000;
// TODO: currently we look at the whole file, even though for binary files we
// only need to know the file size. To change that we'd have to extend all
// the data backends to support getting the length.
let mut contents = vec![];
reader.read_to_end(&mut contents)?;

let start = &contents[..PEEK_SIZE.min(contents.len())];
Ok(FileContent {
is_binary: start.contains(&b'\0'),
contents,
})
}

fn diff_content(
path: &RepoPath,
value: MaterializedTreeValue,
) -> Result<FileContent, CommandError> {
match value {
MaterializedTreeValue::Absent => Ok(vec![]),
MaterializedTreeValue::Absent => Ok(FileContent::empty()),
MaterializedTreeValue::File { mut reader, .. } => {
let mut contents = vec![];
reader.read_to_end(&mut contents)?;
Ok(contents)
}
MaterializedTreeValue::Symlink { id: _, target } => Ok(target.into_bytes()),
MaterializedTreeValue::GitSubmodule(id) => {
Ok(format!("Git submodule checked out at {}", id.hex()).into_bytes())
file_content_for_diff(&mut reader).map_err(Into::into)
}
MaterializedTreeValue::Conflict { id: _, contents } => Ok(contents),
MaterializedTreeValue::Symlink { id: _, target } => Ok(FileContent {
// Unix file paths can't contain null bytes.
is_binary: false,
contents: target.into_bytes(),
}),
MaterializedTreeValue::GitSubmodule(id) => Ok(FileContent {
is_binary: false,
contents: format!("Git submodule checked out at {}", id.hex()).into_bytes(),
}),
// TODO: are we sure this is never binary?
MaterializedTreeValue::Conflict { id: _, contents } => Ok(FileContent {
is_binary: false,
contents,
}),
MaterializedTreeValue::Tree(id) => {
panic!("Unexpected tree with id {id:?} in diff at path {path:?}");
}
Expand Down Expand Up @@ -404,8 +451,10 @@ pub fn show_color_words_diff(
let right_content = diff_content(&path, right_value)?;
if right_content.is_empty() {
writeln!(formatter.labeled("empty"), " (empty)")?;
} else if right_content.is_binary {
writeln!(formatter.labeled("binary"), " (binary)")?;
} else {
show_color_words_diff_hunks(&[], &right_content, formatter)?;
show_color_words_diff_hunks(&[], &right_content.contents, formatter)?;
}
} else if right_value.is_present() {
let description = match (&left_value, &right_value) {
Expand Down Expand Up @@ -458,7 +507,15 @@ pub fn show_color_words_diff(
let left_content = diff_content(&path, left_value)?;
let right_content = diff_content(&path, right_value)?;
writeln!(formatter.labeled("header"), "{description} {ui_path}:")?;
show_color_words_diff_hunks(&left_content, &right_content, formatter)?;
if left_content.is_binary || right_content.is_binary {
writeln!(formatter.labeled("binary"), " (binary)")?;
} else {
show_color_words_diff_hunks(
&left_content.contents,
&right_content.contents,
formatter,
)?;
}
} else {
let description = basic_diff_file_type(&left_value);
writeln!(
Expand All @@ -468,8 +525,10 @@ pub fn show_color_words_diff(
let left_content = diff_content(&path, left_value)?;
if left_content.is_empty() {
writeln!(formatter.labeled("empty"), " (empty)")?;
} else if left_content.is_binary {
writeln!(formatter.labeled("binary"), " (binary)")?;
} else {
show_color_words_diff_hunks(&left_content, &[], formatter)?;
show_color_words_diff_hunks(&left_content.contents, &[], formatter)?;
}
}
}
Expand Down Expand Up @@ -508,6 +567,7 @@ fn git_diff_part(
"100644".to_string()
};
hash = id.hex();
// TODO: use `file_content_for_diff` instead of showing binary
contents = vec![];
reader.read_to_end(&mut contents)?;
}
Expand Down Expand Up @@ -810,8 +870,15 @@ struct DiffStat {
removed: usize,
}

fn get_diff_stat(path: String, left_content: &[u8], right_content: &[u8]) -> DiffStat {
let hunks = unified_diff_hunks(left_content, right_content, 0);
fn get_diff_stat(
path: String,
left_content: &FileContent,
right_content: &FileContent,
) -> DiffStat {
// TODO: this matches git's behavior, which is to count the number of newlines
// in the file. but that behavior seems unhelpful; no one really cares how
// many `0xa0` characters are in an image.
let hunks = unified_diff_hunks(&left_content.contents, &right_content.contents, 0);
let mut added = 0;
let mut removed = 0;
for hunk in hunks {
Expand Down
3 changes: 3 additions & 0 deletions cli/tests/common/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ impl TestEnvironment {
}

/// Run a `jj` command, check that it was successful, and return its stdout
#[track_caller]
pub fn jj_cmd_success(&self, current_dir: &Path, args: &[&str]) -> String {
if self.debug_allow_stderr {
let (stdout, stderr) = self.jj_cmd_ok(current_dir, args);
Expand Down Expand Up @@ -340,10 +341,12 @@ impl TestEnvironment {
}
}

#[track_caller]
pub fn get_stdout_string(assert: &assert_cmd::assert::Assert) -> String {
String::from_utf8(assert.get_output().stdout.clone()).unwrap()
}

#[track_caller]
pub fn get_stderr_string(assert: &assert_cmd::assert::Assert) -> String {
String::from_utf8(assert.get_output().stderr.clone()).unwrap()
}
Expand Down
37 changes: 37 additions & 0 deletions cli/tests/test_diff_command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -941,3 +941,40 @@ fn test_diff_stat_long_name_or_stat() {
2 files changed, 20 insertions(+), 0 deletions(-)
"###);
}

#[test]
fn test_diff_binary() {
let test_env = TestEnvironment::default();
test_env.jj_cmd_ok(test_env.env_root(), &["init", "repo", "--git"]);
let repo_path = test_env.env_root().join("repo");

std::fs::write(repo_path.join("file1.png"), b"\x89PNG\r\n\x1a\nabcdefg\0").unwrap();
std::fs::write(repo_path.join("file2.png"), b"\x89PNG\r\n\x1a\n0123456\0").unwrap();
test_env.jj_cmd_ok(&repo_path, &["new"]);
std::fs::remove_file(repo_path.join("file1.png")).unwrap();
std::fs::write(repo_path.join("file2.png"), "foo\nbar\n").unwrap();
std::fs::write(repo_path.join("file3.png"), b"\x89PNG\r\n\x1a\nxyz\0").unwrap();
// try a file that's valid UTF-8 but contains control characters
std::fs::write(repo_path.join("file4.png"), b"\0\0\0").unwrap();

let stdout = test_env.jj_cmd_success(&repo_path, &["diff"]);
insta::assert_snapshot!(stdout, @r###"
Removed regular file file1.png:
(binary)
Modified regular file file2.png:
(binary)
Added regular file file3.png:
(binary)
Added regular file file4.png:
(binary)
"###);

let stdout = test_env.jj_cmd_success(&repo_path, &["diff", "--stat"]);
insta::assert_snapshot!(stdout, @r###"
file1.png | 3 ---
file2.png | 5 ++---
file3.png | 3 +++
file4.png | 1 +
4 files changed, 6 insertions(+), 6 deletions(-)
"###);
}

0 comments on commit efb0f6e

Please sign in to comment.