Skip to content

Commit

Permalink
don't try to diff binary files
Browse files Browse the repository at this point in the history
previously, `jj diff` would show the full contents of binary files such as images.
after this change, it instead shows "(binary)". it still shows the filename and metadata so that
users can open the file in the viewer of their choce.

future work could involve showing binary files as Sixel or similar; finding a way to compare large
non-binary files without filling up the screen; or extending the data backends to avoid having to
read the whole file contents into memory.
  • Loading branch information
jyn514 committed Feb 2, 2024
1 parent 0182595 commit 3f9fbe8
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 15 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

* `jj workspace root` was aliased to `jj root`, for ease of discoverability

* `jj diff` no longer shows the contents of binary files.

### Fixed bugs

* Fixed snapshots of symlinks in `gitignore`-d directory.
Expand Down
74 changes: 59 additions & 15 deletions cli/src/diff_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use std::io;
use std::ops::Range;

use futures::{try_join, Stream, StreamExt};
use gix::bstr::ByteSlice;
use itertools::Itertools;
use jj_lib::backend::{BackendResult, TreeValue};
use jj_lib::commit::Commit;
Expand Down Expand Up @@ -346,19 +347,52 @@ fn show_color_words_diff_line(
Ok(())
}

fn diff_content(path: &RepoPath, value: MaterializedTreeValue) -> Result<Vec<u8>, CommandError> {
match value {
MaterializedTreeValue::Absent => Ok(vec![]),
MaterializedTreeValue::File { mut reader, .. } => {
let mut contents = vec![];
reader.read_to_end(&mut contents)?;
Ok(contents)
struct FileContent {
/// true if this file is likely text; false if it is definitely binary.
is_binary: bool,
// TODO: maybe this could store an `enum { String, Vec<u8> }` instead? but then we have to UTF-8 validate the whole file.
contents: Vec<u8>,
}

impl FileContent {
pub(crate) fn is_empty(&self) -> bool {
self.contents.is_empty()
}
}

impl From<String> for FileContent {
fn from(contents: String) -> Self {
FileContent {
is_binary: false,
contents: contents.into_bytes()
}
MaterializedTreeValue::Symlink { id: _, target } => Ok(target.into_bytes()),
}
}

fn diff_file_content(reader: &mut dyn io::Read) -> io::Result<FileContent> {
// If this is a binary file, don't show the full contents.
// Determine whether it's binary by whether the first 8k bytes contain a null character;
// this is the same heuristic used by git as of writing: https://github.com/git/git/blob/eea0e59ffbed6e33d171ace5be13cde9faa41639/xdiff-interface.c#L192-L198
const PEEK_SIZE: usize = 8000;
// TODO: currently we look at the whole file, even though for binary files we only need to know the file size.
// To change that we'd have to extend all the data backends to support getting the length.
let mut contents = vec![];
reader.read_to_end(&mut contents)?;

let is_binary = !contents[..PEEK_SIZE.min(contents.len())].is_utf8();
Ok(FileContent { is_binary, contents })
}

fn diff_content(path: &RepoPath, value: MaterializedTreeValue) -> Result<FileContent, CommandError> {
match value {
MaterializedTreeValue::Absent => Ok(String::new().into()),
MaterializedTreeValue::File { mut reader, .. } => diff_file_content(&mut reader).map_err(Into::into),
MaterializedTreeValue::Symlink { id: _, target } => Ok(target.into()),
MaterializedTreeValue::GitSubmodule(id) => {
Ok(format!("Git submodule checked out at {}", id.hex()).into_bytes())
Ok(format!("Git submodule checked out at {}", id.hex()).into())
}
MaterializedTreeValue::Conflict { id: _, contents } => Ok(contents),
// TODO: are we sure this is never binary?
MaterializedTreeValue::Conflict { id: _, contents } => Ok(FileContent { is_binary: false, contents }),
MaterializedTreeValue::Tree(id) => {
panic!("Unexpected tree with id {id:?} in diff at path {path:?}");
}
Expand Down Expand Up @@ -404,8 +438,10 @@ pub fn show_color_words_diff(
let right_content = diff_content(&path, right_value)?;
if right_content.is_empty() {
writeln!(formatter.labeled("empty"), " (empty)")?;
} else if right_content.is_binary {
writeln!(formatter.labeled("empty"), " (binary)")?;
} else {
show_color_words_diff_hunks(&[], &right_content, formatter)?;
show_color_words_diff_hunks(&[], &right_content.contents, formatter)?;
}
} else if right_value.is_present() {
let description = match (&left_value, &right_value) {
Expand Down Expand Up @@ -458,7 +494,11 @@ pub fn show_color_words_diff(
let left_content = diff_content(&path, left_value)?;
let right_content = diff_content(&path, right_value)?;
writeln!(formatter.labeled("header"), "{description} {ui_path}:")?;
show_color_words_diff_hunks(&left_content, &right_content, formatter)?;
if left_content.is_binary || right_content.is_binary {
writeln!(formatter.labeled("empty"), " (binary)")?;
} else {
show_color_words_diff_hunks(&left_content.contents, &right_content.contents, formatter)?;
}
} else {
let description = basic_diff_file_type(&left_value);
writeln!(
Expand All @@ -468,8 +508,10 @@ pub fn show_color_words_diff(
let left_content = diff_content(&path, left_value)?;
if left_content.is_empty() {
writeln!(formatter.labeled("empty"), " (empty)")?;
} else if left_content.is_binary {
writeln!(formatter.labeled("empty"), " (binary)")?;
} else {
show_color_words_diff_hunks(&left_content, &[], formatter)?;
show_color_words_diff_hunks(&left_content.contents, &[], formatter)?;
}
}
}
Expand Down Expand Up @@ -810,8 +852,10 @@ struct DiffStat {
removed: usize,
}

fn get_diff_stat(path: String, left_content: &[u8], right_content: &[u8]) -> DiffStat {
let hunks = unified_diff_hunks(left_content, right_content, 0);
fn get_diff_stat(path: String, left_content: &FileContent, right_content: &FileContent) -> DiffStat {
// TODO: this matches git's behavior, which is to count the number of newlines in the file.
// but that behavior seems unhelpful; no one really cares how many `0xa0` characters are in an image.
let hunks = unified_diff_hunks(&left_content.contents, &right_content.contents, 0);
let mut added = 0;
let mut removed = 0;
for hunk in hunks {
Expand Down
32 changes: 32 additions & 0 deletions cli/tests/test_diff_command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -941,3 +941,35 @@ fn test_diff_stat_long_name_or_stat() {
2 files changed, 20 insertions(+), 0 deletions(-)
"###);
}

#[test]
fn test_diff_binary() {
let test_env = TestEnvironment::default();
test_env.jj_cmd_ok(test_env.env_root(), &["init", "repo", "--git"]);
let repo_path = test_env.env_root().join("repo");

std::fs::write(repo_path.join("file1.png"), b"\x89PNG\r\n\x1a\nabcdefg").unwrap();
std::fs::write(repo_path.join("file2.png"), b"\x89PNG\r\n\x1a\n0123456").unwrap();
test_env.jj_cmd_ok(&repo_path, &["new"]);
std::fs::remove_file(repo_path.join("file1.png")).unwrap();
std::fs::write(repo_path.join("file2.png"), "foo\nbar\n").unwrap();
std::fs::write(repo_path.join("file3.png"), b"\x89PNG\r\n\x1a\nxyz").unwrap();

let stdout = test_env.jj_cmd_success(&repo_path, &["diff"]);
insta::assert_snapshot!(stdout, @r###"
Removed regular file file1.png:
(binary)
Modified regular file file2.png:
(binary)
Added regular file file3.png:
(binary)
"###);

let stdout = test_env.jj_cmd_success(&repo_path, &["diff", "--stat"]);
insta::assert_snapshot!(stdout, @r###"
file1.png | 3 ---
file2.png | 5 ++---
file3.png | 3 +++
3 files changed, 5 insertions(+), 6 deletions(-)
"###);
}

0 comments on commit 3f9fbe8

Please sign in to comment.