From efb0f6e440e0d1c4eaa66123acea857cc4e6893a Mon Sep 17 00:00:00 2001 From: jyn Date: Fri, 2 Feb 2024 01:26:24 -0500 Subject: [PATCH] don't try to diff binary files previously, `jj diff` would show the full contents of binary files such as images. after this change, it instead shows "(binary)". it still shows the filename and metadata so that users can open the file in the viewer of their choce. future work could involve showing binary files as Sixel or similar; finding a way to compare large non-binary files without filling up the screen; or extending the data backends to avoid having to read the whole file contents into memory. --- CHANGELOG.md | 2 + cli/src/config/colors.toml | 1 + cli/src/diff_util.rs | 97 ++++++++++++++++++++++++++++------ cli/tests/common/mod.rs | 3 ++ cli/tests/test_diff_command.rs | 37 +++++++++++++ 5 files changed, 125 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad2232a3eeb..c4de11c78a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * `jj workspace root` was aliased to `jj root`, for ease of discoverability +* `jj diff` no longer shows the contents of binary files. + ### Fixed bugs * Fixed snapshots of symlinks in `gitignore`-d directory. diff --git a/cli/src/config/colors.toml b/cli/src/config/colors.toml index 2be8a32bd7b..0ce0e0a5d63 100644 --- a/cli/src/config/colors.toml +++ b/cli/src/config/colors.toml @@ -31,6 +31,7 @@ "divergent change_id"="red" "conflict" = "red" "empty" = "green" +"binary" = "green" "placeholder" = "red" "description placeholder" = "yellow" "empty description placeholder" = "green" diff --git a/cli/src/diff_util.rs b/cli/src/diff_util.rs index e7cca3650db..52ded4013bc 100644 --- a/cli/src/diff_util.rs +++ b/cli/src/diff_util.rs @@ -346,19 +346,66 @@ fn show_color_words_diff_line( Ok(()) } -fn diff_content(path: &RepoPath, value: MaterializedTreeValue) -> Result, CommandError> { +struct FileContent { + /// false if this file is likely text; true if it is likely binary. + is_binary: bool, + contents: Vec, +} + +impl FileContent { + fn empty() -> Self { + Self { + is_binary: false, + contents: vec![], + } + } + + pub(crate) fn is_empty(&self) -> bool { + self.contents.is_empty() + } +} + +fn file_content_for_diff(reader: &mut dyn io::Read) -> io::Result { + // If this is a binary file, don't show the full contents. + // Determine whether it's binary by whether the first 8k bytes contain a null + // character; this is the same heuristic used by git as of writing: https://github.com/git/git/blob/eea0e59ffbed6e33d171ace5be13cde9faa41639/xdiff-interface.c#L192-L198 + const PEEK_SIZE: usize = 8000; + // TODO: currently we look at the whole file, even though for binary files we + // only need to know the file size. To change that we'd have to extend all + // the data backends to support getting the length. + let mut contents = vec![]; + reader.read_to_end(&mut contents)?; + + let start = &contents[..PEEK_SIZE.min(contents.len())]; + Ok(FileContent { + is_binary: start.contains(&b'\0'), + contents, + }) +} + +fn diff_content( + path: &RepoPath, + value: MaterializedTreeValue, +) -> Result { match value { - MaterializedTreeValue::Absent => Ok(vec![]), + MaterializedTreeValue::Absent => Ok(FileContent::empty()), MaterializedTreeValue::File { mut reader, .. } => { - let mut contents = vec![]; - reader.read_to_end(&mut contents)?; - Ok(contents) - } - MaterializedTreeValue::Symlink { id: _, target } => Ok(target.into_bytes()), - MaterializedTreeValue::GitSubmodule(id) => { - Ok(format!("Git submodule checked out at {}", id.hex()).into_bytes()) + file_content_for_diff(&mut reader).map_err(Into::into) } - MaterializedTreeValue::Conflict { id: _, contents } => Ok(contents), + MaterializedTreeValue::Symlink { id: _, target } => Ok(FileContent { + // Unix file paths can't contain null bytes. + is_binary: false, + contents: target.into_bytes(), + }), + MaterializedTreeValue::GitSubmodule(id) => Ok(FileContent { + is_binary: false, + contents: format!("Git submodule checked out at {}", id.hex()).into_bytes(), + }), + // TODO: are we sure this is never binary? + MaterializedTreeValue::Conflict { id: _, contents } => Ok(FileContent { + is_binary: false, + contents, + }), MaterializedTreeValue::Tree(id) => { panic!("Unexpected tree with id {id:?} in diff at path {path:?}"); } @@ -404,8 +451,10 @@ pub fn show_color_words_diff( let right_content = diff_content(&path, right_value)?; if right_content.is_empty() { writeln!(formatter.labeled("empty"), " (empty)")?; + } else if right_content.is_binary { + writeln!(formatter.labeled("binary"), " (binary)")?; } else { - show_color_words_diff_hunks(&[], &right_content, formatter)?; + show_color_words_diff_hunks(&[], &right_content.contents, formatter)?; } } else if right_value.is_present() { let description = match (&left_value, &right_value) { @@ -458,7 +507,15 @@ pub fn show_color_words_diff( let left_content = diff_content(&path, left_value)?; let right_content = diff_content(&path, right_value)?; writeln!(formatter.labeled("header"), "{description} {ui_path}:")?; - show_color_words_diff_hunks(&left_content, &right_content, formatter)?; + if left_content.is_binary || right_content.is_binary { + writeln!(formatter.labeled("binary"), " (binary)")?; + } else { + show_color_words_diff_hunks( + &left_content.contents, + &right_content.contents, + formatter, + )?; + } } else { let description = basic_diff_file_type(&left_value); writeln!( @@ -468,8 +525,10 @@ pub fn show_color_words_diff( let left_content = diff_content(&path, left_value)?; if left_content.is_empty() { writeln!(formatter.labeled("empty"), " (empty)")?; + } else if left_content.is_binary { + writeln!(formatter.labeled("binary"), " (binary)")?; } else { - show_color_words_diff_hunks(&left_content, &[], formatter)?; + show_color_words_diff_hunks(&left_content.contents, &[], formatter)?; } } } @@ -508,6 +567,7 @@ fn git_diff_part( "100644".to_string() }; hash = id.hex(); + // TODO: use `file_content_for_diff` instead of showing binary contents = vec![]; reader.read_to_end(&mut contents)?; } @@ -810,8 +870,15 @@ struct DiffStat { removed: usize, } -fn get_diff_stat(path: String, left_content: &[u8], right_content: &[u8]) -> DiffStat { - let hunks = unified_diff_hunks(left_content, right_content, 0); +fn get_diff_stat( + path: String, + left_content: &FileContent, + right_content: &FileContent, +) -> DiffStat { + // TODO: this matches git's behavior, which is to count the number of newlines + // in the file. but that behavior seems unhelpful; no one really cares how + // many `0xa0` characters are in an image. + let hunks = unified_diff_hunks(&left_content.contents, &right_content.contents, 0); let mut added = 0; let mut removed = 0; for hunk in hunks { diff --git a/cli/tests/common/mod.rs b/cli/tests/common/mod.rs index c9bad7ea6c0..4b768782e0a 100644 --- a/cli/tests/common/mod.rs +++ b/cli/tests/common/mod.rs @@ -181,6 +181,7 @@ impl TestEnvironment { } /// Run a `jj` command, check that it was successful, and return its stdout + #[track_caller] pub fn jj_cmd_success(&self, current_dir: &Path, args: &[&str]) -> String { if self.debug_allow_stderr { let (stdout, stderr) = self.jj_cmd_ok(current_dir, args); @@ -340,10 +341,12 @@ impl TestEnvironment { } } +#[track_caller] pub fn get_stdout_string(assert: &assert_cmd::assert::Assert) -> String { String::from_utf8(assert.get_output().stdout.clone()).unwrap() } +#[track_caller] pub fn get_stderr_string(assert: &assert_cmd::assert::Assert) -> String { String::from_utf8(assert.get_output().stderr.clone()).unwrap() } diff --git a/cli/tests/test_diff_command.rs b/cli/tests/test_diff_command.rs index 8b567b97cd7..ad3beb89ac3 100644 --- a/cli/tests/test_diff_command.rs +++ b/cli/tests/test_diff_command.rs @@ -941,3 +941,40 @@ fn test_diff_stat_long_name_or_stat() { 2 files changed, 20 insertions(+), 0 deletions(-) "###); } + +#[test] +fn test_diff_binary() { + let test_env = TestEnvironment::default(); + test_env.jj_cmd_ok(test_env.env_root(), &["init", "repo", "--git"]); + let repo_path = test_env.env_root().join("repo"); + + std::fs::write(repo_path.join("file1.png"), b"\x89PNG\r\n\x1a\nabcdefg\0").unwrap(); + std::fs::write(repo_path.join("file2.png"), b"\x89PNG\r\n\x1a\n0123456\0").unwrap(); + test_env.jj_cmd_ok(&repo_path, &["new"]); + std::fs::remove_file(repo_path.join("file1.png")).unwrap(); + std::fs::write(repo_path.join("file2.png"), "foo\nbar\n").unwrap(); + std::fs::write(repo_path.join("file3.png"), b"\x89PNG\r\n\x1a\nxyz\0").unwrap(); + // try a file that's valid UTF-8 but contains control characters + std::fs::write(repo_path.join("file4.png"), b"\0\0\0").unwrap(); + + let stdout = test_env.jj_cmd_success(&repo_path, &["diff"]); + insta::assert_snapshot!(stdout, @r###" + Removed regular file file1.png: + (binary) + Modified regular file file2.png: + (binary) + Added regular file file3.png: + (binary) + Added regular file file4.png: + (binary) + "###); + + let stdout = test_env.jj_cmd_success(&repo_path, &["diff", "--stat"]); + insta::assert_snapshot!(stdout, @r###" + file1.png | 3 --- + file2.png | 5 ++--- + file3.png | 3 +++ + file4.png | 1 + + 4 files changed, 6 insertions(+), 6 deletions(-) + "###); +}