hasher: fix large file support

Do not use gsize to represent file offsets or file sizes, as it is not big enough for large files on 32-bit platforms. Note that --hash mode reads a whole file at once.
sahib · Jun 20, 2023 · dc923ed · dc923ed
1 parent 25227ae
commit dc923ed
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 21 deletions.
diff --git a/lib/hasher.c b/lib/hasher.c
@@ -142,7 +142,7 @@ static void rm_hasher_request_readahead(int fd, RmOff seek_offset, RmOff bytes_t
 
 static gboolean rm_hasher_symlink_read(RmHasher *hasher, GThreadPool *hashpipe,
                                        RmDigest *digest, char *path,
-                                       gsize *bytes_actually_read) {
+                                       guint64 *bytes_actually_read) {
     /* Read contents of symlink (i.e. path of symlink's target).  */
 
     RmBuffer *buffer = rm_buffer_new(hasher->buf_sem, hasher->buf_size);
@@ -168,8 +168,8 @@ static gboolean rm_hasher_symlink_read(RmHasher *hasher, GThreadPool *hashpipe,
  * increments *bytes_read by the actual bytes read */
 
 static gboolean rm_hasher_buffered_read(RmHasher *hasher, GThreadPool *hashpipe,
-                                        RmDigest *digest, char *path, gsize start_offset,
-                                        gsize bytes_to_read, gsize *bytes_actually_read) {
+                                        RmDigest *digest, char *path, guint64 start_offset,
+                                        guint64 bytes_to_read, guint64 *bytes_actually_read) {
     FILE *fd = NULL;
     fd = fopen(path, "rb");
     if(fd == NULL) {
@@ -219,10 +219,10 @@ static gboolean rm_hasher_buffered_read(RmHasher *hasher, GThreadPool *hashpipe,
             rm_log_error_line("Unexpected EOF in rm_hasher_buffered_read");
             break;
         } else if(bytes_read == 0) {
-            rm_log_warning_line(_("Something went wrong reading %s; expected %li bytes, "
-                                "got %li; ignoring"),
-                              path, (long int)bytes_to_read,
-                              (long int)*bytes_actually_read);
+            rm_log_warning_line(_("Something went wrong reading %s; expected %lli bytes, "
+                                "got %lli; ignoring"),
+                              path, (long long)bytes_to_read,
+                              (long long)*bytes_actually_read);
             break;
         }
     }
@@ -236,8 +236,8 @@ static gboolean rm_hasher_buffered_read(RmHasher *hasher, GThreadPool *hashpipe,
 
 static gboolean rm_hasher_unbuffered_read(RmHasher *hasher, GThreadPool *hashpipe,
                                           RmDigest *digest, char *path,
-                                          gint64 start_offset, gint64 bytes_to_read,
-                                          gsize *bytes_actually_read) {
+                                          guint64 start_offset, guint64 bytes_to_read,
+                                          guint64 *bytes_actually_read) {
     gint32 bytes_read = 0;
     guint64 file_offset = start_offset;
 
@@ -276,7 +276,7 @@ static gboolean rm_hasher_unbuffered_read(RmHasher *hasher, GThreadPool *hashpip
     memset(readvec, 0, sizeof(readvec));
 
     gboolean success = FALSE;
-    gsize bytes_remaining = bytes_to_read;
+    guint64 bytes_remaining = bytes_to_read;
 
     while(TRUE) {
         /* allocate buffers for preadv */
@@ -329,10 +329,10 @@ static gboolean rm_hasher_unbuffered_read(RmHasher *hasher, GThreadPool *hashpip
             success = TRUE;
             break;
         } else if(bytes_read == 0) {
-            rm_log_error_line(_("Something went wrong reading %s; expected %li bytes, "
-                                "got %li; ignoring"),
-                              path, (long int)bytes_to_read,
-                              (long int)*bytes_actually_read);
+            rm_log_error_line(_("Something went wrong reading %s; expected %lli bytes, "
+                                "got %lli; ignoring"),
+                              path, (long long)bytes_to_read,
+                              (long long)*bytes_actually_read);
             break;
         }
     }
@@ -475,9 +475,9 @@ RmHasherTask *rm_hasher_task_new(RmHasher *hasher, RmDigest *digest,
 }
 
 gboolean rm_hasher_task_hash(RmHasherTask *task, char *path, guint64 start_offset,
-                             gsize bytes_to_read, gboolean is_symlink,
-                             gsize *bytes_read_out) {
-    gsize bytes_read = 0;
+                             guint64 bytes_to_read, gboolean is_symlink,
+                             guint64 *bytes_read_out) {
+    guint64 bytes_read = 0;
     gboolean success = false;
 
     if(is_symlink) {

diff --git a/lib/hasher.h b/lib/hasher.h
@@ -141,9 +141,9 @@ RmHasherTask *rm_hasher_task_new(RmHasher *hasher,
 gboolean rm_hasher_task_hash(RmHasherTask *task,
                              char *path,
                              guint64 start_offset,
-                             size_t bytes_to_read,
+                             guint64 bytes_to_read,
                              gboolean is_symlink,
-                             gsize *bytes_read_out);
+                             guint64 *bytes_read_out);
 
 /**
  * @brief Finalise a hashing task

diff --git a/lib/shredder.c b/lib/shredder.c
@@ -1668,7 +1668,7 @@ static gint rm_shred_process_file(RmFile *file, RmSession *session) {
              (!cfg->shred_never_wait && rm_mds_device_is_rotational(file->disk) &&
               bytes_to_read < SHRED_TOO_MANY_BYTES_TO_WAIT));
 
-        gsize bytes_read = 0;
+        guint64 bytes_read = 0;
         RmHasherTask *task = rm_hasher_task_new(tag->hasher, file->digest, file);
         if(!rm_hasher_task_hash(task, file_path, file->hash_offset, bytes_to_read,
                                 file->is_symlink, &bytes_read)) {

diff --git a/tests/test_robustness/test_bigfiles.py b/tests/test_robustness/test_bigfiles.py
@@ -1,13 +1,20 @@
 #!/usr/bin/env python3
 # encoding: utf-8
-from nose.plugins.attrib import attr
+
+import os
+import subprocess
+
 from nose import with_setup
+from nose.plugins.attrib import attr
 from tests.utils import *
 
 FILE_SIZE_KB = 10000
 DIFFERENT_BYTES = 1
 KBYTES_FROM_END = 10
 
+LARGE_FILE_SIZE = 5 * 1024**3  # 5 GiB
+
+
 @with_setup(usual_setup_func, usual_teardown_func)
 def test_bigfiles():
 
@@ -25,3 +32,43 @@ def test_bigfiles():
     *_, footer = run_rmlint('')
     assert footer['duplicates'] == 1
 
+
+def _setup_large_file_offset():
+    if not has_feature('bigfiles'):
+        raise_skiptest('rmlint built without large file support')
+
+    path_a = create_file('', 'a')
+    path_b = create_file('', 'b')
+    path_c = create_file('', 'c')
+
+    os.truncate(path_a, 4 * 1024)
+    if os.stat(path_a).st_blocks:
+        # only really works on Linux
+        raise_skiptest('cannot make sparse files with truncate()')
+
+    # allocate large sparse files
+    os.truncate(path_a, LARGE_FILE_SIZE)
+    os.truncate(path_b, LARGE_FILE_SIZE)
+    os.truncate(path_c, LARGE_FILE_SIZE)
+
+    # touch last byte of one file
+    with open(path_a, 'r+') as f:
+        f.seek(LARGE_FILE_SIZE - 1)
+        f.write('x')
+
+    return path_a, path_b, path_c
+
+
+@with_setup(usual_setup_func, usual_teardown_func)
+def test_hash_utility():
+    path_a, path_b, path_c = _setup_large_file_offset()
+
+    # only files 'b' and 'c' should match
+    # metro is chosen because it's faster
+    output = subprocess.check_output([
+        *'./rmlint --hash -a metro'.split(),
+        path_a, path_b, path_c,
+    ])
+    hashes = [l.split()[0] for l in output.splitlines()]
+    assert hashes[0] != hashes[1]  # a != b
+    assert hashes[1] == hashes[2]  # b == c