Skip to content

Commit

Permalink
hasher: fix large file support
Browse files Browse the repository at this point in the history
Do not use gsize to represent file offsets or file sizes, as it is not
big enough for large files on 32-bit platforms. Note that --hash mode
reads a whole file at once.
  • Loading branch information
cebtenzzre committed Jun 20, 2023
1 parent 25227ae commit dc923ed
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 21 deletions.
34 changes: 17 additions & 17 deletions lib/hasher.c
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ static void rm_hasher_request_readahead(int fd, RmOff seek_offset, RmOff bytes_t

static gboolean rm_hasher_symlink_read(RmHasher *hasher, GThreadPool *hashpipe,
RmDigest *digest, char *path,
gsize *bytes_actually_read) {
guint64 *bytes_actually_read) {
/* Read contents of symlink (i.e. path of symlink's target). */

RmBuffer *buffer = rm_buffer_new(hasher->buf_sem, hasher->buf_size);
Expand All @@ -168,8 +168,8 @@ static gboolean rm_hasher_symlink_read(RmHasher *hasher, GThreadPool *hashpipe,
* increments *bytes_read by the actual bytes read */

static gboolean rm_hasher_buffered_read(RmHasher *hasher, GThreadPool *hashpipe,
RmDigest *digest, char *path, gsize start_offset,
gsize bytes_to_read, gsize *bytes_actually_read) {
RmDigest *digest, char *path, guint64 start_offset,
guint64 bytes_to_read, guint64 *bytes_actually_read) {
FILE *fd = NULL;
fd = fopen(path, "rb");
if(fd == NULL) {
Expand Down Expand Up @@ -219,10 +219,10 @@ static gboolean rm_hasher_buffered_read(RmHasher *hasher, GThreadPool *hashpipe,
rm_log_error_line("Unexpected EOF in rm_hasher_buffered_read");
break;
} else if(bytes_read == 0) {
rm_log_warning_line(_("Something went wrong reading %s; expected %li bytes, "
"got %li; ignoring"),
path, (long int)bytes_to_read,
(long int)*bytes_actually_read);
rm_log_warning_line(_("Something went wrong reading %s; expected %lli bytes, "
"got %lli; ignoring"),
path, (long long)bytes_to_read,
(long long)*bytes_actually_read);
break;
}
}
Expand All @@ -236,8 +236,8 @@ static gboolean rm_hasher_buffered_read(RmHasher *hasher, GThreadPool *hashpipe,

static gboolean rm_hasher_unbuffered_read(RmHasher *hasher, GThreadPool *hashpipe,
RmDigest *digest, char *path,
gint64 start_offset, gint64 bytes_to_read,
gsize *bytes_actually_read) {
guint64 start_offset, guint64 bytes_to_read,
guint64 *bytes_actually_read) {
gint32 bytes_read = 0;
guint64 file_offset = start_offset;

Expand Down Expand Up @@ -276,7 +276,7 @@ static gboolean rm_hasher_unbuffered_read(RmHasher *hasher, GThreadPool *hashpip
memset(readvec, 0, sizeof(readvec));

gboolean success = FALSE;
gsize bytes_remaining = bytes_to_read;
guint64 bytes_remaining = bytes_to_read;

while(TRUE) {
/* allocate buffers for preadv */
Expand Down Expand Up @@ -329,10 +329,10 @@ static gboolean rm_hasher_unbuffered_read(RmHasher *hasher, GThreadPool *hashpip
success = TRUE;
break;
} else if(bytes_read == 0) {
rm_log_error_line(_("Something went wrong reading %s; expected %li bytes, "
"got %li; ignoring"),
path, (long int)bytes_to_read,
(long int)*bytes_actually_read);
rm_log_error_line(_("Something went wrong reading %s; expected %lli bytes, "
"got %lli; ignoring"),
path, (long long)bytes_to_read,
(long long)*bytes_actually_read);
break;
}
}
Expand Down Expand Up @@ -475,9 +475,9 @@ RmHasherTask *rm_hasher_task_new(RmHasher *hasher, RmDigest *digest,
}

gboolean rm_hasher_task_hash(RmHasherTask *task, char *path, guint64 start_offset,
gsize bytes_to_read, gboolean is_symlink,
gsize *bytes_read_out) {
gsize bytes_read = 0;
guint64 bytes_to_read, gboolean is_symlink,
guint64 *bytes_read_out) {
guint64 bytes_read = 0;
gboolean success = false;

if(is_symlink) {
Expand Down
4 changes: 2 additions & 2 deletions lib/hasher.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,9 @@ RmHasherTask *rm_hasher_task_new(RmHasher *hasher,
gboolean rm_hasher_task_hash(RmHasherTask *task,
char *path,
guint64 start_offset,
size_t bytes_to_read,
guint64 bytes_to_read,
gboolean is_symlink,
gsize *bytes_read_out);
guint64 *bytes_read_out);

/**
* @brief Finalise a hashing task
Expand Down
2 changes: 1 addition & 1 deletion lib/shredder.c
Original file line number Diff line number Diff line change
Expand Up @@ -1668,7 +1668,7 @@ static gint rm_shred_process_file(RmFile *file, RmSession *session) {
(!cfg->shred_never_wait && rm_mds_device_is_rotational(file->disk) &&
bytes_to_read < SHRED_TOO_MANY_BYTES_TO_WAIT));

gsize bytes_read = 0;
guint64 bytes_read = 0;
RmHasherTask *task = rm_hasher_task_new(tag->hasher, file->digest, file);
if(!rm_hasher_task_hash(task, file_path, file->hash_offset, bytes_to_read,
file->is_symlink, &bytes_read)) {
Expand Down
49 changes: 48 additions & 1 deletion tests/test_robustness/test_bigfiles.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
#!/usr/bin/env python3
# encoding: utf-8
from nose.plugins.attrib import attr

import os
import subprocess

from nose import with_setup
from nose.plugins.attrib import attr
from tests.utils import *

FILE_SIZE_KB = 10000
DIFFERENT_BYTES = 1
KBYTES_FROM_END = 10

LARGE_FILE_SIZE = 5 * 1024**3 # 5 GiB


@with_setup(usual_setup_func, usual_teardown_func)
def test_bigfiles():

Expand All @@ -25,3 +32,43 @@ def test_bigfiles():
*_, footer = run_rmlint('')
assert footer['duplicates'] == 1


def _setup_large_file_offset():
if not has_feature('bigfiles'):
raise_skiptest('rmlint built without large file support')

path_a = create_file('', 'a')
path_b = create_file('', 'b')
path_c = create_file('', 'c')

os.truncate(path_a, 4 * 1024)
if os.stat(path_a).st_blocks:
# only really works on Linux
raise_skiptest('cannot make sparse files with truncate()')

# allocate large sparse files
os.truncate(path_a, LARGE_FILE_SIZE)
os.truncate(path_b, LARGE_FILE_SIZE)
os.truncate(path_c, LARGE_FILE_SIZE)

# touch last byte of one file
with open(path_a, 'r+') as f:
f.seek(LARGE_FILE_SIZE - 1)
f.write('x')

return path_a, path_b, path_c


@with_setup(usual_setup_func, usual_teardown_func)
def test_hash_utility():
path_a, path_b, path_c = _setup_large_file_offset()

# only files 'b' and 'c' should match
# metro is chosen because it's faster
output = subprocess.check_output([
*'./rmlint --hash -a metro'.split(),
path_a, path_b, path_c,
])
hashes = [l.split()[0] for l in output.splitlines()]
assert hashes[0] != hashes[1] # a != b
assert hashes[1] == hashes[2] # b == c

0 comments on commit dc923ed

Please sign in to comment.