Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add file path -> crc32 algorithm #23

Merged
merged 2 commits into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@ jobs:
matrix:
include:
- python-version: "3.11"
minio: "2021-04-22T15-44-28Z" # minimum supported version
mc: "2021-04-22T17-40-00Z"
- python-version: "3.11"
minio: "2024-08-03T04-33-23Z" # current version as of this update
minio: "2024-08-17T01-24-54Z" # minimum supported version
mc: "2024-08-13T05-33-17Z"

steps:
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Enables running jobs on remote compute from the KBase CDM cluster.
AWS key management service.
* The provided credentials must enable listing buckets, as the service performs that operation
to check the host and credentials on startup
* If using Minio, the minimum version is `2021-04-22T15-44-28Z` and the server must be run
* If using Minio, the minimum version is `2024-08-17T01-24-54Z` and the server must be run
in `--compat` mode.

## Development
Expand Down
24 changes: 21 additions & 3 deletions cdmtaskservice/s3/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

from hashlib import md5
from pathlib import Path
import zlib

_CHUNK_SIZE_64KB = 2 ** 16


def calculate_etag(infile: Path, partsize: int) -> str:
Expand Down Expand Up @@ -40,16 +43,31 @@ def calculate_etag(infile: Path, partsize: int) -> str:
# and the 2nd none.

# Not really a good way to test the expanduser calls
if not infile or not infile.expanduser().is_file():
raise ValueError("infile must be exist and be a file")
_check_file(infile)
if partsize < 1:
raise ValueError("partsize must be > 0")
md5_digests = []
with open(infile.expanduser(), 'rb') as f:
for chunk in iter(lambda: f.read(partsize), b''):
while chunk := f.read(partsize):
md5_digests.append(md5(chunk).digest())
if len(md5_digests) == 0:
raise ValueError("file is empty")
if len(md5_digests) == 1:
return md5_digests[0].hex()
return md5(b''.join(md5_digests)).hexdigest() + '-' + str(len(md5_digests))


def crc32(infile: Path) -> bytes:
"""Compute the CRC-32 checksum of the contents of the given file"""
# adapted from https://stackoverflow.com/a/59974585/643675
# Not really a good way to test the expanduser calls
_check_file(infile)
with open(infile.expanduser(), "rb") as f:
checksum = 0
while chunk := f.read(_CHUNK_SIZE_64KB):
checksum = zlib.crc32(chunk, checksum)
return checksum.to_bytes(4)

def _check_file(infile: Path):
if not infile or not infile.expanduser().is_file():
raise ValueError("infile must be exist and be a file")
24 changes: 23 additions & 1 deletion test/s3/remote_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pytest import raises

from conftest import assert_exception_correct
from cdmtaskservice.s3.remote import calculate_etag
from cdmtaskservice.s3.remote import calculate_etag, crc32

TESTDATA = Path(os.path.normpath((Path(__file__) / ".." / ".." / "testdata")))

Expand Down Expand Up @@ -35,3 +35,25 @@ def test_calculate_etag_fail():
calculate_etag(infile, size)
assert_exception_correct(got.value, expected)


def test_crc32():
# checked these with the linux crc32 program
testset = [
(TESTDATA / "empty_file", "00000000"),
(TESTDATA / "random_bytes_1kB", "ed9a6eb3"),
(TESTDATA / "random_bytes_10kB", "4ffc5208"),
]
for infile, crc in testset:
gotcrc = crc32(infile)
assert gotcrc.hex() == crc


def test_crc32_fail():
testset = [
(None, ValueError("infile must be exist and be a file")),
(TESTDATA, ValueError("infile must be exist and be a file")),
]
for infile, expected in testset:
with raises(Exception) as got:
crc32(infile)
assert_exception_correct(got.value, expected)
Loading