From f18c6ec601c5b327a4ed78acf810220330467c8c Mon Sep 17 00:00:00 2001 From: Gavin Date: Mon, 26 Aug 2024 18:09:40 -0700 Subject: [PATCH 1/2] Add file path -> crc32 algorithm Can use the base64 encoded crc to validate uploads like so: ``` In [56]: fields Out[56]: {'key': 'somefile', 'AWSAccessKeyId': 'admin', 'policy': 'eyJleHBpcmF0aW9uIjogIjIwMjQtMDgtMjZUMjM6MzE6MDBaIiwgImNvbmRpdGlvbnMiOiBbeyJidWNrZXQiOiAidGVzdCJ9LCB7ImtleSI6ICJzb21lZmlsZSJ9XX0=', 'signature': 'Ys4tVhnbR1fUrMgpm+1DRxXsWnY=', 'x-amz-checksum-crc32': b'bwUq9A=='} In [57]: with open("Dockerfile") as f: ...: files = {"file": ("somefile", f)} ...: res = requests.post(url["url"], data=fields, files=files) ``` --- .github/workflows/test.yml | 5 +---- README.md | 2 +- cdmtaskservice/s3/remote.py | 22 +++++++++++++++++++--- test/s3/remote_test.py | 24 +++++++++++++++++++++++- 4 files changed, 44 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 784436b..095aa64 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -22,10 +22,7 @@ jobs: matrix: include: - python-version: "3.11" - minio: "2021-04-22T15-44-28Z" # minimum supported version - mc: "2021-04-22T17-40-00Z" - - python-version: "3.11" - minio: "2024-08-03T04-33-23Z" # current version as of this update + minio: "2024-08-17T01-24-54Z" # minimum supported version mc: "2024-08-13T05-33-17Z" steps: diff --git a/README.md b/README.md index d4b0525..08016ce 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Enables running jobs on remote compute from the KBase CDM cluster. AWS key management service. * The provided credentials must enable listing buckets, as the service performs that operation to check the host and credentials on startup -* If using Minio, the minimum version is `2021-04-22T15-44-28Z` and the server must be run +* If using Minio, the minimum version is `2024-08-17T01-24-54Z` and the server must be run in `--compat` mode. ## Development diff --git a/cdmtaskservice/s3/remote.py b/cdmtaskservice/s3/remote.py index b910cd7..d153f79 100644 --- a/cdmtaskservice/s3/remote.py +++ b/cdmtaskservice/s3/remote.py @@ -8,6 +8,7 @@ from hashlib import md5 from pathlib import Path +import zlib def calculate_etag(infile: Path, partsize: int) -> str: @@ -40,16 +41,31 @@ def calculate_etag(infile: Path, partsize: int) -> str: # and the 2nd none. # Not really a good way to test the expanduser calls - if not infile or not infile.expanduser().is_file(): - raise ValueError("infile must be exist and be a file") + _check_file(infile) if partsize < 1: raise ValueError("partsize must be > 0") md5_digests = [] with open(infile.expanduser(), 'rb') as f: - for chunk in iter(lambda: f.read(partsize), b''): + while chunk := f.read(partsize): md5_digests.append(md5(chunk).digest()) if len(md5_digests) == 0: raise ValueError("file is empty") if len(md5_digests) == 1: return md5_digests[0].hex() return md5(b''.join(md5_digests)).hexdigest() + '-' + str(len(md5_digests)) + + +def crc32(infile: Path) -> bytes: + """Compute the CRC-32 checksum of the contents of the given file""" + # adapted from https://stackoverflow.com/a/59974585/643675 + # Not really a good way to test the expanduser calls + _check_file(infile) + with open(infile.expanduser(), "rb") as f: + checksum = 0 + while chunk := f.read(65536): + checksum = zlib.crc32(chunk, checksum) + return checksum.to_bytes(4) + +def _check_file(infile: Path): + if not infile or not infile.expanduser().is_file(): + raise ValueError("infile must be exist and be a file") diff --git a/test/s3/remote_test.py b/test/s3/remote_test.py index b7b4690..609bfa0 100644 --- a/test/s3/remote_test.py +++ b/test/s3/remote_test.py @@ -4,7 +4,7 @@ from pytest import raises from conftest import assert_exception_correct -from cdmtaskservice.s3.remote import calculate_etag +from cdmtaskservice.s3.remote import calculate_etag, crc32 TESTDATA = Path(os.path.normpath((Path(__file__) / ".." / ".." / "testdata"))) @@ -35,3 +35,25 @@ def test_calculate_etag_fail(): calculate_etag(infile, size) assert_exception_correct(got.value, expected) + +def test_crc32(): + # checked these with the linux crc32 program + testset = [ + (TESTDATA / "empty_file", "00000000"), + (TESTDATA / "random_bytes_1kB", "ed9a6eb3"), + (TESTDATA / "random_bytes_10kB", "4ffc5208"), + ] + for infile, crc in testset: + gotcrc = crc32(infile) + assert gotcrc.hex() == crc + + +def test_crc32_fail(): + testset = [ + (None, ValueError("infile must be exist and be a file")), + (TESTDATA, ValueError("infile must be exist and be a file")), + ] + for infile, expected in testset: + with raises(Exception) as got: + crc32(infile) + assert_exception_correct(got.value, expected) From e9fe607ffda9701eaf79e61d7173357220d53798 Mon Sep 17 00:00:00 2001 From: Gavin Date: Tue, 27 Aug 2024 11:02:56 -0700 Subject: [PATCH 2/2] Make a constant for read chunk size --- cdmtaskservice/s3/remote.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cdmtaskservice/s3/remote.py b/cdmtaskservice/s3/remote.py index d153f79..301abbb 100644 --- a/cdmtaskservice/s3/remote.py +++ b/cdmtaskservice/s3/remote.py @@ -10,6 +10,8 @@ from pathlib import Path import zlib +_CHUNK_SIZE_64KB = 2 ** 16 + def calculate_etag(infile: Path, partsize: int) -> str: """ @@ -62,7 +64,7 @@ def crc32(infile: Path) -> bytes: _check_file(infile) with open(infile.expanduser(), "rb") as f: checksum = 0 - while chunk := f.read(65536): + while chunk := f.read(_CHUNK_SIZE_64KB): checksum = zlib.crc32(chunk, checksum) return checksum.to_bytes(4)