From b4a4f6ad7d0687eee046e66258c209ec59509f8c Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Fri, 2 Aug 2024 09:33:48 -0700 Subject: [PATCH] Add in-toto format with hashes of shards as subjects (#267) This converts model serialization manifests that record every model file shard hash into an in-toto payload that can then be passed to Sigstore's `sign_intoto` for signing to generate a Sigstore `Bundle` (if using Sigstore). This time, we record every hash as part of the subject instead of in the payload. We require verifiers to be aware of this and acknowledge that verifiers that only check subject by subject (that is, they check if the hash of a passed in argument is in the list of subjects and don't check if all the hashes are present), can fail to fully detect if the model integrity is compromised by renaming one file in the model, interchanging two file names, deleting a file, or reordering two shards. The signing library will have additional checks for this, but verifying the signature with other tools might result in invalid results. Signed-off-by: Mihai Maruseac --- model_signing/signing/in_toto.py | 105 ++++++++++++++++++ model_signing/signing/in_toto_test.py | 62 +++++++++++ .../deep_model_folder | 45 ++++++++ .../empty_model_file | 7 ++ .../empty_model_folder | 7 ++ .../model_folder_with_empty_file | 7 ++ .../sample_model_file | 18 +++ .../sample_model_folder | 99 +++++++++++++++++ .../symlink_model_folder | 18 +++ 9 files changed, 368 insertions(+) create mode 100644 model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/deep_model_folder create mode 100644 model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/empty_model_file create mode 100644 model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/empty_model_folder create mode 100644 model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/model_folder_with_empty_file create mode 100644 model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/sample_model_file create mode 100644 model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/sample_model_folder create mode 100644 model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/symlink_model_folder diff --git a/model_signing/signing/in_toto.py b/model_signing/signing/in_toto.py index 207c6e1d..66af0fac 100644 --- a/model_signing/signing/in_toto.py +++ b/model_signing/signing/in_toto.py @@ -481,3 +481,108 @@ def from_manifest(cls, manifest: manifest_module.Manifest) -> Self: manifest, predicate_type=cls.predicate_type ) return cls(statement) + + +class ShardDigestsIntotoPayload(IntotoPayload): + """In-toto payload where the subjects are the model shards themselves. + + This payload is supposed to be used for manifests where every file shard in + the model is matched with a digest. Because existing tooling only supports + established hashing algorithms, we annotate every subject with the actual + hash algorithm used to compute the file digest, and use "sha256" as the + algorithm name in the digest itself. + + Example: + ```json + { + "_type": "https://in-toto.io/Statement/v1", + "subject": [ + { + "name": "d0/d1/d2/d3/d4/f0:0:16", + "digest": { + "sha256": "6efa14..." + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + }, + { + "name": "d0/d1/d2/d3/d4/f1:0:16", + "digest": { + "sha256": "a9bc14..." + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + }, + { + "name": "d0/d1/d2/d3/d4/f2:0:16", + "digest": { + "sha256": "5f597e..." + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + }, + { + "name": "d0/d1/d2/d3/d4/f3:0:16", + "digest": { + "sha256": "eaf677..." + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + } + ], + "predicateType": "https://model_signing/ShardDigests/v0.1", + "predicate": { + "unused": "Unused, just passed due to API requirements" + } + } + ``` + + If the annotation for a subject is missing, or it does not contain + actual_hash_algorithm, it should be assumed that the digest is computed via + the algorithm listed in the digest dictionary (i.e., sha256). + + See also https://github.com/sigstore/sigstore-python/issues/1018. + """ + + predicate_type: Final[str] = ( + "https://model_signing/ShardDigests/v0.1" + ) + + def __init__(self, statement: statement.Statement): + """Builds an instance of this in-toto payload. + + Don't call this directly in production. Use `from_manifest()` instead. + + Args: + statement: The DSSE statement representing this in-toto payload. + """ + self.statement = statement + + @classmethod + @override + def from_manifest(cls, manifest: manifest_module.Manifest) -> Self: + """Converts a manifest to the signing payload used for signing. + + The manifest must be one where every model shard is paired with its own + digest. Currently, this is only `ShardLevelManifest`. + + Args: + manifest: the manifest to convert to signing payload. + + Returns: + An instance of `DigestOfDigestsIntotoPayload`. + + Raises: + TypeError: If the manifest is not `ShardLevelManifest`. + """ + if not isinstance(manifest, manifest_module.ShardLevelManifest): + raise TypeError("Only ShardLevelManifest is supported") + + statement = _convert_descriptors_to_direct_statement( + manifest, predicate_type=cls.predicate_type + ) + return cls(statement) diff --git a/model_signing/signing/in_toto_test.py b/model_signing/signing/in_toto_test.py index dd452073..51ccbe0a 100644 --- a/model_signing/signing/in_toto_test.py +++ b/model_signing/signing/in_toto_test.py @@ -264,3 +264,65 @@ def test_only_runs_on_expected_manifest_types(self): match="Only FileLevelManifest is supported", ): in_toto.DigestsIntotoPayload.from_manifest(manifest) + + +class TestShardDigestsIntotoPayload: + + def _hasher_factory( + self, path: pathlib.Path, start: int, end: int + ) -> file.ShardedFileHasher: + return file.ShardedFileHasher( + path, memory.SHA256(), start=start, end=end + ) + + @pytest.mark.parametrize("model_fixture_name", test_support.all_test_models) + def test_known_models(self, request, model_fixture_name): + # Set up variables (arrange) + testdata_path = request.path.parent / "testdata" + test_path = testdata_path / "in_toto" + test_class_path = test_path / "TestShardDigestsIntotoPayload" + golden_path = test_class_path / model_fixture_name + should_update = request.config.getoption("update_goldens") + model = request.getfixturevalue(model_fixture_name) + + # Compute payload (act) + serializer = serialize_by_file_shard.ManifestSerializer( + self._hasher_factory, allow_symlinks=True + ) + manifest = serializer.serialize(model) + payload = in_toto.ShardDigestsIntotoPayload.from_manifest(manifest) + + # Compare with golden, or write to golden (approximately "assert") + if should_update: + with open(golden_path, "w", encoding="utf-8") as f: + f.write(f"{json_format.MessageToJson(payload.statement.pb)}\n") + else: + with open(golden_path, "r", encoding="utf-8") as f: + json_contents = f.read() + proto = json_format.Parse( + json_contents, statement_pb2.Statement() + ) + + assert payload.statement.pb == proto + + def test_produces_valid_statements(self, sample_model_folder): + serializer = serialize_by_file_shard.ManifestSerializer( + self._hasher_factory, allow_symlinks=True + ) + manifest = serializer.serialize(sample_model_folder) + + payload = in_toto.ShardDigestsIntotoPayload.from_manifest( + manifest + ) + + payload.statement.validate() + + def test_only_runs_on_expected_manifest_types(self): + digest = hashing.Digest("test", b"test_digest") + manifest = manifest_module.DigestManifest(digest) + + with pytest.raises( + TypeError, + match="Only ShardLevelManifest is supported", + ): + in_toto.ShardDigestsIntotoPayload.from_manifest(manifest) diff --git a/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/deep_model_folder b/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/deep_model_folder new file mode 100644 index 00000000..6c228aeb --- /dev/null +++ b/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/deep_model_folder @@ -0,0 +1,45 @@ +{ + "_type": "https://in-toto.io/Statement/v1", + "subject": [ + { + "name": "d0/d1/d2/d3/d4/f0:0:16", + "digest": { + "sha256": "6efa14bb03544fcb76045c55f25b9315b6eb5be2d8a85f703193a76b7874c6ff" + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + }, + { + "name": "d0/d1/d2/d3/d4/f1:0:16", + "digest": { + "sha256": "a9bc149b70b9d325cd68d275d582cfdb98c0347d3ce54590aa6533368daed3d2" + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + }, + { + "name": "d0/d1/d2/d3/d4/f2:0:16", + "digest": { + "sha256": "5f597e6a92d1324d9adbed43d527926d11d0131487baf315e65ae1ef3b1ca3c0" + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + }, + { + "name": "d0/d1/d2/d3/d4/f3:0:16", + "digest": { + "sha256": "eaf677c35fec6b87889d9e4563d8bb65dcb9869ca0225697c9cc44cf49dca008" + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + } + ], + "predicateType": "https://model_signing/ShardDigests/v0.1", + "predicate": { + "unused": "Unused, just passed due to API requirements" + } +} diff --git a/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/empty_model_file b/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/empty_model_file new file mode 100644 index 00000000..4fa1c7fe --- /dev/null +++ b/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/empty_model_file @@ -0,0 +1,7 @@ +{ + "_type": "https://in-toto.io/Statement/v1", + "predicateType": "https://model_signing/ShardDigests/v0.1", + "predicate": { + "unused": "Unused, just passed due to API requirements" + } +} diff --git a/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/empty_model_folder b/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/empty_model_folder new file mode 100644 index 00000000..4fa1c7fe --- /dev/null +++ b/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/empty_model_folder @@ -0,0 +1,7 @@ +{ + "_type": "https://in-toto.io/Statement/v1", + "predicateType": "https://model_signing/ShardDigests/v0.1", + "predicate": { + "unused": "Unused, just passed due to API requirements" + } +} diff --git a/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/model_folder_with_empty_file b/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/model_folder_with_empty_file new file mode 100644 index 00000000..4fa1c7fe --- /dev/null +++ b/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/model_folder_with_empty_file @@ -0,0 +1,7 @@ +{ + "_type": "https://in-toto.io/Statement/v1", + "predicateType": "https://model_signing/ShardDigests/v0.1", + "predicate": { + "unused": "Unused, just passed due to API requirements" + } +} diff --git a/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/sample_model_file b/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/sample_model_file new file mode 100644 index 00000000..bd839714 --- /dev/null +++ b/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/sample_model_file @@ -0,0 +1,18 @@ +{ + "_type": "https://in-toto.io/Statement/v1", + "subject": [ + { + "name": ".:0:22", + "digest": { + "sha256": "3aab065c7181a173b5dd9e9d32a9f79923440b413be1e1ffcdba26a7365f719b" + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + } + ], + "predicateType": "https://model_signing/ShardDigests/v0.1", + "predicate": { + "unused": "Unused, just passed due to API requirements" + } +} diff --git a/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/sample_model_folder b/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/sample_model_folder new file mode 100644 index 00000000..bc455dc8 --- /dev/null +++ b/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/sample_model_folder @@ -0,0 +1,99 @@ +{ + "_type": "https://in-toto.io/Statement/v1", + "subject": [ + { + "name": "d0/f00:0:23", + "digest": { + "sha256": "fdd8925354242a7fd1515e79534317b800015607a609cd306e0b4dcfe6c92249" + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + }, + { + "name": "d0/f01:0:23", + "digest": { + "sha256": "e16940b5e44ce981150bda37c4ba95881a749a521b4a297c5cdf97bdcfe965e6" + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + }, + { + "name": "d0/f02:0:23", + "digest": { + "sha256": "407822246ea8f9e26380842c3f4cd10d7b23e78f1fe7c74c293608682886a426" + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + }, + { + "name": "d1/f10:0:23", + "digest": { + "sha256": "6a3b08b5df77c4d418ceee1ac136a9ad49fc7c41358b5e82c1176daccb21ff3f" + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + }, + { + "name": "d1/f11:0:23", + "digest": { + "sha256": "a484b3d8ea5e99b75f9f123f9a42c882388693edc7d85d82ccba54834712cadf" + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + }, + { + "name": "d1/f12:0:23", + "digest": { + "sha256": "8f577930f5f40c2c2133cb299d36f9527fde98c1608569017cae6b5bcd01abb3" + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + }, + { + "name": "f0:0:24", + "digest": { + "sha256": "997b37cc51f1ca1c7a270466607e26847429cd7264c30148c1b9352e224083fc" + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + }, + { + "name": "f1:0:24", + "digest": { + "sha256": "c88a04d48353133fb065ba2c8ab369abab21395b9526aa20373ad828915fa7ae" + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + }, + { + "name": "f2:0:24", + "digest": { + "sha256": "700e3ba5065d8dd47e41fd928ea086670d628f891ba363be0ca3c31d20d7d719" + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + }, + { + "name": "f3:0:24", + "digest": { + "sha256": "912bcf5ebdf44dc7b4085b07940e0a81d157fba24b276e73fd911121d4544c4a" + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + } + ], + "predicateType": "https://model_signing/ShardDigests/v0.1", + "predicate": { + "unused": "Unused, just passed due to API requirements" + } +} diff --git a/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/symlink_model_folder b/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/symlink_model_folder new file mode 100644 index 00000000..13776db7 --- /dev/null +++ b/model_signing/signing/testdata/in_toto/TestShardDigestsIntotoPayload/symlink_model_folder @@ -0,0 +1,18 @@ +{ + "_type": "https://in-toto.io/Statement/v1", + "subject": [ + { + "name": "symlink_file:0:22", + "digest": { + "sha256": "3aab065c7181a173b5dd9e9d32a9f79923440b413be1e1ffcdba26a7365f719b" + }, + "annotations": { + "actual_hash_algorithm": "file-sha256-1000000" + } + } + ], + "predicateType": "https://model_signing/ShardDigests/v0.1", + "predicate": { + "unused": "Unused, just passed due to API requirements" + } +}