Skip to content

Commit

Permalink
Machinery for singing generic manifests.
Browse files Browse the repository at this point in the history
THIS IS DRAFT, WIP. Will split into separate PRs once it works. But
posting publicly to show what the plans are (sigstore#224, sigstore#248, sigstore#240, sigstore#111).

Signed-off-by: Mihai Maruseac <[email protected]>
  • Loading branch information
mihaimaruseac committed Jul 24, 2024
1 parent 6f01724 commit c6252d4
Show file tree
Hide file tree
Showing 5 changed files with 308 additions and 41 deletions.
92 changes: 87 additions & 5 deletions model_signing/manifest/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,44 @@
from collections.abc import Iterable
import dataclasses
import pathlib
from typing import Self
from typing import Iterator, Self
from typing_extensions import override

from model_signing.hashing import hashing


@dataclasses.dataclass(frozen=True)
class ResourceDescriptor:
"""A description of any content from any `Manifest`.
We aim this to be similar to in-toto's `ResourceDescriptor`. To support
cases where in-toto cannot be directly used, we make this a dataclass that
can be mapped to in-toto when needed, and used as its own otherwise.
Not all fields from in-toto are specified at this moment. All fields here
must be present, unlike in-toto, where all are optional.
See github.com/in-toto/attestation/blob/main/spec/v1/resource_descriptor.md
for the in-toto specification.
Attributes:
identifier: A string that uniquely identifies this `ResourceDescriptor`.
Corresponds to `name`, `uri`, or `content` in in-toto specification.
digest: One digest for the item. Note that unlike in-toto, we only have
one digest for the item and it is always required.
"""

identifier: str
digest: hashing.Digest


class Manifest(metaclass=abc.ABCMeta):
"""Generic manifest file to represent a model."""

pass
@abc.abstractmethod
def resource_descriptors(self) -> Iterator[ResourceDescriptor]:
"""Yields each resource from the manifest, one by one."""
pass


@dataclasses.dataclass(frozen=True)
Expand All @@ -72,6 +101,17 @@ class DigestManifest(Manifest):

digest: hashing.Digest

@override
def resource_descriptors(self) -> Iterator[ResourceDescriptor]:
"""Yields each resource from the manifest, one by one.
In this case, we have only one descriptor to return. Since model paths
are already encoded in the digest, use "." for the digest. Subclasses
might record additional fields to have distinguishable human readable
identifiers.
"""
yield ResourceDescriptor(identifier=".", digest=self.digest)


class ItemizedManifest(Manifest):
"""A detailed manifest, recording integrity of every model component."""
Expand Down Expand Up @@ -130,6 +170,37 @@ def __init__(self, items: Iterable[FileManifestItem]):
def __eq__(self, other: Self):
return self._item_to_digest == other._item_to_digest

@override
def resource_descriptors(self) -> Iterator[ResourceDescriptor]:
"""Yields each resource from the manifest, one by one.
The items are returned in alphabetical order of the path.
"""
for item, digest in sorted(self._item_to_digest.items()):
yield ResourceDescriptor(identifier=str(item), digest=digest)


@dataclasses.dataclass(frozen=True, order=True)
class Shard:
"""A dataclass to hold information about a file shard.
Attributes:
path: The path to the file, relative to the model root.
start: The start offset of the shard (included).
end: The end offset of the shard (not included).
"""

path: pathlib.PurePath
start: int
end: int

def __str__(self) -> str:
"""Converts the item to a canonicalized string representation.
The format is {path}:{start}:{end}, which should also be easy to decode.
"""
return f"{str(self.path)}:{self.start}:{self.end}"


@dataclasses.dataclass
class ShardedFileManifestItem(ManifestItem):
Expand All @@ -146,7 +217,7 @@ def __init__(
path: pathlib.PurePath,
start: int,
end: int,
digest: hashing.Digest
digest: hashing.Digest,
):
"""Builds a manifest item pairing a file shard with its digest.
Expand All @@ -163,9 +234,9 @@ def __init__(
self.digest = digest

@property
def input_tuple(self) -> tuple[pathlib.PurePath, int, int]:
def input_tuple(self) -> Shard:
"""Returns the triple that uniquely determines the manifest item."""
return (self.path, self.start, self.end)
return Shard(self.path, self.start, self.end)


class ShardLevelManifest(FileLevelManifest):
Expand All @@ -178,3 +249,14 @@ def __init__(self, items: Iterable[ShardedFileManifestItem]):
efficient updates and retrieval of digests.
"""
self._item_to_digest = {item.input_tuple: item.digest for item in items}

@override
def resource_descriptors(self) -> Iterator[ResourceDescriptor]:
"""Yields each resource from the manifest, one by one.
The items are returned in the order given by the `input_tuple` property
of `ShardedFileManifestItem` used to create this instance (the triple of
file name and shard endpoints).
"""
for item, digest in sorted(self._item_to_digest.items()):
yield ResourceDescriptor(identifier=str(item), digest=digest)
100 changes: 100 additions & 0 deletions model_signing/manifest/manifest_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,31 @@
# limitations under the License.

import pathlib
import pytest

from model_signing.hashing import hashing
from model_signing.manifest import manifest


class TestDigestManifest:

def test_manifest_has_just_one_resource_descriptor(self):
digest = hashing.Digest("test", b"test_digest")
manifest_file = manifest.DigestManifest(digest)

descriptors = list(manifest_file.resource_descriptors())

assert len(descriptors) == 1

def test_manifest_has_the_correct_resource_descriptor(self):
digest = hashing.Digest("test", b"test_digest")
manifest_file = manifest.DigestManifest(digest)

for descriptor in manifest_file.resource_descriptors():
assert descriptor.identifier == "."
assert descriptor.digest == digest


class TestFileLevelManifest:

def test_insert_order_does_not_matter(self):
Expand All @@ -34,6 +54,39 @@ def test_insert_order_does_not_matter(self):

assert manifest1 == manifest2

@pytest.mark.parametrize("num_items", [1, 3, 5])
def test_manifest_has_all_resource_descriptors(self, num_items):
items: list[manifest.FileManifestItem] = []
for i in range(num_items):
path = pathlib.PurePath(f"file{i}")
digest = hashing.Digest("test", b"hash{i}")
item = manifest.FileManifestItem(path=path, digest=digest)
items.append(item)
manifest_file = manifest.FileLevelManifest(items)

descriptors = list(manifest_file.resource_descriptors())

assert len(descriptors) == num_items

def test_manifest_has_the_correct_resource_descriptors(self):
path1 = pathlib.PurePath(f"file1")
digest1 = hashing.Digest("test", b"hash1")
item1 = manifest.FileManifestItem(path=path1, digest=digest1)

path2 = pathlib.PurePath(f"file2")
digest2 = hashing.Digest("test", b"hash2")
item2 = manifest.FileManifestItem(path=path2, digest=digest2)

# Note order is reversed
manifest_file = manifest.FileLevelManifest([item2, item1])
descriptors = list(manifest_file.resource_descriptors())

# But we expect the descriptors to be in order by file
assert descriptors[0].identifier == "file1"
assert descriptors[1].identifier == "file2"
assert descriptors[0].digest.digest_value == b"hash1"
assert descriptors[1].digest.digest_value == b"hash2"


class TestShardLevelManifest:

Expand Down Expand Up @@ -70,3 +123,50 @@ def test_same_path_different_shards_gives_different_manifest(self):
manifest2 = manifest.ShardLevelManifest([item])

assert manifest1 != manifest2

@pytest.mark.parametrize("num_items", [1, 3, 5])
def test_manifest_has_all_resource_descriptors(self, num_items):
items: list[manifest.ShardedFileManifestItem] = []
for i in range(num_items):
path = pathlib.PurePath(f"file")
digest = hashing.Digest("test", b"hash{i}")
item = manifest.ShardedFileManifestItem(
path=path, digest=digest, start=i, end=i + 2
)
items.append(item)
manifest_file = manifest.ShardLevelManifest(items)

descriptors = list(manifest_file.resource_descriptors())

assert len(descriptors) == num_items

def test_manifest_has_the_correct_resource_descriptors(self):
path1 = pathlib.PurePath(f"file1")
digest1 = hashing.Digest("test", b"hash1")
item1 = manifest.ShardedFileManifestItem(
path=path1, digest=digest1, start=0, end=4
)

path2 = pathlib.PurePath(f"file2")
digest2 = hashing.Digest("test", b"hash2")
item2 = manifest.ShardedFileManifestItem(
path=path2, digest=digest2, start=0, end=4
)

# First file, but second shard
digest3 = hashing.Digest("test", b"hash3")
item3 = manifest.ShardedFileManifestItem(
path=path1, digest=digest3, start=4, end=8
)

# Note order is reversed
manifest_file = manifest.ShardLevelManifest([item3, item2, item1])
descriptors = list(manifest_file.resource_descriptors())

# But we expect the descriptors to be in order by file shard
assert descriptors[0].identifier == "file1:0:4"
assert descriptors[1].identifier == "file1:4:8"
assert descriptors[2].identifier == "file2:0:4"
assert descriptors[0].digest.digest_value == b"hash1"
assert descriptors[1].digest.digest_value == b"hash3"
assert descriptors[2].digest.digest_value == b"hash2"
Loading

0 comments on commit c6252d4

Please sign in to comment.