Skip to content

Commit

Permalink
Add mariner manifests and license files support
Browse files Browse the repository at this point in the history
Reference: aboutcode-org/scancode.io#1156

Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
  • Loading branch information
AyanSinhaMahapatra committed Apr 10, 2024
1 parent cafcbcf commit 283bec9
Show file tree
Hide file tree
Showing 20 changed files with 8,712 additions and 226 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ TAGS
Procfile
local.cfg
geckodriver.log
var
.metaflow
selenium
/dist/
Expand Down
5 changes: 4 additions & 1 deletion src/packagedcode/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,10 @@

debian.DebianInstalledFilelistHandler,
debian.DebianInstalledMd5sumFilelistHandler,
debian.DebianInstalledStatusDatabaseHandler
debian.DebianInstalledStatusDatabaseHandler,

rpm.RpmLicenseFilesHandler,
rpm.RpmMarinerContainerManifestHandler
]

if on_linux:
Expand Down
7 changes: 7 additions & 0 deletions src/packagedcode/licensing.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,13 @@ def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))


RESOURCE_TO_PACKAGE_LICENSE_FIELDS = {
'detected_license_expression': 'declared_license_expression',
'detected_license_expression_spdx': 'declared_license_expression_spdx',
'license_detections': 'license_detections',
}


def add_referenced_license_matches_for_package(resource, codebase):
"""
Return an updated ``resource`` saving it in place, after adding new license
Expand Down
60 changes: 51 additions & 9 deletions src/packagedcode/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
from commoncode.datautils import String
from commoncode.fileutils import as_posixpath
from commoncode.resource import Resource
from license_expression import combine_expressions
from license_expression import Licensing

try:
from typecode import contenttype
except ImportError:
Expand Down Expand Up @@ -118,11 +121,8 @@
- IdentifiablePackageData: a base class for a Package-like class with a Package URL.
"""

SCANCODE_DEBUG_PACKAGE = os.environ.get('SCANCODE_DEBUG_PACKAGE', False)
SCANCODE_DEBUG_PACKAGE_ASSEMBLY = os.environ.get('SCANCODE_DEBUG_PACKAGE_ASSEMBLY', False)

TRACE = SCANCODE_DEBUG_PACKAGE
TRACE_UPDATE = SCANCODE_DEBUG_PACKAGE_ASSEMBLY
TRACE = os.environ.get('SCANCODE_DEBUG_PACKAGE', False)
TRACE_UPDATE = os.environ.get('SCANCODE_DEBUG_PACKAGE_ASSEMBLY', False)


def logger_debug(*args):
Expand Down Expand Up @@ -1618,6 +1618,8 @@ def update(
include_qualifiers=False,
include_subpath=False,
ignore_name_check=False,
default_relation='AND',
licensing=Licensing(),
):
"""
Update this Package with data from the ``package_data`` PackageData.
Expand Down Expand Up @@ -1670,16 +1672,16 @@ def update(
'file_references',
])

license_modified = False
for name, value in existing.items():
new_value = new_package_data.get(name)
if not new_value:
if TRACE_UPDATE: logger_debug(f' No new value: {name!r}: skipping')
continue

if TRACE_UPDATE:
logger_debug(f'update: {name!r}={value!r} with new_value: {new_value!r}')

if not new_value:
if TRACE_UPDATE: logger_debug(' No new value: skipping')
continue

if not value:
if TRACE_UPDATE: logger_debug(' set existing value to new')
setattr(self, name, new_value)
Expand All @@ -1694,6 +1696,18 @@ def update(
if name == 'extra_data':
value.update(new_value)

if 'license_detections' in name:
license_modified = True
license_keys = licensing.license_keys(
expression=new_package_data.get("declared_license_expression"),
unique=True,
)
if name == 'license_detections' and len(license_keys) > 1:
setattr(self, 'other_license_detections', new_value)
else:
merged = value + new_value
setattr(self, name, merged)

if name in list_fields:
if TRACE_UPDATE: logger_debug(' merge lists of values')
merged = merge_sequences(list1=value, list2=new_value)
Expand All @@ -1702,8 +1716,36 @@ def update(
elif TRACE_UPDATE and value != new_value:
if TRACE_UPDATE: logger_debug(' skipping update: no replace')

if license_modified:
self.refresh_license_expressions(default_relation=default_relation)

return True

def refresh_license_expressions(self, default_relation='AND'):
if self.license_detections:
self.declared_license_expression = str(combine_expressions(
expressions=[
detection["license_expression"]
for detection in self.license_detections
],
relation=default_relation,
))
self.declared_license_expression_spdx = get_declared_license_expression_spdx(
declared_license_expression=self.declared_license_expression,
)

if self.other_license_detections:
self.other_license_expression = str(combine_expressions(
expressions=[
detection["license_expression"]
for detection in self.other_license_detections
],
relation=default_relation,
))
self.other_license_expression_spdx = get_declared_license_expression_spdx(
declared_license_expression=self.other_license_expression,
)

def get_packages_files(self, codebase):
"""
Yield all the Resource of this package found in codebase.
Expand Down
167 changes: 166 additions & 1 deletion src/packagedcode/rpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,25 @@
# See https://aboutcode.org for more information about nexB OSS projects.
#

import io
import os
import fnmatch
import logging
import sys
from collections import namedtuple
from pathlib import Path

from packagedcode import models
from packagedcode import nevra
from packagedcode.licensing import RESOURCE_TO_PACKAGE_LICENSE_FIELDS
from packagedcode.pyrpm import RPM
from packagedcode.rpm_installed import collect_installed_rpmdb_xmlish_from_rpmdb_loc
from packagedcode.rpm_installed import parse_rpm_xmlish
from packagedcode.utils import build_description
from packagedcode.utils import get_ancestor
from scancode.api import get_licenses

TRACE = False
TRACE = os.environ.get('SCANCODE_DEBUG_PACKAGE_API', False)


def logger_debug(*args):
Expand Down Expand Up @@ -374,6 +379,166 @@ def parse(cls, location, package_only=False):
yield models.PackageData.from_data(package_data, package_only)


class RpmMarinerContainerManifestHandler(models.DatafileHandler):
datasource_id = 'rpm_mariner_manifest'
# container-manifest-1 is more minimal and has the same data
path_patterns = ('*var/lib/rpmmanifest/container-manifest-2',)
default_package_type = 'rpm'
default_package_namespace = 'mariner'
description = 'RPM mariner distroless package manifest'
documentation_url = 'https://github.com/microsoft/marinara/'

manifest_attributes = [
"name",
"version",
"n1",
"n2",
"party",
"n3",
"n4",
"arch",
"checksum_algo",
"filename"
]

@classmethod
def parse(cls, location, package_only=False):
with io.open(location, encoding='utf-8') as data:
lines = data.readlines()

for line in lines:
line = line.rstrip("\n")
metadata = line.split("\t")

package_data = {
"type": cls.default_package_type,
"namespace": cls.default_package_namespace,
"datasource_id": cls.datasource_id,
}
for key, value in zip(cls.manifest_attributes, metadata):
package_data[key] = value

package_data = cls.clean_mariner_manifest_data(package_data)
yield models.PackageData.from_data(package_data=package_data)

@classmethod
def assemble(cls, package_data, resource, codebase, package_adder):

levels_up = len('var/lib/rpmmanifest/container-manifest-2'.split('/'))
root_resource = get_ancestor(
levels_up=levels_up,
resource=resource,
codebase=codebase,
)
package_name = package_data.name

package = models.Package.from_package_data(
package_data=package_data,
datafile_path=resource.path,
)
package_uid = package.package_uid

assemblable_paths = tuple(set([
f'*usr/share/licenses/{package_name}/COPYING*',
f'*usr/share/licenses/{package_name}/LICENSE*',
]))

resources = []
for res in root_resource.walk(codebase):
if TRACE:
logger_debug(f' rpm: mariner assemble: root_walk: res: {res}')
if not any([
fnmatch.fnmatch(name=res.location, pat=pattern)
for pattern in assemblable_paths
]):
continue

if TRACE:
logger_debug(f' rpm: mariner assemble: pattern matched for: res: {res}')

for pkgdt in res.package_data:
package_data = models.PackageData.from_dict(pkgdt)
if TRACE:
logger_debug(f' rpm: mariner assemble: package_data: {package_data.declared_license_expression}')

package.update(
package_data=package_data,
datafile_path=res.path,
check_compatible=False,
replace=False,
include_version=False,
include_qualifiers=False,
include_subpath=False,
)

package_adder(package_uid, res, codebase)
resources.append(res)

yield package
yield from resources

@staticmethod
def clean_mariner_manifest_data(package_data):
ignore_attributes = ["n1", "n2", "n3", "n4", "checksum_algo"]
for attribute in ignore_attributes:
package_data.pop(attribute)

if arch := package_data.pop("arch"):
package_data["qualifiers"] = {"arch": arch}

if filename := package_data.pop("filename"):
package_data["extra_data"] = {"filename": filename}

if party := package_data.pop("party"):
party_obj = models.Party(
type=models.party_org,
role="owner",
name=party,
)
package_data["parties"] = [party_obj.to_dict()]

return package_data


class RpmLicenseFilesHandler(models.NonAssemblableDatafileHandler):
datasource_id = 'rpm_package_licenses'
path_patterns = (
'*usr/share/licenses/*/COPYING*',
'*usr/share/licenses/*/LICENSE*',
)
default_package_type = 'rpm'
default_package_namespace = 'mariner'
description = 'RPM mariner distroless package license files'
documentation_url = 'https://github.com/microsoft/marinara/'

@classmethod
def parse(cls, location, package_only=False):

# The license files are in a directory which is the package name,
# for example: "/usr/share/licenses/openssl/LICENSE"
name = location.split('/usr/share/licenses/').pop().split('/')[0]
package_data = models.PackageData(
type=cls.default_package_type,
namespace=cls.default_package_namespace,
name=name,
datasource_id=cls.datasource_id,
)

if package_only:
yield package_data

resource_license_attributes = get_licenses(
location=location,
include_text=True,
license_diagnostics=True,
license_text_diagnostics=True,
)
for key, key_pkg in RESOURCE_TO_PACKAGE_LICENSE_FIELDS.items():
setattr(package_data, key_pkg, resource_license_attributes.get(key))

yield package_data


ALGO_BY_ID = {
None: 'md5',
0: 'md5',
Expand Down
22 changes: 22 additions & 0 deletions tests/packagedcode/data/chef/package.scan.expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,28 @@
}
],
"identifier": "mit-a822f434-d61f-f2b1-c792-8b8cb9e7b9bf"
},
{
"license_expression": "mit",
"license_expression_spdx": "MIT",
"matches": [
{
"license_expression": "mit",
"spdx_license_expression": "MIT",
"from_file": "package/metadata.json",
"start_line": 1,
"end_line": 1,
"matcher": "1-spdx-id",
"score": 100.0,
"matched_length": 1,
"match_coverage": 100.0,
"rule_relevance": 100,
"rule_identifier": "spdx-license-identifier-mit-5da48780aba670b0860c46d899ed42a0f243ff06",
"rule_url": null,
"matched_text": "MIT"
}
],
"identifier": "mit-a822f434-d61f-f2b1-c792-8b8cb9e7b9bf"
}
],
"other_license_expression": null,
Expand Down
22 changes: 22 additions & 0 deletions tests/packagedcode/data/plugin/chef-package-expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,28 @@
}
],
"identifier": "mit-a822f434-d61f-f2b1-c792-8b8cb9e7b9bf"
},
{
"license_expression": "mit",
"license_expression_spdx": "MIT",
"matches": [
{
"license_expression": "mit",
"spdx_license_expression": "MIT",
"from_file": "package/metadata.json",
"start_line": 1,
"end_line": 1,
"matcher": "1-spdx-id",
"score": 100.0,
"matched_length": 1,
"match_coverage": 100.0,
"rule_relevance": 100,
"rule_identifier": "spdx-license-identifier-mit-5da48780aba670b0860c46d899ed42a0f243ff06",
"rule_url": null,
"matched_text": "MIT"
}
],
"identifier": "mit-a822f434-d61f-f2b1-c792-8b8cb9e7b9bf"
}
],
"other_license_expression": null,
Expand Down
Loading

0 comments on commit 283bec9

Please sign in to comment.