Skip to content

Commit

Permalink
Merge pull request aboutcode-org#3620 from nexB/update-license-detect…
Browse files Browse the repository at this point in the history
…ions

Update license detections
  • Loading branch information
AyanSinhaMahapatra authored Jan 16, 2024
2 parents 572f4fb + f016ef9 commit f70bbb7
Show file tree
Hide file tree
Showing 831 changed files with 42,378 additions and 18,247 deletions.
51 changes: 51 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,57 @@ v33.0.0 (next next, roadmap)
v32.1.0 (next, roadmap)
----------------------------

Major API/other changes:

- Output Format Version updated to 3.1.0 (minor version bump)
- Drops python 3.7 and adopts python 3.12
- New license match attributes:
- ``from_file``
- ``matched_text_diagnostics`` is added for ``--license-text-diagnostics``
- In codebase-level ``license_detections`` we have a new attribute
``reference_matches``
- SPDX license expressions everywhere side-by-side with ScanCode
license expressions.
- All rule attribute level data provided in codebase level ``todo`` items.

Changes in Output Data Structure:

- The data structure of the JSON output has changed for
licenses at file level, and license detections at top-level.
But note that all the changes are additions to the JSON output,
so we have a minor version bump ``3.0.0`` to ``3.1.0``:

- There is a new attribute ``from_file`` in ``matches`` which is in
``license_detections`` in:
* File level ``license_detections``
* Codebase level ``license_detections``
* ``license_detections`` and ``other_license_detections`` in
file-level ``package_data``
* ``license_detections`` and ``other_license_detections`` in
codebase level ``packages``

- On using the CLI option ``--license-text-diagnostics`` there is
now a new license match attribute ``matched_text_diagnostics``
with the matched text and highlighted diagnostics, instead of
having this replace the plain ``matched_text``.

- A new ``reference_matches`` attribute is added to codebase-level
``license_detections`` which is same as the ``matches`` attribute
in other license detections.

- We now have SPDX license expressions everywhere we have
ScanCode license expressions for ease of use and adopting
SPDX everywhere. A new attribute ``license_expression_spdx``
is added to:
- ``license_detections`` in file and codebase level
- in package ``license_detections`` and ``other_license_detections``
- ``matches`` for ``license_detections`` everywhere

- Adds all rule atrribute level info in codebase level ``todo``
data, to assist in review. This includes length, text, notes,
referenced_filenames, and the boolean attributes (like
is_license_notice, is_license_intro etc, as applicable).

- A new field in packages with the license category for the
detected license expression and also an API function to
compute license categories from license expressions.
Expand Down
3 changes: 2 additions & 1 deletion src/licensedcode/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,11 +545,12 @@ def validate_spdx_license_keys(license_expression, licensing):
try:
parsed.render(template='{symbol.wrapped.spdx_license_key}')
except AttributeError:
msg = f"Error rendering SPDX license key for: {key}"
messages.append(msg)
pass

if messages:
raise InvalidLicenseKeyError(messages)
raise InvalidLicenseKeyError(f"ERROR in parsing license_expression: {license_expression}: type: {type(license_expression)} :{messages}")


class InvalidLicenseKeyError(Exception):
Expand Down
110 changes: 92 additions & 18 deletions src/licensedcode/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,15 @@ class LicenseDetection:
license_expression = attr.ib(
default=None,
metadata=dict(
help='Full license expression string '
'using the SPDX license expression syntax and ScanCode license keys.')
help='A license expression string using the SPDX license expression'
' syntax and ScanCode license keys, the effective license expression'
' for this license detection.')
)

license_expression_spdx = attr.ib(
default=None,
metadata=dict(
help='SPDX license expression string with SPDX ids.')
)

matches = attr.ib(
Expand Down Expand Up @@ -248,8 +255,17 @@ def from_matches(
detection_log=detection_log,
)
detection.identifier = detection.identifier_with_expression
detection.license_expression_spdx = detection.spdx_license_expression()
return detection

def spdx_license_expression(self):
from licensedcode.cache import build_spdx_license_expression
from licensedcode.cache import get_cache
return str(build_spdx_license_expression(
license_expression=self.license_expression,
licensing=get_cache().licensing,
))

def __eq__(self, other):
return (
isinstance(other, LicenseDetection)
Expand Down Expand Up @@ -515,6 +531,7 @@ def from_license_detection_mapping(

detection = cls(
license_expression=license_detection_mapping["license_expression"],
license_expression_spdx=license_detection_mapping["license_expression_spdx"],
detection_log=license_detection_mapping.get("detection_log", []) or None,
identifier=license_detection_mapping["identifier"],
matches=matches,
Expand Down Expand Up @@ -590,6 +607,12 @@ class LicenseMatchFromResult(LicenseMatch):
help='Text which was matched')
)

matched_text_diagnostics = attr.ib(
default=None,
metadata=dict(
help='Text which was matched, with extra diagnostics information.')
)

def score(self):
return self.match_score

Expand All @@ -615,15 +638,18 @@ def from_dict(cls, license_match_mapping):
"""
rule = Rule.from_match_data(license_match_mapping)
matched_text = license_match_mapping.get("matched_text") or None
matched_text_diagnostics = license_match_mapping.get("matched_text_diagnostics") or None

return cls(
from_file=license_match_mapping["from_file"],
start_line=license_match_mapping["start_line"],
end_line=license_match_mapping["end_line"],
match_score=license_match_mapping["score"],
matched_length=license_match_mapping["matched_length"],
match_coverage=license_match_mapping["match_coverage"],
matcher=license_match_mapping["matcher"],
text=matched_text,
matched_text_diagnostics=matched_text_diagnostics,
rule=rule,
qspan=None,
ispan=None,
Expand All @@ -642,35 +668,57 @@ def to_dict(
include_text=False,
license_text_diagnostics=False,
whole_lines=True,
rule_details=False,
):
"""
Return a "result" scan data built from a LicenseMatch object.
"""
matched_text = None
if include_text:
matched_text = self.matched_text

result = {}

# Detection Level Information
result['score'] = self.score()
result['license_expression'] = self.rule.license_expression
result['license_expression_spdx'] = self.rule.spdx_license_expression()
result['from_file'] = self.from_file
result['start_line'] = self.start_line
result['end_line'] = self.end_line
if rule_details:
result.update(self.rule.get_flags_mapping())
result['matcher'] = self.matcher
result['score'] = self.score()
result['matched_length'] = self.len()
if rule_details:
result["rule_length"] = self.rule.length
result['match_coverage'] = self.coverage()
result['matcher'] = self.matcher

# LicenseDB Level Information (Rule that was matched)
result['license_expression'] = self.rule.license_expression
result['rule_identifier'] = self.rule.identifier
result['rule_relevance'] = self.rule.relevance
result['rule_identifier'] = self.rule.identifier
result['rule_url'] = self.rule.rule_url
if rule_details:
result["rule_notes"] = self.rule.notes
result["referenced_filenames"] = self.rule.referenced_filenames
if include_text and self.matched_text:
result['matched_text'] = self.matched_text
if license_text_diagnostics and self.matched_text_diagnostics:
result['matched_text_diagnostics'] = self.matched_text_diagnostics
if rule_details:
result["rule_text"] = self.rule.text

if include_text:
result['matched_text'] = matched_text
return result


def populate_matches_with_path(matches, path):
"""
Given `matches` list of LicenseMatch objects, populate the `from_file`
attribute in them with `path` which is the path for the origin file for
that license match.
"""
for match in matches:
# Here if we have the `from_file` attribute populated already,
# they are from other files, and if it's empty, they are from
# the original resource, so we populate the files with the resource
# path for the original resource of their origin
if not match["from_file"]:
match["from_file"] = path


def collect_license_detections(codebase, include_license_clues=True):
"""
Return a list of LicenseDetectionFromResult object rehydrated from
Expand All @@ -680,7 +728,10 @@ def collect_license_detections(codebase, include_license_clues=True):
according to their license detections. This is required because package fields
are populated in package plugin, which runs before the license plugin, and thus
the license plugin step where unknown references to other files are dereferenced
does not show up automatically in package attributes.
does not show up automatically in package attributes.
Also populate from_file attributes with resource paths for matches which have
origin in the same file.
"""
has_packages = hasattr(codebase.root, 'package_data')
has_licenses = hasattr(codebase.root, 'license_detections')
Expand All @@ -692,7 +743,11 @@ def collect_license_detections(codebase, include_license_clues=True):
resource_license_detections = []
if has_licenses:
license_detections = getattr(resource, 'license_detections', []) or []
for detection in license_detections:
populate_matches_with_path(matches=detection["matches"], path=resource.path)
license_clues = getattr(resource, 'license_clues', []) or []
populate_matches_with_path(matches=license_clues, path=resource.path)
codebase.save_resource(resource)

if license_detections:
license_detection_objects = detections_from_license_detection_mappings(
Expand Down Expand Up @@ -729,6 +784,9 @@ def collect_license_detections(codebase, include_license_clues=True):

package_license_detections = package["license_detections"]
if package_license_detections:
for detection in package_license_detections:
populate_matches_with_path(matches=detection["matches"], path=resource.path)
modified = True
package_license_detection_mappings.extend(package_license_detections)
detection_is_same, license_expression = verify_package_license_expression(
license_detection_mappings=package_license_detections,
Expand Down Expand Up @@ -828,6 +886,7 @@ class UniqueDetection:
"""
identifier = attr.ib(default=None)
license_expression = attr.ib(default=None)
license_expression_spdx = attr.ib(default=None)
detection_count = attr.ib(default=None)
matches = attr.ib(default=attr.Factory(list))
detection_log = attr.ib(default=attr.Factory(list))
Expand Down Expand Up @@ -860,12 +919,14 @@ def get_unique_detections(cls, license_detections):
for match in detection.matches
]
))
detection.license_expression_spdx = detection.spdx_license_expression()
detection.identifier = detection.identifier_with_expression

unique_license_detections.append(
cls(
identifier=detection.identifier,
license_expression=detection.license_expression,
license_expression_spdx=detection.license_expression_spdx,
detection_log=detection_log or [],
matches=detection.matches,
detection_count=len(file_regions),
Expand All @@ -875,7 +936,11 @@ def get_unique_detections(cls, license_detections):

return unique_license_detections

def to_dict(self, license_diagnostics):
def to_dict(self,
include_text=False,
license_text_diagnostics=False,
license_diagnostics=False,
):

def dict_fields(attr, value):

Expand All @@ -890,11 +955,20 @@ def dict_fields(attr, value):

return True

return attr.asdict(self, filter=dict_fields)
detection_mapping = attr.asdict(self, filter=dict_fields)
detection_mapping["reference_matches"] = [
match.to_dict(
include_text=include_text,
license_text_diagnostics=license_text_diagnostics,
)
for match in self.matches
]
return detection_mapping

def get_license_detection_object(self):
return LicenseDetection(
license_expression=self.license_expression,
license_expression_spdx=self.license_expression_spdx,
detection_log=self.detection_log,
matches=self.matches,
identifier=self.identifier,
Expand Down
16 changes: 10 additions & 6 deletions src/licensedcode/licenses_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ def process_codebase(self, codebase, **kwargs):
Collect the ``license_references`` and ``rule_references``
list of data mappings and add to the ``codebase``.
"""
include_files = 'license' in kwargs
include_packages = 'package' in kwargs
include_files = hasattr(codebase.attributes, 'license_detections')
include_packages = hasattr(codebase.attributes, 'packages')

license_references, rule_references = collect_license_and_rule_references(
codebase=codebase,
Expand All @@ -86,17 +86,25 @@ def collect_license_and_rule_references(codebase, include_packages=True, include
Return a two-tuple of (``license_references``, ``license_rule_references``)
sorted lists of unique mappings collected from a ``codebase``.
"""
if TRACE:
logger_debug(f'include_packages: {include_packages}, include_files: {include_files}')

license_keys = set()
rules_by_identifier = {}

if include_packages:
pks, prules = collect_references_from_packages(codebase)
if TRACE:
logger_debug(f'collect_references_from_packages: license keys: {pks}')
logger_debug(f'collect_references_from_packages: rules by id: {prules}')
license_keys.update(pks)
rules_by_identifier.update(prules)

if include_files:
pks, prules = collect_references_from_files(codebase)
if TRACE:
logger_debug(f'collect_references_from_files: license keys: {pks}')
logger_debug(f'collect_references_from_files: rules by id: {prules}')
license_keys.update(pks)
rules_by_identifier.update(prules)

Expand Down Expand Up @@ -140,10 +148,6 @@ def collect_references_from_packages(codebase):
if expression:
license_keys.update(licensing.license_keys(expression))

detections = getattr(resource, 'license_detections', []) or []
rules_by_id = build_rules_from_detection_data(detections)
rules_by_identifier.update(rules_by_id)

for rule in rules_by_identifier.values():
# TODO: consider using the expresion object directly instead
expo = rule.license_expression
Expand Down
Loading

0 comments on commit f70bbb7

Please sign in to comment.