Skip to content

Commit

Permalink
labeller can add label resources to a manifest if given additional co…
Browse files Browse the repository at this point in the history
…ntext
  • Loading branch information
nicholascar committed Jan 3, 2025
1 parent 5a5ddc9 commit 1d27026
Show file tree
Hide file tree
Showing 11 changed files with 538 additions and 44 deletions.
30 changes: 14 additions & 16 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion prezmanifest/documentor.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,13 @@


def create_table(manifest: Path, t="markdown") -> str:
MANIFEST_ROOT_DIR = manifest.parent
# load and validate manifest
validate(manifest)
manifest_graph = load_graph(manifest)

# add in MRR vocab
manifest_graph += load_graph(Path(__file__).parent / "mrr.ttl")

if t == "asciidoc":
header = "|===\n| Resource | Role | Description\n\n"
else:
Expand Down
14 changes: 14 additions & 0 deletions prezmanifest/get_iris_from_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from pathlib import Path
from rdflib import Graph
from rdflib.namespace import RDF, SKOS

FILES_DIR = Path("something")

for f in FILES_DIR.glob("*.ttl"):
g = Graph()
g.parse(f)
iri = g.value(predicate=RDF.type, object=SKOS.ConceptScheme)

# print(f"{f.name}, {iri}")

print(f"<{iri}> ,")
188 changes: 188 additions & 0 deletions prezmanifest/labeller.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
"""
Assesses a given Manifest, finds any IRIs in any of the given resources missing labels and tries to patch them from
a given source of labels, such as KurrawongAI's Semantic Background (https://github.com/kurrawong/semantic-background)
repository.
"""
import argparse
import sys
from pathlib import Path
from typing import Literal as TLiteral
from urllib.parse import ParseResult, urlparse

from kurra.utils import load_graph
from labelify import find_missing_labels, extract_labels
from rdflib import Graph, BNode, Literal
from rdflib.namespace import PROF, RDF

from prezmanifest.utils import get_files_from_artifact

try:
from prezmanifest import MRR, OLIS, validate, load, __version__
except ImportError:
import sys

sys.path.append(str(Path(__file__).parent.parent.resolve()))
from prezmanifest import MRR, OLIS, validate, __version__


def label(
manifest: Path,
output: TLiteral["iris", "rdf", "manifest"] = "manifest",
additional_context: Path | str | Graph = None,
) -> set | Graph | None:
""""Main function for labeller module"""
# create the target from the Manifest
manifest_content_graph = load(manifest, return_data_type="Graph")

output_types = ["iris", "rdf", "manifest"]
if output not in output_types:
raise ValueError(f"Parameter output is {output} but must be one of {', '.join(output_types)}")

# determine if any labelling context is given in Manifest
context_graph = Graph()
for s, o in manifest_content_graph.subject_objects(PROF.hasResource):
for role in manifest_content_graph.objects(o, PROF.hasRole):
if role in [
MRR.IncompleteCatalogueAndResourceLabels,
MRR.CompleteCatalogueAndResourceLabels,
]:
for artifact in manifest_content_graph.objects(o, PROF.hasArtifact):
artifact: Literal
for f in get_files_from_artifact(manifest, artifact):
context_graph += load_graph(f)

if output == "iris":
return find_missing_labels(manifest_content_graph + context_graph, additional_context)

elif output == "rdf":
iris = find_missing_labels(manifest_content_graph, context_graph)

if additional_context is not None:
return extract_labels(iris, additional_context)
else:
return None

else: # output == manifest
# If this is selected, generate the "rdf" output and create a resource for it in the Manifest
# If there are no more missing labels then we have an mrr:CompleteCatalogueAndResourceLabels
# else add mrr:IncompleteCatalogueAndResourceLabels

# Generate labels for any IRIs missing them, using context given in the Manifest and any
# Additional Context supplied
manifest_only_graph = load_graph(manifest)
rdf_addition = label(manifest, "rdf", additional_context)

if len(rdf_addition) > 0:
new_artifact = manifest.parent / "labels-additional.ttl"
rdf_addition.serialize(destination=new_artifact, format="longturtle")
new_resource = BNode()

# Find the role of any context in the Manifest
manifest_iri = None
context_roles = []
for s, o in manifest_only_graph.subject_objects(PROF.hasResource):
manifest_iri = s
for role in manifest_only_graph.objects(o, PROF.hasRole):
if role in [
MRR.IncompleteCatalogueAndResourceLabels,
MRR.CompleteCatalogueAndResourceLabels,
]:
context_roles.append(role)

if MRR.CompleteCatalogueAndResourceLabels in context_roles and len(context_roles) == 1:
# If a CompleteCatalogueAndResourceLabels is present in Manifest and yet more labels were discovered,
# change CompleteCatalogueAndResourceLabels to IncompleteCatalogueAndResourceLabels and add another
for s, o in manifest_content_graph.subject_objects(PROF.hasRole):
if o == MRR.CompleteCatalogueAndResourceLabels:
manifest_only_graph.remove((s, PROF.hasRole, o))
manifest_only_graph.add((manifest_iri, PROF.hasResource, new_resource))
manifest_only_graph.add((new_resource, PROF.hasRole, MRR.IncompleteCatalogueAndResourceLabels))
manifest_only_graph.add((new_resource, PROF.hasArtifact, Literal(new_artifact.name)))
else:
# If an IncompleteCatalogueAndResourceLabels was present, add another IncompleteCatalogueAndResourceLabels
# which together make a CompleteCatalogueAndResourceLabels

# If none was present, add an IncompleteCatalogueAndResourceLabels or a CompleteCatalogueAndResourceLabels
# TODO: test for completeness of labelling and add in CompleteCatalogueAndResourceLabels if complete
manifest_only_graph.add((manifest_iri, PROF.hasResource, new_resource))
manifest_only_graph.add((new_resource, PROF.hasRole, MRR.IncompleteCatalogueAndResourceLabels))
manifest_only_graph.add((new_resource, PROF.hasArtifact, Literal(new_artifact.name)))

manifest_only_graph.serialize(destination=manifest, format="longturtle")

else:
raise Warning("No new labels have been generated for content in this Manifest. "
"This could be because none were missing or because no new labels can be found in any "
"supplied additional context.")


def setup_cli_parser(args=None):
def url_file_or_folder(input: str) -> ParseResult | Path:
parsed = urlparse(input)
if all([parsed.scheme, parsed.netloc]):
return parsed
path = Path(input)
if path.is_file():
return path
if path.is_dir():
return path
raise argparse.ArgumentTypeError(
f"{input} is not a valid input. Must be a file, folder or sparql endpoint"
)

parser = argparse.ArgumentParser()
group = parser.add_mutually_exclusive_group(required=True)

parser.add_argument(
"-v",
"--version",
action="version",
version="{version}".format(version=__version__),
)

group.add_argument(
"-o",
"--output",
help="The form of output you want",
choices=["iris", "rdf", "manifest"],
default="manifest",
)

parser.add_argument(
"-a",
"--additional-context",
help="File, Folder or Sparql Endpoint to read additional context RDF from",
type=url_file_or_folder,
)

parser.add_argument(
"manifest",
help="A Manifest file to process",
type=Path,
)

return parser.parse_args(args)


def cli(args=None):
if args is None:
args = sys.argv[1:]

args = setup_cli_parser(args)

x = label(args.manifest, args.output, args.additional_context)

if args.output == "iris":
print("\n".join([str(iri) for iri in x]))
elif args.output == "rdf":
if x is not None:
print(x.serialize(format="longturtle"))

else: # manifest
pass


if __name__ == "__main__":
retval = cli(sys.argv[1:])
if retval is not None:
sys.exit(retval)
33 changes: 14 additions & 19 deletions prezmanifest/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,17 @@
from rdflib import DCAT, DCTERMS, OWL, PROF, RDF, SDO, SKOS
from rdflib import Graph, URIRef, Dataset
from typing import Literal as TLiteral
import logging

try:
from prezmanifest import MRR, OLIS, validate, __version__
from prezmanifest.utils import get_files_from_artifact
except ImportError:
import sys

sys.path.append(str(Path(__file__).parent.parent.resolve()))
from prezmanifest import MRR, OLIS, validate, __version__
from prezmanifest.utils import get_files_from_artifact


def load(
Expand Down Expand Up @@ -95,26 +98,26 @@ def _export(data: Graph | Dataset, iri, sparql_endpoint, destination_file, retur
else:
raise ValueError(return_data_value_error_message)

print(msg)
logging.info(msg)

if sum(x is not None for x in [sparql_endpoint, destination_file, return_data_type]) != 1:
raise ValueError(
"You must specify exactly 1 of sparql_endpoint, destination_file or return_data_type",
)

# load and validate manifest
g = validate(manifest)

MANIFEST_ROOT_DIR = manifest.parent
# load and validate manifest
validate(manifest)
manifest_graph = load_graph(manifest)

vg = Graph()
vg_iri = None

for s, o in g.subject_objects(PROF.hasResource):
for role in g.objects(o, PROF.hasRole):
for s, o in manifest_graph.subject_objects(PROF.hasResource):
for role in manifest_graph.objects(o, PROF.hasRole):
# The catalogue - must be processed first
if role == MRR.CatalogueData:
for artifact in g.objects(o, PROF.hasArtifact):
for artifact in manifest_graph.objects(o, PROF.hasArtifact):
# load the Catalogue, determine the Virtual Graph & Catalogue IRIs
# and fail if we can't see a Catalogue object
c = load_graph(MANIFEST_ROOT_DIR / str(artifact))
Expand All @@ -138,24 +141,16 @@ def _export(data: Graph | Dataset, iri, sparql_endpoint, destination_file, retur
_export(c, catalogue_iri, sparql_endpoint, destination_file, return_data_type)

# non-catalogue resources
for s, o in g.subject_objects(PROF.hasResource):
for role in g.objects(o, PROF.hasRole):
for s, o in manifest_graph.subject_objects(PROF.hasResource):
for role in manifest_graph.objects(o, PROF.hasRole):
# The data files & background - must be processed after Catalogue
if role in [
MRR.CompleteCatalogueAndResourceLabels,
MRR.IncompleteCatalogueAndResourceLabels,
MRR.ResourceData,
]:
for artifact in g.objects(o, PROF.hasArtifact):
if not "*" in str(artifact):
files = [manifest.parent / Path(str(artifact))]
else:
artifact_str = str(artifact)
glob_marker_location = artifact_str.find("*")
glob_parts = [artifact_str[:glob_marker_location], artifact_str[glob_marker_location:]]
files = Path(manifest.parent / Path(glob_parts[0])).glob(glob_parts[1])

for f in files:
for artifact in manifest_graph.objects(o, PROF.hasArtifact):
for f in get_files_from_artifact(manifest, artifact):
if str(f.name).endswith(".ttl"):
fg = Graph().parse(f)
# fg.bind("rdf", RDF)
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "prezmanifest"
version = "0.2.1"
version = "0.2.2"
description = "A Python package that provides a series of functions to work with Prez Manifests."
authors = ["Nicholas Car <[email protected]>"]
license = "BSD-3-Clause"
Expand All @@ -10,7 +10,7 @@ readme = "README.adoc"
python = ">=3.12,<4.0"
pyshacl = "^0.29.0"
kurra = "^0.6.1"
labelify = {path = "/Users/nick/work/kurrawong/labelify"}
labelify = "^0.3.8"

[tool.poetry.dev-dependencies]

Expand Down
7 changes: 7 additions & 0 deletions tests/demo-vocabs/labels-2.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
PREFIX schema: <https://schema.org/>

<https://prez.dev/ManifestResourceRoles/CatalogueData> schema:name "Catalogue Data" .
<https://prez.dev/ManifestResourceRoles/ResourceData> schema:name "Resource Data" .

<https://schema.org/mathExpression> schema:name "Maths Expression" .
<https://schema.org/description> schema:name "description" .
Loading

0 comments on commit 1d27026

Please sign in to comment.