Skip to content

Commit

Permalink
Update main repo jobs for the category system
Browse files Browse the repository at this point in the history
Dumps for now do not include categories, profiles, and tasks. This will be fixed in the next commit.
  • Loading branch information
Ostrzyciel committed May 15, 2024
1 parent bc19a96 commit 29f8815
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 243 deletions.
1 change: 1 addition & 0 deletions src/main/scala/commands/CategoryDocGenCommand.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package io.github.riverbench.ci_worker
package commands

import util.*
import util.collection.*
import util.doc.*

import org.apache.jena.rdf.model.{Model, Property, Resource}
Expand Down
86 changes: 1 addition & 85 deletions src/main/scala/commands/MainDocGenCommand.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,8 @@ object MainDocGenCommand extends Command:
val schemaRepoDir = FileSystems.getDefault.getPath(args(4))
val outDir = FileSystems.getDefault.getPath(args(5))

println("Generating profile documentation...")
val profileCollection = new ProfileCollection(mainMetadataOutDir.resolve("profiles"))
val ontologies = RdfIoUtil.loadOntologies(schemaRepoDir)
profileDocGen(profileCollection, ontologies, mainMetadataOutDir, outDir, version)
if version == "dev" then
profileOverviewDocGen(profileCollection, outDir)

println("Generating main documentation...")
val ontologies = RdfIoUtil.loadOntologies(schemaRepoDir)
val mainMetadata = RDFDataMgr.loadModel(mainMetadataOutDir.resolve("metadata.ttl").toString)
val mainDocOpt = DocBuilder.Options(
titleProps = Seq(
Expand Down Expand Up @@ -87,84 +81,6 @@ object MainDocGenCommand extends Command:
)
}

private def profileDocGen(
profileCollection: ProfileCollection, ontologies: Model, metadataOutDir: Path, outDir: Path, version: String
):
Unit =
val profileDocOpt = DocBuilder.Options(
titleProps = Seq(
RdfUtil.dctermsTitle,
RDFS.label,
RDF.`type`,
),
hidePropsInLevel = Seq(
(1, RdfUtil.dctermsDescription), // shown as content below the header
(1, RDF.`type`), // Always the same
),
defaultPropGroup = Some("General information"),
)
val profileDocBuilder = new DocBuilder(ontologies, profileDocOpt)
outDir.resolve("profiles").toFile.mkdirs()

for (name, profile) <- profileCollection.profiles do
val profileRes = profile.listSubjectsWithProperty(RDF.`type`, RdfUtil.Profile).next.asResource
val description = RdfUtil.getString(profileRes, RdfUtil.dctermsDescription) getOrElse ""
val profileDoc = profileDocBuilder.build(
s"$name (${readableVersion(version)})",
description + rdfInfo(PurlMaker.profile(name, version)),
profileRes
)
val tableSection =
"""
|## Download links
|
|Below you will find links to download the profile's datasets in different lengths.
|
|!!! warning
| Some datasets are shorter than others and a given distribution may not be available for all datasets.
| In that case, a link to the longest available distribution of the dataset is provided.
|
|""".stripMargin +
Files.readString(metadataOutDir.resolve(f"profiles/doc/${name}_table.md"))
val profileDocPath = outDir.resolve(s"profiles/$name.md")
Files.writeString(profileDocPath, profileDoc.toMarkdown + tableSection)

private def profileOverviewDocGen(profileCollection: ProfileCollection, outDir: Path): Unit =
def taxoLink(text: String, name: String) =
f"[$text](${Constants.taxonomyDocBaseLink}$name-stream)"

val sb = new StringBuilder()
sb.append("Profile | Stream type | RDF-star | Non-standard extensions\n")
sb.append("--- | --- | :-: | :-:\n")
for pName <- profileCollection.profiles.keys.toSeq.sorted do
val nameSplit = pName.split('-')
sb.append(f"[$pName]($pName/dev) | ")
sb.append(
(nameSplit(0), nameSplit(1)) match
case ("flat", "mixed") => "flat " + taxoLink("triple", "flat-rdf-triple") +
" or " + taxoLink("quad", "flat-rdf-quad")
case ("flat", t) => taxoLink("flat " + t.dropRight(1), "flat-rdf-" + t.dropRight(1))
case ("stream", "mixed") => taxoLink("dataset", "rdf-dataset") + " or " +
taxoLink("graph", "rdf-graph")
case ("stream", "datasets") => taxoLink("dataset", "rdf-dataset")
case ("stream", "named") => taxoLink("named graph", "rdf-named-graph")
case ("stream", "ts") => taxoLink("timestamped named graph", "timestamped-rdf-named-graph")
case ("stream", "subject") => taxoLink("subject graph", "rdf-subject-graph")
case _ => taxoLink("graph", "rdf-graph")
)
for restriction <- Seq("rdfstar", "nonstandard") do
sb.append(" | ")
sb.append(
if pName.contains(restriction) then ":material-check:"
else ":material-close:"
)
sb.append("\n")

Files.writeString(outDir.resolve("profiles/table.md"), sb.toString())

private def readableVersion(v: String) =
if v == "dev" then "development version" else v

private val staticContentWarn =
"""
|<!--
Expand Down
3 changes: 2 additions & 1 deletion src/main/scala/commands/PackageCategoryCommand.scala
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
package io.github.riverbench.ci_worker
package commands

import util.doc.MarkdownUtil
import util.*
import util.collection.*
import util.doc.MarkdownUtil

import org.apache.jena.rdf.model.{Model, ModelFactory, Property, Resource}
import org.apache.jena.riot.RDFDataMgr
Expand Down
173 changes: 18 additions & 155 deletions src/main/scala/commands/PackageMainCommand.scala
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
package io.github.riverbench.ci_worker
package commands

import util.doc.MarkdownUtil
import util.*
import util.collection.*
import util.doc.MarkdownUtil

import org.apache.jena.rdf.model.{Model, Property, RDFNode, Resource}
import org.apache.jena.riot.RDFDataMgr
Expand All @@ -28,14 +29,18 @@ object PackageMainCommand extends Command:
val repoDir = FileSystems.getDefault.getPath(args(2))
val outDir = FileSystems.getDefault.getPath(args(3))

println("Loading profiles...")
val profileCollection = new ProfileCollection(repoDir.resolve("profiles"))
val datasetsVersion = if version == "dev" then "dev" else "latest"

println("Fetching categories...")
val categoryNames = repoDir.resolve("categories").toFile.listFiles()
.filter(_.isDirectory)
.map(_.getName)
val categoryCollection = CategoryCollection.fromReleases(categoryNames, datasetsVersion)

println("Fetching datasets...")
val datasetNames = repoDir.resolve("datasets").toFile.listFiles()
.filter(_.isDirectory)
.map(_.getName)
val datasetsVersion = if version == "dev" then "dev" else "latest"
val datasetCollection = DatasetCollection.fromReleases(datasetNames, datasetsVersion)

// Prepare main model
Expand All @@ -44,62 +49,32 @@ object PackageMainCommand extends Command:
val mainVer = if version == "dev" then "" else "v/" + version
val newMainRes = mainModel.createResource(AppConfig.CiWorker.rbRootUrl + mainVer)

println("Processing profiles...")
outDir.resolve("profiles/doc").toFile.mkdirs()
val subSupModel = profileCollection.getSubSuperAssertions

def getProfileUri(name: String) = AppConfig.CiWorker.baseDevProfileUrl + name + "/" + version

for (name, profileModel) <- profileCollection.profiles do
// Add version tags to URIs
val oldRes = profileModel.createResource(AppConfig.CiWorker.baseDevProfileUrl + name)
val newRes = profileModel.createResource(getProfileUri(name))
RdfUtil.renameResource(oldRes, newRes, profileModel)
RdfUtil.renameResource(oldRes, newRes, subSupModel)

for (name, profileModel) <- profileCollection.profiles do
// Add inferred properties
val res = subSupModel.createResource(getProfileUri(name))
profileModel.removeAll(res, RdfUtil.isSupersetOfProfile, null)
profileModel.add(subSupModel.listStatements(res, null, null))
// Version metadata
profileModel.add(res, RdfUtil.hasVersion, version)
profileModel.add(res, RdfUtil.dcatInCatalog, newMainRes)
// Link datasets to profiles
linkProfileAndDatasets(name, profileModel, res, datasetCollection, outDir)
// Prettify
profileModel.removeNsPrefix("")

println("Processing main metadata...")
RdfUtil.renameResource(oldMainRes, newMainRes, mainModel)
newMainRes.addProperty(RdfUtil.foafHomepage, newMainRes)
newMainRes.addProperty(RdfUtil.hasVersion, version)

// Add links to datasets and profiles
for (name, _) <- profileCollection.profiles do
val profileRes = mainModel.createResource(getProfileUri(name))
newMainRes.addProperty(RdfUtil.hasProfile, profileRes)
// Add links to datasets and categories
for ((_, dsModel) <- datasetCollection.datasets) do
val dsRes = dsModel.listSubjectsWithProperty(RDF.`type`, RdfUtil.Dataset).next.asResource
newMainRes.addProperty(RdfUtil.dcatDataset, dsRes)
for ((_, catModel) <- categoryCollection.categories) do
val catRes = catModel.listSubjectsWithProperty(RDF.`type`, RdfUtil.Category).next.asResource
newMainRes.addProperty(RdfUtil.hasCategory, catRes)

val allModels = profileCollection.profiles.values ++
Seq(mainModel) ++
// Generate RDF dump of all metadata
val allModels = Seq(mainModel) ++
datasetCollection.datasets.values
val dumpModel = RdfUtil.mergeModels(allModels.toSeq)
// TODO: dump should include categories, tasks, profiles...
// This stuff must start in the category repos.
val dumpModel = RdfUtil.mergeModels(allModels)

if version == "dev" then
// Generate dataset overview
println("Generating dataset overview...")
generateDatasetOverview(datasetCollection, outDir)

// Write to files
println("Writing profiles...")
for (name, profileModel) <- profileCollection.profiles do
for (ext, format) <- Constants.outputFormats do
val outFile = outDir.resolve(f"profiles/$name.$ext").toFile
RDFDataMgr.write(new FileOutputStream(outFile), profileModel, format)

println("Writing main metadata...")
for (ext, format) <- Constants.outputFormats do
val mainOutFile = outDir.resolve(f"metadata.$ext").toFile
Expand All @@ -115,118 +90,6 @@ object PackageMainCommand extends Command:
println("Done.")
}

private def linkProfileAndDatasets(
name: String, profile: Model, profileRes: Resource, datasetCollection: DatasetCollection, outDir: Path
): Unit =
val restrictions = profile.listObjectsOfProperty(profileRes, RdfUtil.hasRestriction).asScala
.map(rNode => {
val rRes = rNode.asResource()
val props = rRes.listProperties().asScala
.map(p => (p.getPredicate, p.getObject))
.toSeq
props
})
.toSeq

val distTypes = restrictions.flatten.filter(_._1 == RdfUtil.hasDistributionType)
.map(_._2.asResource())

if distTypes.isEmpty then
throw new Exception(s"No distribution types specified in profile $name")

val profileTableSb = StringBuilder()
// name, dataset uri, Seq(dist download url, size, byte size)
val datasets: mutable.ArrayBuffer[(String, Resource, Seq[(String, Long, Long)])] = mutable.ArrayBuffer()

for ((name, dsModel) <- datasetCollection.datasets) do
if dsModel.isEmpty then
throw new Exception(f"Dataset $name is empty – does it have a matching release?")
val dsRes = dsModel.listSubjectsWithProperty(RDF.`type`, RdfUtil.Dataset).asScala.toSeq.headOption
dsRes match
case None => throw new Exception(f"Could not find the root resource for dataset $name")
case Some(dsRes) =>
if datasetMatchesRestrictions(dsRes, restrictions) then
profile.add(profileRes, RdfUtil.dcatSeriesMember, dsRes)
val distributions = dsRes.listProperties(RdfUtil.dcatDistribution).asScala
.map(_.getObject.asResource())
.filter(d => distTypes.exists(dt => d.hasProperty(RdfUtil.hasDistributionType, dt)))
.map(distRes => {
val downloadUrl = distRes.getProperty(RdfUtil.dcatDownloadURL).getObject.asResource().getURI
val size = distRes.getProperty(RdfUtil.hasStreamElementCount).getObject.asLiteral().getLong
val byteSize = distRes.getProperty(RdfUtil.dcatByteSize).getObject.asLiteral().getLong
(downloadUrl, size, byteSize)
})
.toSeq
.sortBy(_._2)
datasets.append((name, dsRes, distributions))

val columns = datasets
.flatMap(_._3.map(_._2))
.map(c => {
if Constants.packageSizes.contains(c) then
(c, Constants.packageSizeToHuman(c))
else
(Long.MaxValue, "Full")
})
.distinct
.sortBy(_._1)

if name.contains("flat") then
writeTable("")
else
profileTableSb.append(
"""!!! note
|
| For stream profiles, there are two available types of distributions: plain streaming, and streaming in the Jelly format. See the [documentation](../../documentation/dataset-release-format.md) for details.
|
|### Plain streaming distributions
|
|""".stripMargin)
writeTable("tar.gz")
profileTableSb.append(
"""
|
|### Jelly streaming distributions
|
|""".stripMargin)
writeTable("jelly.gz")

def writeTable(filterBy: String): Unit =
profileTableSb.append("Dataset")
for col <- columns do
profileTableSb.append(f" | ${col._2}")
profileTableSb.append("\n---")
profileTableSb.append(" | ---" * columns.size)

for (dsName, dsUri, dists) <- datasets.sortBy(_._1) do
profileTableSb.append(f"\n[$dsName]($dsUri)")
for col <- columns do
val (distUrl, distSize, distByteSize) = dists
.filter(d => d._2 <= col._1 && d._1.contains(filterBy))
.last
profileTableSb.append(f" | [${Constants.packageSizeToHuman(distSize, true)} " +
f"(${MarkdownUtil.formatSize(distByteSize)})]($distUrl)")

Files.writeString(outDir.resolve(f"profiles/doc/${name}_table.md"), profileTableSb.toString())

private def datasetMatchesRestrictions(dsRes: Resource, rs: Seq[Seq[(Property, RDFNode)]]): Boolean =
val andMatches = for r <- rs yield
val orMatches = for (p, o) <- r yield
if p == RdfUtil.hasDistributionType then
None
else if p == RdfUtil.staxHasStreamType then
Some(
dsRes.listProperties(RdfUtil.staxHasStreamTypeUsage).asScala
.exists(_.getResource.hasProperty(p, o))
)
else if !dsRes.hasProperty(p, o) then
Some(false)
else Some(true)

val orMatchesFlat = orMatches.flatten
orMatchesFlat.isEmpty || orMatchesFlat.contains(true)
!andMatches.contains(false)

private def generateDatasetOverview(datasetCollection: DatasetCollection, outDir: Path): Unit =
val sb = new StringBuilder()
sb.append("Dataset | <abbr title=\"Stream type\">El. type</abbr> | " +
Expand Down
29 changes: 29 additions & 0 deletions src/main/scala/util/collection/CategoryCollection.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package io.github.riverbench.ci_worker
package util.collection

import org.apache.jena.rdf.model.Model
import org.apache.jena.riot.RDFDataMgr

object CategoryCollection:
def fromReleases(names: Iterable[String], version: String): CategoryCollection =
val categories = names.map { name =>
if version == "latest" then
(name, s"https://github.com/RiverBench/category-$name/releases/latest/download/metadata.ttl")
else
(name, s"https://github.com/RiverBench/category-$name/releases/download/$version/metadata.ttl")
}
CategoryCollection(categories)

class CategoryCollection(namesToUris: Iterable[(String, String)]):
val categories: Map[String, Model] = namesToUris
.map((name, uri) => {
try {
(name, RDFDataMgr.loadModel(uri))
}
catch {
case e: Exception =>
println(f"Failed to load metadata for category $name from $uri")
throw e
}
})
.toMap
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package io.github.riverbench.ci_worker
package util

import org.apache.jena.rdf.model.Model
import org.apache.jena.riot.RDFDataMgr

import java.nio.file.Path
Expand All @@ -25,7 +26,7 @@ object DatasetCollection:
DatasetCollection(datasets)

class DatasetCollection(namesToUris: Iterable[(String, String)]):
val datasets = namesToUris
val datasets: Map[String, Model] = namesToUris
.map((name, uri) => {
try {
(name, RDFDataMgr.loadModel(uri))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package io.github.riverbench.ci_worker
package util
package util.collection

import util.{AppConfig, RdfUtil}

import org.apache.jena.rdf.model.{Model, ModelFactory}
import org.apache.jena.riot.RDFDataMgr
Expand Down

0 comments on commit 29f8815

Please sign in to comment.