Update main repo jobs for the category system

Dumps for now do not include categories, profiles, and tasks. This will be fixed in the next commit.
RiverBench · May 15, 2024 · 29f8815 · 29f8815
1 parent bc19a96
commit 29f8815
Show file tree

Hide file tree

Showing 7 changed files with 56 additions and 243 deletions.
diff --git a/src/main/scala/commands/CategoryDocGenCommand.scala b/src/main/scala/commands/CategoryDocGenCommand.scala
@@ -2,6 +2,7 @@ package io.github.riverbench.ci_worker
 package commands
 
 import util.*
+import util.collection.*
 import util.doc.*
 
 import org.apache.jena.rdf.model.{Model, Property, Resource}

diff --git a/src/main/scala/commands/MainDocGenCommand.scala b/src/main/scala/commands/MainDocGenCommand.scala
@@ -26,14 +26,8 @@ object MainDocGenCommand extends Command:
     val schemaRepoDir = FileSystems.getDefault.getPath(args(4))
     val outDir = FileSystems.getDefault.getPath(args(5))
 
-    println("Generating profile documentation...")
-    val profileCollection = new ProfileCollection(mainMetadataOutDir.resolve("profiles"))
-    val ontologies = RdfIoUtil.loadOntologies(schemaRepoDir)
-    profileDocGen(profileCollection, ontologies, mainMetadataOutDir, outDir, version)
-    if version == "dev" then
-      profileOverviewDocGen(profileCollection, outDir)
-
     println("Generating main documentation...")
+    val ontologies = RdfIoUtil.loadOntologies(schemaRepoDir)
     val mainMetadata = RDFDataMgr.loadModel(mainMetadataOutDir.resolve("metadata.ttl").toString)
     val mainDocOpt = DocBuilder.Options(
       titleProps = Seq(
@@ -87,84 +81,6 @@ object MainDocGenCommand extends Command:
       )
   }
 
-  private def profileDocGen(
-    profileCollection: ProfileCollection, ontologies: Model, metadataOutDir: Path, outDir: Path, version: String
-  ):
-  Unit =
-    val profileDocOpt = DocBuilder.Options(
-      titleProps = Seq(
-        RdfUtil.dctermsTitle,
-        RDFS.label,
-        RDF.`type`,
-      ),
-      hidePropsInLevel = Seq(
-        (1, RdfUtil.dctermsDescription), // shown as content below the header
-        (1, RDF.`type`), // Always the same
-      ),
-      defaultPropGroup = Some("General information"),
-    )
-    val profileDocBuilder = new DocBuilder(ontologies, profileDocOpt)
-    outDir.resolve("profiles").toFile.mkdirs()
-
-    for (name, profile) <- profileCollection.profiles do
-      val profileRes = profile.listSubjectsWithProperty(RDF.`type`, RdfUtil.Profile).next.asResource
-      val description = RdfUtil.getString(profileRes, RdfUtil.dctermsDescription) getOrElse ""
-      val profileDoc = profileDocBuilder.build(
-        s"$name (${readableVersion(version)})",
-        description + rdfInfo(PurlMaker.profile(name, version)),
-        profileRes
-      )
-      val tableSection =
-        """
-          |## Download links
-          |
-          |Below you will find links to download the profile's datasets in different lengths.
-          |
-          |!!! warning
-          |    Some datasets are shorter than others and a given distribution may not be available for all datasets.
-          |    In that case, a link to the longest available distribution of the dataset is provided.
-          |
-          |""".stripMargin +
-        Files.readString(metadataOutDir.resolve(f"profiles/doc/${name}_table.md"))
-      val profileDocPath = outDir.resolve(s"profiles/$name.md")
-      Files.writeString(profileDocPath, profileDoc.toMarkdown + tableSection)
-
-  private def profileOverviewDocGen(profileCollection: ProfileCollection, outDir: Path): Unit =
-    def taxoLink(text: String, name: String) =
-      f"[$text](${Constants.taxonomyDocBaseLink}$name-stream)"
-
-    val sb = new StringBuilder()
-    sb.append("Profile | Stream type | RDF-star | Non-standard extensions\n")
-    sb.append("--- | --- | :-: | :-:\n")
-    for pName <- profileCollection.profiles.keys.toSeq.sorted do
-      val nameSplit = pName.split('-')
-      sb.append(f"[$pName]($pName/dev) | ")
-      sb.append(
-        (nameSplit(0), nameSplit(1)) match
-        case ("flat", "mixed") => "flat " + taxoLink("triple", "flat-rdf-triple") +
-          " or " + taxoLink("quad", "flat-rdf-quad")
-        case ("flat", t) => taxoLink("flat " + t.dropRight(1), "flat-rdf-" + t.dropRight(1))
-        case ("stream", "mixed") => taxoLink("dataset", "rdf-dataset") + " or " +
-          taxoLink("graph", "rdf-graph")
-        case ("stream", "datasets") => taxoLink("dataset", "rdf-dataset")
-        case ("stream", "named") => taxoLink("named graph", "rdf-named-graph")
-        case ("stream", "ts") => taxoLink("timestamped named graph", "timestamped-rdf-named-graph")
-        case ("stream", "subject") => taxoLink("subject graph", "rdf-subject-graph")
-        case _ => taxoLink("graph", "rdf-graph")
-      )
-      for restriction <- Seq("rdfstar", "nonstandard") do
-        sb.append(" | ")
-        sb.append(
-          if pName.contains(restriction) then ":material-check:"
-          else ":material-close:"
-        )
-      sb.append("\n")
-
-    Files.writeString(outDir.resolve("profiles/table.md"), sb.toString())
-
-  private def readableVersion(v: String) =
-    if v == "dev" then "development version" else v
-
   private val staticContentWarn =
     """
       |<!--

diff --git a/src/main/scala/commands/PackageCategoryCommand.scala b/src/main/scala/commands/PackageCategoryCommand.scala
@@ -1,8 +1,9 @@
 package io.github.riverbench.ci_worker
 package commands
 
-import util.doc.MarkdownUtil
 import util.*
+import util.collection.*
+import util.doc.MarkdownUtil
 
 import org.apache.jena.rdf.model.{Model, ModelFactory, Property, Resource}
 import org.apache.jena.riot.RDFDataMgr

diff --git a/src/main/scala/commands/PackageMainCommand.scala b/src/main/scala/commands/PackageMainCommand.scala
@@ -1,8 +1,9 @@
 package io.github.riverbench.ci_worker
 package commands
 
-import util.doc.MarkdownUtil
 import util.*
+import util.collection.*
+import util.doc.MarkdownUtil
 
 import org.apache.jena.rdf.model.{Model, Property, RDFNode, Resource}
 import org.apache.jena.riot.RDFDataMgr
@@ -28,14 +29,18 @@ object PackageMainCommand extends Command:
     val repoDir = FileSystems.getDefault.getPath(args(2))
     val outDir = FileSystems.getDefault.getPath(args(3))
 
-    println("Loading profiles...")
-    val profileCollection = new ProfileCollection(repoDir.resolve("profiles"))
+    val datasetsVersion = if version == "dev" then "dev" else "latest"
+
+    println("Fetching categories...")
+    val categoryNames = repoDir.resolve("categories").toFile.listFiles()
+      .filter(_.isDirectory)
+      .map(_.getName)
+    val categoryCollection = CategoryCollection.fromReleases(categoryNames, datasetsVersion)
 
     println("Fetching datasets...")
     val datasetNames = repoDir.resolve("datasets").toFile.listFiles()
       .filter(_.isDirectory)
       .map(_.getName)
-    val datasetsVersion = if version == "dev" then "dev" else "latest"
     val datasetCollection = DatasetCollection.fromReleases(datasetNames, datasetsVersion)
 
     // Prepare main model
@@ -44,62 +49,32 @@ object PackageMainCommand extends Command:
     val mainVer = if version == "dev" then "" else "v/" + version
     val newMainRes = mainModel.createResource(AppConfig.CiWorker.rbRootUrl + mainVer)
 
-    println("Processing profiles...")
-    outDir.resolve("profiles/doc").toFile.mkdirs()
-    val subSupModel = profileCollection.getSubSuperAssertions
-
-    def getProfileUri(name: String) = AppConfig.CiWorker.baseDevProfileUrl + name + "/" + version
-
-    for (name, profileModel) <- profileCollection.profiles do
-      // Add version tags to URIs
-      val oldRes = profileModel.createResource(AppConfig.CiWorker.baseDevProfileUrl + name)
-      val newRes = profileModel.createResource(getProfileUri(name))
-      RdfUtil.renameResource(oldRes, newRes, profileModel)
-      RdfUtil.renameResource(oldRes, newRes, subSupModel)
-
-    for (name, profileModel) <- profileCollection.profiles do
-      // Add inferred properties
-      val res = subSupModel.createResource(getProfileUri(name))
-      profileModel.removeAll(res, RdfUtil.isSupersetOfProfile, null)
-      profileModel.add(subSupModel.listStatements(res, null, null))
-      // Version metadata
-      profileModel.add(res, RdfUtil.hasVersion, version)
-      profileModel.add(res, RdfUtil.dcatInCatalog, newMainRes)
-      // Link datasets to profiles
-      linkProfileAndDatasets(name, profileModel, res, datasetCollection, outDir)
-      // Prettify
-      profileModel.removeNsPrefix("")
-
     println("Processing main metadata...")
     RdfUtil.renameResource(oldMainRes, newMainRes, mainModel)
     newMainRes.addProperty(RdfUtil.foafHomepage, newMainRes)
     newMainRes.addProperty(RdfUtil.hasVersion, version)
 
-    // Add links to datasets and profiles
-    for (name, _) <- profileCollection.profiles do
-      val profileRes = mainModel.createResource(getProfileUri(name))
-      newMainRes.addProperty(RdfUtil.hasProfile, profileRes)
+    // Add links to datasets and categories
     for ((_, dsModel) <- datasetCollection.datasets) do
       val dsRes = dsModel.listSubjectsWithProperty(RDF.`type`, RdfUtil.Dataset).next.asResource
       newMainRes.addProperty(RdfUtil.dcatDataset, dsRes)
+    for ((_, catModel) <- categoryCollection.categories) do
+      val catRes = catModel.listSubjectsWithProperty(RDF.`type`, RdfUtil.Category).next.asResource
+      newMainRes.addProperty(RdfUtil.hasCategory, catRes)
 
-    val allModels = profileCollection.profiles.values ++
-      Seq(mainModel) ++
+    // Generate RDF dump of all metadata
+    val allModels = Seq(mainModel) ++
       datasetCollection.datasets.values
-    val dumpModel = RdfUtil.mergeModels(allModels.toSeq)
+    // TODO: dump should include categories, tasks, profiles...
+    //  This stuff must start in the category repos.
+    val dumpModel = RdfUtil.mergeModels(allModels)
 
     if version == "dev" then
       // Generate dataset overview
       println("Generating dataset overview...")
       generateDatasetOverview(datasetCollection, outDir)
 
     // Write to files
-    println("Writing profiles...")
-    for (name, profileModel) <- profileCollection.profiles do
-      for (ext, format) <- Constants.outputFormats do
-        val outFile = outDir.resolve(f"profiles/$name.$ext").toFile
-        RDFDataMgr.write(new FileOutputStream(outFile), profileModel, format)
-
     println("Writing main metadata...")
     for (ext, format) <- Constants.outputFormats do
       val mainOutFile = outDir.resolve(f"metadata.$ext").toFile
@@ -115,118 +90,6 @@ object PackageMainCommand extends Command:
     println("Done.")
   }
 
-  private def linkProfileAndDatasets(
-    name: String, profile: Model, profileRes: Resource, datasetCollection: DatasetCollection, outDir: Path
-  ): Unit =
-    val restrictions = profile.listObjectsOfProperty(profileRes, RdfUtil.hasRestriction).asScala
-      .map(rNode => {
-        val rRes = rNode.asResource()
-        val props = rRes.listProperties().asScala
-          .map(p => (p.getPredicate, p.getObject))
-          .toSeq
-        props
-      })
-      .toSeq
-
-    val distTypes = restrictions.flatten.filter(_._1 == RdfUtil.hasDistributionType)
-      .map(_._2.asResource())
-
-    if distTypes.isEmpty then
-      throw new Exception(s"No distribution types specified in profile $name")
-
-    val profileTableSb = StringBuilder()
-    // name, dataset uri, Seq(dist download url, size, byte size)
-    val datasets: mutable.ArrayBuffer[(String, Resource, Seq[(String, Long, Long)])] = mutable.ArrayBuffer()
-
-    for ((name, dsModel) <- datasetCollection.datasets) do
-      if dsModel.isEmpty then
-        throw new Exception(f"Dataset $name is empty – does it have a matching release?")
-      val dsRes = dsModel.listSubjectsWithProperty(RDF.`type`, RdfUtil.Dataset).asScala.toSeq.headOption
-      dsRes match
-        case None => throw new Exception(f"Could not find the root resource for dataset $name")
-        case Some(dsRes) =>
-          if datasetMatchesRestrictions(dsRes, restrictions) then
-            profile.add(profileRes, RdfUtil.dcatSeriesMember, dsRes)
-            val distributions = dsRes.listProperties(RdfUtil.dcatDistribution).asScala
-              .map(_.getObject.asResource())
-              .filter(d => distTypes.exists(dt => d.hasProperty(RdfUtil.hasDistributionType, dt)))
-              .map(distRes => {
-                val downloadUrl = distRes.getProperty(RdfUtil.dcatDownloadURL).getObject.asResource().getURI
-                val size = distRes.getProperty(RdfUtil.hasStreamElementCount).getObject.asLiteral().getLong
-                val byteSize = distRes.getProperty(RdfUtil.dcatByteSize).getObject.asLiteral().getLong
-                (downloadUrl, size, byteSize)
-              })
-              .toSeq
-              .sortBy(_._2)
-            datasets.append((name, dsRes, distributions))
-
-    val columns = datasets
-      .flatMap(_._3.map(_._2))
-      .map(c => {
-        if Constants.packageSizes.contains(c) then
-          (c, Constants.packageSizeToHuman(c))
-        else
-          (Long.MaxValue, "Full")
-      })
-      .distinct
-      .sortBy(_._1)
-
-    if name.contains("flat") then
-      writeTable("")
-    else
-      profileTableSb.append(
-        """!!! note
-          |
-          |    For stream profiles, there are two available types of distributions: plain streaming, and streaming in the Jelly format. See the [documentation](../../documentation/dataset-release-format.md) for details.
-          |
-          |### Plain streaming distributions
-          |
-          |""".stripMargin)
-      writeTable("tar.gz")
-      profileTableSb.append(
-        """
-          |
-          |### Jelly streaming distributions
-          |
-          |""".stripMargin)
-      writeTable("jelly.gz")
-
-    def writeTable(filterBy: String): Unit =
-      profileTableSb.append("Dataset")
-      for col <- columns do
-        profileTableSb.append(f" | ${col._2}")
-      profileTableSb.append("\n---")
-      profileTableSb.append(" | ---" * columns.size)
-
-      for (dsName, dsUri, dists) <- datasets.sortBy(_._1) do
-        profileTableSb.append(f"\n[$dsName]($dsUri)")
-        for col <- columns do
-          val (distUrl, distSize, distByteSize) = dists
-            .filter(d => d._2 <= col._1 && d._1.contains(filterBy))
-            .last
-          profileTableSb.append(f" | [${Constants.packageSizeToHuman(distSize, true)} " +
-            f"(${MarkdownUtil.formatSize(distByteSize)})]($distUrl)")
-
-    Files.writeString(outDir.resolve(f"profiles/doc/${name}_table.md"), profileTableSb.toString())
-
-  private def datasetMatchesRestrictions(dsRes: Resource, rs: Seq[Seq[(Property, RDFNode)]]): Boolean =
-    val andMatches = for r <- rs yield
-      val orMatches = for (p, o) <- r yield
-        if p == RdfUtil.hasDistributionType then
-          None
-        else if p == RdfUtil.staxHasStreamType then
-          Some(
-            dsRes.listProperties(RdfUtil.staxHasStreamTypeUsage).asScala
-              .exists(_.getResource.hasProperty(p, o))
-          )
-        else if !dsRes.hasProperty(p, o) then
-          Some(false)
-        else Some(true)
-
-      val orMatchesFlat = orMatches.flatten
-      orMatchesFlat.isEmpty || orMatchesFlat.contains(true)
-    !andMatches.contains(false)
-
   private def generateDatasetOverview(datasetCollection: DatasetCollection, outDir: Path): Unit =
     val sb = new StringBuilder()
     sb.append("Dataset | <abbr title=\"Stream type\">El. type</abbr> | " +

diff --git a/src/main/scala/util/collection/CategoryCollection.scala b/src/main/scala/util/collection/CategoryCollection.scala
@@ -0,0 +1,29 @@
+package io.github.riverbench.ci_worker
+package util.collection
+
+import org.apache.jena.rdf.model.Model
+import org.apache.jena.riot.RDFDataMgr
+
+object CategoryCollection:
+  def fromReleases(names: Iterable[String], version: String): CategoryCollection =
+    val categories = names.map { name =>
+      if version == "latest" then
+        (name, s"https://github.com/RiverBench/category-$name/releases/latest/download/metadata.ttl")
+      else
+        (name, s"https://github.com/RiverBench/category-$name/releases/download/$version/metadata.ttl")
+    }
+    CategoryCollection(categories)
+
+class CategoryCollection(namesToUris: Iterable[(String, String)]):
+  val categories: Map[String, Model] = namesToUris
+    .map((name, uri) => {
+      try {
+        (name, RDFDataMgr.loadModel(uri))
+      }
+      catch {
+        case e: Exception =>
+          println(f"Failed to load metadata for category $name from $uri")
+          throw e
+      }
+    })
+    .toMap
diff --git a/src/main/scala/util/DatasetCollection.scala → ...a/util/collection/DatasetCollection.scala b/src/main/scala/util/DatasetCollection.scala → ...a/util/collection/DatasetCollection.scala
@@ -1,6 +1,7 @@
 package io.github.riverbench.ci_worker
 package util
 
+import org.apache.jena.rdf.model.Model
 import org.apache.jena.riot.RDFDataMgr
 
 import java.nio.file.Path
@@ -25,7 +26,7 @@ object DatasetCollection:
     DatasetCollection(datasets)
 
 class DatasetCollection(namesToUris: Iterable[(String, String)]):
-  val datasets = namesToUris
+  val datasets: Map[String, Model] = namesToUris
     .map((name, uri) => {
       try {
         (name, RDFDataMgr.loadModel(uri))

diff --git a/src/main/scala/util/ProfileCollection.scala → ...a/util/collection/ProfileCollection.scala b/src/main/scala/util/ProfileCollection.scala → ...a/util/collection/ProfileCollection.scala
@@ -1,5 +1,7 @@
 package io.github.riverbench.ci_worker
-package util
+package util.collection
+
+import util.{AppConfig, RdfUtil}
 
 import org.apache.jena.rdf.model.{Model, ModelFactory}
 import org.apache.jena.riot.RDFDataMgr