From fff572f546927dd0f014a5adbc4d4d0e57020789 Mon Sep 17 00:00:00 2001 From: Ostrzyciel Date: Thu, 18 Jul 2024 23:10:52 +0200 Subject: [PATCH] Maybe it works? --- .../eu/ostrzyciel/jelly/core/NodeEncoder.java | 193 ++++++++++ .../jelly/core/NameEncoderSpec.scala | 197 ---------- .../jelly/core/NodeEncoderSpec.scala | 340 ++++++++++++++++++ 3 files changed, 533 insertions(+), 197 deletions(-) create mode 100644 core/src/main/java/eu/ostrzyciel/jelly/core/NodeEncoder.java delete mode 100644 core/src/test/scala/eu/ostrzyciel/jelly/core/NameEncoderSpec.scala create mode 100644 core/src/test/scala/eu/ostrzyciel/jelly/core/NodeEncoderSpec.scala diff --git a/core/src/main/java/eu/ostrzyciel/jelly/core/NodeEncoder.java b/core/src/main/java/eu/ostrzyciel/jelly/core/NodeEncoder.java new file mode 100644 index 00000000..d6f0b941 --- /dev/null +++ b/core/src/main/java/eu/ostrzyciel/jelly/core/NodeEncoder.java @@ -0,0 +1,193 @@ +package eu.ostrzyciel.jelly.core; + +import eu.ostrzyciel.jelly.core.proto.v1.*; +import scala.collection.mutable.ListBuffer; + +import java.util.LinkedHashMap; +import java.util.List; +import java.util.function.Function; +import java.util.function.Supplier; + +public class NodeEncoder { + public final static class DependentNode { + public RdfTerm encoded; + // 1: datatypes and IRI names + public int lookupPointer1; + public int lookupSerial1; + // 2: IRI prefixes + public int lookupPointer2; + public int lookupSerial2; + } + + private static final class NodeCache extends LinkedHashMap { + private final int maxSize; + + public NodeCache(int maxSize) { + this.maxSize = maxSize; + } + + @Override + protected boolean removeEldestEntry(java.util.Map.Entry eldest) { + return size() > maxSize; + } + } + + final int maxPrefixTableSize; + int lastIriNameId; + int lastIriPrefixId = -1000; + + NewEncoderLookup datatypeLookup; + NewEncoderLookup prefixLookup; + NewEncoderLookup nameLookup; + NodeCache dependentNodeCache; + NodeCache nodeCache; + + static final RdfIri zeroIri = new RdfIri(0, 0); + + public NodeEncoder(RdfStreamOptions opt, int nodeCacheSize, int dependentNodeCacheSize) { + datatypeLookup = new NewEncoderLookup(opt.maxDatatypeTableSize()); + this.maxPrefixTableSize = opt.maxPrefixTableSize(); + if (maxPrefixTableSize > 0) { + prefixLookup = new NewEncoderLookup(maxPrefixTableSize); + } + nameLookup = new NewEncoderLookup(opt.maxNameTableSize()); + dependentNodeCache = new NodeCache<>(dependentNodeCacheSize); + nodeCache = new NodeCache<>(nodeCacheSize); + } + + // if returned object.encoded = null -> set it to new RdfLiteral. + // else, use it to write on the wire + public DependentNode encodeDtLiteral(TNode key, ListBuffer rowsBuffer, Supplier dtSupplier) { + var cachedNode = dependentNodeCache.get(key); + if (cachedNode != null) { + // Check if the value is still valid + if (cachedNode.lookupSerial1 == datatypeLookup.table[cachedNode.lookupPointer1 * 3 + 2]) { + datatypeLookup.onAccess(cachedNode.lookupPointer1); + return cachedNode; + } + cachedNode.encoded = null; + } else { + cachedNode = new DependentNode(); + // We can already put the node in the map, we will update it later using our reference + dependentNodeCache.put(key, cachedNode); + } + + // The node is not encoded, but we may already have the datatype encoded + var datatypeName = dtSupplier.get(); + var dtEntry = datatypeLookup.addEntry(datatypeName); + if (dtEntry.newEntry) { + rowsBuffer.append(new RdfStreamRow( + new RdfStreamRow$Row$Datatype( + new RdfDatatypeEntry(dtEntry.setId, datatypeName) + ) + )); + } + cachedNode.lookupPointer1 = dtEntry.getId; + cachedNode.lookupSerial1 = dtEntry.serial; + + return cachedNode; + } + + public RdfTerm encodeIri(String iri, ListBuffer rowsBuffer) { + var cachedNode = dependentNodeCache.get(iri); + if (cachedNode != null) { + // Check if the value is still valid + if (cachedNode.lookupSerial1 == nameLookup.table[cachedNode.lookupPointer1 * 3 + 2]) { + if (cachedNode.lookupPointer2 == 0) { + nameLookup.onAccess(cachedNode.lookupPointer1); + // TODO: fast path for no prefixes? or it may be just an empty prefix... consider + return outputIri(cachedNode); + } else if (cachedNode.lookupSerial2 == prefixLookup.table[cachedNode.lookupPointer2 * 3 + 2]) { + nameLookup.onAccess(cachedNode.lookupPointer1); + prefixLookup.onAccess(cachedNode.lookupPointer2); + return outputIri(cachedNode); + } + } + } else { + cachedNode = new DependentNode(); + dependentNodeCache.put(iri, cachedNode); + } + + // Fast path for no prefixes + if (this.maxPrefixTableSize == 0) { + var nameEntry = nameLookup.addEntry(iri); + if (nameEntry.newEntry) { + rowsBuffer.append(new RdfStreamRow( + new RdfStreamRow$Row$Name(new RdfNameEntry(nameEntry.setId, iri)) + )); + } + cachedNode.lookupPointer1 = nameEntry.getId; + cachedNode.lookupSerial1 = nameEntry.serial; + cachedNode.encoded = new RdfIri(0, nameEntry.getId); + if (lastIriNameId + 1 == nameEntry.getId) { + lastIriNameId = nameEntry.getId; + return zeroIri; + } else { + lastIriNameId = nameEntry.getId; + return cachedNode.encoded; + } + } + + // Slow path, with splitting out the prefix + int i = iri.indexOf('#', 8); + String prefix; + String postfix; + if (i == -1) { + i = iri.lastIndexOf('/'); + if (i != -1) { + prefix = iri.substring(0, i + 1); + postfix = iri.substring(i + 1); + } else { + prefix = ""; + postfix = iri; + } + } else { + prefix = iri.substring(0, i + 1); + postfix = iri.substring(i + 1); + } + + var prefixEntry = prefixLookup.addEntry(prefix); + var nameEntry = nameLookup.addEntry(postfix); + if (prefixEntry.newEntry) { + rowsBuffer.append(new RdfStreamRow( + new RdfStreamRow$Row$Prefix(new RdfPrefixEntry(prefixEntry.setId, prefix)) + )); + } + if (nameEntry.newEntry) { + rowsBuffer.append(new RdfStreamRow( + new RdfStreamRow$Row$Name(new RdfNameEntry(nameEntry.setId, postfix)) + )); + } + cachedNode.lookupPointer1 = nameEntry.getId; + cachedNode.lookupSerial1 = nameEntry.serial; + cachedNode.lookupPointer2 = prefixEntry.getId; + cachedNode.lookupSerial2 = prefixEntry.serial; + cachedNode.encoded = new RdfIri(prefixEntry.getId, nameEntry.getId); + return outputIri(cachedNode); + } + + private RdfTerm outputIri(DependentNode cachedNode) { + if (lastIriPrefixId == cachedNode.lookupPointer2) { + if (lastIriNameId + 1 == cachedNode.lookupPointer1) { + lastIriNameId = cachedNode.lookupPointer1; + return zeroIri; + } else { + lastIriNameId = cachedNode.lookupPointer1; + return new RdfIri(0, cachedNode.lookupPointer1); + } + } else { + lastIriPrefixId = cachedNode.lookupPointer2; + if (lastIriNameId + 1 == cachedNode.lookupPointer1) { + lastIriNameId = cachedNode.lookupPointer1; + return new RdfIri(cachedNode.lookupPointer2, 0); + } else { + lastIriNameId = cachedNode.lookupPointer1; + return cachedNode.encoded; + } + } + } + + public RdfTerm encodeOther(TNode key, Function encoder) { + return nodeCache.computeIfAbsent(key, encoder); + } +} diff --git a/core/src/test/scala/eu/ostrzyciel/jelly/core/NameEncoderSpec.scala b/core/src/test/scala/eu/ostrzyciel/jelly/core/NameEncoderSpec.scala deleted file mode 100644 index 74e93ae2..00000000 --- a/core/src/test/scala/eu/ostrzyciel/jelly/core/NameEncoderSpec.scala +++ /dev/null @@ -1,197 +0,0 @@ -package eu.ostrzyciel.jelly.core - -import eu.ostrzyciel.jelly.core.proto.v1.* -import org.scalatest.Inspectors -import org.scalatest.matchers.should.Matchers -import org.scalatest.wordspec.AnyWordSpec - -import scala.collection.mutable.ListBuffer - -class NameEncoderSpec extends AnyWordSpec, Inspectors, Matchers: - def smallOptions(prefixTableSize: Int) = RdfStreamOptions( - maxNameTableSize = 4, - maxPrefixTableSize = prefixTableSize, - maxDatatypeTableSize = 8, - ) - - private def getEncoder(prefixTableSize: Int = 8): (NameEncoder, ListBuffer[RdfStreamRow]) = - val buffer = new ListBuffer[RdfStreamRow]() - (NameEncoder(smallOptions(prefixTableSize)), buffer) - - "A NameEncoder" when { - "encoding datatypes" should { - "add a datatype" in { - val (encoder, buffer) = getEncoder() - val dt = encoder.encodeDatatype("dt1", buffer) - dt.value should be (1) - buffer.size should be (1) - buffer.head.row.isDatatype should be (true) - val dtEntry = buffer.head.row.datatype - dtEntry.value should be ("dt1") - dtEntry.id should be (0) - } - - "add multiple datatypes and reuse existing ones" in { - val (encoder, buffer) = getEncoder() - for i <- 1 to 4 do - val dt = encoder.encodeDatatype(s"dt$i", buffer) - dt.value should be (i) - - // "dt3" should be reused - val dt = encoder.encodeDatatype("dt3", buffer) - dt.value should be (3) - - buffer.size should be (4) - buffer.map(_.row.datatype) should contain only ( - RdfDatatypeEntry(0, "dt1"), - RdfDatatypeEntry(0, "dt2"), - RdfDatatypeEntry(0, "dt3"), - RdfDatatypeEntry(0, "dt4"), - ) - } - - "add datatypes evicting old ones" in { - val (encoder, buffer) = getEncoder() - for i <- 1 to 12 do - val dt = encoder.encodeDatatype(s"dt$i", buffer) - // first 4 should be evicted - dt.value should be ((i - 1) % 8 + 1) - - for i <- 9 to 12 do - val dt = encoder.encodeDatatype(s"dt$i", buffer) - dt.value should be (i - 8) - - for i <- 5 to 8 do - val dt = encoder.encodeDatatype(s"dt$i", buffer) - dt.value should be (i) - - // 5–8 were used last, so they should be evicted last - for i <- 13 to 16 do - val dt = encoder.encodeDatatype(s"dt$i", buffer) - dt.value should be (i - 12) // 1–4 - - buffer.size should be (16) - val expectedIds = Array.from( - Iterable.fill(8)(0) ++ Seq(1) ++ Iterable.fill(3)(0) ++ Seq(1) ++ Iterable.fill(3)(0) - ) - for (r, i) <- buffer.zipWithIndex do - val dt = r.row.datatype - dt.id should be (expectedIds(i)) - dt.value should be (s"dt${i + 1}") - } - } - - "encoding IRIs" should { - "add a full IRI" in { - val (encoder, buffer) = getEncoder() - val iri = encoder.encodeIri("https://test.org/Cake", buffer) - iri.nameId should be (0) - iri.prefixId should be (1) - - buffer.size should be (2) - buffer should contain (RdfStreamRow(RdfStreamRow.Row.Prefix( - RdfPrefixEntry(id = 0, value = "https://test.org/") - ))) - buffer should contain (RdfStreamRow(RdfStreamRow.Row.Name( - RdfNameEntry(id = 0, value = "Cake") - ))) - } - - "add a prefix-only IRI" in { - val (encoder, buffer) = getEncoder() - val iri = encoder.encodeIri("https://test.org/test/", buffer) - iri.nameId should be (0) - iri.prefixId should be (1) - - // an empty name entry still has to be allocated - buffer.size should be (2) - buffer should contain (RdfStreamRow(RdfStreamRow.Row.Prefix( - RdfPrefixEntry(id = 0, value = "https://test.org/test/") - ))) - buffer should contain(RdfStreamRow(RdfStreamRow.Row.Name( - RdfNameEntry(id = 0, value = "") - ))) - } - - "add a name-only IRI" in { - val (encoder, buffer) = getEncoder() - val iri = encoder.encodeIri("testTestTest", buffer) - iri.nameId should be (0) - iri.prefixId should be (1) - - // in the mode with the prefix table enabled, an empty prefix entry still has to be allocated - buffer.size should be (2) - buffer should contain(RdfStreamRow(RdfStreamRow.Row.Prefix( - RdfPrefixEntry(id = 0, value = "") - ))) - buffer should contain (RdfStreamRow(RdfStreamRow.Row.Name( - RdfNameEntry(id = 0, value = "testTestTest") - ))) - } - - "add a full IRI in no-prefix table mode" in { - val (encoder, buffer) = getEncoder(0) - val iri = encoder.encodeIri("https://test.org/Cake", buffer) - iri.nameId should be (0) - iri.prefixId should be (0) - - // in the no prefix mode, there must be no prefix entries - buffer.size should be (1) - buffer should contain (RdfStreamRow(RdfStreamRow.Row.Name( - RdfNameEntry(id = 0, value = "https://test.org/Cake") - ))) - } - - "add IRIs while evicting old ones" in { - val (encoder, buffer) = getEncoder(3) - val data = Seq( - // IRI, expected prefix ID, expected name ID - ("https://test.org/Cake1", 1, 0), - ("https://test.org#Cake1", 2, 1), - ("https://test.org/test/Cake1", 3, 1), - ("https://test.org/Cake2", 1, 0), - ("https://test.org#Cake2", 2, 2), - ("https://test.org/other/Cake1", 3, 1), - ("https://test.org/other/Cake2", 0, 0), - ("https://test.org/other/Cake3", 0, 0), - ("https://test.org/other/Cake4", 0, 0), - ("https://test.org/other/Cake5", 0, 1), - ("https://test.org#Cake2", 2, 0), - // prefix "" evicts the previous number #1 - ("Cake2", 1, 2), - ) - - for (sIri, ePrefix, eName) <- data do - val iri = encoder.encodeIri(sIri, buffer) - iri.prefixId should be (ePrefix) - iri.nameId should be (eName) - - val expectedBuffer = Seq( - // Prefix? (name otherwise), ID, value - (true, 0, "https://test.org/"), - (false, 0, "Cake1"), - (true, 0, "https://test.org#"), - (true, 0, "https://test.org/test/"), - (false, 0, "Cake2"), - (true, 3, "https://test.org/other/"), - (false, 0, "Cake3"), - (false, 0, "Cake4"), - (false, 1, "Cake5"), - (true, 1, ""), - ) - - buffer.size should be (expectedBuffer.size) - for ((isPrefix, eId, eVal), row) <- expectedBuffer.zip(buffer) do - if isPrefix then - row.row.isPrefix should be (true) - val prefix = row.row.prefix - prefix.id should be (eId) - prefix.value should be (eVal) - else - row.row.isName should be (true) - val name = row.row.name - name.id should be (eId) - name.value should be (eVal) - } - } - } diff --git a/core/src/test/scala/eu/ostrzyciel/jelly/core/NodeEncoderSpec.scala b/core/src/test/scala/eu/ostrzyciel/jelly/core/NodeEncoderSpec.scala new file mode 100644 index 00000000..c913abf2 --- /dev/null +++ b/core/src/test/scala/eu/ostrzyciel/jelly/core/NodeEncoderSpec.scala @@ -0,0 +1,340 @@ +package eu.ostrzyciel.jelly.core + +import eu.ostrzyciel.jelly.core.NodeEncoder.DependentNode +import eu.ostrzyciel.jelly.core.helpers.Mrl +import eu.ostrzyciel.jelly.core.proto.v1.* +import org.scalatest.Inspectors +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpec + +import scala.collection.mutable.ListBuffer +import scala.util.Random + +class NodeEncoderSpec extends AnyWordSpec, Inspectors, Matchers: + def smallOptions(prefixTableSize: Int) = RdfStreamOptions( + maxNameTableSize = 4, + maxPrefixTableSize = prefixTableSize, + maxDatatypeTableSize = 8, + ) + + private def getEncoder(prefixTableSize: Int = 8): (NodeEncoder[Mrl.Node], ListBuffer[RdfStreamRow]) = + val buffer = new ListBuffer[RdfStreamRow]() + (NodeEncoder[Mrl.Node](smallOptions(prefixTableSize), 16, 16), buffer) + + "A NodeEncoder" when { + "encoding datatype literals" should { + "encode a datatype literal" in { + val (encoder, buffer) = getEncoder() + val dn: DependentNode = encoder.encodeDtLiteral( + Mrl.DtLiteral("v1", Mrl.Datatype("dt1")), + buffer, + () => "dt1" + ) + dn.encoded should be (null) + dn.lookupPointer1 should be (1) + dn.lookupSerial1 should be (1) + buffer.size should be (1) + buffer.head.row.isDatatype should be (true) + val dtEntry = buffer.head.row.datatype + dtEntry.value should be ("dt1") + dtEntry.id should be (0) + } + + "encode multiple datatype literals and reuse existing datatypes" in { + val (encoder, buffer) = getEncoder() + for i <- 1 to 4 do + val dn = encoder.encodeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + buffer, + () => s"dt$i" + ) + dn.encoded should be (null) + dn.lookupPointer1 should be (i) + dn.lookupSerial1 should be (1) + dn.encoded = RdfLiteral(s"v$i", RdfLiteral.LiteralKind.Datatype(dn.lookupPointer1)) + + // "dt3" datatype should be reused + val dn = encoder.encodeDtLiteral( + Mrl.DtLiteral(s"v1000", Mrl.Datatype(s"dt3")), + buffer, + () => "dt3" + ) + dn.encoded should be (null) + dn.lookupPointer1 should be (3) + dn.lookupSerial1 should be (1) + + // "v2"^^ should be reused + val dn2 = encoder.encodeDtLiteral( + Mrl.DtLiteral("v2", Mrl.Datatype("dt2")), + buffer, + () => "dt2" + ) + dn2.encoded should be (RdfLiteral("v2", RdfLiteral.LiteralKind.Datatype(2))) + dn2.lookupPointer1 should be (2) + dn2.lookupSerial1 should be (1) + + buffer.size should be (4) + buffer.map(_.row.datatype) should contain only ( + RdfDatatypeEntry(0, "dt1"), + RdfDatatypeEntry(0, "dt2"), + RdfDatatypeEntry(0, "dt3"), + RdfDatatypeEntry(0, "dt4"), + ) + } + + "encode datatype literals while evicting old datatypes" in { + val (encoder, buffer) = getEncoder() + for i <- 1 to 12 do + val dn = encoder.encodeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + buffer, + () => s"dt$i" + ) + // first 4 datatypes should be evicted + dn.lookupPointer1 should be ((i - 1) % 8 + 1) + dn.lookupSerial1 should be ((i - 1) / 8 + 1) + + for i <- 9 to 12 do + val dn = encoder.encodeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + buffer, + () => s"dt$i" + ) + dn.lookupPointer1 should be (i - 8) + dn.lookupSerial1 should be (2) + + for i <- 5 to 8 do + val dn = encoder.encodeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + buffer, + () => s"dt$i" + ) + dn.lookupPointer1 should be (i) + dn.lookupSerial1 should be (1) + + // 5–8 were used last, so they should be evicted last + for i <- 13 to 16 do + val dn = encoder.encodeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + buffer, + () => s"dt$i" + ) + dn.lookupPointer1 should be (i - 12) // 1–4 + dn.lookupSerial1 should be (3) + + buffer.size should be (16) + val expectedIds = Array.from( + Iterable.fill(8)(0) ++ Seq(1) ++ Iterable.fill(3)(0) ++ Seq(1) ++ Iterable.fill(3)(0) + ) + for (r, i) <- buffer.zipWithIndex do + val dt = r.row.datatype + dt.id should be (expectedIds(i)) + dt.value should be (s"dt${i + 1}") + } + + "reuse already encoded literals, evicting old ones" in { + val (encoder, buffer) = getEncoder() + for i <- 1 to 4; j <- 1 to 4 do + val dn = encoder.encodeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$j")), + buffer, + () => s"dt$j" + ) + dn.encoded should be (null) + dn.lookupPointer1 should be (j) + dn.lookupSerial1 should be (1) + dn.encoded = RdfLiteral(s"v$i", RdfLiteral.LiteralKind.Datatype(dn.lookupPointer1)) + + for _ <- 1 to 10 do + for i <- Random.shuffle(1 to 4); j <- Random.shuffle(1 to 4) do + val dn = encoder.encodeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$j")), + buffer, + () => s"dt$j" + ) + dn.encoded should be (RdfLiteral(s"v$i", RdfLiteral.LiteralKind.Datatype(j))) + dn.lookupPointer1 should be (j) + dn.lookupSerial1 should be (1) + + // Add more literals to evict the old ones + for j <- 101 to 104 do + val dn = encoder.encodeDtLiteral( + Mrl.DtLiteral(s"v100", Mrl.Datatype(s"dt${j - 100}")), + buffer, + () => s"dt${j - 100}" + ) + dn.encoded should be (null) + dn.lookupPointer1 should be (j - 100) + dn.lookupSerial1 should be (1) + dn.encoded = RdfLiteral(s"v100", RdfLiteral.LiteralKind.Datatype(dn.lookupPointer1)) + + // These entries should have been evicted, we will get nulls here + for j <- 1 to 4 do + val dn = encoder.encodeDtLiteral( + Mrl.DtLiteral(s"v1", Mrl.Datatype(s"dt$j")), + buffer, + () => s"dt$j" + ) + dn.encoded should be (null) + dn.lookupPointer1 should be (j) + dn.lookupSerial1 should be (1) + } + + "invalidate cached datatype literals when their datatypes are evicted" in { + val (encoder, buffer) = getEncoder() + for i <- 1 to 4 do + val dn = encoder.encodeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + buffer, + () => s"dt$i" + ) + dn.encoded should be (null) + dn.lookupPointer1 should be (i) + dn.lookupSerial1 should be (1) + dn.encoded = RdfLiteral(s"v$i", RdfLiteral.LiteralKind.Datatype(dn.lookupPointer1)) + + for i <- 5 to 12 do + val dn = encoder.encodeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + buffer, + () => s"dt$i" + ) + dn.encoded should be (null) + dn.lookupPointer1 should be ((i - 1) % 8 + 1) + dn.lookupSerial1 should be ((i - 1) / 8 + 1) + dn.encoded = RdfLiteral(s"v$i", RdfLiteral.LiteralKind.Datatype(dn.lookupPointer1)) + + for i <- 1 to 4 do + val dn = encoder.encodeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + buffer, + () => s"dt$i" + ) + dn.encoded should be (null) + dn.lookupPointer1 should be (i + 4) + dn.lookupSerial1 should be (2) + } + } + + "encoding IRIs" should { + "add a full IRI" in { + val (encoder, buffer) = getEncoder() + val iri = encoder.encodeIri("https://test.org/Cake", buffer).asInstanceOf[RdfIri] + iri.nameId should be (0) + iri.prefixId should be (1) + + buffer.size should be (2) + buffer should contain (RdfStreamRow(RdfStreamRow.Row.Prefix( + RdfPrefixEntry(id = 0, value = "https://test.org/") + ))) + buffer should contain (RdfStreamRow(RdfStreamRow.Row.Name( + RdfNameEntry(id = 0, value = "Cake") + ))) + } + + "add a prefix-only IRI" in { + val (encoder, buffer) = getEncoder() + val iri = encoder.encodeIri("https://test.org/test/", buffer).asInstanceOf[RdfIri] + iri.nameId should be (0) + iri.prefixId should be (1) + + // an empty name entry still has to be allocated + buffer.size should be (2) + buffer should contain (RdfStreamRow(RdfStreamRow.Row.Prefix( + RdfPrefixEntry(id = 0, value = "https://test.org/test/") + ))) + buffer should contain(RdfStreamRow(RdfStreamRow.Row.Name( + RdfNameEntry(id = 0, value = "") + ))) + } + + "add a name-only IRI" in { + val (encoder, buffer) = getEncoder() + val iri = encoder.encodeIri("testTestTest", buffer).asInstanceOf[RdfIri] + iri.nameId should be (0) + iri.prefixId should be (1) + + // in the mode with the prefix table enabled, an empty prefix entry still has to be allocated + buffer.size should be (2) + buffer should contain(RdfStreamRow(RdfStreamRow.Row.Prefix( + RdfPrefixEntry(id = 0, value = "") + ))) + buffer should contain (RdfStreamRow(RdfStreamRow.Row.Name( + RdfNameEntry(id = 0, value = "testTestTest") + ))) + } + + "add a full IRI in no-prefix table mode" in { + val (encoder, buffer) = getEncoder(0) + val iri = encoder.encodeIri("https://test.org/Cake", buffer).asInstanceOf[RdfIri] + iri.nameId should be (0) + iri.prefixId should be (0) + + // in the no prefix mode, there must be no prefix entries + buffer.size should be (1) + buffer should contain (RdfStreamRow(RdfStreamRow.Row.Name( + RdfNameEntry(id = 0, value = "https://test.org/Cake") + ))) + } + + "add IRIs while evicting old ones" in { + val (encoder, buffer) = getEncoder(3) + val data = Seq( + // IRI, expected prefix ID, expected name ID + ("https://test.org/Cake1", 1, 0), + ("https://test.org/Cake1", 0, 1), + ("https://test.org/Cake1", 0, 1), + ("https://test.org#Cake1", 2, 1), + ("https://test.org/test/Cake1", 3, 1), + ("https://test.org/Cake2", 1, 0), + ("https://test.org#Cake2", 2, 2), + ("https://test.org/other/Cake1", 3, 1), + ("https://test.org/other/Cake2", 0, 0), + ("https://test.org/other/Cake3", 0, 0), + ("https://test.org/other/Cake4", 0, 0), + ("https://test.org/other/Cake1", 0, 1), + ("https://test.org/other/Cake2", 0, 0), + ("https://test.org/other/Cake3", 0, 0), + ("https://test.org/other/Cake4", 0, 0), + ("https://test.org/other/Cake5", 0, 1), + ("https://test.org/other/Cake5", 0, 1), + ("https://test.org#Cake2", 2, 0), + ("https://test.org#Cake5", 0, 1), + // prefix "" evicts the previous number #1 + ("Cake2", 1, 0), + ) + + for (sIri, ePrefix, eName) <- data do + val iri = encoder.encodeIri(sIri, buffer).asInstanceOf[RdfIri] + iri.prefixId should be (ePrefix) + iri.nameId should be (eName) + + val expectedBuffer = Seq( + // Prefix? (name otherwise), ID, value + (true, 0, "https://test.org/"), + (false, 0, "Cake1"), + (true, 0, "https://test.org#"), + (true, 0, "https://test.org/test/"), + (false, 0, "Cake2"), + (true, 3, "https://test.org/other/"), + (false, 0, "Cake3"), + (false, 0, "Cake4"), + (false, 1, "Cake5"), + (true, 1, ""), + ) + + buffer.size should be (expectedBuffer.size) + for ((isPrefix, eId, eVal), row) <- expectedBuffer.zip(buffer) do + if isPrefix then + row.row.isPrefix should be (true) + val prefix = row.row.prefix + prefix.id should be (eId) + prefix.value should be (eVal) + else + row.row.isName should be (true) + val name = row.row.name + name.id should be (eId) + name.value should be (eVal) + } + } + }