diff --git a/core/src/main/java/eu/ostrzyciel/jelly/core/EncoderLookup.java b/core/src/main/java/eu/ostrzyciel/jelly/core/EncoderLookup.java index 1cb28ee..7aee3d5 100644 --- a/core/src/main/java/eu/ostrzyciel/jelly/core/EncoderLookup.java +++ b/core/src/main/java/eu/ostrzyciel/jelly/core/EncoderLookup.java @@ -18,13 +18,6 @@ static final class LookupEntry { public int setId; /** Whether this entry is a new entry. */ public boolean newEntry; - /** - * The serial number of the entry, incremented each time the entry is replaced in the table. - * This could theoretically overflow and cause bogus cache hits, but it's enormously - * unlikely to happen in practice. I can buy a beer for anyone who can construct an RDF dataset that - * causes this to happen. - */ - public int serial = 1; public LookupEntry(int getId, int setId) { this.getId = getId; @@ -40,13 +33,23 @@ public LookupEntry(int getId, int setId, boolean newEntry) { /** The lookup hash map */ private final HashMap map = new HashMap<>(); + /** * The doubly-linked list of entries, with 1-based indexing. - * Each entry is represented by three integers: left, right, and serial. + * Each entry is represented by two integers: left and right. * The head pointer is in table[1]. - * The first valid entry is in table[3] – table[5]. + * The first valid entry is in table[3] – table[4]. + */ + private final int[] table; + + /** + * The serial numbers of the entries, incremented each time the entry is replaced in the table. + * This could theoretically overflow and cause bogus cache hits, but it's enormously + * unlikely to happen in practice. I can buy a beer for anyone who can construct an RDF dataset that + * causes this to happen. */ - final int[] table; + final int[] serials; + // Tail pointer for the table. private int tail; // Maximum size of the lookup. @@ -58,16 +61,24 @@ public LookupEntry(int getId, int setId, boolean newEntry) { private int lastSetId; // Names of the entries. Entry 0 is always null. private final String[] names; + // Whether to use serials for the entries. + private final boolean useSerials; private final LookupEntry entryForReturns = new LookupEntry(0, 0, true); - public EncoderLookup(int size) { + public EncoderLookup(int size, boolean useSerials) { this.size = size; - table = new int[(size + 1) * 3]; - // Set the head's serial to non-zero value, so that default-initialized DependentNodes are not - // accidentally considered as valid entries. - table[2] = -1; + table = new int[(size + 1) * 2]; names = new String[size + 1]; + this.useSerials = useSerials; + if (useSerials) { + serials = new int[size + 1]; + // Set the head's serial to non-zero value, so that default-initialized DependentNodes are not + // accidentally considered as valid entries. + serials[0] = -1; + } else { + serials = null; + } } /** @@ -76,18 +87,18 @@ public EncoderLookup(int size) { * @param id The ID of the entry that was accessed. */ public void onAccess(int id) { - int base = id * 3; + int base = id * 2; if (base == tail) { return; } int left = table[base]; int right = table[base + 1]; + // Set our left to the tail + table[base] = tail; // Set left's right to our right table[left + 1] = right; // Set right's left to our left table[right] = left; - // Set our left to the tail - table[base] = tail; // Set the tail's right to us table[tail + 1] = base; // Update the tail @@ -99,7 +110,7 @@ public void onAccess(int id) { * @param key The key of the entry. * @return The entry. */ - public LookupEntry addEntry(String key) { + public LookupEntry getOrAddEntry(String key) { var value = map.get(key); if (value != null) { // The entry is already in the table, just update the access order @@ -111,13 +122,11 @@ public LookupEntry addEntry(String key) { if (used < size) { // We still have space in the table, add a new entry to the end of the table. id = ++used; - int base = id * 3; + int base = id * 2; // Set the left to the tail table[base] = tail; // Right is already 0 // table[base + 1] = 0; - // Serial is zero, set it to 0+1 = 1 - table[base + 2] = 1; // Set the tail's right to us table[tail + 1] = base; tail = base; @@ -130,22 +139,22 @@ public LookupEntry addEntry(String key) { } else { // The table is full, evict the least recently used entry. int base = table[1]; - id = base / 3; + id = base / 2; // Remove the entry from the map LookupEntry oldEntry = map.remove(names[id]); - oldEntry.getId = id; - oldEntry.setId = id; - int serial = table[base + 2] + 1; - oldEntry.serial = serial; - table[base + 2] = serial; // Insert the new entry names[id] = key; map.put(key, oldEntry); // Update the table onAccess(id); - entryForReturns.serial = serial; entryForReturns.setId = lastSetId + 1 == id ? 0 : id; } + if (this.useSerials) { + // Increment the serial number + // We save some memory accesses by not doing this if the serials are not used. + // The if should be very predictable and have no negative performance impact. + ++serials[id]; + } entryForReturns.getId = id; lastSetId = id; return entryForReturns; diff --git a/core/src/main/java/eu/ostrzyciel/jelly/core/NodeEncoder.java b/core/src/main/java/eu/ostrzyciel/jelly/core/NodeEncoder.java index 12d6a9e..dfd486e 100644 --- a/core/src/main/java/eu/ostrzyciel/jelly/core/NodeEncoder.java +++ b/core/src/main/java/eu/ostrzyciel/jelly/core/NodeEncoder.java @@ -58,7 +58,7 @@ protected boolean removeEldestEntry(java.util.Map.Entry eldest) { private int lastIriPrefixId = -1000; private final EncoderLookup datatypeLookup; - private EncoderLookup prefixLookup; + private final EncoderLookup prefixLookup; private final EncoderLookup nameLookup; // We split the node caches in three – the first two are for nodes that depend on the lookups @@ -80,21 +80,21 @@ protected boolean removeEldestEntry(java.util.Map.Entry eldest) { * @param dtLiteralNodeCacheSize The size of the datatype literal dependent node cache */ public NodeEncoder(RdfStreamOptions opt, int nodeCacheSize, int iriNodeCacheSize, int dtLiteralNodeCacheSize) { - datatypeLookup = new EncoderLookup(opt.maxDatatypeTableSize()); + datatypeLookup = new EncoderLookup(opt.maxDatatypeTableSize(), true); this.maxPrefixTableSize = opt.maxPrefixTableSize(); if (maxPrefixTableSize > 0) { - prefixLookup = new EncoderLookup(maxPrefixTableSize); + prefixLookup = new EncoderLookup(maxPrefixTableSize, true); iriNodeCache = new NodeCache<>(iriNodeCacheSize); - nameOnlyIris = null; } else { + prefixLookup = null; iriNodeCache = null; - nameOnlyIris = new RdfIri[opt.maxNameTableSize() + 1]; - for (int i = 0; i < nameOnlyIris.length; i++) { - nameOnlyIris[i] = new RdfIri(0, i); - } + } + nameOnlyIris = new RdfIri[opt.maxNameTableSize() + 1]; + for (int i = 0; i < nameOnlyIris.length; i++) { + nameOnlyIris[i] = new RdfIri(0, i); } dtLiteralNodeCache = new NodeCache<>(dtLiteralNodeCacheSize); - nameLookup = new EncoderLookup(opt.maxNameTableSize()); + nameLookup = new EncoderLookup(opt.maxNameTableSize(), maxPrefixTableSize > 0); nodeCache = new NodeCache<>(nodeCacheSize); } @@ -112,23 +112,24 @@ public UniversalTerm encodeDtLiteral( var cachedNode = dtLiteralNodeCache.computeIfAbsent(key, k -> new DependentNode()); // Check if the value is still valid if (cachedNode.encoded != null && - cachedNode.lookupSerial1 == datatypeLookup.table[cachedNode.lookupPointer1 * 3 + 2] + cachedNode.lookupSerial1 == datatypeLookup.serials[cachedNode.lookupPointer1] ) { datatypeLookup.onAccess(cachedNode.lookupPointer1); return cachedNode.encoded; } // The node is not encoded, but we may already have the datatype encoded - var dtEntry = datatypeLookup.addEntry(datatypeName); + var dtEntry = datatypeLookup.getOrAddEntry(datatypeName); if (dtEntry.newEntry) { rowsBuffer.append(new RdfStreamRow( new RdfDatatypeEntry(dtEntry.setId, datatypeName) )); } - cachedNode.lookupPointer1 = dtEntry.getId; - cachedNode.lookupSerial1 = dtEntry.serial; + int dtId = dtEntry.getId; + cachedNode.lookupPointer1 = dtId; + cachedNode.lookupSerial1 = datatypeLookup.serials[dtId]; cachedNode.encoded = new RdfLiteral( - lex, new RdfLiteral$LiteralKind$Datatype(dtEntry.getId) + lex, new RdfLiteral$LiteralKind$Datatype(dtId) ); return cachedNode.encoded; @@ -143,7 +144,7 @@ public UniversalTerm encodeDtLiteral( public UniversalTerm encodeIri(String iri, ArrayBuffer rowsBuffer) { if (maxPrefixTableSize == 0) { // Fast path for no prefixes - var nameEntry = nameLookup.addEntry(iri); + var nameEntry = nameLookup.getOrAddEntry(iri); if (nameEntry.newEntry) { rowsBuffer.append(new RdfStreamRow( new RdfNameEntry(nameEntry.setId, iri) @@ -162,8 +163,8 @@ public UniversalTerm encodeIri(String iri, ArrayBuffer rowsBuffer) var cachedNode = iriNodeCache.computeIfAbsent(iri, k -> new DependentNode()); // Check if the value is still valid if (cachedNode.encoded != null && - cachedNode.lookupSerial1 == nameLookup.table[cachedNode.lookupPointer1 * 3 + 2] && - cachedNode.lookupSerial2 == prefixLookup.table[cachedNode.lookupPointer2 * 3 + 2] + cachedNode.lookupSerial1 == nameLookup.serials[cachedNode.lookupPointer1] && + cachedNode.lookupSerial2 == prefixLookup.serials[cachedNode.lookupPointer2] ) { nameLookup.onAccess(cachedNode.lookupPointer1); prefixLookup.onAccess(cachedNode.lookupPointer2); @@ -187,8 +188,8 @@ public UniversalTerm encodeIri(String iri, ArrayBuffer rowsBuffer) postfix = iri.substring(i + 1); } - var prefixEntry = prefixLookup.addEntry(prefix); - var nameEntry = nameLookup.addEntry(postfix); + var prefixEntry = prefixLookup.getOrAddEntry(prefix); + var nameEntry = nameLookup.getOrAddEntry(postfix); if (prefixEntry.newEntry) { rowsBuffer.append(new RdfStreamRow( new RdfPrefixEntry(prefixEntry.setId, prefix) @@ -199,11 +200,13 @@ public UniversalTerm encodeIri(String iri, ArrayBuffer rowsBuffer) new RdfNameEntry(nameEntry.setId, postfix) )); } - cachedNode.lookupPointer1 = nameEntry.getId; - cachedNode.lookupSerial1 = nameEntry.serial; - cachedNode.lookupPointer2 = prefixEntry.getId; - cachedNode.lookupSerial2 = prefixEntry.serial; - cachedNode.encoded = new RdfIri(prefixEntry.getId, nameEntry.getId); + int nameId = nameEntry.getId; + int prefixId = prefixEntry.getId; + cachedNode.lookupPointer1 = nameId; + cachedNode.lookupSerial1 = nameLookup.serials[nameId]; + cachedNode.lookupPointer2 = prefixId; + cachedNode.lookupSerial2 = prefixLookup.serials[prefixId]; + cachedNode.encoded = new RdfIri(prefixId, nameId); return outputIri(cachedNode); } @@ -213,21 +216,23 @@ public UniversalTerm encodeIri(String iri, ArrayBuffer rowsBuffer) * @return The encoded IRI */ private UniversalTerm outputIri(DependentNode cachedNode) { - if (lastIriPrefixId == cachedNode.lookupPointer2) { - if (lastIriNameId + 1 == cachedNode.lookupPointer1) { - lastIriNameId = cachedNode.lookupPointer1; + int nameId = cachedNode.lookupPointer1; + int prefixId = cachedNode.lookupPointer2; + if (lastIriPrefixId == prefixId) { + if (lastIriNameId + 1 == nameId) { + lastIriNameId = nameId; return zeroIri; } else { - lastIriNameId = cachedNode.lookupPointer1; - return new RdfIri(0, cachedNode.lookupPointer1); + lastIriNameId = nameId; + return nameOnlyIris[nameId]; } } else { - lastIriPrefixId = cachedNode.lookupPointer2; - if (lastIriNameId + 1 == cachedNode.lookupPointer1) { - lastIriNameId = cachedNode.lookupPointer1; - return new RdfIri(cachedNode.lookupPointer2, 0); + lastIriPrefixId = prefixId; + if (lastIriNameId + 1 == nameId) { + lastIriNameId = nameId; + return new RdfIri(prefixId, 0); } else { - lastIriNameId = cachedNode.lookupPointer1; + lastIriNameId = nameId; return cachedNode.encoded; } } diff --git a/core/src/test/scala/eu/ostrzyciel/jelly/core/EncoderLookupSpec.scala b/core/src/test/scala/eu/ostrzyciel/jelly/core/EncoderLookupSpec.scala index 98c9ac9..be5df1d 100644 --- a/core/src/test/scala/eu/ostrzyciel/jelly/core/EncoderLookupSpec.scala +++ b/core/src/test/scala/eu/ostrzyciel/jelly/core/EncoderLookupSpec.scala @@ -11,118 +11,126 @@ class EncoderLookupSpec extends AnyWordSpec, Matchers: "encoder lookup" should { "add new entries up to capacity" in { - val lookup = EncoderLookup(4) + val lookup = EncoderLookup(4, true) for i <- 1 to 4 do - val v = lookup.addEntry(s"v$i") + val v = lookup.getOrAddEntry(s"v$i") v.getId should be (i) v.setId should be (0) v.newEntry should be (true) - v.serial should be (1) + lookup.serials(v.getId) should be (1) } "retrieve entries" in { - val lookup = EncoderLookup(4) + val lookup = EncoderLookup(4, true) for i <- 1 to 4 do - lookup.addEntry(s"v$i") + lookup.getOrAddEntry(s"v$i") for i <- 1 to 4 do - val v = lookup.addEntry(s"v$i") + val v = lookup.getOrAddEntry(s"v$i") v.getId should be (i) v.setId should be (i) v.newEntry should be (false) - v.serial should be (1) + lookup.serials(v.getId) should be (1) } "retrieve entries many times, in random order" in { - val lookup = EncoderLookup(50) + val lookup = EncoderLookup(50, true) for i <- 1 to 50 do - lookup.addEntry(s"v$i") + lookup.getOrAddEntry(s"v$i") for _ <- 1 to 20 do for i <- Random.shuffle(1 to 50) do - val v = lookup.addEntry(s"v$i") + val v = lookup.getOrAddEntry(s"v$i") v.getId should be (i) v.setId should be (i) v.newEntry should be (false) - v.serial should be (1) + lookup.serials(v.getId) should be (1) } "overwrite existing entries, from oldest to newest" in { - val lookup = EncoderLookup(4) + val lookup = EncoderLookup(4, true) for i <- 1 to 4 do - lookup.addEntry(s"v$i") + lookup.getOrAddEntry(s"v$i") - val v = lookup.addEntry("v5") + val v = lookup.getOrAddEntry("v5") v.getId should be (1) v.setId should be (1) v.newEntry should be (true) - v.serial should be (2) + lookup.serials(v.getId) should be (2) for i <- 6 to 8 do - val v = lookup.addEntry(s"v$i") + val v = lookup.getOrAddEntry(s"v$i") v.getId should be (i - 4) v.setId should be (0) v.newEntry should be (true) - v.serial should be (2) + lookup.serials(v.getId) should be (2) } "overwrite existing entries in order, many times" in { - val lookup = EncoderLookup(17) + val lookup = EncoderLookup(17, true) for i <- 1 to 17 do - lookup.addEntry(s"v$i") + lookup.getOrAddEntry(s"v$i") for k <- 2 to 23 do - val v = lookup.addEntry(s"v1 $k") + val v = lookup.getOrAddEntry(s"v1 $k") v.getId should be (1) v.setId should be (1) v.newEntry should be (true) - v.serial should be (k) + lookup.serials(v.getId) should be (k) for i <- 2 to 17 do - val v = lookup.addEntry(s"v$i $k") + val v = lookup.getOrAddEntry(s"v$i $k") v.getId should be (i) v.setId should be (0) v.newEntry should be (true) - v.serial should be (k) + lookup.serials(v.getId) should be (k) } "pass random stress test (1)" in { - val lookup = EncoderLookup(100) + val lookup = EncoderLookup(100, true) val frequentSet = (1 to 10).map(i => s"v$i") - frequentSet.foreach(lookup.addEntry) + frequentSet.foreach(lookup.getOrAddEntry) for i <- 1 to 50 do for fIndex <- 1 to 10 do - val v = lookup.addEntry(frequentSet(fIndex - 1)) + val v = lookup.getOrAddEntry(frequentSet(fIndex - 1)) v.getId should be (fIndex) v.setId should be (fIndex) v.newEntry should be (false) - v.serial should be (1) + lookup.serials(v.getId) should be (1) for _ <- 1 to 80 do - val v = lookup.addEntry(s"r${Random.nextInt(200) + 1}") + val v = lookup.getOrAddEntry(s"r${Random.nextInt(200) + 1}") v.getId should be > 10 if v.setId != 0 then v.setId should be > 10 } "pass random stress test (2)" in { - val lookup = EncoderLookup(113) + val lookup = EncoderLookup(113, true) for i <- 1 to 20 do - lookup.addEntry(s"v$i") + lookup.getOrAddEntry(s"v$i") for _ <- 1 to 1000 do val id = Random.nextInt(20) + 1 - val v = lookup.addEntry(s"v$id") + val v = lookup.getOrAddEntry(s"v$id") v.getId should be (id) if v.setId != 0 then v.setId should be (id) v.newEntry should be (false) else v.newEntry should be (true) - v.serial should be (1) + lookup.serials(v.getId) should be (1) } "pass random stress test (3)" in { - val lookup = EncoderLookup(1023) + val lookup = EncoderLookup(1023, true) for _ <- 1 to 100_000 do - val v = lookup.addEntry(s"v${Random.nextInt(10_000) + 1}") + val v = lookup.getOrAddEntry(s"v${Random.nextInt(10_000) + 1}") v.getId should be > 0 } + + "not use the serials table if not needed" in { + val lookup = EncoderLookup(16, false) + for _ <- 1 to 2000 do + val v = lookup.getOrAddEntry(s"v${Random.nextInt(1000) + 1}") + v.getId should be > 0 + lookup.serials should be (null) + } } diff --git a/core/src/test/scala/eu/ostrzyciel/jelly/core/NodeEncoderSpec.scala b/core/src/test/scala/eu/ostrzyciel/jelly/core/NodeEncoderSpec.scala index 3d9ebad..ab4d033 100644 --- a/core/src/test/scala/eu/ostrzyciel/jelly/core/NodeEncoderSpec.scala +++ b/core/src/test/scala/eu/ostrzyciel/jelly/core/NodeEncoderSpec.scala @@ -72,6 +72,33 @@ class NodeEncoderSpec extends AnyWordSpec, Inspectors, Matchers: ) } + "not evict datatype IRIs used recently" in { + val (encoder, buffer) = getEncoder() + for i <- 1 to 8 do + val node = encoder.encodeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + s"v$i", s"dt$i", buffer, + ) + node.literal.lex should be(s"v$i") + node.literal.literalKind.datatype should be(i) + + // use literal 1 again + val node = encoder.encodeDtLiteral( + Mrl.DtLiteral("v1", Mrl.Datatype("dt1")), + "v1", "dt1", buffer, + ) + node.literal.lex should be("v1") + node.literal.literalKind.datatype should be(1) + + // now add a new DT and see which DT is evicted + val node2 = encoder.encodeDtLiteral( + Mrl.DtLiteral("v9", Mrl.Datatype("dt9")), + "v9", "dt9", buffer, + ) + node2.literal.lex should be("v9") + node2.literal.literalKind.datatype should be(2) + } + "encode datatype literals while evicting old datatypes" in { val (encoder, buffer) = getEncoder() for i <- 1 to 12 do @@ -305,6 +332,71 @@ class NodeEncoderSpec extends AnyWordSpec, Inspectors, Matchers: name.value should be (eVal) } + "add IRIs while evicting old ones (2: detecting invalidated prefix entries)" in { + val (encoder, buffer) = getEncoder(3) + val data = Seq( + // IRI, expected prefix ID, expected name ID + ("https://test.org/1/Cake1", 1, 0), + ("https://test.org/2/Cake1", 2, 1), + ("https://test.org/3/Cake1", 3, 1), + ("https://test.org/3/Cake2", 0, 0), + // Evict the /1/ prefix + ("https://test.org/4/Cake2", 1, 2), + // Try to get the first IRI + ("https://test.org/1/Cake1", 2, 1), + ) + + for (sIri, ePrefix, eName) <- data do + val iri = encoder.encodeIri(sIri, buffer).asInstanceOf[RdfIri] + iri.prefixId should be(ePrefix) + iri.nameId should be(eName) + + val expectedBuffer = Seq( + // Prefix? (name otherwise), ID, value + (true, 0, "https://test.org/1/"), + (false, 0, "Cake1"), + (true, 0, "https://test.org/2/"), + (true, 0, "https://test.org/3/"), + (false, 0, "Cake2"), + (true, 1, "https://test.org/4/"), + (true, 0, "https://test.org/1/"), + ) + + buffer.size should be(expectedBuffer.size) + for ((isPrefix, eId, eVal), row) <- expectedBuffer.zip(buffer) do + if isPrefix then + row.row.isPrefix should be (true) + val prefix = row.row.prefix + prefix.id should be(eId) + prefix.value should be(eVal) + else + row.row.isName should be (true) + val name = row.row.name + name.id should be(eId) + name.value should be(eVal) + } + + "not evict IRI prefixes used recently" in { + val (encoder, buffer) = getEncoder(3) + val data = Seq( + // IRI, expected prefix ID, expected name ID + ("https://test.org/1/Cake1", 1, 0), + ("https://test.org/2/Cake2", 2, 0), + ("https://test.org/3/Cake3", 3, 0), + ("https://test.org/3/Cake3", 0, 3), + ("https://test.org/2/Cake2", 2, 2), + ("https://test.org/1/Cake1", 1, 1), + // Evict something -- this must not be /1/ because it was used last + // this tests if .onAccess() is called correctly + ("https://test.org/4/Cake4", 3, 4), + ) + + for (sIri, ePrefix, eName) <- data do + val iri = encoder.encodeIri(sIri, buffer).asInstanceOf[RdfIri] + iri.prefixId should be(ePrefix) + iri.nameId should be(eName) + } + "add IRIs while evicting old ones, without a prefix table" in { val (encoder, buffer) = getEncoder(0) val data = Seq(