From 2d428d81384d3b5e8349e5ff057372340b833983 Mon Sep 17 00:00:00 2001 From: Ostrzyciel Date: Fri, 8 Nov 2024 20:59:47 +0100 Subject: [PATCH] Revert "Experiment: again try a lighter impl for encoder node cache (#212)" Of course, this doesn't work. I have no idea how the default hashmaps can be so good, but I bow to the ancient masters of Java. I give up. This reverts commit c2bad54c0c756db7de52d091dd16c8ebb3e2d430. --- .../jelly/core/EncoderNodeCache.java | 27 ---------- .../jelly/core/EncoderNodeCacheDependent.java | 47 ----------------- .../jelly/core/EncoderNodeCacheSimple.java | 28 ---------- .../eu/ostrzyciel/jelly/core/NodeEncoder.java | 52 +++++++++++++++---- .../ostrzyciel/jelly/core/ProtoEncoder.scala | 9 ++-- 5 files changed, 47 insertions(+), 116 deletions(-) delete mode 100644 core/src/main/java/eu/ostrzyciel/jelly/core/EncoderNodeCache.java delete mode 100644 core/src/main/java/eu/ostrzyciel/jelly/core/EncoderNodeCacheDependent.java delete mode 100644 core/src/main/java/eu/ostrzyciel/jelly/core/EncoderNodeCacheSimple.java diff --git a/core/src/main/java/eu/ostrzyciel/jelly/core/EncoderNodeCache.java b/core/src/main/java/eu/ostrzyciel/jelly/core/EncoderNodeCache.java deleted file mode 100644 index 68522a0..0000000 --- a/core/src/main/java/eu/ostrzyciel/jelly/core/EncoderNodeCache.java +++ /dev/null @@ -1,27 +0,0 @@ -package eu.ostrzyciel.jelly.core; - -/** - * A terrifyingly simple cache. - */ -abstract class EncoderNodeCache { - - protected final Object[] keys; - - protected final int sizeMinusOne; - - protected EncoderNodeCache(int minimumSize) { - var size = Integer.highestOneBit(minimumSize); - if (size < minimumSize) { - size <<= 1; - } - this.sizeMinusOne = size - 1; - keys = new Object[size]; - } - - protected int calcIndex(Object key) { - int h = key.hashCode(); - // Spread bits to avoid collisions for hashes that differ only in the upper bits. - // Trick from HashMap.hash() - return (h ^ h >>> 16) & sizeMinusOne; - } -} diff --git a/core/src/main/java/eu/ostrzyciel/jelly/core/EncoderNodeCacheDependent.java b/core/src/main/java/eu/ostrzyciel/jelly/core/EncoderNodeCacheDependent.java deleted file mode 100644 index 55a5f55..0000000 --- a/core/src/main/java/eu/ostrzyciel/jelly/core/EncoderNodeCacheDependent.java +++ /dev/null @@ -1,47 +0,0 @@ -package eu.ostrzyciel.jelly.core; - -import eu.ostrzyciel.jelly.core.proto.v1.UniversalTerm; - -/** - * A cached node that depends on other lookups (RdfIri and RdfLiteral in the datatype variant). - */ -final class DependentNode { - // The actual cached node - UniversalTerm encoded; - // 1: datatypes and IRI names - // The pointer is the index in the lookup table, the serial is the serial number of the entry. - // The serial in the lookup table must be equal to the serial here for the entry to be valid. - int lookupPointer1; - int lookupSerial1; - // 2: IRI prefixes - int lookupPointer2; - int lookupSerial2; -} - -class EncoderNodeCacheDependent extends EncoderNodeCache { - private final DependentNode[] values; - - EncoderNodeCacheDependent(int minimumSize) { - super(minimumSize); - DependentNode[] x = new DependentNode[sizeMinusOne + 1]; - values = x; - for (int i = 0; i < values.length; i++) { - values[i] = new DependentNode(); - } - } - - DependentNode getOrClearIfAbsent(Object key) { - final int idx = calcIndex(key); - final Object storedKey = keys[idx]; - final DependentNode node = values[idx]; - if (storedKey != null && (storedKey == key || storedKey.equals(key))) { - return node; - } else { - node.encoded = null; - node.lookupPointer1 = 0; - node.lookupPointer2 = 0; - keys[idx] = key; - return node; - } - } -} diff --git a/core/src/main/java/eu/ostrzyciel/jelly/core/EncoderNodeCacheSimple.java b/core/src/main/java/eu/ostrzyciel/jelly/core/EncoderNodeCacheSimple.java deleted file mode 100644 index eacafcf..0000000 --- a/core/src/main/java/eu/ostrzyciel/jelly/core/EncoderNodeCacheSimple.java +++ /dev/null @@ -1,28 +0,0 @@ -package eu.ostrzyciel.jelly.core; - -import eu.ostrzyciel.jelly.core.proto.v1.UniversalTerm; - -import java.util.function.Function; - -class EncoderNodeCacheSimple extends EncoderNodeCache { - private final UniversalTerm[] values; - - EncoderNodeCacheSimple(int minimumSize) { - super(minimumSize); - UniversalTerm[] x = new UniversalTerm[sizeMinusOne + 1]; - values = x; - } - - UniversalTerm getOrComputeIfAbsent(Object key, Function f) { - final int idx = calcIndex(key); - final Object storedKey = keys[idx]; - if (storedKey != null && (storedKey == key || storedKey.equals(key))) { - return values[idx]; - } else { - keys[idx] = key; - UniversalTerm newTerm = f.apply(key); - values[idx] = newTerm; - return newTerm; - } - } -} diff --git a/core/src/main/java/eu/ostrzyciel/jelly/core/NodeEncoder.java b/core/src/main/java/eu/ostrzyciel/jelly/core/NodeEncoder.java index 57b9d2e..29e0773 100644 --- a/core/src/main/java/eu/ostrzyciel/jelly/core/NodeEncoder.java +++ b/core/src/main/java/eu/ostrzyciel/jelly/core/NodeEncoder.java @@ -19,6 +19,40 @@ * @param The type of RDF nodes used by the RDF library. */ public final class NodeEncoder { + /** + * A cached node that depends on other lookups (RdfIri and RdfLiteral in the datatype variant). + */ + static final class DependentNode { + // The actual cached node + public UniversalTerm encoded; + // 1: datatypes and IRI names + // The pointer is the index in the lookup table, the serial is the serial number of the entry. + // The serial in the lookup table must be equal to the serial here for the entry to be valid. + public int lookupPointer1; + public int lookupSerial1; + // 2: IRI prefixes + public int lookupPointer2; + public int lookupSerial2; + } + + /** + * A simple LRU cache for already encoded nodes. + * @param Key type + * @param Value type + */ + private static final class NodeCache extends LinkedHashMap { + private final int maxSize; + + public NodeCache(int maxSize) { + this.maxSize = maxSize; + } + + @Override + protected boolean removeEldestEntry(java.util.Map.Entry eldest) { + return size() > maxSize; + } + } + private final int maxPrefixTableSize; private int lastIriNameId; private int lastIriPrefixId = -1000; @@ -29,9 +63,9 @@ public final class NodeEncoder { // We split the node caches in three – the first two are for nodes that depend on the lookups // (IRIs and datatype literals). The third one is for nodes that don't depend on the lookups. - private final EncoderNodeCacheDependent iriNodeCache; - private final EncoderNodeCacheDependent dtLiteralNodeCache; - private final EncoderNodeCacheSimple nodeCache; + private final NodeCache iriNodeCache; + private final NodeCache dtLiteralNodeCache; + private final NodeCache nodeCache; // Pre-allocated IRI that has prefixId=0 and nameId=0 static final RdfIri zeroIri = new RdfIri(0, 0); @@ -50,7 +84,7 @@ public NodeEncoder(RdfStreamOptions opt, int nodeCacheSize, int iriNodeCacheSize this.maxPrefixTableSize = opt.maxPrefixTableSize(); if (maxPrefixTableSize > 0) { prefixLookup = new EncoderLookup(maxPrefixTableSize, true); - iriNodeCache = new EncoderNodeCacheDependent(iriNodeCacheSize); + iriNodeCache = new NodeCache<>(iriNodeCacheSize); } else { prefixLookup = null; iriNodeCache = null; @@ -59,9 +93,9 @@ public NodeEncoder(RdfStreamOptions opt, int nodeCacheSize, int iriNodeCacheSize for (int i = 0; i < nameOnlyIris.length; i++) { nameOnlyIris[i] = new RdfIri(0, i); } - dtLiteralNodeCache = new EncoderNodeCacheDependent(dtLiteralNodeCacheSize); + dtLiteralNodeCache = new NodeCache<>(dtLiteralNodeCacheSize); nameLookup = new EncoderLookup(opt.maxNameTableSize(), maxPrefixTableSize > 0); - nodeCache = new EncoderNodeCacheSimple(nodeCacheSize); + nodeCache = new NodeCache<>(nodeCacheSize); } /** @@ -75,7 +109,7 @@ public NodeEncoder(RdfStreamOptions opt, int nodeCacheSize, int iriNodeCacheSize public UniversalTerm encodeDtLiteral( TNode key, String lex, String datatypeName, ArrayBuffer rowsBuffer ) { - var cachedNode = dtLiteralNodeCache.getOrClearIfAbsent(key); + var cachedNode = dtLiteralNodeCache.computeIfAbsent(key, k -> new DependentNode()); // Check if the value is still valid if (cachedNode.encoded != null && cachedNode.lookupSerial1 == datatypeLookup.serials[cachedNode.lookupPointer1] @@ -127,7 +161,7 @@ public UniversalTerm encodeIri(String iri, ArrayBuffer rowsBuffer) } // Slow path, with splitting out the prefix - var cachedNode = iriNodeCache.getOrClearIfAbsent(iri); + var cachedNode = iriNodeCache.computeIfAbsent(iri, k -> new DependentNode()); // Check if the value is still valid if (cachedNode.encoded != null && cachedNode.lookupSerial1 == nameLookup.serials[cachedNode.lookupPointer1] && @@ -212,6 +246,6 @@ private UniversalTerm outputIri(DependentNode cachedNode) { * @return The encoded node */ public UniversalTerm encodeOther(Object key, Function encoder) { - return nodeCache.getOrComputeIfAbsent(key, encoder); + return nodeCache.computeIfAbsent(key, encoder); } } diff --git a/core/src/main/scala/eu/ostrzyciel/jelly/core/ProtoEncoder.scala b/core/src/main/scala/eu/ostrzyciel/jelly/core/ProtoEncoder.scala index b9394d5..3669426 100644 --- a/core/src/main/scala/eu/ostrzyciel/jelly/core/ProtoEncoder.scala +++ b/core/src/main/scala/eu/ostrzyciel/jelly/core/ProtoEncoder.scala @@ -144,11 +144,10 @@ abstract class ProtoEncoder[TNode, -TTriple, -TQuad, -TQuoted](val options: RdfS private val extraRowsBuffer = new ArrayBuffer[RdfStreamRow](32) private val nodeEncoder = new NodeEncoder[TNode]( options, - // Make the node cache size between 256 and 2048, depending on the user's maxNameTableSize. - Math.max(Math.min(options.maxNameTableSize, 2048), 512), - // IRI cache can be the largest... - Math.max(Math.min((options.maxNameTableSize * 1.5).toInt, 8192), 512), - Math.max(Math.min(options.maxNameTableSize, 2048), 512), + // Make the node cache size between 256 and 1024, depending on the user's maxNameTableSize. + Math.max(Math.min(options.maxNameTableSize, 1024), 256), + options.maxNameTableSize, + Math.max(Math.min(options.maxNameTableSize, 1024), 256), ) private var emittedOptions = false