Skip to content

Commit

Permalink
Make a few small improvements in NodeEncoder
Browse files Browse the repository at this point in the history
Backport some tests from #206

Do a few small optimizations in the NodeEncoder and the EncoderLookup.
  • Loading branch information
Ostrzyciel committed Nov 4, 2024
1 parent 3944fce commit cfb686d
Show file tree
Hide file tree
Showing 4 changed files with 211 additions and 97 deletions.
67 changes: 38 additions & 29 deletions core/src/main/java/eu/ostrzyciel/jelly/core/EncoderLookup.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,6 @@ static final class LookupEntry {
public int setId;
/** Whether this entry is a new entry. */
public boolean newEntry;
/**
* The serial number of the entry, incremented each time the entry is replaced in the table.
* This could theoretically overflow and cause bogus cache hits, but it's enormously
* unlikely to happen in practice. I can buy a beer for anyone who can construct an RDF dataset that
* causes this to happen.
*/
public int serial = 1;

public LookupEntry(int getId, int setId) {
this.getId = getId;
Expand All @@ -40,13 +33,23 @@ public LookupEntry(int getId, int setId, boolean newEntry) {

/** The lookup hash map */
private final HashMap<String, LookupEntry> map = new HashMap<>();

/**
* The doubly-linked list of entries, with 1-based indexing.
* Each entry is represented by three integers: left, right, and serial.
* Each entry is represented by two integers: left and right.
* The head pointer is in table[1].
* The first valid entry is in table[3] – table[5].
* The first valid entry is in table[3] – table[4].
*/
private final int[] table;

/**
* The serial numbers of the entries, incremented each time the entry is replaced in the table.
* This could theoretically overflow and cause bogus cache hits, but it's enormously
* unlikely to happen in practice. I can buy a beer for anyone who can construct an RDF dataset that
* causes this to happen.
*/
final int[] table;
final int[] serials;

// Tail pointer for the table.
private int tail;
// Maximum size of the lookup.
Expand All @@ -58,16 +61,24 @@ public LookupEntry(int getId, int setId, boolean newEntry) {
private int lastSetId;
// Names of the entries. Entry 0 is always null.
private final String[] names;
// Whether to use serials for the entries.
private final boolean useSerials;

private final LookupEntry entryForReturns = new LookupEntry(0, 0, true);

public EncoderLookup(int size) {
public EncoderLookup(int size, boolean useSerials) {
this.size = size;
table = new int[(size + 1) * 3];
// Set the head's serial to non-zero value, so that default-initialized DependentNodes are not
// accidentally considered as valid entries.
table[2] = -1;
table = new int[(size + 1) * 2];
names = new String[size + 1];
this.useSerials = useSerials;
if (useSerials) {
serials = new int[size + 1];
// Set the head's serial to non-zero value, so that default-initialized DependentNodes are not
// accidentally considered as valid entries.
serials[0] = -1;
} else {
serials = null;
}
}

/**
Expand All @@ -76,18 +87,18 @@ public EncoderLookup(int size) {
* @param id The ID of the entry that was accessed.
*/
public void onAccess(int id) {
int base = id * 3;
int base = id * 2;
if (base == tail) {
return;
}
int left = table[base];
int right = table[base + 1];
// Set our left to the tail
table[base] = tail;
// Set left's right to our right
table[left + 1] = right;
// Set right's left to our left
table[right] = left;
// Set our left to the tail
table[base] = tail;
// Set the tail's right to us
table[tail + 1] = base;
// Update the tail
Expand All @@ -99,7 +110,7 @@ public void onAccess(int id) {
* @param key The key of the entry.
* @return The entry.
*/
public LookupEntry addEntry(String key) {
public LookupEntry getOrAddEntry(String key) {
var value = map.get(key);
if (value != null) {
// The entry is already in the table, just update the access order
Expand All @@ -111,13 +122,11 @@ public LookupEntry addEntry(String key) {
if (used < size) {
// We still have space in the table, add a new entry to the end of the table.
id = ++used;
int base = id * 3;
int base = id * 2;
// Set the left to the tail
table[base] = tail;
// Right is already 0
// table[base + 1] = 0;
// Serial is zero, set it to 0+1 = 1
table[base + 2] = 1;
// Set the tail's right to us
table[tail + 1] = base;
tail = base;
Expand All @@ -130,22 +139,22 @@ public LookupEntry addEntry(String key) {
} else {
// The table is full, evict the least recently used entry.
int base = table[1];
id = base / 3;
id = base / 2;
// Remove the entry from the map
LookupEntry oldEntry = map.remove(names[id]);
oldEntry.getId = id;
oldEntry.setId = id;
int serial = table[base + 2] + 1;
oldEntry.serial = serial;
table[base + 2] = serial;
// Insert the new entry
names[id] = key;
map.put(key, oldEntry);
// Update the table
onAccess(id);
entryForReturns.serial = serial;
entryForReturns.setId = lastSetId + 1 == id ? 0 : id;
}
if (this.useSerials) {
// Increment the serial number
// We save some memory accesses by not doing this if the serials are not used.
// The if should be very predictable and have no negative performance impact.
++serials[id];
}
entryForReturns.getId = id;
lastSetId = id;
return entryForReturns;
Expand Down
73 changes: 39 additions & 34 deletions core/src/main/java/eu/ostrzyciel/jelly/core/NodeEncoder.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ protected boolean removeEldestEntry(java.util.Map.Entry<K, V> eldest) {
private int lastIriPrefixId = -1000;

private final EncoderLookup datatypeLookup;
private EncoderLookup prefixLookup;
private final EncoderLookup prefixLookup;
private final EncoderLookup nameLookup;

// We split the node caches in three – the first two are for nodes that depend on the lookups
Expand All @@ -80,21 +80,21 @@ protected boolean removeEldestEntry(java.util.Map.Entry<K, V> eldest) {
* @param dtLiteralNodeCacheSize The size of the datatype literal dependent node cache
*/
public NodeEncoder(RdfStreamOptions opt, int nodeCacheSize, int iriNodeCacheSize, int dtLiteralNodeCacheSize) {
datatypeLookup = new EncoderLookup(opt.maxDatatypeTableSize());
datatypeLookup = new EncoderLookup(opt.maxDatatypeTableSize(), true);
this.maxPrefixTableSize = opt.maxPrefixTableSize();
if (maxPrefixTableSize > 0) {
prefixLookup = new EncoderLookup(maxPrefixTableSize);
prefixLookup = new EncoderLookup(maxPrefixTableSize, true);
iriNodeCache = new NodeCache<>(iriNodeCacheSize);
nameOnlyIris = null;
} else {
prefixLookup = null;
iriNodeCache = null;
nameOnlyIris = new RdfIri[opt.maxNameTableSize() + 1];
for (int i = 0; i < nameOnlyIris.length; i++) {
nameOnlyIris[i] = new RdfIri(0, i);
}
}
nameOnlyIris = new RdfIri[opt.maxNameTableSize() + 1];
for (int i = 0; i < nameOnlyIris.length; i++) {
nameOnlyIris[i] = new RdfIri(0, i);
}
dtLiteralNodeCache = new NodeCache<>(dtLiteralNodeCacheSize);
nameLookup = new EncoderLookup(opt.maxNameTableSize());
nameLookup = new EncoderLookup(opt.maxNameTableSize(), maxPrefixTableSize > 0);
nodeCache = new NodeCache<>(nodeCacheSize);
}

Expand All @@ -112,23 +112,24 @@ public UniversalTerm encodeDtLiteral(
var cachedNode = dtLiteralNodeCache.computeIfAbsent(key, k -> new DependentNode());
// Check if the value is still valid
if (cachedNode.encoded != null &&
cachedNode.lookupSerial1 == datatypeLookup.table[cachedNode.lookupPointer1 * 3 + 2]
cachedNode.lookupSerial1 == datatypeLookup.serials[cachedNode.lookupPointer1]
) {
datatypeLookup.onAccess(cachedNode.lookupPointer1);
return cachedNode.encoded;
}

// The node is not encoded, but we may already have the datatype encoded
var dtEntry = datatypeLookup.addEntry(datatypeName);
var dtEntry = datatypeLookup.getOrAddEntry(datatypeName);
if (dtEntry.newEntry) {
rowsBuffer.append(new RdfStreamRow(
new RdfDatatypeEntry(dtEntry.setId, datatypeName)
));
}
cachedNode.lookupPointer1 = dtEntry.getId;
cachedNode.lookupSerial1 = dtEntry.serial;
int dtId = dtEntry.getId;
cachedNode.lookupPointer1 = dtId;
cachedNode.lookupSerial1 = datatypeLookup.serials[dtId];
cachedNode.encoded = new RdfLiteral(
lex, new RdfLiteral$LiteralKind$Datatype(dtEntry.getId)
lex, new RdfLiteral$LiteralKind$Datatype(dtId)
);

return cachedNode.encoded;
Expand All @@ -143,7 +144,7 @@ public UniversalTerm encodeDtLiteral(
public UniversalTerm encodeIri(String iri, ArrayBuffer<RdfStreamRow> rowsBuffer) {
if (maxPrefixTableSize == 0) {
// Fast path for no prefixes
var nameEntry = nameLookup.addEntry(iri);
var nameEntry = nameLookup.getOrAddEntry(iri);
if (nameEntry.newEntry) {
rowsBuffer.append(new RdfStreamRow(
new RdfNameEntry(nameEntry.setId, iri)
Expand All @@ -162,8 +163,8 @@ public UniversalTerm encodeIri(String iri, ArrayBuffer<RdfStreamRow> rowsBuffer)
var cachedNode = iriNodeCache.computeIfAbsent(iri, k -> new DependentNode());
// Check if the value is still valid
if (cachedNode.encoded != null &&
cachedNode.lookupSerial1 == nameLookup.table[cachedNode.lookupPointer1 * 3 + 2] &&
cachedNode.lookupSerial2 == prefixLookup.table[cachedNode.lookupPointer2 * 3 + 2]
cachedNode.lookupSerial1 == nameLookup.serials[cachedNode.lookupPointer1] &&
cachedNode.lookupSerial2 == prefixLookup.serials[cachedNode.lookupPointer2]
) {
nameLookup.onAccess(cachedNode.lookupPointer1);
prefixLookup.onAccess(cachedNode.lookupPointer2);
Expand All @@ -187,8 +188,8 @@ public UniversalTerm encodeIri(String iri, ArrayBuffer<RdfStreamRow> rowsBuffer)
postfix = iri.substring(i + 1);
}

var prefixEntry = prefixLookup.addEntry(prefix);
var nameEntry = nameLookup.addEntry(postfix);
var prefixEntry = prefixLookup.getOrAddEntry(prefix);
var nameEntry = nameLookup.getOrAddEntry(postfix);
if (prefixEntry.newEntry) {
rowsBuffer.append(new RdfStreamRow(
new RdfPrefixEntry(prefixEntry.setId, prefix)
Expand All @@ -199,11 +200,13 @@ public UniversalTerm encodeIri(String iri, ArrayBuffer<RdfStreamRow> rowsBuffer)
new RdfNameEntry(nameEntry.setId, postfix)
));
}
cachedNode.lookupPointer1 = nameEntry.getId;
cachedNode.lookupSerial1 = nameEntry.serial;
cachedNode.lookupPointer2 = prefixEntry.getId;
cachedNode.lookupSerial2 = prefixEntry.serial;
cachedNode.encoded = new RdfIri(prefixEntry.getId, nameEntry.getId);
int nameId = nameEntry.getId;
int prefixId = prefixEntry.getId;
cachedNode.lookupPointer1 = nameId;
cachedNode.lookupSerial1 = nameLookup.serials[nameId];
cachedNode.lookupPointer2 = prefixId;
cachedNode.lookupSerial2 = prefixLookup.serials[prefixId];
cachedNode.encoded = new RdfIri(prefixId, nameId);
return outputIri(cachedNode);
}

Expand All @@ -213,21 +216,23 @@ public UniversalTerm encodeIri(String iri, ArrayBuffer<RdfStreamRow> rowsBuffer)
* @return The encoded IRI
*/
private UniversalTerm outputIri(DependentNode cachedNode) {
if (lastIriPrefixId == cachedNode.lookupPointer2) {
if (lastIriNameId + 1 == cachedNode.lookupPointer1) {
lastIriNameId = cachedNode.lookupPointer1;
int nameId = cachedNode.lookupPointer1;
int prefixId = cachedNode.lookupPointer2;
if (lastIriPrefixId == prefixId) {
if (lastIriNameId + 1 == nameId) {
lastIriNameId = nameId;
return zeroIri;
} else {
lastIriNameId = cachedNode.lookupPointer1;
return new RdfIri(0, cachedNode.lookupPointer1);
lastIriNameId = nameId;
return nameOnlyIris[nameId];
}
} else {
lastIriPrefixId = cachedNode.lookupPointer2;
if (lastIriNameId + 1 == cachedNode.lookupPointer1) {
lastIriNameId = cachedNode.lookupPointer1;
return new RdfIri(cachedNode.lookupPointer2, 0);
lastIriPrefixId = prefixId;
if (lastIriNameId + 1 == nameId) {
lastIriNameId = nameId;
return new RdfIri(prefixId, 0);
} else {
lastIriNameId = cachedNode.lookupPointer1;
lastIriNameId = nameId;
return cachedNode.encoded;
}
}
Expand Down
Loading

0 comments on commit cfb686d

Please sign in to comment.