Skip to content

Commit

Permalink
Add more nightly tasks. (#313)
Browse files Browse the repository at this point in the history
- Filtered (term|disjunctive|conjunctive) queries.
 - Pre-filtered vector search.
 - Post-filtered vector search.
 - (Term|Disjunctive) queries queries combined across multiple fields using a
   DisjunctionMaxQuery.

I settled on 5% for the filter density, which felt like a nice trade-off as
queries would still match many documents, yet taking the filter into account
should help speed up evaluation in a number of cases (ie. doing exhaustive
evaluation and post-filtering hits that don't match the query would likely not
be a good approach).
  • Loading branch information
jpountz authored Nov 12, 2024
1 parent e2fb399 commit cd2e383
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 6 deletions.
63 changes: 58 additions & 5 deletions src/main/perf/TaskParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ public void close() throws IOException {
}

private final static Pattern filterPattern = Pattern.compile(" \\+filter=([0-9\\.]+)%");
private final static Pattern preFilterPattern = Pattern.compile(" \\+preFilter=([0-9\\.]+)%");
private final static Pattern countOnlyPattern = Pattern.compile("count\\((.*?)\\)");
private final static Pattern minShouldMatchPattern = Pattern.compile(" \\+minShouldMatch=(\\d+)($| )");
// pattern: taskName term1 term2 term3 term4 +combinedFields=field1^1.0,field2,field3^2.0
Expand Down Expand Up @@ -211,6 +212,7 @@ class TaskBuilder {

List<String> facets;
List<FieldAndWeight> combinedFields;
List<String> dismaxFields;
String text;
boolean doDrillSideways, doHilite, doStoredLoadsTask;
Sort sort;
Expand Down Expand Up @@ -255,7 +257,8 @@ SearchTask buildSearchTask(String input) throws ParseException, IOException {
text = taskAndType[1];
int msm = parseMinShouldMatch();
combinedFields = parseCombinedFields();
Query query = buildQuery(taskType, text, msm, combinedFields);
dismaxFields = parseDismaxFields();
Query query = buildQuery(taskType, text, msm);
Query query2 = applyDrillDowns(query, drillDowns);
Query query3 = applyFilter(query2, filter);
return new SearchTask(category, isCountOnly, query3, sort, group, topN, doHilite, doStoredLoadsTask, facets, null, doDrillSideways);
Expand Down Expand Up @@ -322,6 +325,18 @@ Query parseFilter() {
return null;
}

Query parsePreFilter() {
// Check for pre-filter (eg: " +preFilter=0.5%"), only relevant to vector search
final Matcher m = preFilterPattern.matcher(text);
if (m.find()) {
final double filterPct = Double.parseDouble(m.group(1));
// Splice out the filter string:
text = (text.substring(0, m.start(0)) + text.substring(m.end(0), text.length())).trim();
return new RandomQuery(filterPct);
}
return null;
}

boolean parseIsCountOnly() {
// Check for count: "count(...)"
final Matcher m = countOnlyPattern.matcher(text);
Expand Down Expand Up @@ -374,6 +389,17 @@ List<FieldAndWeight> parseCombinedFields() {
}
}

List<String> parseDismaxFields() {
String marker = "+dismaxFields=";
int i = text.indexOf(marker);
if (i >= 0) {
String[] fields = text.substring(i + marker.length()).split(",");
text = text.substring(0, i);
return Arrays.asList(fields);
}
return null;
}

List<String> parseFacets() {
List<String> facets = new ArrayList<>();
while (true) {
Expand Down Expand Up @@ -443,7 +469,7 @@ void parseHilite() {
}
}

Query buildQuery(String type, String text, int minShouldMatch, List<FieldAndWeight> fieldAndWeights) throws ParseException, IOException {
Query buildQuery(String type, String text, int minShouldMatch) throws ParseException, IOException {
Query query;
switch(type) {
case "ordered":
Expand Down Expand Up @@ -485,7 +511,7 @@ Query buildQuery(String type, String text, int minShouldMatch, List<FieldAndWeig
if (combinedFields != null) {
CombinedFieldQuery.Builder cfqBuilder = new CombinedFieldQuery.Builder();

for (FieldAndWeight fieldAndWeight : fieldAndWeights) {
for (FieldAndWeight fieldAndWeight : combinedFields) {
cfqBuilder.addField(fieldAndWeight.field, fieldAndWeight.weight);
}

Expand All @@ -505,6 +531,28 @@ Query buildQuery(String type, String text, int minShouldMatch, List<FieldAndWeig
return cfqBuilder.build();
}

if (dismaxFields != null) {
List<Query> dismaxClauses = new ArrayList<>();
for (String field : dismaxFields) {
if (query instanceof TermQuery tq) {
dismaxClauses.add(new TermQuery(new Term(field, tq.getTerm().bytes())));
} else if (query instanceof BooleanQuery bq) {
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
for (BooleanClause clause : bq.clauses()) {
if (clause.query() instanceof TermQuery tq) {
bqBuilder.add(new TermQuery(new Term(field, tq.getTerm().bytes())), clause.occur());
} else {
throw new IllegalStateException("Cannot change field of query: " + clause.query());
}
}
dismaxClauses.add(bqBuilder.build());
} else {
throw new IllegalStateException("Cannot change field of query: " + query);
}
}
return new DisjunctionMaxQuery(dismaxClauses, 0f);
}

if (minShouldMatch == 0) {
return query;
} else {
Expand Down Expand Up @@ -683,7 +731,6 @@ private String[][][] parseDisjunctionSpec(String[] fieldHolder, int[] slopHolder
throw new RuntimeException("failed to parse query=" + text);
}
fieldHolder[0] = text.substring("(".length(), colon);
MultiPhraseQuery.Builder b = new MultiPhraseQuery.Builder();
int endParen = text.indexOf(')');
if (endParen == -1) {
throw new RuntimeException("failed to parse query=" + text);
Expand Down Expand Up @@ -712,6 +759,8 @@ Query parseDisjunctionMax() {
}

Query parseVectorQuery() throws IOException {
Query preFilter = parsePreFilter();

float[] queryVector;
if (vectorChannel != null) {
if (this.vector == null) {
Expand All @@ -723,7 +772,11 @@ Query parseVectorQuery() throws IOException {
queryVector = vectorDictionary.computeTextVector(text);
}

return new KnnFloatVectorQuery(vectorField, queryVector, topN);
if (preFilter != null) {
return new KnnFloatVectorQuery(vectorField, queryVector, topN, preFilter);
} else {
return new KnnFloatVectorQuery(vectorField, queryVector, topN);
}
}
}
}
18 changes: 17 additions & 1 deletion src/python/nightlyBench.py
Original file line number Diff line number Diff line change
Expand Up @@ -1415,6 +1415,11 @@ def writeIndexHTML(searchChartData, days):
writeOneLine(w, done, 'CombinedHighMed', 'Combined high-freq medium-freq')
writeOneLine(w, done, 'CombinedHighHigh', 'Combined high-freq high-freq')

w('<br><br><b>DisjunctionMaxQuery to combine scores across the title and body fields:</b>')
writeOneLine(w, done, 'DismaxTerm', 'Term queries combined via dismax')
writeOneLine(w, done, 'DismaxOrHighMed', 'Disjunctive queries combined via dismax')
writeOneLine(w, done, 'DismaxOrHighHigh', 'Disjunctive queries combined via dismax')

w('<br><br><b>Proximity queries:</b>')
writeOneLine(w, done, 'Phrase', 'Exact phrase')
writeOneLine(w, done, 'SloppyPhrase', 'Sloppy (~4) phrase')
Expand All @@ -1433,14 +1438,25 @@ def writeIndexHTML(searchChartData, days):
writeOneLine(w, done, 'CountOrHighHigh', 'Count(high-freq high-freq)')
writeOneLine(w, done, 'CountOrHighMed', 'Count(high-freq med-freq)')

w('<br><br><b>Vector Search:</b>')
writeOneLine(w, done, 'VectorSearch', 'VectorSearch (approximate KNN float 768-dimension vector search from word embeddings)')
writeOneLine(w, done, 'PreFilteredVectorSearch', 'Likewise, with a pre-filter')
writeOneLine(w, done, 'PostFilteredVectorSearch', 'Same filter, but applied as a post-filter rather than a pre-filter')

w('<br><br><b>Other queries:</b>')
writeOneLine(w, done, 'Term', 'TermQuery')
writeOneLine(w, done, 'Respell', 'Respell (DirectSpellChecker)')
writeOneLine(w, done, 'PKLookup', 'Primary key lookup')
writeOneLine(w, done, 'Wildcard', 'WildcardQuery')
writeOneLine(w, done, 'Prefix3', 'PrefixQuery (3 leading characters)')
writeOneLine(w, done, 'IntNRQ', 'Numeric range filtering on last-modified-datetime')
writeOneLine(w, done, 'VectorSearch', 'VectorSearch (approximate KNN float 768-dimension vector search from word embeddings)')

w('<br><br><b>Filtered queries:</b>')
writeOneLine(w, done, 'FilteredTerm', '+term #filter')
writeOneLine(w, done, 'FilteredAndHighHigh', '+high-freq +high-freq #filter')
writeOneLine(w, done, 'FilteredAndHighMed', '+high-freq +medium-freq #filter')
writeOneLine(w, done, 'FilteredOrHighHigh', '(high-freq high-freq) #filter')
writeOneLine(w, done, 'FilteredOrHighMed', '(high-freq medium-freq) #filter')

w('<br><br><b>Faceting:</b>')
writeOneLine(w, done, 'TermDateFacets', 'Term query + date hierarchy')
Expand Down
75 changes: 75 additions & 0 deletions tasks/wikinightly.tasks
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,18 @@ VectorSearch: vector//many foundation # freq=99550 freq=10894
VectorSearch: vector//this school # freq=238551 freq=29912
VectorSearch: vector//such 2007 # freq=111526 freq=90200 1.2

PreFilteredVectorSearch: vector//publisher backstory +preFilter=5%
PreFilteredVectorSearch: vector//many geografia +preFilter=5%
PreFilteredVectorSearch: vector//many foundation +preFilter=5%
PreFilteredVectorSearch: vector//this school +preFilter=5%
PreFilteredVectorSearch: vector//such 2007 +preFilter=5%

PostFilteredVectorSearch: vector//publisher backstory +filter=5%
PostFilteredVectorSearch: vector//many geografia +filter=5%
PostFilteredVectorSearch: vector//many foundation +filter=5%
PostFilteredVectorSearch: vector//this school +filter=5%
PostFilteredVectorSearch: vector//such 2007 +filter=5%

OrHighMedDayTaxoFacets: 4 steve +facets:DayOfYear.taxonomy # freq=986452 freq=84364
OrHighMedDayTaxoFacets: some groups +facets:DayOfYear.taxonomy # freq=839919 freq=146670
OrHighMedDayTaxoFacets: a named +facets:DayOfYear.taxonomy # freq=6560531 freq=310150
Expand All @@ -210,6 +222,8 @@ MedTermDayTaxoFacets: site +facets:DayOfYear.taxonomy # freq=277788
MedTermDayTaxoFacets: man +facets:DayOfYear.taxonomy # freq=278112
MedTermDayTaxoFacets: football +facets:DayOfYear.taxonomy # freq=277738

# CombinedFieldQuery: does BM25F across the title and body fields, with a boost
# of 8 on the title field
CombinedTerm: 0 +combinedFields=titleTokenized^8.0,body
CombinedTerm: names +combinedFields=titleTokenized^8.0,body
CombinedTerm: nbsp +combinedFields=titleTokenized^8.0,body
Expand All @@ -228,6 +242,30 @@ CombinedHighHigh: but year +combinedFields=titleTokenized^8.0,body
CombinedHighHigh: name its +combinedFields=titleTokenized^8.0,body
CombinedHighHigh: to but +combinedFields=titleTokenized^8.0,body

# DisjunctionMaxQuery across the body and text field. Even though this may not
# be the most effective way to combine scores across fields, it's been used and
# still is used a lot for this.
DismaxTerm: 0 +dismaxFields=titleTokenized,body
DismaxTerm: names +dismaxFields=titleTokenized,body
DismaxTerm: nbsp +dismaxFields=titleTokenized,body
DismaxTerm: part +dismaxFields=titleTokenized,body
DismaxTerm: st +dismaxFields=titleTokenized,body

DismaxOrHighHigh: are last +dismaxFields=titleTokenized,body
DismaxOrHighHigh: at united +dismaxFields=titleTokenized,body
DismaxOrHighHigh: but year +dismaxFields=titleTokenized,body
DismaxOrHighHigh: name its +dismaxFields=titleTokenized,body
DismaxOrHighHigh: to but +dismaxFields=titleTokenized,body

DismaxOrHighMed: at mostly +dismaxFields=titleTokenized,body
DismaxOrHighMed: his interview +dismaxFields=titleTokenized,body
DismaxOrHighMed: http 9 +dismaxFields=titleTokenized,body
DismaxOrHighMed: they hard +dismaxFields=titleTokenized,body
DismaxOrHighMed: title bay +dismaxFields=titleTokenized,body

# Counting queries. This helps track performance of exhaustive evaluation, plus
# some counting-specific optimizations like counting ones in a bitset for
# disjunctions.
CountAndHighHigh: count(+be +up) # freq=2115632 freq=824628
CountAndHighHigh: count(+cite +had) # freq=1367577 freq=1223103
CountAndHighHigh: count(+is +he) # freq=4214104 freq=1663980
Expand Down Expand Up @@ -313,3 +351,40 @@ OrMany: solr pronounced solar is an open-source enterprise-search platform writt
OrMany: a database index is a data structure that improves the speed of data retrieval operations on a database table at the cost of additional writes and storage space to maintain the index data structure indexes are used to quickly locate data without having to search every row in a database table every time said table is accessed
OrMany: in computing a search engine is an information retrieval software system designed to help find information stored on one or more computer systems search engines discover crawl transform and store information for retrieval and presentation in response to user queries
OrMany: a java virtual machine jvm is a virtual machine that enables a computer to run java programs as well as programs written in other languages that are also compiled to java bytecode the jvm is detailed by a specification that formally describes what is required in a jvm implementation

# Filtered variants of some of the most interesting queries
FilteredTerm: 0 +filter=5%
FilteredTerm: names +filter=5%
FilteredTerm: nbsp +filter=5%
FilteredTerm: part +filter=5%
FilteredTerm: st +filter=5%

FilteredAndHighHigh: +be +up +filter=5%
FilteredAndHighHigh: +cite +had +filter=5%
FilteredAndHighHigh: +is +he +filter=5%
FilteredAndHighHigh: +no +4 +filter=5%
FilteredAndHighHigh: +title +see +filter=5%

FilteredAndHighMed: +2010 +16 +filter=5%
FilteredAndHighMed: +5 +power +filter=5%
FilteredAndHighMed: +only +particularly +filter=5%
FilteredAndHighMed: +united +1983 +filter=5%
FilteredAndHighMed: +who +ed +filter=5%

FilteredOrHighHigh: are last +filter=5%
FilteredOrHighHigh: at united +filter=5%
FilteredOrHighHigh: but year +filter=5%
FilteredOrHighHigh: name its +filter=5%
FilteredOrHighHigh: to but +filter=5%

FilteredOrHighMed: at mostly +filter=5%
FilteredOrHighMed: his interview +filter=5%
FilteredOrHighMed: http 9 +filter=5%
FilteredOrHighMed: they hard +filter=5%
FilteredOrHighMed: title bay +filter=5%

FilteredPhrase: "it was" +filter=5%
FilteredPhrase: "red please" +filter=5%
FilteredPhrase: "the average" +filter=5%
FilteredPhrase: "the us" +filter=5%
FilteredPhrase: "when a" +filter=5%

0 comments on commit cd2e383

Please sign in to comment.