Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preparing the ES Embeddings Rewriter #1

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion src/main/java/querqy/embeddings/ChorusEmbeddingModel.java
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
package querqy.embeddings;

import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import querqy.solr.utils.JsonUtil;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
Expand All @@ -17,6 +21,8 @@ public class ChorusEmbeddingModel implements EmbeddingModel {

private static final String CONTENT_TYPE_JSON = "application/json";

private static final ObjectMapper objectMapper = new ObjectMapper();

private URL url;

private boolean normalize = true;
Expand Down Expand Up @@ -65,14 +71,23 @@ public Embedding getEmbedding(final String text) {
os.write(input, 0, input.length);
}

embedding = Embedding.of((List<Double>) JsonUtil.readJson(con.getInputStream(), Map.class).get("embedding"));
embedding = parseEmbeddingFromResponse(con.getInputStream());
embeddingsCache.putEmbedding(cacheKey, embedding);
return embedding;

} catch (final IOException e) {
throw new RuntimeException(e);
}
}

public Embedding parseEmbeddingFromResponse(InputStream is) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not understanding why this has been changed. What is the benefit of not using JsonUtil.readJson()?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code before was
(List<Double>) JsonUtil.readJson(con.getInputStream(), Map.class).get("embedding")

What happens here is that the Jackson Objectmapper will just read the object as a map and the embedding field as a List. It does not try to coerce into a common List element type, i.e. you get a List of (Double, Integer, Double) if you parse a JSON array [1.2, 1, 1.3]. You then get a class java.lang.Integer cannot be cast to class java.lang.Double when the element of the List is accessed as a Double.

try {
JsonNode responseTree = objectMapper.readTree(is);
List<Double> embedding = objectMapper.convertValue(responseTree.path("embedding"), new TypeReference<>() {});
return Embedding.of(embedding);
} catch (IOException e) {
throw new RuntimeException(e);
}
}

protected String toJsonString(final String text) {
Expand All @@ -86,4 +101,5 @@ protected String toJsonString(final String text) {

)));
}

}
51 changes: 4 additions & 47 deletions src/main/java/querqy/embeddings/EmbeddingsRewriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import querqy.model.Node;
import querqy.model.QuerqyQuery;
import querqy.model.Query;
import querqy.model.StringRawQuery;
import querqy.model.Term;
import querqy.rewrite.QueryRewriter;
import querqy.rewrite.RewriterOutput;
Expand Down Expand Up @@ -77,65 +76,23 @@ public RewriterOutput rewrite(final ExpandedQuery query,
}

protected ExpandedQuery applyEmbedding(final Embedding embedding, final ExpandedQuery inputQuery) {

KnnVectorQuery knnVectorQuery = new KnnVectorQuery(vectorField, embedding.asVector(), topK);
LuceneRawQuery luceneRawQuery = new LuceneRawQuery(null, Clause.Occur.MUST,true, knnVectorQuery);

switch (queryMode) {
case BOOST:
inputQuery.addBoostUpQuery(new BoostQuery(new StringRawQuery(null, makeEmbeddingQueryString(embedding),
Clause.Occur.SHOULD, true), boost));
inputQuery.addBoostUpQuery(new BoostQuery(luceneRawQuery, boost));
break;
case MAIN_QUERY:
// this is a workaround to avoid changing Querqy's query object model for now:
// as we cant set a StringRawQuery as the userQuery, we use a match all for that, add a vector query
// as a filter query (retrieve only knn) and a boost query (rank by distance)
//inputQuery.setUserQuery(new MatchAllQuery());
inputQuery.setUserQuery(new LuceneRawQuery(null, Clause.Occur.MUST,
true, new KnnVectorQuery(vectorField, embedding.asVector(), topK)));
inputQuery.setUserQuery(luceneRawQuery);
break;
default:
throw new IllegalStateException("Unknown query mode: " + queryMode);

}

return inputQuery;
}

protected String makeEmbeddingQueryString(final Embedding embedding) {
return "{!func}sum(100,query({!knn f=" + vectorField + " topK=" + topK + " v='[" + embedding.asCommaSeparatedString() + "]'}))";
}

protected String embeddingToString(final float[] embedding) {
final StringBuilder sb = new StringBuilder(embedding.length * 16);
for (int i = 0; i < embedding.length; i++) {
if (i > 0) {
sb.append(", ");
}
sb.append(embedding[i]);
}
return sb.toString();
}

protected ExpandedQuery applyVectorQuery(final String embeddingQueryString, final ExpandedQuery inputQuery) {


final StringRawQuery embeddingsQuery = new StringRawQuery(null, embeddingQueryString, Clause.Occur.SHOULD, true);
switch (queryMode) {
case BOOST:
inputQuery.addBoostUpQuery(new BoostQuery(embeddingsQuery, boost));
break;
case MAIN_QUERY:
// this is a workaround to avoid changing Querqy's query object model for now:
// as we cant set a StringRawQuery as the userQuery, we use a match all for that, add a vector query
// as a filter query (retrieve only knn) and a boost query (rank by distance)
inputQuery.setUserQuery(new StringRawQuery(null, embeddingQueryString, Clause.Occur.MUST, true));
break;
default:
throw new IllegalStateException("Unknown query mode: " + queryMode);

}

return inputQuery;
}
/**
* Traverse the query graph, collect all the terms and join them into a string
*/
Expand Down
19 changes: 19 additions & 0 deletions src/test/java/querqy/solr/embeddings/ChorusEmbeddingModelTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package querqy.solr.embeddings;

import org.junit.Assert;
import org.junit.Test;
import querqy.embeddings.ChorusEmbeddingModel;
import querqy.embeddings.Embedding;

import java.io.ByteArrayInputStream;
import java.nio.charset.StandardCharsets;

public class ChorusEmbeddingModelTest {

@Test
public void testParseJson() {
String embeddingJson = "{ \"embedding\": [0.3, 1, 5] }";
Embedding e = new ChorusEmbeddingModel().parseEmbeddingFromResponse(new ByteArrayInputStream(embeddingJson.getBytes(StandardCharsets.UTF_8)));
Assert.assertArrayEquals(e.asVector(), new float[] { 0.3f, 1f, 5f}, 0f);
}
}