From 89afa74a53e58f15e09632d7ea9a2bea22b8df99 Mon Sep 17 00:00:00 2001 From: Vivek Narang Date: Tue, 30 Apr 2024 07:34:54 -0400 Subject: [PATCH] allow passing args to the example code --- .gitignore | 2 + README.md | 8 +- .../vectorsearch/CuVSIndexSearcher.java | 28 +++--- .../LuceneVectorSearchExample.java | 87 +++++++++++++------ 4 files changed, 84 insertions(+), 41 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b3fdf27 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +build +target \ No newline at end of file diff --git a/README.md b/README.md index be9ec38..724013d 100644 --- a/README.md +++ b/README.md @@ -16,11 +16,17 @@ By way of a working example, OpenAI's Wikipedia corpus (25k documents) can be in Install RAFT (https://docs.rapids.ai/api/raft/stable/build/#installation) +Download the dataset file [using this link](https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip) + Set the correct path for Raft in `cuda/CMakeLists.txt` file. Then, proceed to run the following (Wikipedia OpenAI benchmark): wget -c https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip mvn package - java -jar lucene/target/cuvs-searcher-lucene-0.0.1-SNAPSHOT-jar-with-dependencies.jar + java -jar lucene/target/cuvs-searcher-lucene-0.0.1-SNAPSHOT-jar-with-dependencies.jar + + # Example + java -jar lucene/target/cuvs-searcher-lucene-0.0.1-SNAPSHOT-jar-with-dependencies.jar vector_database_wikipedia_articles_embedded.zip 5 content_vector 25000 768 query.txt + ## Benchmarks diff --git a/lucene/src/main/java/com/searchscale/lucene/vectorsearch/CuVSIndexSearcher.java b/lucene/src/main/java/com/searchscale/lucene/vectorsearch/CuVSIndexSearcher.java index ed1c877..f84ed8a 100644 --- a/lucene/src/main/java/com/searchscale/lucene/vectorsearch/CuVSIndexSearcher.java +++ b/lucene/src/main/java/com/searchscale/lucene/vectorsearch/CuVSIndexSearcher.java @@ -30,12 +30,12 @@ public CuVSIndexSearcher(IndexReader reader) { List docIds = new ArrayList<>(); List dataVectors = new ArrayList(); try { - for (LeafReaderContext leaf: reader.leaves()) { - FloatVectorValues vectors = leaf.reader().getFloatVectorValues("content_vector"); + for (LeafReaderContext leaf : reader.leaves()) { + FloatVectorValues vectors = leaf.reader().getFloatVectorValues(LuceneVectorSearchExample.vectorColName); DocIdSetIterator disi = FloatVectorValues.all(leaf.reader().maxDoc()); for (int doc = disi.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi.nextDoc()) { vectors.advance(doc); - docIds.add(leaf.docBase+doc); + docIds.add(leaf.docBase + doc); dataVectors.add(vectors.vectorValue().clone()); } } @@ -45,29 +45,31 @@ public CuVSIndexSearcher(IndexReader reader) { int numVectors = dataVectors.size(); int dim = dataVectors.get(0).length; float[] singleDataVector = new float[numVectors * dim]; - for (int i=0; i