Skip to content

Commit

Permalink
allow passing args to the example code
Browse files Browse the repository at this point in the history
  • Loading branch information
Vivek Narang committed Apr 30, 2024
1 parent e8ffc55 commit 89afa74
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 41 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
build
target
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,17 @@ By way of a working example, OpenAI's Wikipedia corpus (25k documents) can be in

Install RAFT (https://docs.rapids.ai/api/raft/stable/build/#installation)

Download the dataset file [using this link](https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip)

Set the correct path for Raft in `cuda/CMakeLists.txt` file. Then, proceed to run the following (Wikipedia OpenAI benchmark):

wget -c https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip
mvn package
java -jar lucene/target/cuvs-searcher-lucene-0.0.1-SNAPSHOT-jar-with-dependencies.jar
java -jar lucene/target/cuvs-searcher-lucene-0.0.1-SNAPSHOT-jar-with-dependencies.jar <datasetfile> <vector_index_column> <name_of_vector_field> <numDocs> <dimensions> <queryFile>

# Example
java -jar lucene/target/cuvs-searcher-lucene-0.0.1-SNAPSHOT-jar-with-dependencies.jar vector_database_wikipedia_articles_embedded.zip 5 content_vector 25000 768 query.txt


## Benchmarks

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ public CuVSIndexSearcher(IndexReader reader) {
List<Integer> docIds = new ArrayList<>();
List<float[]> dataVectors = new ArrayList<float[]>();
try {
for (LeafReaderContext leaf: reader.leaves()) {
FloatVectorValues vectors = leaf.reader().getFloatVectorValues("content_vector");
for (LeafReaderContext leaf : reader.leaves()) {
FloatVectorValues vectors = leaf.reader().getFloatVectorValues(LuceneVectorSearchExample.vectorColName);
DocIdSetIterator disi = FloatVectorValues.all(leaf.reader().maxDoc());
for (int doc = disi.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi.nextDoc()) {
vectors.advance(doc);
docIds.add(leaf.docBase+doc);
docIds.add(leaf.docBase + doc);
dataVectors.add(vectors.vectorValue().clone());
}
}
Expand All @@ -45,29 +45,31 @@ public CuVSIndexSearcher(IndexReader reader) {
int numVectors = dataVectors.size();
int dim = dataVectors.get(0).length;
float[] singleDataVector = new float[numVectors * dim];
for (int i=0; i<numVectors; i++) {
for (int j=0; j<dim; j++) {
singleDataVector[i*dim + j] = dataVectors.get(i)[j];
for (int i = 0; i < numVectors; i++) {
for (int j = 0; j < dim; j++) {
singleDataVector[i * dim + j] = dataVectors.get(i)[j];
}
}
int docIdsArr[] = new int[docIds.size()];
for (int i=0; i<docIdsArr.length; i++) docIdsArr[i] = docIds.get(i);
System.out.println("Time taken for copying data from IndexReader to arrays for C++: " + (System.currentTimeMillis()-startTime));
for (int i = 0; i < docIdsArr.length; i++)
docIdsArr[i] = docIds.get(i);
System.out.println(
"Time taken for copying data from IndexReader to arrays for C++: " + (System.currentTimeMillis() - startTime));
startTime = System.currentTimeMillis();
jni.initIndex(docIdsArr, singleDataVector, docIdsArr.length, dataVectors.get(0).length);
System.out.println("Time taken for index building: " + (System.currentTimeMillis()-startTime));
System.out.println("Time taken for index building: " + (System.currentTimeMillis() - startTime));
}

@Override
public TopDocs search(Query query, int n) throws IOException {
KnnFloatVectorQuery knnQuery = (KnnFloatVectorQuery) query;
Object results = jni.getTopK(knnQuery.getTargetCopy(), knnQuery.getK());
ByteBuffer buf = ((ByteBuffer)results).order(ByteOrder.nativeOrder());
ByteBuffer buf = ((ByteBuffer) results).order(ByteOrder.nativeOrder());
int N = buf.limit() / 8;
ScoreDoc scoreDocs[] = new ScoreDoc[N];
for (int i=0; i<N; i++) {
float score = buf.getFloat((i)*4);
int id = buf.getInt((N+i)*4);
for (int i = 0; i < N; i++) {
float score = buf.getFloat((i) * 4);
int id = buf.getInt((N + i) * 4);
scoreDocs[i] = new ScoreDoc(id, score);
}
return new TopDocs(new TotalHits(N, TotalHits.Relation.EQUAL_TO), scoreDocs);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package com.searchscale.lucene.vectorsearch;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
Expand Down Expand Up @@ -38,55 +40,84 @@

public class LuceneVectorSearchExample {

public static int DIMENSIONS = 1536;

public static void main(String[] args) {
public static String vectorColName = null;

public static void main(String[] args) throws Exception {

// [0] Parse Args

String datasetFile = args[0];
int indexOfVector = Integer.valueOf(args[1]);
vectorColName = args[2];
int numDocs = Integer.valueOf(args[3]);
int dims = Integer.valueOf(args[4]);
String queryFile = args[5];
System.out.println("Dataset file used is: " + datasetFile);
System.out.println("Index of vector field is: " + indexOfVector);
System.out.println("Name of the vector field is: " + vectorColName);
System.out.println("Number of documents to be indexed are: " + numDocs);
System.out.println("Number of dimensions are: " + dims);
System.out.println("Query file used is: " + queryFile);

// [1] Setup the index
Directory index = new ByteBuffersDirectory();
Lucene99Codec knnVectorsCodec = getCodec(DIMENSIONS);
Lucene99Codec knnVectorsCodec = getCodec(dims);
IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer()).setCodec(knnVectorsCodec);

// [2] Index
long startTime = System.currentTimeMillis();
try (ZipFile zip = new ZipFile("vector_database_wikipedia_articles_embedded.zip");
IndexWriter writer = new IndexWriter(index, config)) {
CSVReader reader = new CSVReader(new InputStreamReader(zip.getInputStream(zip.entries().nextElement())));
{
InputStreamReader isr = null;
IndexWriter writer = new IndexWriter(index, config);
if (datasetFile.endsWith(".zip")) {
ZipFile zip = new ZipFile(datasetFile);
isr = new InputStreamReader(zip.getInputStream(zip.entries().nextElement()));
} else {
isr = new InputStreamReader(new FileInputStream(datasetFile));
}

CSVReader reader = new CSVReader(isr);
String[] line;
int count = 0;
while ((line = reader.readNext()) != null) {
if ((count++) == 0) continue; // skip the first line of the file, it is a header
if ((count++) == 0)
continue; // skip the first line of the file, it is a header
Document doc = new Document();
doc.add(new StringField("id", ""+(count-2), Field.Store.YES));
doc.add(new StringField("id", "" + (count - 2), Field.Store.YES));
doc.add(new StringField("url", line[1], Field.Store.YES));
doc.add(new StringField("title", line[2], Field.Store.YES));
doc.add(new TextField("text", line[3], Field.Store.YES));
float[] contentVector = reduceDimensionVector(parseFloatArrayFromStringArray(line[5]), DIMENSIONS);
doc.add(new KnnFloatVectorField("content_vector", contentVector, VectorSimilarityFunction.EUCLIDEAN));
float[] contentVector = reduceDimensionVector(parseFloatArrayFromStringArray(line[5]), dims);
doc.add(new KnnFloatVectorField(vectorColName, contentVector, VectorSimilarityFunction.EUCLIDEAN));
doc.add(new StringField("vector_id", line[6], Field.Store.YES));
if (count % 500 == 0) writer.commit();
if (count % 5000 == 0) System.out.println(count + " docs indexed ...");

if (count % 500 == 0)
writer.commit();
if (count % 5000 == 0)
System.out.println(count + " docs indexed ...");
writer.addDocument(doc);
if (count == numDocs)
break;
}
writer.commit();
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("Time taken for index building (end to end): " + (System.currentTimeMillis()-startTime));

System.out.println("Time taken for index building (end to end): " + (System.currentTimeMillis() - startTime));

// [3] Query
try (IndexReader reader = DirectoryReader.open(index)) {
IndexSearcher searcher = new CuVSIndexSearcher(reader);
for (String line: FileUtils.readFileToString(new File("query.txt"), "UTF-8").split("\n")) {
float queryVector[] = reduceDimensionVector(parseFloatArrayFromStringArray(line), DIMENSIONS);
Query query = new KnnFloatVectorQuery("content_vector", queryVector, 5);
for (String line : FileUtils.readFileToString(new File(queryFile), "UTF-8").split("\n")) {
float queryVector[] = reduceDimensionVector(parseFloatArrayFromStringArray(line), dims);
Query query = new KnnFloatVectorQuery(vectorColName, queryVector, 5);
startTime = System.currentTimeMillis();
TopDocs topDocs = searcher.search(query, ((KnnFloatVectorQuery)query).getK());
System.out.println("Time taken for searching (end to end): " + (System.currentTimeMillis()-startTime));
TopDocs topDocs = searcher.search(query, ((KnnFloatVectorQuery) query).getK());
System.out.println("Time taken for searching (end to end): " + (System.currentTimeMillis() - startTime));
ScoreDoc[] hits = topDocs.scoreDocs;
System.out.println("Found " + hits.length + " hits.");
for (ScoreDoc hit: hits) {
for (ScoreDoc hit : hits) {
Document d = searcher.storedFields().document(hit.doc);
System.out.println("DocID: " + hit.doc + ", Score: " + hit.score + ", Title: " + d.get("title"));
System.out.println("DocID: " + hit.doc + ", Score: " + hit.score);
}
}
} catch (Exception e) {
Expand All @@ -95,7 +126,8 @@ public static void main(String[] args) {
}

private static Lucene99Codec getCodec(int dimensions) {
if (dimensions <= 1024) return new Lucene99Codec(Mode.BEST_SPEED);
if (dimensions <= 1024)
return new Lucene99Codec(Mode.BEST_SPEED);
Lucene99Codec knnVectorsCodec = new Lucene99Codec(Mode.BEST_SPEED) {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
Expand All @@ -109,14 +141,15 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
}

private static float[] parseFloatArrayFromStringArray(String str) {
float[] titleVector = ArrayUtils.toPrimitive(Arrays.stream(str.replace("[", "").replace("]", "").
split(", ")).map(Float::valueOf).toArray(Float[]::new));
float[] titleVector = ArrayUtils.toPrimitive(
Arrays.stream(str.replace("[", "").replace("]", "").split(", ")).map(Float::valueOf).toArray(Float[]::new));
return titleVector;
}

public static float[] reduceDimensionVector(float[] vector, int dim) {
float out[] = new float[dim];
for (int i=0; i<dim && i<vector.length; i++) out[i] = vector[i];
for (int i = 0; i < dim && i < vector.length; i++)
out[i] = vector[i];
return out;
}

Expand Down

0 comments on commit 89afa74

Please sign in to comment.