Skip to content

Commit

Permalink
Merge branch 'main' into fields-parameter-support
Browse files Browse the repository at this point in the history
Signed-off-by: Ethan Emoto <[email protected]>
  • Loading branch information
e-emoto authored Jan 8, 2025
2 parents 28225e0 + 405e5e2 commit c4094b0
Show file tree
Hide file tree
Showing 33 changed files with 509 additions and 87 deletions.
2 changes: 1 addition & 1 deletion .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# This should match the owning team set up in https://github.com/orgs/opensearch-project/teams
* @heemin32 @navneet1v @VijayanB @vamshin @jmazanec15 @naveentatikonda @junqiu-lei @martin-gaievski @ryanbogan @luyuncheng @shatejas
* @heemin32 @navneet1v @VijayanB @vamshin @jmazanec15 @naveentatikonda @junqiu-lei @martin-gaievski @ryanbogan @luyuncheng @shatejas @0ctopus13prime
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Introduced a writing layer in native engines where relies on the writing interface to process IO. (#2241)[https://github.com/opensearch-project/k-NN/pull/2241]
- Allow method parameter override for training based indices (#2290) https://github.com/opensearch-project/k-NN/pull/2290]
- Optimizes lucene query execution to prevent unnecessary rewrites (#2305)[https://github.com/opensearch-project/k-NN/pull/2305]
- Add check to directly use ANN Search when filters match all docs. (#2320)[https://github.com/opensearch-project/k-NN/pull/2320]
- Use one formula to calculate cosine similarity (#2357)[https://github.com/opensearch-project/k-NN/pull/2357]
### Bug Fixes
* Fixing the bug when a segment has no vector field present for disk based vector search (#2282)[https://github.com/opensearch-project/k-NN/pull/2282]
* Fixing the bug where search fails with "fields" parameter for an index with a knn_vector field (#2314)[https://github.com/opensearch-project/k-NN/pull/2314]
* Fix for NPE while merging segments after all the vector fields docs are deleted (#2365)[https://github.com/opensearch-project/k-NN/pull/2365]
* Allow validation for non knn index only after 2.17.0 (#2315)[https://github.com/opensearch-project/k-NN/pull/2315]
* Release query vector memory after execution (#2346)[https://github.com/opensearch-project/k-NN/pull/2346]
* Fix shard level rescoring disabled setting flag (#2352)[https://github.com/opensearch-project/k-NN/pull/2352]
* Fix filter rewrite logic which was resulting in getting inconsistent / incorrect results for cases where filter was getting rewritten for shards (#2359)[https://github.com/opensearch-project/k-NN/pull/2359]
### Infrastructure
* Updated C++ version in JNI from c++11 to c++17 [#2259](https://github.com/opensearch-project/k-NN/pull/2259)
* Upgrade bytebuddy and objenesis version to match OpenSearch core and, update github ci runner for macos [#2279](https://github.com/opensearch-project/k-NN/pull/2279)
Expand Down
1 change: 1 addition & 0 deletions MAINTAINERS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ This document contains a list of maintainers in this repo. See [opensearch-proje

| Maintainer | GitHub ID | Affiliation |
|-------------------------|-------------------------------------------------------|-------------|
| Doo Yong Kim | [0ctopus13prime](https://github.com/0ctopus13prime) | Amazon |
| Heemin Kim | [heemin32](https://github.com/heemin32) | Amazon |
| Jack Mazanec | [jmazanec15](https://github.com/jmazanec15) | Amazon |
| Junqiu Lei | [junqiu-lei](https://github.com/junqiu-lei) | Amazon |
Expand Down
2 changes: 2 additions & 0 deletions jni/src/faiss_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1180,6 +1180,7 @@ jobjectArray knn_jni::faiss_wrapper::RangeSearchWithFilter(knn_jni::JNIUtilInter
jniUtil->ReleaseLongArrayElements(env, filterIdsJ, filteredIdsArray, JNI_ABORT);
throw;
}
jniUtil->ReleaseLongArrayElements(env, filterIdsJ, filteredIdsArray, JNI_ABORT);
} else {
faiss::SearchParameters *searchParameters = nullptr;
faiss::SearchParametersHNSW hnswParams;
Expand All @@ -1202,6 +1203,7 @@ jobjectArray knn_jni::faiss_wrapper::RangeSearchWithFilter(knn_jni::JNIUtilInter
throw;
}
}
jniUtil->ReleaseFloatArrayElements(env, queryVectorJ, rawQueryVector, JNI_ABORT);

// lims is structured to support batched queries, it has a length of nq + 1 (where nq is the number of queries),
// lims[i] - lims[i-1] gives the number of results for the i-th query. With a single query we used in k-NN,
Expand Down
15 changes: 0 additions & 15 deletions src/main/java/org/opensearch/knn/common/KNNVectorUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@

import lombok.AccessLevel;
import lombok.NoArgsConstructor;
import org.opensearch.knn.index.vectorvalues.KNNVectorValues;

import java.io.IOException;
import java.util.List;
import java.util.Objects;

Expand Down Expand Up @@ -62,17 +60,4 @@ public static int[] intListToArray(final List<Integer> integerList) {
}
return intArray;
}

/**
* Iterates vector values once if it is not at start of the location,
* Intended to be done to make sure dimension and bytesPerVector are available
* @param vectorValues
* @throws IOException
*/
public static void iterateVectorValuesOnce(final KNNVectorValues<?> vectorValues) throws IOException {
if (vectorValues.docId() == -1) {
vectorValues.nextDoc();
vectorValues.getVector();
}
}
}
2 changes: 1 addition & 1 deletion src/main/java/org/opensearch/knn/index/KNNSettings.java
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ public static Integer getFilteredExactSearchThreshold(final String indexName) {
.getAsInt(ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD, ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_DEFAULT_VALUE);
}

public static boolean isShardLevelRescoringEnabledForDiskBasedVector(String indexName) {
public static boolean isShardLevelRescoringDisabledForDiskBasedVector(String indexName) {
return KNNSettings.state().clusterService.state()
.getMetadata()
.index(indexName)
Expand Down
14 changes: 13 additions & 1 deletion src/main/java/org/opensearch/knn/index/SpaceType.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,21 @@ public float scoreToDistanceTranslation(float score) {
}
},
COSINESIMIL("cosinesimil") {
/**
* Cosine similarity has range of [-1, 1] where -1 represents vectors are at diametrically opposite, and 1 is where
* they are identical in direction and perfectly similar. In Lucene, scores have to be in the range of [0, Float.MAX_VALUE].
* Hence, to move the range from [-1, 1] to [ 0, Float.MAX_VALUE], we convert using following formula which is adopted
* by Lucene as mentioned here
* https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73
* We expect raw score = 1 - cosine(x,y), if underlying library returns different range or other than expected raw score,
* they should override this method to either provide valid range or convert raw score to the format as 1 - cosine and call this method
*
* @param rawScore score returned from underlying library
* @return Lucene scaled score
*/
@Override
public float scoreTranslation(float rawScore) {
return 1 / (1 + rawScore);
return Math.max((2.0F - rawScore) / 2.0F, 0.0F);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import static org.opensearch.knn.common.KNNConstants.MODEL_ID;
import static org.opensearch.knn.common.KNNVectorUtil.intListToArray;
import static org.opensearch.knn.common.KNNVectorUtil.iterateVectorValuesOnce;
import static org.opensearch.knn.index.codec.util.KNNCodecUtil.initializeVectorValues;
import static org.opensearch.knn.index.codec.transfer.OffHeapVectorTransferFactory.getVectorTransfer;

/**
Expand Down Expand Up @@ -52,7 +52,7 @@ public static DefaultIndexBuildStrategy getInstance() {
public void buildAndWriteIndex(final BuildIndexParams indexInfo) throws IOException {
final KNNVectorValues<?> knnVectorValues = indexInfo.getVectorValues();
// Needed to make sure we don't get 0 dimensions while initializing index
iterateVectorValuesOnce(knnVectorValues);
initializeVectorValues(knnVectorValues);
IndexBuildSetup indexBuildSetup = QuantizationIndexUtils.prepareIndexBuild(knnVectorValues, indexInfo);

try (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import static org.opensearch.knn.common.KNNVectorUtil.intListToArray;
import static org.opensearch.knn.common.KNNVectorUtil.iterateVectorValuesOnce;
import static org.opensearch.knn.index.codec.util.KNNCodecUtil.initializeVectorValues;
import static org.opensearch.knn.index.codec.transfer.OffHeapVectorTransferFactory.getVectorTransfer;

/**
Expand Down Expand Up @@ -53,7 +53,7 @@ public static MemOptimizedNativeIndexBuildStrategy getInstance() {
public void buildAndWriteIndex(final BuildIndexParams indexInfo) throws IOException {
final KNNVectorValues<?> knnVectorValues = indexInfo.getVectorValues();
// Needed to make sure we don't get 0 dimensions while initializing index
iterateVectorValuesOnce(knnVectorValues);
initializeVectorValues(knnVectorValues);
KNNEngine engine = indexInfo.getKnnEngine();
Map<String, Object> indexParameters = indexInfo.getParameters();
IndexBuildSetup indexBuildSetup = QuantizationIndexUtils.prepareIndexBuild(knnVectorValues, indexInfo);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
import static org.opensearch.knn.common.FieldInfoExtractor.extractVectorDataType;
import static org.opensearch.knn.common.KNNConstants.MODEL_ID;
import static org.opensearch.knn.common.KNNConstants.PARAMETERS;
import static org.opensearch.knn.common.KNNVectorUtil.iterateVectorValuesOnce;
import static org.opensearch.knn.index.codec.util.KNNCodecUtil.initializeVectorValues;
import static org.opensearch.knn.index.codec.util.KNNCodecUtil.buildEngineFileName;
import static org.opensearch.knn.index.engine.faiss.Faiss.FAISS_BINARY_INDEX_DESCRIPTION_PREFIX;

Expand Down Expand Up @@ -100,7 +100,7 @@ public static NativeIndexWriter getWriter(
* @throws IOException
*/
public void flushIndex(final KNNVectorValues<?> knnVectorValues, int totalLiveDocs) throws IOException {
iterateVectorValuesOnce(knnVectorValues);
initializeVectorValues(knnVectorValues);
buildAndWriteIndex(knnVectorValues, totalLiveDocs);
recordRefreshStats();
}
Expand All @@ -111,7 +111,7 @@ public void flushIndex(final KNNVectorValues<?> knnVectorValues, int totalLiveDo
* @throws IOException
*/
public void mergeIndex(final KNNVectorValues<?> knnVectorValues, int totalLiveDocs) throws IOException {
iterateVectorValuesOnce(knnVectorValues);
initializeVectorValues(knnVectorValues);
if (knnVectorValues.docId() == NO_MORE_DOCS) {
// This is in place so we do not add metrics
log.debug("Skipping mergeIndex, vector values are already iterated for {}", fieldInfo.name);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.search.DocIdSetIterator;
import org.opensearch.knn.common.FieldInfoExtractor;
import org.opensearch.knn.common.KNNConstants;
import org.opensearch.knn.index.VectorDataType;
import org.opensearch.knn.index.codec.KNN80Codec.KNN80BinaryDocValues;
import org.opensearch.knn.index.engine.KNNEngine;
import org.opensearch.knn.index.vectorvalues.KNNVectorValues;

import java.io.IOException;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -116,6 +119,31 @@ public static String getNativeEngineFileFromFieldInfo(FieldInfo field, SegmentIn
}
}

/**
* Positions the vectorValuesIterator to the first vector document ID if not already positioned there.
* This initialization is crucial for setting up vector dimensions and other properties in VectorValues.
* <p>
* If the VectorValues contains no vector documents, the iterator will be positioned at
* {@link DocIdSetIterator#NO_MORE_DOCS}
*
* @param vectorValues {@link KNNVectorValues}
* @throws IOException if there is an error while accessing the vector values
*/
public static void initializeVectorValues(final KNNVectorValues<?> vectorValues) throws IOException {
// The docId will be set to -1 if next doc has never been called yet. If it has already been called,
// no need to advance the vector values
if (vectorValues.docId() != -1) {
return;
}
// Ensure that we are not getting the next vector if there are no more docs
vectorValues.nextDoc();
if (vectorValues.docId() == DocIdSetIterator.NO_MORE_DOCS) {
// Ensure that we are not getting the vector if there are no more docs
return;
}
vectorValues.getVector();
}

/**
* Get KNNEngine From FieldInfo
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

package org.opensearch.knn.index.mapper;

import org.opensearch.Version;
import org.opensearch.knn.index.engine.KNNMethodContext;
import org.opensearch.knn.index.engine.qframe.QuantizationConfig;

Expand Down Expand Up @@ -62,4 +63,12 @@ default QuantizationConfig getQuantizationConfig() {
* @return the dimension of the index; for model based indices, it will be null
*/
int getDimension();

/**
* Returns index created Version
* @return Version
*/
default Version getIndexCreatedVersion() {
return Version.CURRENT;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.KnnByteVectorField;
import org.apache.lucene.document.KnnFloatVectorField;
import org.opensearch.Version;
import org.opensearch.common.Explicit;
import org.opensearch.knn.index.KNNVectorSimilarityFunction;
import org.opensearch.knn.index.VectorDataType;
Expand Down Expand Up @@ -73,6 +74,11 @@ public Mode getMode() {
public CompressionLevel getCompressionLevel() {
return knnMethodConfigContext.getCompressionLevel();
}

@Override
public Version getIndexCreatedVersion() {
return knnMethodConfigContext.getVersionCreated();
}
}
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.VectorEncoding;
import org.opensearch.Version;
import org.opensearch.common.Explicit;
import org.opensearch.common.xcontent.XContentFactory;
import org.opensearch.knn.index.SpaceType;
Expand Down Expand Up @@ -86,6 +87,11 @@ public CompressionLevel getCompressionLevel() {
public QuantizationConfig getQuantizationConfig() {
return quantizationConfig;
}

@Override
public Version getIndexCreatedVersion() {
return knnMethodConfigContext.getVersionCreated();
}
}
);
return new MethodFieldMapper(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,11 @@ public QuantizationConfig getQuantizationConfig() {
return quantizationConfig;
}

@Override
public Version getIndexCreatedVersion() {
return indexCreatedVersion;
}

// ModelMetadata relies on cluster state which may not be available during field mapper creation. Thus,
// we lazily initialize it.
private void initFromModelMetadata() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,10 @@ public enum FilterIdsSelectorType {
public static FilterIdsSelector getFilterIdSelector(final BitSet filterIdsBitSet, final int cardinality) throws IOException {
long[] filterIds;
FilterIdsSelector.FilterIdsSelectorType filterType;
if (filterIdsBitSet instanceof FixedBitSet) {
if (filterIdsBitSet == null) {
filterIds = null;
filterType = FilterIdsSelector.FilterIdsSelectorType.BITMAP;
} else if (filterIdsBitSet instanceof FixedBitSet) {
/**
* When filterIds is dense filter, using fixed bitset
*/
Expand Down
19 changes: 17 additions & 2 deletions src/main/java/org/opensearch/knn/index/query/KNNQueryBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -662,9 +662,24 @@ public String getWriteableName() {

@Override
protected QueryBuilder doRewrite(QueryRewriteContext queryShardContext) throws IOException {
// rewrite filter query if it exists to avoid runtime errors in next steps of query phase
QueryBuilder rewrittenFilter;
if (Objects.nonNull(filter)) {
filter = filter.rewrite(queryShardContext);
rewrittenFilter = filter.rewrite(queryShardContext);
if (rewrittenFilter != filter) {
KNNQueryBuilder rewrittenQueryBuilder = KNNQueryBuilder.builder()
.fieldName(this.fieldName)
.vector(this.vector)
.k(this.k)
.maxDistance(this.maxDistance)
.minScore(this.minScore)
.methodParameters(this.methodParameters)
.filter(rewrittenFilter)
.ignoreUnmapped(this.ignoreUnmapped)
.rescoreContext(this.rescoreContext)
.expandNested(this.expandNested)
.build();
return rewrittenQueryBuilder;
}
}
return super.doRewrite(queryShardContext);
}
Expand Down
10 changes: 9 additions & 1 deletion src/main/java/org/opensearch/knn/index/query/KNNWeight.java
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
*/
public PerLeafResult searchLeaf(LeafReaderContext context, int k) throws IOException {
final BitSet filterBitSet = getFilteredDocsBitSet(context);
final int maxDoc = context.reader().maxDoc();
int cardinality = filterBitSet.cardinality();
// We don't need to go to JNI layer if no documents are found which satisfy the filters
// We should give this condition a deeper look that where it should be placed. For now I feel this is a good
Expand All @@ -145,7 +146,14 @@ public PerLeafResult searchLeaf(LeafReaderContext context, int k) throws IOExcep
Map<Integer, Float> result = doExactSearch(context, new BitSetIterator(filterBitSet, cardinality), cardinality, k);
return new PerLeafResult(filterWeight == null ? null : filterBitSet, result);
}
Map<Integer, Float> docIdsToScoreMap = doANNSearch(context, filterBitSet, cardinality, k);

/*
* If filters match all docs in this segment, then null should be passed as filterBitSet
* so that it will not do a bitset look up in bottom search layer.
*/
final BitSet annFilter = (filterWeight != null && cardinality == maxDoc) ? null : filterBitSet;
final Map<Integer, Float> docIdsToScoreMap = doANNSearch(context, annFilter, cardinality, k);

// See whether we have to perform exact search based on approx search results
// This is required if there are no native engine files or if approximate search returned
// results less than K, though we have more than k filtered docs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,11 @@ public Weight createWeight(IndexSearcher indexSearcher, ScoreMode scoreMode, flo
if (rescoreContext == null) {
perLeafResults = doSearch(indexSearcher, leafReaderContexts, knnWeight, finalK);
} else {
boolean isShardLevelRescoringEnabled = KNNSettings.isShardLevelRescoringEnabledForDiskBasedVector(knnQuery.getIndexName());
boolean isShardLevelRescoringDisabled = KNNSettings.isShardLevelRescoringDisabledForDiskBasedVector(knnQuery.getIndexName());
int dimension = knnQuery.getQueryVector().length;
int firstPassK = rescoreContext.getFirstPassK(finalK, isShardLevelRescoringEnabled, dimension);
int firstPassK = rescoreContext.getFirstPassK(finalK, isShardLevelRescoringDisabled, dimension);
perLeafResults = doSearch(indexSearcher, leafReaderContexts, knnWeight, firstPassK);
if (isShardLevelRescoringEnabled == true) {
if (isShardLevelRescoringDisabled == false) {
ResultUtil.reduceToTopK(perLeafResults, firstPassK);
}

Expand Down
Loading

0 comments on commit c4094b0

Please sign in to comment.