Merge branch 'main' into fields-parameter-support

Signed-off-by: Ethan Emoto <[email protected]>
opensearch-project · Jan 8, 2025 · c4094b0 · c4094b0
2 parents 28225e0 + 405e5e2
commit c4094b0
Show file tree

Hide file tree

Showing 33 changed files with 509 additions and 87 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1,2 +1,2 @@
 # This should match the owning team set up in https://github.com/orgs/opensearch-project/teams
-*   @heemin32 @navneet1v @VijayanB @vamshin @jmazanec15 @naveentatikonda @junqiu-lei @martin-gaievski @ryanbogan @luyuncheng @shatejas
+*   @heemin32 @navneet1v @VijayanB @vamshin @jmazanec15 @naveentatikonda @junqiu-lei @martin-gaievski @ryanbogan @luyuncheng @shatejas @0ctopus13prime
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,10 +23,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Introduced a writing layer in native engines where relies on the writing interface to process IO. (#2241)[https://github.com/opensearch-project/k-NN/pull/2241]
 - Allow method parameter override for training based indices (#2290) https://github.com/opensearch-project/k-NN/pull/2290]
 - Optimizes lucene query execution to prevent unnecessary rewrites (#2305)[https://github.com/opensearch-project/k-NN/pull/2305]
+- Add check to directly use ANN Search when filters match all docs. (#2320)[https://github.com/opensearch-project/k-NN/pull/2320]
+- Use one formula to calculate cosine similarity (#2357)[https://github.com/opensearch-project/k-NN/pull/2357]
 ### Bug Fixes
 * Fixing the bug when a segment has no vector field present for disk based vector search (#2282)[https://github.com/opensearch-project/k-NN/pull/2282]
 * Fixing the bug where search fails with "fields" parameter for an index with a knn_vector field (#2314)[https://github.com/opensearch-project/k-NN/pull/2314]
+* Fix for NPE while merging segments after all the vector fields docs are deleted (#2365)[https://github.com/opensearch-project/k-NN/pull/2365]
 * Allow validation for non knn index only after 2.17.0 (#2315)[https://github.com/opensearch-project/k-NN/pull/2315]
+* Release query vector memory after execution (#2346)[https://github.com/opensearch-project/k-NN/pull/2346]
+* Fix shard level rescoring disabled setting flag (#2352)[https://github.com/opensearch-project/k-NN/pull/2352]
+* Fix filter rewrite logic which was resulting in getting inconsistent / incorrect results for cases where filter was getting rewritten for shards (#2359)[https://github.com/opensearch-project/k-NN/pull/2359]
 ### Infrastructure
 * Updated C++ version in JNI from c++11 to c++17 [#2259](https://github.com/opensearch-project/k-NN/pull/2259)
 * Upgrade bytebuddy and objenesis version to match OpenSearch core and, update github ci runner for macos [#2279](https://github.com/opensearch-project/k-NN/pull/2279)

diff --git a/MAINTAINERS.md b/MAINTAINERS.md
@@ -6,6 +6,7 @@ This document contains a list of maintainers in this repo. See [opensearch-proje
 
 | Maintainer              | GitHub ID                                             | Affiliation |
 |-------------------------|-------------------------------------------------------|-------------|
+| Doo Yong Kim             | [0ctopus13prime](https://github.com/0ctopus13prime)               | Amazon      |
 | Heemin Kim              | [heemin32](https://github.com/heemin32)               | Amazon      |
 | Jack Mazanec            | [jmazanec15](https://github.com/jmazanec15)           | Amazon      |
 | Junqiu Lei              | [junqiu-lei](https://github.com/junqiu-lei)           | Amazon      |

diff --git a/jni/src/faiss_wrapper.cpp b/jni/src/faiss_wrapper.cpp
@@ -1180,6 +1180,7 @@ jobjectArray knn_jni::faiss_wrapper::RangeSearchWithFilter(knn_jni::JNIUtilInter
             jniUtil->ReleaseLongArrayElements(env, filterIdsJ, filteredIdsArray, JNI_ABORT);
             throw;
         }
+        jniUtil->ReleaseLongArrayElements(env, filterIdsJ, filteredIdsArray, JNI_ABORT);
     } else {
         faiss::SearchParameters *searchParameters = nullptr;
         faiss::SearchParametersHNSW hnswParams;
@@ -1202,6 +1203,7 @@ jobjectArray knn_jni::faiss_wrapper::RangeSearchWithFilter(knn_jni::JNIUtilInter
             throw;
         }
     }
+    jniUtil->ReleaseFloatArrayElements(env, queryVectorJ, rawQueryVector, JNI_ABORT);
 
     // lims is structured to support batched queries, it has a length of nq + 1 (where nq is the number of queries),
     // lims[i] - lims[i-1] gives the number of results for the i-th query. With a single query we used in k-NN,

diff --git a/src/main/java/org/opensearch/knn/common/KNNVectorUtil.java b/src/main/java/org/opensearch/knn/common/KNNVectorUtil.java
@@ -7,9 +7,7 @@
 
 import lombok.AccessLevel;
 import lombok.NoArgsConstructor;
-import org.opensearch.knn.index.vectorvalues.KNNVectorValues;
 
-import java.io.IOException;
 import java.util.List;
 import java.util.Objects;
 
@@ -62,17 +60,4 @@ public static int[] intListToArray(final List<Integer> integerList) {
         }
         return intArray;
     }
-
-    /**
-     * Iterates vector values once if it is not at start of the location,
-     * Intended to be done to make sure dimension and bytesPerVector are available
-     * @param vectorValues
-     * @throws IOException
-     */
-    public static void iterateVectorValuesOnce(final KNNVectorValues<?> vectorValues) throws IOException {
-        if (vectorValues.docId() == -1) {
-            vectorValues.nextDoc();
-            vectorValues.getVector();
-        }
-    }
 }
diff --git a/src/main/java/org/opensearch/knn/index/KNNSettings.java b/src/main/java/org/opensearch/knn/index/KNNSettings.java
@@ -577,7 +577,7 @@ public static Integer getFilteredExactSearchThreshold(final String indexName) {
             .getAsInt(ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD, ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_DEFAULT_VALUE);
     }
 
-    public static boolean isShardLevelRescoringEnabledForDiskBasedVector(String indexName) {
+    public static boolean isShardLevelRescoringDisabledForDiskBasedVector(String indexName) {
         return KNNSettings.state().clusterService.state()
             .getMetadata()
             .index(indexName)

diff --git a/src/main/java/org/opensearch/knn/index/SpaceType.java b/src/main/java/org/opensearch/knn/index/SpaceType.java
@@ -60,9 +60,21 @@ public float scoreToDistanceTranslation(float score) {
         }
     },
     COSINESIMIL("cosinesimil") {
+        /**
+         * Cosine similarity has range of [-1, 1] where -1 represents vectors are at diametrically opposite, and 1 is where
+         * they are identical in direction and perfectly similar. In Lucene, scores have to be in the range of [0, Float.MAX_VALUE].
+         * Hence, to move the range from [-1, 1] to [ 0, Float.MAX_VALUE], we convert  using following formula which is adopted
+         * by Lucene as mentioned here
+         * https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73
+         * We expect raw score = 1 - cosine(x,y), if underlying library returns different range or other than expected raw score,
+         * they should override this method to either provide valid range or convert raw score to the format as 1 - cosine and call this method
+         *
+         * @param rawScore score returned from underlying library
+         * @return Lucene scaled score
+         */
         @Override
         public float scoreTranslation(float rawScore) {
-            return 1 / (1 + rawScore);
+            return Math.max((2.0F - rawScore) / 2.0F, 0.0F);
         }
 
         @Override

diff --git a/src/main/java/org/opensearch/knn/index/codec/nativeindex/DefaultIndexBuildStrategy.java b/src/main/java/org/opensearch/knn/index/codec/nativeindex/DefaultIndexBuildStrategy.java
@@ -23,7 +23,7 @@
 import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
 import static org.opensearch.knn.common.KNNConstants.MODEL_ID;
 import static org.opensearch.knn.common.KNNVectorUtil.intListToArray;
-import static org.opensearch.knn.common.KNNVectorUtil.iterateVectorValuesOnce;
+import static org.opensearch.knn.index.codec.util.KNNCodecUtil.initializeVectorValues;
 import static org.opensearch.knn.index.codec.transfer.OffHeapVectorTransferFactory.getVectorTransfer;
 
 /**
@@ -52,7 +52,7 @@ public static DefaultIndexBuildStrategy getInstance() {
     public void buildAndWriteIndex(final BuildIndexParams indexInfo) throws IOException {
         final KNNVectorValues<?> knnVectorValues = indexInfo.getVectorValues();
         // Needed to make sure we don't get 0 dimensions while initializing index
-        iterateVectorValuesOnce(knnVectorValues);
+        initializeVectorValues(knnVectorValues);
         IndexBuildSetup indexBuildSetup = QuantizationIndexUtils.prepareIndexBuild(knnVectorValues, indexInfo);
 
         try (

diff --git a/...java/org/opensearch/knn/index/codec/nativeindex/MemOptimizedNativeIndexBuildStrategy.java b/...java/org/opensearch/knn/index/codec/nativeindex/MemOptimizedNativeIndexBuildStrategy.java
@@ -22,7 +22,7 @@
 
 import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
 import static org.opensearch.knn.common.KNNVectorUtil.intListToArray;
-import static org.opensearch.knn.common.KNNVectorUtil.iterateVectorValuesOnce;
+import static org.opensearch.knn.index.codec.util.KNNCodecUtil.initializeVectorValues;
 import static org.opensearch.knn.index.codec.transfer.OffHeapVectorTransferFactory.getVectorTransfer;
 
 /**
@@ -53,7 +53,7 @@ public static MemOptimizedNativeIndexBuildStrategy getInstance() {
     public void buildAndWriteIndex(final BuildIndexParams indexInfo) throws IOException {
         final KNNVectorValues<?> knnVectorValues = indexInfo.getVectorValues();
         // Needed to make sure we don't get 0 dimensions while initializing index
-        iterateVectorValuesOnce(knnVectorValues);
+        initializeVectorValues(knnVectorValues);
         KNNEngine engine = indexInfo.getKnnEngine();
         Map<String, Object> indexParameters = indexInfo.getParameters();
         IndexBuildSetup indexBuildSetup = QuantizationIndexUtils.prepareIndexBuild(knnVectorValues, indexInfo);

diff --git a/src/main/java/org/opensearch/knn/index/codec/nativeindex/NativeIndexWriter.java b/src/main/java/org/opensearch/knn/index/codec/nativeindex/NativeIndexWriter.java
@@ -43,7 +43,7 @@
 import static org.opensearch.knn.common.FieldInfoExtractor.extractVectorDataType;
 import static org.opensearch.knn.common.KNNConstants.MODEL_ID;
 import static org.opensearch.knn.common.KNNConstants.PARAMETERS;
-import static org.opensearch.knn.common.KNNVectorUtil.iterateVectorValuesOnce;
+import static org.opensearch.knn.index.codec.util.KNNCodecUtil.initializeVectorValues;
 import static org.opensearch.knn.index.codec.util.KNNCodecUtil.buildEngineFileName;
 import static org.opensearch.knn.index.engine.faiss.Faiss.FAISS_BINARY_INDEX_DESCRIPTION_PREFIX;
 
@@ -100,7 +100,7 @@ public static NativeIndexWriter getWriter(
      * @throws IOException
      */
     public void flushIndex(final KNNVectorValues<?> knnVectorValues, int totalLiveDocs) throws IOException {
-        iterateVectorValuesOnce(knnVectorValues);
+        initializeVectorValues(knnVectorValues);
         buildAndWriteIndex(knnVectorValues, totalLiveDocs);
         recordRefreshStats();
     }
@@ -111,7 +111,7 @@ public void flushIndex(final KNNVectorValues<?> knnVectorValues, int totalLiveDo
      * @throws IOException
      */
     public void mergeIndex(final KNNVectorValues<?> knnVectorValues, int totalLiveDocs) throws IOException {
-        iterateVectorValuesOnce(knnVectorValues);
+        initializeVectorValues(knnVectorValues);
         if (knnVectorValues.docId() == NO_MORE_DOCS) {
             // This is in place so we do not add metrics
             log.debug("Skipping mergeIndex, vector values are already iterated for {}", fieldInfo.name);

diff --git a/src/main/java/org/opensearch/knn/index/codec/util/KNNCodecUtil.java b/src/main/java/org/opensearch/knn/index/codec/util/KNNCodecUtil.java
@@ -9,12 +9,15 @@
 import org.apache.lucene.index.BinaryDocValues;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.search.DocIdSetIterator;
 import org.opensearch.knn.common.FieldInfoExtractor;
 import org.opensearch.knn.common.KNNConstants;
 import org.opensearch.knn.index.VectorDataType;
 import org.opensearch.knn.index.codec.KNN80Codec.KNN80BinaryDocValues;
 import org.opensearch.knn.index.engine.KNNEngine;
+import org.opensearch.knn.index.vectorvalues.KNNVectorValues;
 
+import java.io.IOException;
 import java.util.Comparator;
 import java.util.List;
 import java.util.stream.Collectors;
@@ -116,6 +119,31 @@ public static String getNativeEngineFileFromFieldInfo(FieldInfo field, SegmentIn
         }
     }
 
+    /**
+     * Positions the vectorValuesIterator to the first vector document ID if not already positioned there.
+     * This initialization is crucial for setting up vector dimensions and other properties in VectorValues.
+     * <p>
+     * If the VectorValues contains no vector documents, the iterator will be positioned at
+     * {@link DocIdSetIterator#NO_MORE_DOCS}
+     *
+     * @param vectorValues {@link KNNVectorValues}
+     * @throws IOException if there is an error while accessing the vector values
+     */
+    public static void initializeVectorValues(final KNNVectorValues<?> vectorValues) throws IOException {
+        // The docId will be set to -1 if next doc has never been called yet. If it has already been called,
+        // no need to advance the vector values
+        if (vectorValues.docId() != -1) {
+            return;
+        }
+        // Ensure that we are not getting the next vector if there are no more docs
+        vectorValues.nextDoc();
+        if (vectorValues.docId() == DocIdSetIterator.NO_MORE_DOCS) {
+            // Ensure that we are not getting the vector if there are no more docs
+            return;
+        }
+        vectorValues.getVector();
+    }
+
     /**
      * Get KNNEngine From FieldInfo
      *

diff --git a/src/main/java/org/opensearch/knn/index/mapper/KNNMappingConfig.java b/src/main/java/org/opensearch/knn/index/mapper/KNNMappingConfig.java
@@ -5,6 +5,7 @@
 
 package org.opensearch.knn.index.mapper;
 
+import org.opensearch.Version;
 import org.opensearch.knn.index.engine.KNNMethodContext;
 import org.opensearch.knn.index.engine.qframe.QuantizationConfig;
 
@@ -62,4 +63,12 @@ default QuantizationConfig getQuantizationConfig() {
      * @return the dimension of the index; for model based indices, it will be null
      */
     int getDimension();
+
+    /**
+     * Returns index created Version
+     * @return Version
+     */
+    default Version getIndexCreatedVersion() {
+        return Version.CURRENT;
+    }
 }
diff --git a/src/main/java/org/opensearch/knn/index/mapper/LuceneFieldMapper.java b/src/main/java/org/opensearch/knn/index/mapper/LuceneFieldMapper.java
@@ -17,6 +17,7 @@
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.KnnByteVectorField;
 import org.apache.lucene.document.KnnFloatVectorField;
+import org.opensearch.Version;
 import org.opensearch.common.Explicit;
 import org.opensearch.knn.index.KNNVectorSimilarityFunction;
 import org.opensearch.knn.index.VectorDataType;
@@ -73,6 +74,11 @@ public Mode getMode() {
                 public CompressionLevel getCompressionLevel() {
                     return knnMethodConfigContext.getCompressionLevel();
                 }
+
+                @Override
+                public Version getIndexCreatedVersion() {
+                    return knnMethodConfigContext.getVersionCreated();
+                }
             }
         );
 

diff --git a/src/main/java/org/opensearch/knn/index/mapper/MethodFieldMapper.java b/src/main/java/org/opensearch/knn/index/mapper/MethodFieldMapper.java
@@ -8,6 +8,7 @@
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.index.DocValuesType;
 import org.apache.lucene.index.VectorEncoding;
+import org.opensearch.Version;
 import org.opensearch.common.Explicit;
 import org.opensearch.common.xcontent.XContentFactory;
 import org.opensearch.knn.index.SpaceType;
@@ -86,6 +87,11 @@ public CompressionLevel getCompressionLevel() {
                 public QuantizationConfig getQuantizationConfig() {
                     return quantizationConfig;
                 }
+
+                @Override
+                public Version getIndexCreatedVersion() {
+                    return knnMethodConfigContext.getVersionCreated();
+                }
             }
         );
         return new MethodFieldMapper(

diff --git a/src/main/java/org/opensearch/knn/index/mapper/ModelFieldMapper.java b/src/main/java/org/opensearch/knn/index/mapper/ModelFieldMapper.java
@@ -107,6 +107,11 @@ public QuantizationConfig getQuantizationConfig() {
                 return quantizationConfig;
             }
 
+            @Override
+            public Version getIndexCreatedVersion() {
+                return indexCreatedVersion;
+            }
+
             // ModelMetadata relies on cluster state which may not be available during field mapper creation. Thus,
             // we lazily initialize it.
             private void initFromModelMetadata() {

diff --git a/src/main/java/org/opensearch/knn/index/query/FilterIdsSelector.java b/src/main/java/org/opensearch/knn/index/query/FilterIdsSelector.java
@@ -78,7 +78,10 @@ public enum FilterIdsSelectorType {
     public static FilterIdsSelector getFilterIdSelector(final BitSet filterIdsBitSet, final int cardinality) throws IOException {
         long[] filterIds;
         FilterIdsSelector.FilterIdsSelectorType filterType;
-        if (filterIdsBitSet instanceof FixedBitSet) {
+        if (filterIdsBitSet == null) {
+            filterIds = null;
+            filterType = FilterIdsSelector.FilterIdsSelectorType.BITMAP;
+        } else if (filterIdsBitSet instanceof FixedBitSet) {
             /**
              * When filterIds is dense filter, using fixed bitset
              */

diff --git a/src/main/java/org/opensearch/knn/index/query/KNNQueryBuilder.java b/src/main/java/org/opensearch/knn/index/query/KNNQueryBuilder.java
@@ -662,9 +662,24 @@ public String getWriteableName() {
 
     @Override
     protected QueryBuilder doRewrite(QueryRewriteContext queryShardContext) throws IOException {
-        // rewrite filter query if it exists to avoid runtime errors in next steps of query phase
+        QueryBuilder rewrittenFilter;
         if (Objects.nonNull(filter)) {
-            filter = filter.rewrite(queryShardContext);
+            rewrittenFilter = filter.rewrite(queryShardContext);
+            if (rewrittenFilter != filter) {
+                KNNQueryBuilder rewrittenQueryBuilder = KNNQueryBuilder.builder()
+                    .fieldName(this.fieldName)
+                    .vector(this.vector)
+                    .k(this.k)
+                    .maxDistance(this.maxDistance)
+                    .minScore(this.minScore)
+                    .methodParameters(this.methodParameters)
+                    .filter(rewrittenFilter)
+                    .ignoreUnmapped(this.ignoreUnmapped)
+                    .rescoreContext(this.rescoreContext)
+                    .expandNested(this.expandNested)
+                    .build();
+                return rewrittenQueryBuilder;
+            }
         }
         return super.doRewrite(queryShardContext);
     }

diff --git a/src/main/java/org/opensearch/knn/index/query/KNNWeight.java b/src/main/java/org/opensearch/knn/index/query/KNNWeight.java
@@ -129,6 +129,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
      */
     public PerLeafResult searchLeaf(LeafReaderContext context, int k) throws IOException {
         final BitSet filterBitSet = getFilteredDocsBitSet(context);
+        final int maxDoc = context.reader().maxDoc();
         int cardinality = filterBitSet.cardinality();
         // We don't need to go to JNI layer if no documents are found which satisfy the filters
         // We should give this condition a deeper look that where it should be placed. For now I feel this is a good
@@ -145,7 +146,14 @@ public PerLeafResult searchLeaf(LeafReaderContext context, int k) throws IOExcep
             Map<Integer, Float> result = doExactSearch(context, new BitSetIterator(filterBitSet, cardinality), cardinality, k);
             return new PerLeafResult(filterWeight == null ? null : filterBitSet, result);
         }
-        Map<Integer, Float> docIdsToScoreMap = doANNSearch(context, filterBitSet, cardinality, k);
+
+        /*
+         * If filters match all docs in this segment, then null should be passed as filterBitSet
+         * so that it will not do a bitset look up in bottom search layer.
+         */
+        final BitSet annFilter = (filterWeight != null && cardinality == maxDoc) ? null : filterBitSet;
+        final Map<Integer, Float> docIdsToScoreMap = doANNSearch(context, annFilter, cardinality, k);
+
         // See whether we have to perform exact search based on approx search results
         // This is required if there are no native engine files or if approximate search returned
         // results less than K, though we have more than k filtered docs

diff --git a/src/main/java/org/opensearch/knn/index/query/nativelib/NativeEngineKnnVectorQuery.java b/src/main/java/org/opensearch/knn/index/query/nativelib/NativeEngineKnnVectorQuery.java
@@ -63,11 +63,11 @@ public Weight createWeight(IndexSearcher indexSearcher, ScoreMode scoreMode, flo
         if (rescoreContext == null) {
             perLeafResults = doSearch(indexSearcher, leafReaderContexts, knnWeight, finalK);
         } else {
-            boolean isShardLevelRescoringEnabled = KNNSettings.isShardLevelRescoringEnabledForDiskBasedVector(knnQuery.getIndexName());
+            boolean isShardLevelRescoringDisabled = KNNSettings.isShardLevelRescoringDisabledForDiskBasedVector(knnQuery.getIndexName());
             int dimension = knnQuery.getQueryVector().length;
-            int firstPassK = rescoreContext.getFirstPassK(finalK, isShardLevelRescoringEnabled, dimension);
+            int firstPassK = rescoreContext.getFirstPassK(finalK, isShardLevelRescoringDisabled, dimension);
             perLeafResults = doSearch(indexSearcher, leafReaderContexts, knnWeight, firstPassK);
-            if (isShardLevelRescoringEnabled == true) {
+            if (isShardLevelRescoringDisabled == false) {
                 ResultUtil.reduceToTopK(perLeafResults, firstPassK);
             }