Merge pull request #14 from mhalle/boolean-expand

changed implementation of boolean operation to allow AND and expand:true
weixsong · Apr 14, 2016 · b20a10a · b20a10a
2 parents 6c3edd1 + 143294a
commit b20a10a
Showing 1 changed file with 83 additions and 33 deletions.
diff --git a/lib/index.js b/lib/index.js
@@ -360,6 +360,14 @@ elasticlunr.Index.prototype.search = function (query, userConfig) {
   return results;
 };
 
+/**
+ * search queryTokens in specified field.
+ *
+ * @param {Array} queryTokens The query tokens to query in this field.
+ * @param {String} field Field to query in.
+ * @param {elasticlunr.Configuration} config The user query config, JSON format.
+ * @return {Object}
+ */
 /**
  * search queryTokens in specified field.
  *
@@ -371,19 +379,49 @@ elasticlunr.Index.prototype.search = function (query, userConfig) {
 elasticlunr.Index.prototype.fieldSearch = function (queryTokens, fieldName, config) {
   var booleanType = config[fieldName].bool;
   var expand = config[fieldName].expand;
-  var scores = {};
+  var scores = null;
   var docTokens = {};
 
   queryTokens.forEach(function (token) {
     var tokens = [token];
     if (expand == true) {
       tokens = this.index[fieldName].expandToken(token);
     }
-
+    // Consider every query token in turn. If expanded, each query token
+    // corresponds to a set of tokens, which is all tokens in the 
+    // index matching the pattern queryToken* .
+    // For the set of tokens corresponding to a query token, find and score
+    // all matching documents. Store those scores in queryTokenScores, 
+    // keyed by docRef.
+    // Then, depending on the value of booleanType, combine the scores
+    // for this query token with previous scores.  If booleanType is OR,
+    // then merge the scores by summing into the accumulated total, adding
+    // new document scores are required (effectively a union operator). 
+    // If booleanType is AND, accumulate scores only if the document 
+    // has previously been scored by another query token (an intersection
+    // operation0. 
+    // Furthermore, since when booleanType is AND, additional 
+    // query tokens can't add new documents to the result set, use the
+    // current document set to limit the processing of each new query 
+    // token for efficiency (i.e., incremental intersection).
+
+    var queryTokenScores = {};
     tokens.forEach(function (key) {
       var docs = this.index[fieldName].getDocs(key);
       var idf = this.idf(key, fieldName);
-
+
+      if (scores && booleanType == 'AND') {
+          // special case, we can rule out documents that have been
+          // already been filtered out because they weren't scored
+          // by previous query token passes.
+          var filteredDocs = {};
+          for (var docRef in scores) {
+              if (docRef in docs) {
+                  filteredDocs[docRef] = docs[docRef];
+              }
+          }
+          docs = filteredDocs;
+      }
       // only record appeared token for retrieved documents for the
       // original token, not for expaned token.
       // beause for doing coordNorm for a retrieved document, coordNorm only care how many
@@ -411,24 +449,58 @@ elasticlunr.Index.prototype.fieldSearch = function (queryTokens, fieldName, conf
 
         var score = tf * idf * fieldLengthNorm * penality;
 
-        if (docRef in scores) {
-          scores[docRef] += score;
+        if (docRef in queryTokenScores) {
+          queryTokenScores[docRef] += score;
         } else {
-          scores[docRef] = score;
+          queryTokenScores[docRef] = score;
         }
       }
     }, this);
+
+    scores = this.mergeScores(scores, queryTokenScores, booleanType);
   }, this);
 
-  if (booleanType == 'AND') {
-    scores = this.intersect(scores, docTokens, queryTokens.length);
-  }
-
   scores = this.coordNorm(scores, docTokens, queryTokens.length);
-
   return scores;
 };
 
+/**
+ * Merge the scores from one set of tokens into an accumulated score table.
+ * Exact operation depends on the op parameter. If op is 'AND', then only the
+ * intersection of the two score lists is retained. Otherwise, the union of
+ * the two score lists is returned. For internal use only.
+ *
+ * @param {Object} bool accumulated scores. Should be null on first call.
+ * @param {String} scores new scores to merge into accumScores.
+ * @param {Object} op merge operation (should be 'AND' or 'OR').
+ *
+ */
+
+elasticlunr.Index.prototype.mergeScores = function (accumScores, scores, op) {
+    if (!accumScores) {
+        return scores; 
+    }
+    if (op == 'AND') {
+        var intersection = {};
+        for (var docRef in scores) {
+            if (docRef in accumScores) {
+                intersection[docRef] = accumScores[docRef] + scores[docRef];
+            }
+        }
+        return intersection;
+    } else {
+        for (var docRef in scores) {
+            if (docRef in accumScores) {
+                accumScores[docRef] += scores[docRef];
+            } else {
+                accumScores[docRef] = scores[docRef];
+            }
+        }
+        return accumScores;
+    }
+};
+
+
 /**
  * Record the occuring query token of retrieved doc specified by doc field.
  * Only for inner user.
@@ -448,28 +520,6 @@ elasticlunr.Index.prototype.fieldSearchStats = function (docTokens, token, docs)
   }
 };
 
-/**
- * find documents contain all the query tokens.
- * only for inner use.
- *
- * @param {Object} results first results
- * @param {Object} docs field search results of a token
- * @param {Integer} n query token number
- * @return {Object}
- */
-elasticlunr.Index.prototype.intersect = function (scores, docTokens, n) {
-  var res = {};
-
-  for (var doc in scores) {
-    if (!(doc in docTokens)) continue;
-    if (docTokens[doc].length == n) {
-      res[doc] = scores[doc];
-    }
-  }
-
-  return res;
-};
-
 /**
  * coord norm the score of a doc.
  * if a doc contain more query tokens, then the score will larger than the doc