diff --git a/lib/index.js b/lib/index.js index 7892b4cd..83c99312 100644 --- a/lib/index.js +++ b/lib/index.js @@ -360,6 +360,14 @@ elasticlunr.Index.prototype.search = function (query, userConfig) { return results; }; +/** + * search queryTokens in specified field. + * + * @param {Array} queryTokens The query tokens to query in this field. + * @param {String} field Field to query in. + * @param {elasticlunr.Configuration} config The user query config, JSON format. + * @return {Object} + */ /** * search queryTokens in specified field. * @@ -371,7 +379,7 @@ elasticlunr.Index.prototype.search = function (query, userConfig) { elasticlunr.Index.prototype.fieldSearch = function (queryTokens, fieldName, config) { var booleanType = config[fieldName].bool; var expand = config[fieldName].expand; - var scores = {}; + var scores = null; var docTokens = {}; queryTokens.forEach(function (token) { @@ -379,11 +387,41 @@ elasticlunr.Index.prototype.fieldSearch = function (queryTokens, fieldName, conf if (expand == true) { tokens = this.index[fieldName].expandToken(token); } - + // Consider every query token in turn. If expanded, each query token + // corresponds to a set of tokens, which is all tokens in the + // index matching the pattern queryToken* . + // For the set of tokens corresponding to a query token, find and score + // all matching documents. Store those scores in queryTokenScores, + // keyed by docRef. + // Then, depending on the value of booleanType, combine the scores + // for this query token with previous scores. If booleanType is OR, + // then merge the scores by summing into the accumulated total, adding + // new document scores are required (effectively a union operator). + // If booleanType is AND, accumulate scores only if the document + // has previously been scored by another query token (an intersection + // operation0. + // Furthermore, since when booleanType is AND, additional + // query tokens can't add new documents to the result set, use the + // current document set to limit the processing of each new query + // token for efficiency (i.e., incremental intersection). + + var queryTokenScores = {}; tokens.forEach(function (key) { var docs = this.index[fieldName].getDocs(key); var idf = this.idf(key, fieldName); - + + if (scores && booleanType == 'AND') { + // special case, we can rule out documents that have been + // already been filtered out because they weren't scored + // by previous query token passes. + var filteredDocs = {}; + for (var docRef in scores) { + if (docRef in docs) { + filteredDocs[docRef] = docs[docRef]; + } + } + docs = filteredDocs; + } // only record appeared token for retrieved documents for the // original token, not for expaned token. // beause for doing coordNorm for a retrieved document, coordNorm only care how many @@ -411,24 +449,58 @@ elasticlunr.Index.prototype.fieldSearch = function (queryTokens, fieldName, conf var score = tf * idf * fieldLengthNorm * penality; - if (docRef in scores) { - scores[docRef] += score; + if (docRef in queryTokenScores) { + queryTokenScores[docRef] += score; } else { - scores[docRef] = score; + queryTokenScores[docRef] = score; } } }, this); + + scores = this.mergeScores(scores, queryTokenScores, booleanType); }, this); - if (booleanType == 'AND') { - scores = this.intersect(scores, docTokens, queryTokens.length); - } - scores = this.coordNorm(scores, docTokens, queryTokens.length); - return scores; }; +/** + * Merge the scores from one set of tokens into an accumulated score table. + * Exact operation depends on the op parameter. If op is 'AND', then only the + * intersection of the two score lists is retained. Otherwise, the union of + * the two score lists is returned. For internal use only. + * + * @param {Object} bool accumulated scores. Should be null on first call. + * @param {String} scores new scores to merge into accumScores. + * @param {Object} op merge operation (should be 'AND' or 'OR'). + * + */ + +elasticlunr.Index.prototype.mergeScores = function (accumScores, scores, op) { + if (!accumScores) { + return scores; + } + if (op == 'AND') { + var intersection = {}; + for (var docRef in scores) { + if (docRef in accumScores) { + intersection[docRef] = accumScores[docRef] + scores[docRef]; + } + } + return intersection; + } else { + for (var docRef in scores) { + if (docRef in accumScores) { + accumScores[docRef] += scores[docRef]; + } else { + accumScores[docRef] = scores[docRef]; + } + } + return accumScores; + } +}; + + /** * Record the occuring query token of retrieved doc specified by doc field. * Only for inner user. @@ -448,28 +520,6 @@ elasticlunr.Index.prototype.fieldSearchStats = function (docTokens, token, docs) } }; -/** - * find documents contain all the query tokens. - * only for inner use. - * - * @param {Object} results first results - * @param {Object} docs field search results of a token - * @param {Integer} n query token number - * @return {Object} - */ -elasticlunr.Index.prototype.intersect = function (scores, docTokens, n) { - var res = {}; - - for (var doc in scores) { - if (!(doc in docTokens)) continue; - if (docTokens[doc].length == n) { - res[doc] = scores[doc]; - } - } - - return res; -}; - /** * coord norm the score of a doc. * if a doc contain more query tokens, then the score will larger than the doc