Merge pull request #53 from repairearth/master

Add chinese support
MihaiValentin · Jun 17, 2021 · 1b55cc8 · 1b55cc8
2 parents a62fec9 + bc6b23f
commit 1b55cc8
Show file tree

Hide file tree

Showing 5 changed files with 207 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,4 @@
 node_modules
-.idea
+.idea
+.history
+.vscode
diff --git a/lunr.zh.js b/lunr.zh.js
@@ -0,0 +1,143 @@
+/*!
+ * Lunr languages, `Chinese` language
+ * https://github.com/MihaiValentin/lunr-languages
+ *
+ * Copyright 2019, Felix Lian (repairearth)
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball zhvaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+/**
+ * export the module via AMD, CommonJS or as a browser global
+ * Export code from https://github.com/umdjs/umd/blob/master/returnExports.js
+ */
+;
+(function(root, factory) {
+  if (typeof define === 'function' && define.amd) {
+    // AMD. Register as an anonymous module.
+    define(factory)
+  } else if (typeof exports === 'object') {
+    /**
+     * Node. Does not work with strict CommonJS, but
+     * only CommonJS-like environments that support module.exports,
+     * like Node.
+     */
+    module.exports = factory(require('nodejieba'))
+  } else {
+    // Browser globals (root is window)
+    factory()(root.lunr);
+  }
+}(this, function(nodejieba) {
+  /**
+   * Just return a value to define the module export.
+   * This example returns an object, but the module
+   * can return a function as the exported value.
+   */
+  return function(lunr, nodejiebaDictJson) {
+    /* throw error if lunr is not yet included */
+    if ('undefined' === typeof lunr) {
+      throw new Error('Lunr is not present. Please include / require Lunr before this script.');
+    }
+
+    /* throw error if lunr stemmer support is not yet included */
+    if ('undefined' === typeof lunr.stemmerSupport) {
+      throw new Error('Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.');
+    }
+
+    /*
+    Chinese tokenization is trickier, since it does not
+    take into account spaces.
+    Since the tokenization function is represented different
+    internally for each of the Lunr versions, this had to be done
+    in order to try to try to pick the best way of doing this based
+    on the Lunr version
+     */
+    var isLunr2 = lunr.version[0] == "2";
+
+    /* register specific locale function */
+    lunr.zh = function() {
+      this.pipeline.reset();
+      this.pipeline.add(
+        lunr.zh.trimmer,
+        lunr.zh.stopWordFilter,
+        lunr.zh.stemmer
+      );
+
+      // change the tokenizer for Chinese one
+      if (isLunr2) { // for lunr version 2.0.0
+        this.tokenizer = lunr.zh.tokenizer;
+      } else {
+        if (lunr.tokenizer) { // for lunr version 0.6.0
+          lunr.tokenizer = lunr.zh.tokenizer;
+        }
+        if (this.tokenizerFn) { // for lunr version 0.7.0 -> 1.0.0
+          this.tokenizerFn = lunr.zh.tokenizer;
+        }
+      }
+    };
+
+    lunr.zh.tokenizer = function(obj) {
+      if (!arguments.length || obj == null || obj == undefined) return []
+      if (Array.isArray(obj)) return obj.map(function (t) { return isLunr2 ? new lunr.Token(t.toLowerCase()) : t.toLowerCase() })
+
+      nodejiebaDictJson && nodejieba.load(nodejiebaDictJson)
+
+      var str = obj.toString().trim().toLowerCase();
+      var tokens = [];
+
+      nodejieba.cut(str, true).forEach(function (seg) {
+        tokens = tokens.concat(seg.split(' '))
+      })
+
+      tokens = tokens.filter(function (token) {
+        return !!token;
+      });
+
+      var fromIndex = 0
+
+      return tokens.map(function (token, index) {
+        if (isLunr2) {
+          var start = str.indexOf(token, fromIndex)
+
+          var tokenMetadata = {}
+          tokenMetadata["position"] = [start, token.length]
+          tokenMetadata["index"] = index
+
+          fromIndex = start
+
+          return new lunr.Token(token, tokenMetadata);
+        } else {
+          return token
+        }
+      });
+    }
+
+    /* lunr trimmer function */
+    lunr.zh.wordCharacters = "\\w\u4e00-\u9fa5";
+    lunr.zh.trimmer = lunr.trimmerSupport.generateTrimmer(lunr.zh.wordCharacters);
+    lunr.Pipeline.registerFunction(lunr.zh.trimmer, 'trimmer-zh');
+
+    /* lunr stemmer function */
+    lunr.zh.stemmer = (function() {
+
+      /* TODO Chinese stemmer  */
+      return function(word) {
+        return word;
+      }
+    })();
+    lunr.Pipeline.registerFunction(lunr.zh.stemmer, 'stemmer-zh');
+
+    /* lunr stop word filter. see https://www.ranks.nl/stopwords/chinese-stopwords */
+    lunr.zh.stopWordFilter = lunr.generateStopWordFilter(
+      '的 一 不 在 人 有 是 为 以 于 上 他 而 后 之 来 及 了 因 下 可 到 由 这 与 也 此 但 并 个 其 已 无 小 我 们 起 最 再 今 去 好 只 又 或 很 亦 某 把 那 你 乃 它 吧 被 比 别 趁 当 从 到 得 打 凡 儿 尔 该 各 给 跟 和 何 还 即 几 既 看 据 距 靠 啦 了 另 么 每 们 嘛 拿 哪 那 您 凭 且 却 让 仍 啥 如 若 使 谁 虽 随 同 所 她 哇 嗡 往 哪 些 向 沿 哟 用 于 咱 则 怎 曾 至 致 着 诸 自'.split(' '));
+    lunr.Pipeline.registerFunction(lunr.zh.stopWordFilter, 'stopWordFilter-zh');
+  };
+}))
diff --git a/package.json b/package.json
@@ -18,5 +18,8 @@
     "mocha": "^6.2.0",
     "uglify-js": "^2.4.15",
     "unicode-8.0.0": "^0.1.5"
+  },
+  "dependencies": {
+    "nodejieba": "^2.3.0"
   }
 }
diff --git a/test/VersionsAndLanguagesTest.js b/test/VersionsAndLanguagesTest.js
@@ -39,7 +39,8 @@ var testDocuments = {
     sv: require('./testdata/sv'),
     tr: require('./testdata/tr'),
     th: require('./testdata/th'),
-    vi: require('./testdata/vi')
+    vi: require('./testdata/vi'),
+    zh: require('./testdata/zh')
 };
 
 lunrVersions.forEach(function(lunrVersion) {

diff --git a/test/testdata/zh.js b/test/testdata/zh.js
@@ -0,0 +1,56 @@
+module.exports = {
+    fields: [
+        {
+            name: 'title',
+            config: { boost: 10 }
+        }, {
+            name: 'body'
+        }
+    ],
+    documents: [
+        {
+            "title": "【新闻1+1】Mercedes Benz女车主维权 车主到底难在何处？",
+            "body": "你们说给我3天时间，我给了你3天，我给你们5个3天，15天，15天你不给我个方案，最后给我的方案是换发动机，15天最后给我这个方案，您觉得我接受得了吗？",
+            "id": 1
+        }, {
+            "title": "央视CCTV13新闻1+1正在说奔驰漏油事件",
+            "body": "我这个车没开出去这个门发动机漏油，你给我说讲三包，给我车主说免费换发动机，我跟你说要求你说可以退款可以换车，你又最后说换发动机，还给我说打12315，大哥你觉得合适吗？",
+            "id": 2
+        }, {
+            "title": "奔驰女车主接受专访，经销商咋就这么强势？",
+            "body": "我打电话给奔驰金融，我说我这天15天没开过，我可不可以暂停还款，人家说不可以，这是用你个人征信做的贷款，你贷款必须还，我车没开到，我凭什么还这个贷款。",
+            "id": 3
+        }
+    ],
+    tests: [
+        {
+            what: "find the word %w",
+            search: "车主",
+            found: 3
+        }, {
+            what: "find the word %w",
+            search: "Benz",
+            found: 1
+        }, {
+            what: "find the word %w",
+            search: "Mercedes Benz",
+            found: 1
+        }, {
+            what: "find the word %w",
+            search: "12315",
+            found: 1
+        }, {
+            what: "find the word %w",
+            search: "CCTV13",
+            found: 1
+        }, {
+            what: "never find a word that does not exist, like %w",
+            search: "美女",
+            found: 0
+        }, {
+            what: "never find a character like %w",
+            search: "，",
+            found: 0
+        }
+    ]
+}