diff --git a/.gitignore b/.gitignore index 34977ee..4299959 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ node_modules -.idea \ No newline at end of file +.idea +.history +.vscode \ No newline at end of file diff --git a/lunr.zh.js b/lunr.zh.js new file mode 100644 index 0000000..a48a8a7 --- /dev/null +++ b/lunr.zh.js @@ -0,0 +1,143 @@ +/*! + * Lunr languages, `Chinese` language + * https://github.com/MihaiValentin/lunr-languages + * + * Copyright 2019, Felix Lian (repairearth) + * http://www.mozilla.org/MPL/ + */ +/*! + * based on + * Snowball zhvaScript Library v0.3 + * http://code.google.com/p/urim/ + * http://snowball.tartarus.org/ + * + * Copyright 2010, Oleg Mazko + * http://www.mozilla.org/MPL/ + */ + +/** + * export the module via AMD, CommonJS or as a browser global + * Export code from https://github.com/umdjs/umd/blob/master/returnExports.js + */ +; +(function(root, factory) { + if (typeof define === 'function' && define.amd) { + // AMD. Register as an anonymous module. + define(factory) + } else if (typeof exports === 'object') { + /** + * Node. Does not work with strict CommonJS, but + * only CommonJS-like environments that support module.exports, + * like Node. + */ + module.exports = factory(require('nodejieba')) + } else { + // Browser globals (root is window) + factory()(root.lunr); + } +}(this, function(nodejieba) { + /** + * Just return a value to define the module export. + * This example returns an object, but the module + * can return a function as the exported value. + */ + return function(lunr, nodejiebaDictJson) { + /* throw error if lunr is not yet included */ + if ('undefined' === typeof lunr) { + throw new Error('Lunr is not present. Please include / require Lunr before this script.'); + } + + /* throw error if lunr stemmer support is not yet included */ + if ('undefined' === typeof lunr.stemmerSupport) { + throw new Error('Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.'); + } + + /* + Chinese tokenization is trickier, since it does not + take into account spaces. + Since the tokenization function is represented different + internally for each of the Lunr versions, this had to be done + in order to try to try to pick the best way of doing this based + on the Lunr version + */ + var isLunr2 = lunr.version[0] == "2"; + + /* register specific locale function */ + lunr.zh = function() { + this.pipeline.reset(); + this.pipeline.add( + lunr.zh.trimmer, + lunr.zh.stopWordFilter, + lunr.zh.stemmer + ); + + // change the tokenizer for Chinese one + if (isLunr2) { // for lunr version 2.0.0 + this.tokenizer = lunr.zh.tokenizer; + } else { + if (lunr.tokenizer) { // for lunr version 0.6.0 + lunr.tokenizer = lunr.zh.tokenizer; + } + if (this.tokenizerFn) { // for lunr version 0.7.0 -> 1.0.0 + this.tokenizerFn = lunr.zh.tokenizer; + } + } + }; + + lunr.zh.tokenizer = function(obj) { + if (!arguments.length || obj == null || obj == undefined) return [] + if (Array.isArray(obj)) return obj.map(function (t) { return isLunr2 ? new lunr.Token(t.toLowerCase()) : t.toLowerCase() }) + + nodejiebaDictJson && nodejieba.load(nodejiebaDictJson) + + var str = obj.toString().trim().toLowerCase(); + var tokens = []; + + nodejieba.cut(str, true).forEach(function (seg) { + tokens = tokens.concat(seg.split(' ')) + }) + + tokens = tokens.filter(function (token) { + return !!token; + }); + + var fromIndex = 0 + + return tokens.map(function (token, index) { + if (isLunr2) { + var start = str.indexOf(token, fromIndex) + + var tokenMetadata = {} + tokenMetadata["position"] = [start, token.length] + tokenMetadata["index"] = index + + fromIndex = start + + return new lunr.Token(token, tokenMetadata); + } else { + return token + } + }); + } + + /* lunr trimmer function */ + lunr.zh.wordCharacters = "\\w\u4e00-\u9fa5"; + lunr.zh.trimmer = lunr.trimmerSupport.generateTrimmer(lunr.zh.wordCharacters); + lunr.Pipeline.registerFunction(lunr.zh.trimmer, 'trimmer-zh'); + + /* lunr stemmer function */ + lunr.zh.stemmer = (function() { + + /* TODO Chinese stemmer */ + return function(word) { + return word; + } + })(); + lunr.Pipeline.registerFunction(lunr.zh.stemmer, 'stemmer-zh'); + + /* lunr stop word filter. see https://www.ranks.nl/stopwords/chinese-stopwords */ + lunr.zh.stopWordFilter = lunr.generateStopWordFilter( + '的 一 不 在 人 有 是 为 以 于 上 他 而 后 之 来 及 了 因 下 可 到 由 这 与 也 此 但 并 个 其 已 无 小 我 们 起 最 再 今 去 好 只 又 或 很 亦 某 把 那 你 乃 它 吧 被 比 别 趁 当 从 到 得 打 凡 儿 尔 该 各 给 跟 和 何 还 即 几 既 看 据 距 靠 啦 了 另 么 每 们 嘛 拿 哪 那 您 凭 且 却 让 仍 啥 如 若 使 谁 虽 随 同 所 她 哇 嗡 往 哪 些 向 沿 哟 用 于 咱 则 怎 曾 至 致 着 诸 自'.split(' ')); + lunr.Pipeline.registerFunction(lunr.zh.stopWordFilter, 'stopWordFilter-zh'); + }; +})) \ No newline at end of file diff --git a/package.json b/package.json index 8d2bd21..1a12722 100644 --- a/package.json +++ b/package.json @@ -18,5 +18,8 @@ "mocha": "^6.2.0", "uglify-js": "^2.4.15", "unicode-8.0.0": "^0.1.5" + }, + "dependencies": { + "nodejieba": "^2.3.0" } } diff --git a/test/VersionsAndLanguagesTest.js b/test/VersionsAndLanguagesTest.js index b178e77..f531960 100644 --- a/test/VersionsAndLanguagesTest.js +++ b/test/VersionsAndLanguagesTest.js @@ -39,7 +39,8 @@ var testDocuments = { sv: require('./testdata/sv'), tr: require('./testdata/tr'), th: require('./testdata/th'), - vi: require('./testdata/vi') + vi: require('./testdata/vi'), + zh: require('./testdata/zh') }; lunrVersions.forEach(function(lunrVersion) { diff --git a/test/testdata/zh.js b/test/testdata/zh.js new file mode 100644 index 0000000..58fa49f --- /dev/null +++ b/test/testdata/zh.js @@ -0,0 +1,56 @@ +module.exports = { + fields: [ + { + name: 'title', + config: { boost: 10 } + }, { + name: 'body' + } + ], + documents: [ + { + "title": "【新闻1+1】Mercedes Benz女车主维权 车主到底难在何处?", + "body": "你们说给我3天时间,我给了你3天,我给你们5个3天,15天,15天你不给我个方案,最后给我的方案是换发动机,15天最后给我这个方案,您觉得我接受得了吗?", + "id": 1 + }, { + "title": "央视CCTV13新闻1+1正在说奔驰漏油事件", + "body": "我这个车没开出去这个门发动机漏油,你给我说讲三包,给我车主说免费换发动机,我跟你说要求你说可以退款可以换车,你又最后说换发动机,还给我说打12315,大哥你觉得合适吗?", + "id": 2 + }, { + "title": "奔驰女车主接受专访,经销商咋就这么强势?", + "body": "我打电话给奔驰金融,我说我这天15天没开过,我可不可以暂停还款,人家说不可以,这是用你个人征信做的贷款,你贷款必须还,我车没开到,我凭什么还这个贷款。", + "id": 3 + } + ], + tests: [ + { + what: "find the word %w", + search: "车主", + found: 3 + }, { + what: "find the word %w", + search: "Benz", + found: 1 + }, { + what: "find the word %w", + search: "Mercedes Benz", + found: 1 + }, { + what: "find the word %w", + search: "12315", + found: 1 + }, { + what: "find the word %w", + search: "CCTV13", + found: 1 + }, { + what: "never find a word that does not exist, like %w", + search: "美女", + found: 0 + }, { + what: "never find a character like %w", + search: ",", + found: 0 + } + ] +} \ No newline at end of file