diff --git a/README.md b/README.md
index 5a0cf71..1ccebc6 100644
--- a/README.md
+++ b/README.md
@@ -1,37 +1,37 @@
-Lunr Languages [![npm](https://img.shields.io/npm/v/lunr-languages.svg)](https://www.npmjs.com/package/lunr-languages) [![Bower](https://img.shields.io/bower/v/lunr-languages.svg)]() [![Join the chat at https://gitter.im/lunr-languages/Lobby](https://badges.gitter.im/lunr-languages/Lobby.svg)](https://gitter.im/lunr-languages/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![](https://img.shields.io/badge/compatible%20with%20Lunr-0.6.0%20--%3E%202.x-green.svg)](http://lunrjs.com/) [![CircleCI branch](https://img.shields.io/circleci/project/github/MihaiValentin/lunr-languages.svg)](https://circleci.com/gh/MihaiValentin/lunr-languages)
-==============
+# Lunr Languages [![npm](https://img.shields.io/npm/v/lunr-languages.svg)](https://www.npmjs.com/package/lunr-languages) [![Bower](https://img.shields.io/bower/v/lunr-languages.svg)]() [![Join the chat at https://gitter.im/lunr-languages/Lobby](https://badges.gitter.im/lunr-languages/Lobby.svg)](https://gitter.im/lunr-languages/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![](https://img.shields.io/badge/compatible%20with%20Lunr-0.6.0%20--%3E%202.x-green.svg)](http://lunrjs.com/) [![CircleCI branch](https://img.shields.io/circleci/project/github/MihaiValentin/lunr-languages.svg)](https://circleci.com/gh/MihaiValentin/lunr-languages)
Lunr Languages is a [Lunr](http://lunrjs.com/) addon that helps you search in documents written in the following languages:
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/DE.png) German
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/FR.png) French
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/ES.png) Spanish
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IT.png) Italian
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/NL.png) Dutch
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/DK.png) Danish
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/PT.png) Portuguese
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/FI.png) Finnish
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/RO.png) Romanian
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/HU.png) Hungarian
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/RU.png) Russian
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/NO.png) Norwegian
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/SE.png) Swedish
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/TR.png) Turkish
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/JP.png) Japanese
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/TH.png) Thai
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IQ.png) Arabic
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/CN.png) Chinese
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/VN.png) Vietnamese
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IN.png) Sankrit
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IN.png) Kannada
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IN.png) Telugu
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IN.png) Hindi
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IN.png) Tamil
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/KR.png) Korean
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/AM.png) Armenian
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IL.png) Hebrew
-* ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/GR.png) Greek
-* [Contribute with a new language](CONTRIBUTING.md)
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/DE.png) German
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/FR.png) French
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/ES.png) Spanish
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IT.png) Italian
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/NL.png) Dutch
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/DK.png) Danish
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/PT.png) Portuguese
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/FI.png) Finnish
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/RO.png) Romanian
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/HU.png) Hungarian
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/RU.png) Russian
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/NO.png) Norwegian
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/SE.png) Swedish
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/TR.png) Turkish
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/JP.png) Japanese
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/TH.png) Thai
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IQ.png) Arabic
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/CN.png) Chinese
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/VN.png) Vietnamese
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IN.png) Sankrit
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IN.png) Kannada
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IN.png) Telugu
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IN.png) Hindi
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IN.png) Tamil
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/KR.png) Korean
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/AM.png) Armenian
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/IL.png) Hebrew
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/GR.png) Greek
+- ![](https://raw.githubusercontent.com/madebybowtie/FlagKit/master/Assets/PNG/PL.png) Polish
+- [Contribute with a new language](CONTRIBUTING.md)
Lunr Languages is compatible with Lunr version `0.6`, `0.7`, `1.0` and `2.X`.
@@ -46,9 +46,11 @@ The following example is for the German language (de).
Add the following JS files to the page:
```html
-
+
+
-
+
+
```
then, use the language in when initializing lunr:
@@ -58,8 +60,8 @@ var idx = lunr(function () {
// use the language (de)
this.use(lunr.de);
// then, the normal lunr index initialization
- this.field('title', { boost: 10 });
- this.field('body');
+ this.field("title", { boost: 10 });
+ this.field("body");
// now you can call this.add(...) to add documents written in German
});
```
@@ -77,7 +79,11 @@ Add `require.js` to the page:
then, use the language in when initializing lunr:
```javascript
-require(['lib/lunr.js', '../lunr.stemmer.support.js', '../lunr.de.js'], function(lunr, stemmerSupport, de) {
+require([
+ "lib/lunr.js",
+ "../lunr.stemmer.support.js",
+ "../lunr.de.js",
+], function (lunr, stemmerSupport, de) {
// since the stemmerSupport and de add keys on the lunr object, we'll pass it as reference to them
// in the end, we will only need lunr.
stemmerSupport(lunr); // adds lunr.stemmerSupport
@@ -85,12 +91,12 @@ require(['lib/lunr.js', '../lunr.stemmer.support.js', '../lunr.de.js'], function
// at this point, lunr can be used
var idx = lunr(function () {
- // use the language (de)
- this.use(lunr.de);
- // then, the normal lunr index initialization
- this.field('title', { boost: 10 })
- this.field('body')
- // now you can call this.add(...) to add documents written in German
+ // use the language (de)
+ this.use(lunr.de);
+ // then, the normal lunr index initialization
+ this.field("title", { boost: 10 });
+ this.field("body");
+ // now you can call this.add(...) to add documents written in German
});
});
```
@@ -98,16 +104,16 @@ require(['lib/lunr.js', '../lunr.stemmer.support.js', '../lunr.de.js'], function
# With node.js
```javascript
-var lunr = require('./lib/lunr.js');
-require('./lunr.stemmer.support.js')(lunr);
-require('./lunr.de.js')(lunr); // or any other language you want
+var lunr = require("./lib/lunr.js");
+require("./lunr.stemmer.support.js")(lunr);
+require("./lunr.de.js")(lunr); // or any other language you want
var idx = lunr(function () {
// use the language (de)
this.use(lunr.de);
// then, the normal lunr index initialization
- this.field('title', { boost: 10 })
- this.field('body')
+ this.field("title", { boost: 10 });
+ this.field("body");
// now you can call this.add(...) to add documents written in German
});
```
@@ -117,14 +123,14 @@ var idx = lunr(function () {
If your documents are written in more than one language, you can enable multi-language indexing. This ensures every word is properly trimmed and stemmed, every stopword is removed, and no words are lost (indexing in just one language would remove words from every other one.)
```javascript
-var lunr = require('./lib/lunr.js');
-require('./lunr.stemmer.support.js')(lunr);
-require('./lunr.ru.js')(lunr);
-require('./lunr.multi.js')(lunr);
+var lunr = require("./lib/lunr.js");
+require("./lunr.stemmer.support.js")(lunr);
+require("./lunr.ru.js")(lunr);
+require("./lunr.multi.js")(lunr);
var idx = lunr(function () {
// the reason "en" does not appear above is that "en" is built in into lunr js
- this.use(lunr.multiLanguage('en', 'ru'));
+ this.use(lunr.multiLanguage("en", "ru"));
// then, the normal lunr index initialization
// ...
});
@@ -135,7 +141,7 @@ You can combine any number of supported languages this way. The corresponding lu
If you serialize the index and load it in another script, you'll have to initialize the multi-language support in that script, too, like this:
```javascript
-lunr.multiLanguage('en', 'ru');
+lunr.multiLanguage("en", "ru");
var idx = lunr.Index.load(serializedIndex);
```
@@ -146,27 +152,28 @@ Check the [Contributing](CONTRIBUTING.md) section
# How does Lunr Languages work?
Searching inside documents is not as straight forward as using `indexOf()`, since there are many things to consider in order to get quality search results:
-* **Tokenization**
- * Given a string like *"Hope you like using Lunr Languages!"*, the tokenizer would split it into individual words, becoming an array like `['Hope', 'you', 'like', 'using', 'Lunr', 'Languages!']`
- * Though it seems a trivial task for Latin characters (just splitting by the space), it gets more complicated for languages like Japanese. Lunr Languages has this included for the Japanese language.
-* **Trimming**
- * After tokenization, trimming ensures that the words contain *just* what is needed in them. In our example above, the trimmer would convert `Languages!` into `Languages`
- * So, the trimmer basically removes special characters that do not add value for the search purpose.
-* **Stemming**
- * What happens if our text contains the word `consignment` but we want to search for `consigned`? It should find it, since its meaning is the same, only the form is different.
- * A stemmer extracts the root of words that can have many forms and stores it in the index. Then, any search is also stemmed and searched in the index.
- * Lunr Languages does stemming for all the included languages, so you can capture all the forms of words in your documents.
-* **Stop words**
- * There's no point in adding or searching words like `the`, `it`, `so`, etc. These words are called *Stop words*
- * Stop words are removed so your index will only contain meaningful words.
- * Lunr Languages includes stop words for all the included languages.
+
+- **Tokenization**
+ - Given a string like _"Hope you like using Lunr Languages!"_, the tokenizer would split it into individual words, becoming an array like `['Hope', 'you', 'like', 'using', 'Lunr', 'Languages!']`
+ - Though it seems a trivial task for Latin characters (just splitting by the space), it gets more complicated for languages like Japanese. Lunr Languages has this included for the Japanese language.
+- **Trimming**
+ - After tokenization, trimming ensures that the words contain _just_ what is needed in them. In our example above, the trimmer would convert `Languages!` into `Languages`
+ - So, the trimmer basically removes special characters that do not add value for the search purpose.
+- **Stemming**
+ - What happens if our text contains the word `consignment` but we want to search for `consigned`? It should find it, since its meaning is the same, only the form is different.
+ - A stemmer extracts the root of words that can have many forms and stores it in the index. Then, any search is also stemmed and searched in the index.
+ - Lunr Languages does stemming for all the included languages, so you can capture all the forms of words in your documents.
+- **Stop words**
+ - There's no point in adding or searching words like `the`, `it`, `so`, etc. These words are called _Stop words_
+ - Stop words are removed so your index will only contain meaningful words.
+ - Lunr Languages includes stop words for all the included languages.
# Technical details & Credits
I've created this project by compiling and wrapping stemmers toghether with stop words from various sources ([including users contributions](https://github.com/MihaiValentin/lunr-languages/pulls?q=is%3Apr)) so they can be directly used with all the current versions of Lunr.
-* (the stemmers for all languages, ported from snowball-js)
-* (the stop words list for the other languages)
-* (the tinyseg Tiny Segmente Japanese tokenizer)
+- (the stemmers for all languages, ported from snowball-js)
+- (the stop words list for the other languages)
+- (the tinyseg Tiny Segmente Japanese tokenizer)
I am providing code in the repository to you under an [open source license](LICENSE). Because this is my personal repository, the license you receive to my code is from me and not my employer (Facebook)
diff --git a/lunr.pl.js b/lunr.pl.js
new file mode 100644
index 0000000..e00f207
--- /dev/null
+++ b/lunr.pl.js
@@ -0,0 +1,103 @@
+/*!
+ * Lunr languages, `Polish` language
+ * https://github.com/turbobit/lunr-languages
+ *
+ * Copyright 2023, Piotr Piechowicz
+ * http://www.mozilla.org/MPL/
+ */
+/*!
+ * based on
+ * Snowball JavaScript Library v0.3
+ * http://code.google.com/p/urim/
+ * http://snowball.tartarus.org/
+ *
+ * Copyright 2010, Oleg Mazko
+ * http://www.mozilla.org/MPL/
+ */
+
+/**
+ * export the module via AMD, CommonJS or as a browser global
+ * Export code from https://github.com/umdjs/umd/blob/master/returnExports.js
+ */
+(function (root, factory) {
+ if (typeof define === "function" && define.amd) {
+ // AMD. Register as an anonymous module.
+ define(factory);
+ } else if (typeof exports === "object") {
+ /**
+ * Node. Does not work with strict CommonJS, but
+ * only CommonJS-like environments that support module.exports,
+ * like Node.
+ */
+ module.exports = factory();
+ } else {
+ // Browser globals (root is window)
+ factory()(root.lunr);
+ }
+})(this, function () {
+ /**
+ * Just return a value to define the module export.
+ * This example returns an object, but the module
+ * can return a function as the exported value.
+ */
+ return function (lunr) {
+ /* throw error if lunr is not yet included */
+ if ("undefined" === typeof lunr) {
+ throw new Error(
+ "Lunr is not present. Please include / require Lunr before this script."
+ );
+ }
+
+ /* throw error if lunr stemmer support is not yet included */
+ if ("undefined" === typeof lunr.stemmerSupport) {
+ throw new Error(
+ "Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script."
+ );
+ }
+
+ /* register specific locale function */
+ lunr.pl = function () {
+ this.pipeline.reset();
+ this.pipeline.add(lunr.pl.trimmer, lunr.pl.stopWordFilter);
+ };
+
+ /* lunr trimmer function */
+ // http://www.unicode.org/charts/
+ lunr.pl.wordCharacters =
+ "[" +
+ "A-Za-z" +
+ "\u0100-\u017F" + // Latin Extended-A
+ "]";
+
+ lunr.pl.trimmer = lunr.trimmerSupport.generateTrimmer(
+ lunr.pl.wordCharacters
+ );
+
+ lunr.Pipeline.registerFunction(lunr.pl.trimmer, "trimmer-pl");
+
+ /* lunr stop word filter */
+ // https://www.ranks.nl/stopwords/polish
+ lunr.pl.stopWordFilter = lunr.generateStopWordFilter(
+ `ach aj albo bardzo bez bo być ci cię ciebie co czy daleko dla dlaczego dlatego do dobrze dokąd dość dużo dwa dwaj dwie dwoje dziś dzisiaj gdyby gdzie go ich ile im inny ja ją jak jakby jaki je jeden jedna jedno jego jej jemu jeśli jest jestem jeżeli już każdy kiedy kierunku kto ku lub ma mają mam mi mną mnie moi mój moja moje może mu my na nam nami nas nasi nasz nasza nasze natychmiast nią nic nich nie niego niej niemu nigdy nim nimi niż obok od około on ona one oni ono owszem po pod ponieważ przed przedtem są sam sama się skąd tak taki tam ten to tobą tobie tu tutaj twoi twój twoja twoje ty wam wami was wasi wasz wasza wasze we więc wszystko wtedy wy żaden zawsze że`.split(
+ " "
+ )
+ );
+ lunr.Pipeline.registerFunction(lunr.pl.stopWordFilter, "stopWordFilter-pl");
+
+ /* lunr stemmer function */
+ lunr.pl.stemmer = (function () {
+ return function (word) {
+ // for lunr version 2
+ if (typeof word.update === "function") {
+ return word.update(function (word) {
+ return word;
+ });
+ } else {
+ // for lunr version <= 1
+ return word;
+ }
+ };
+ })();
+ lunr.Pipeline.registerFunction(lunr.pl.stemmer, "stemmer-pl");
+ };
+});
diff --git a/package-lock.json b/package-lock.json
index e467eec..4fca8ad 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
{
"name": "lunr-languages",
- "version": "1.13.0",
+ "version": "1.14.0",
"lockfileVersion": 2,
"requires": true,
"packages": {
"": {
"name": "lunr-languages",
- "version": "1.13.0",
+ "version": "1.14.0",
"license": "MPL-1.1",
"devDependencies": {
"@node-rs/jieba": "^1.6.1",
diff --git a/test/VersionsAndLanguagesTest.js b/test/VersionsAndLanguagesTest.js
index 5c8896d..ab0b98d 100644
--- a/test/VersionsAndLanguagesTest.js
+++ b/test/VersionsAndLanguagesTest.js
@@ -1,144 +1,183 @@
-var assert = require('assert');
+var assert = require("assert");
var lunrVersions = [
- {
- version: "0.6.0",
- lunr: "lunr-0.6.0.min"
- }, {
- version: "0.7.0",
- lunr: "lunr-0.7.0.min"
- }, {
- version: "1.0.0",
- lunr: "lunr-1.0.0.min"
- }, {
- version: "2.0.1",
- lunr: "lunr-2.0.1"
- }, {
- version: "2.3.5",
- lunr: "lunr-2.3.5"
- }
-
+ {
+ version: "0.6.0",
+ lunr: "lunr-0.6.0.min",
+ },
+ {
+ version: "0.7.0",
+ lunr: "lunr-0.7.0.min",
+ },
+ {
+ version: "1.0.0",
+ lunr: "lunr-1.0.0.min",
+ },
+ {
+ version: "2.0.1",
+ lunr: "lunr-2.0.1",
+ },
+ {
+ version: "2.3.5",
+ lunr: "lunr-2.3.5",
+ },
];
var testDocuments = {
- he: require('./testdata/he'),
- ar: require('./testdata/ar'),
- de: require('./testdata/de'),
- da: require('./testdata/da'),
- du: require('./testdata/du'),
- es: require('./testdata/es'),
- fi: require('./testdata/fi'),
- fr: require('./testdata/fr'),
- hi: require('./testdata/hi'),
- hu: require('./testdata/hu'),
- hy: require('./testdata/hy'),
- it: require('./testdata/it'),
- ja: require('./testdata/ja'),
- jp: require('./testdata/ja'),
- kn: require('./testdata/kn'),
- ko: require('./testdata/ko'),
- no: require('./testdata/no'),
- pt: require('./testdata/pt'),
- ro: require('./testdata/ro'),
- ru: require('./testdata/ru'),
- sa: require('./testdata/sa'),
- sv: require('./testdata/sv'),
- ta: require('./testdata/ta'),
- te: require('./testdata/te'),
- tr: require('./testdata/tr'),
- th: require('./testdata/th'),
- vi: require('./testdata/vi'),
- zh: require('./testdata/zh'),
- el: require('./testdata/el'),
+ he: require("./testdata/he"),
+ ar: require("./testdata/ar"),
+ de: require("./testdata/de"),
+ da: require("./testdata/da"),
+ du: require("./testdata/du"),
+ es: require("./testdata/es"),
+ fi: require("./testdata/fi"),
+ fr: require("./testdata/fr"),
+ hi: require("./testdata/hi"),
+ hu: require("./testdata/hu"),
+ hy: require("./testdata/hy"),
+ it: require("./testdata/it"),
+ ja: require("./testdata/ja"),
+ jp: require("./testdata/ja"),
+ kn: require("./testdata/kn"),
+ ko: require("./testdata/ko"),
+ no: require("./testdata/no"),
+ pl: require("./testdata/pl"),
+ pt: require("./testdata/pt"),
+ ro: require("./testdata/ro"),
+ ru: require("./testdata/ru"),
+ sa: require("./testdata/sa"),
+ sv: require("./testdata/sv"),
+ ta: require("./testdata/ta"),
+ te: require("./testdata/te"),
+ tr: require("./testdata/tr"),
+ th: require("./testdata/th"),
+ vi: require("./testdata/vi"),
+ zh: require("./testdata/zh"),
+ el: require("./testdata/el"),
};
lunrVersions.forEach(function (lunrVersion) {
- describe("Testing Lunr-Languages & Lunr version " + lunrVersion.version, function () {
- describe("should be able to correctly identify words in multi-documents scenarios (eg: en + ru)", function () {
- delete require.cache[require.resolve('./lunr/' + lunrVersion.lunr)]
- var lunr = require('./lunr/' + lunrVersion.lunr);
- require('../lunr.stemmer.support.js')(lunr);
- require('../lunr.ru.js')(lunr);
- require('../lunr.multi.js')(lunr);
-
- var idxEn = lunr(function () {
- this.field('body');
- this.add({ "body": "Этот текст написан на русском.", "id": 1 });
- this.add({ "body": "This text is written in the English language.", "id": 2 });
- });
+ describe(
+ "Testing Lunr-Languages & Lunr version " + lunrVersion.version,
+ function () {
+ describe("should be able to correctly identify words in multi-documents scenarios (eg: en + ru)", function () {
+ delete require.cache[require.resolve("./lunr/" + lunrVersion.lunr)];
+ var lunr = require("./lunr/" + lunrVersion.lunr);
+ require("../lunr.stemmer.support.js")(lunr);
+ require("../lunr.ru.js")(lunr);
+ require("../lunr.multi.js")(lunr);
+
+ var idxEn = lunr(function () {
+ this.field("body");
+ this.add({ body: "Этот текст написан на русском.", id: 1 });
+ this.add({
+ body: "This text is written in the English language.",
+ id: 2,
+ });
+ });
- var idxRu = lunr(function () {
- this.use(lunr.ru);
- this.field('body');
- this.add({ "body": "Этот текст написан на русском.", "id": 1 });
- this.add({ "body": "This text is written in the English language.", "id": 2 });
- });
+ var idxRu = lunr(function () {
+ this.use(lunr.ru);
+ this.field("body");
+ this.add({ body: "Этот текст написан на русском.", id: 1 });
+ this.add({
+ body: "This text is written in the English language.",
+ id: 2,
+ });
+ });
- var idxMulti = lunr(function () {
- this.use(lunr.multiLanguage('en', 'ru'));
- this.field('body');
- this.add({ "body": "Этот текст написан на русском.", "id": 1 });
- this.add({ "body": "This text is written in the English language.", "id": 2 });
- });
+ var idxMulti = lunr(function () {
+ this.use(lunr.multiLanguage("en", "ru"));
+ this.field("body");
+ this.add({ body: "Этот текст написан на русском.", id: 1 });
+ this.add({
+ body: "This text is written in the English language.",
+ id: 2,
+ });
+ });
- it("should not stem and find 'Русских' in english documents", function () {
- assert.equal(idxEn.search('Русских').length, 0)
- });
+ it("should not stem and find 'Русских' in english documents", function () {
+ assert.equal(idxEn.search("Русских").length, 0);
+ });
- it("should stem and find 'languages' in english documents", function () {
- assert.equal(idxEn.search('languages').length, 1)
- });
+ it("should stem and find 'languages' in english documents", function () {
+ assert.equal(idxEn.search("languages").length, 1);
+ });
- it("should stem and find 'Русских' in russian documents", function () {
- assert.equal(idxRu.search('Русских').length, 1)
- });
+ it("should stem and find 'Русских' in russian documents", function () {
+ assert.equal(idxRu.search("Русских").length, 1);
+ });
- it("should not stem and find 'languages' in russian documents", function () {
- assert.equal(idxRu.search('languages').length, 0)
- });
+ it("should not stem and find 'languages' in russian documents", function () {
+ assert.equal(idxRu.search("languages").length, 0);
+ });
- it("should stem and find 'Русских' in russian+english documents", function () {
- assert.equal(idxMulti.search('Русских').length, 1)
- });
+ it("should stem and find 'Русских' in russian+english documents", function () {
+ assert.equal(idxMulti.search("Русских").length, 1);
+ });
- it("should stem and find 'languages' in russian+english documents", function () {
- assert.equal(idxMulti.search('languages').length, 1)
- });
+ it("should stem and find 'languages' in russian+english documents", function () {
+ assert.equal(idxMulti.search("languages").length, 1);
});
- Object.keys(testDocuments).forEach(function (language) {
- describe("should be able to correctly find terms in " + language.toUpperCase() + " correctly", function () {
- // because these tests are asynchronous, we must ensure every load of lunr is fresh
- // so we do not get the previous used languages on it.
- // if we don't do this, when we'll run the test for jp, we'll also have da, de, fr, it languages used
- delete require.cache[require.resolve('./lunr/' + lunrVersion.lunr)];
-
- var lunr = require('./lunr/' + lunrVersion.lunr);
- require('../lunr.stemmer.support.js')(lunr);
- if (language === 'ja' || language === 'jp') { // for japanese, we must also load the tinyseg tokenizer
- require('../tinyseg')(lunr);
- }
- if (language === 'th' || language === 'hi' || language === 'ta' || language === 'sa' || language === 'kn' || language === 'te') { // for thai, we must also load the wordcut tokenizer
- lunr.wordcut = require('../wordcut');
- }
- require('../lunr.' + language + '.js')(lunr);
-
- var idx = lunr(function () {
- this.use(lunr[language]);
- testDocuments[language].fields.forEach(function (field) {
- this.field(field.name, field.config)
- }.bind(this));
-
- testDocuments[language].documents.forEach(function (doc) {
- this.add(doc)
- }.bind(this));
- });
-
- testDocuments[language].tests.forEach(function (test) {
- it("should " + test.what.replace('%w', '"' + test.search + '"'), function () {
- assert.equal(idx.search(test.search).length, test.found)
- });
- }.bind(this));
- })
- })
- })
+ });
+ Object.keys(testDocuments).forEach(function (language) {
+ describe(
+ "should be able to correctly find terms in " +
+ language.toUpperCase() +
+ " correctly",
+ function () {
+ // because these tests are asynchronous, we must ensure every load of lunr is fresh
+ // so we do not get the previous used languages on it.
+ // if we don't do this, when we'll run the test for jp, we'll also have da, de, fr, it languages used
+ delete require.cache[require.resolve("./lunr/" + lunrVersion.lunr)];
+
+ var lunr = require("./lunr/" + lunrVersion.lunr);
+ require("../lunr.stemmer.support.js")(lunr);
+ if (language === "ja" || language === "jp") {
+ // for japanese, we must also load the tinyseg tokenizer
+ require("../tinyseg")(lunr);
+ }
+ if (
+ language === "th" ||
+ language === "hi" ||
+ language === "ta" ||
+ language === "sa" ||
+ language === "kn" ||
+ language === "te"
+ ) {
+ // for thai, we must also load the wordcut tokenizer
+ lunr.wordcut = require("../wordcut");
+ }
+ require("../lunr." + language + ".js")(lunr);
+
+ var idx = lunr(function () {
+ this.use(lunr[language]);
+ testDocuments[language].fields.forEach(
+ function (field) {
+ this.field(field.name, field.config);
+ }.bind(this)
+ );
+
+ testDocuments[language].documents.forEach(
+ function (doc) {
+ this.add(doc);
+ }.bind(this)
+ );
+ });
+
+ testDocuments[language].tests.forEach(
+ function (test) {
+ it(
+ "should " + test.what.replace("%w", '"' + test.search + '"'),
+ function () {
+ assert.equal(idx.search(test.search).length, test.found);
+ }
+ );
+ }.bind(this)
+ );
+ }
+ );
+ });
+ }
+ );
});
diff --git a/test/testdata/pl.js b/test/testdata/pl.js
new file mode 100644
index 0000000..53b4c1c
--- /dev/null
+++ b/test/testdata/pl.js
@@ -0,0 +1,40 @@
+module.exports = {
+ fields: [
+ {
+ name: "title",
+ config: { boost: 10 },
+ },
+ {
+ name: "body",
+ },
+ ],
+ documents: [
+ {
+ title: "Cietrzew",
+ body: "Czego trzeba strzelcowi kowalowi do zestrzelenia cietrzewia drzemiącego w dżdżysty dzień na drzewie",
+ id: 1,
+ },
+ {
+ title: "Kowalowa",
+ body: "Kowalowa Karolowa kazała kowalowi Karolowi karego konia kupić. Kowal Karol karego konia kuje, kowalowa Karolowa kowalem Karolem kieruje.",
+ id: 2,
+ },
+ ],
+ tests: [
+ {
+ what: "find the word %w",
+ search: "kowalowi",
+ found: 2,
+ },
+ {
+ what: "find the word %w",
+ search: "Karolowa",
+ found: 1,
+ },
+ {
+ what: "never find a word that does not exist, like %w",
+ search: "szczebrzeszyn",
+ found: 0,
+ },
+ ],
+};