diff --git a/integration/analyzer_peliasIndexOneEdgeGram.js b/integration/analyzer_peliasIndexOneEdgeGram.js index 1db32362..7cd418ae 100644 --- a/integration/analyzer_peliasIndexOneEdgeGram.js +++ b/integration/analyzer_peliasIndexOneEdgeGram.js @@ -85,6 +85,11 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'british_american_english', 'town theatre', ['0:town', '1:theatre', '1:theater'] ); assertAnalysis( 'british_american_english', 'town theater', ['0:town', '1:theater', '1:theatre'] ); + + assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', [ + '0:ซ', '0:ซอ', '0:ซอย', + '1:เพชรบุรี1', '1:เพชรบุรี', '1:เพชรบุร', '1:เพชรบุ', '1:เพชรบ', '1:เพชร', '1:เพช', '1:เพ', '1:เ', + '2:f', '2:fo', '2:foo'] ); suite.run( t.end ); }); diff --git a/integration/analyzer_peliasQuery.js b/integration/analyzer_peliasQuery.js index f0cf199e..3c940ebc 100644 --- a/integration/analyzer_peliasQuery.js +++ b/integration/analyzer_peliasQuery.js @@ -49,6 +49,16 @@ module.exports.tests.functional = function(test, common){ assertAnalysis( 'place', 'Toys "R" Us!', [ 'toys', 'r', 'us' ]); assertAnalysis( 'address', '101 mapzen place', [ '101', 'mapzen', 'place' ]); + // complicated tokenization for some Asian languages + assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] ); + assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] ); + assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]); + assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', + ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']); + assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]); + assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]); + assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]); + suite.run( t.end ); }); }; diff --git a/integration/analyzer_peliasStreet.js b/integration/analyzer_peliasStreet.js index 2fa0e494..a8f542a9 100644 --- a/integration/analyzer_peliasStreet.js +++ b/integration/analyzer_peliasStreet.js @@ -1,4 +1,5 @@ // validate analyzer is behaving as expected +const { assert } = require('@hapi/joi'); const Suite = require('../test/elastictest/Suite') module.exports.tests = {}; @@ -22,6 +23,16 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'remove_ordinals', '1st 2nd 3rd 4th 5th', ['1','2','3','4','5'] ); assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast','th','101'] ); + // complicated tokenization for some Asian languages + assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] ); + assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] ); + assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]); + assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', + ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']); + assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]); + assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]); + assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]); + suite.run( t.end ); }); }; diff --git a/settings.js b/settings.js index f2dd633a..c8f3437b 100644 --- a/settings.js +++ b/settings.js @@ -22,16 +22,16 @@ function generate(){ "analysis": { "tokenizer": { "peliasTokenizer": { - "type": "pattern", - "pattern": "[\\s,/\\\\-]+" + "type": "icu_tokenizer" } }, "analyzer": { "peliasAdmin": { "type": "custom", "tokenizer": "peliasTokenizer", - "char_filter" : ["punctuation", "nfkc_normalizer"], + "char_filter" : ["ampersand_mapper", "punctuation", "nfkc_normalizer"], "filter": [ + "ampersand_replacer", "lowercase", "trim", "synonyms/custom_admin/multiword", @@ -46,8 +46,9 @@ function generate(){ "peliasIndexOneEdgeGram" : { "type": "custom", "tokenizer" : "peliasTokenizer", - "char_filter" : ["punctuation", "nfkc_normalizer"], + "char_filter" : ["ampersand_mapper", "punctuation", "nfkc_normalizer"], "filter": [ + "ampersand_replacer", "lowercase", "trim", "synonyms/custom_name/multiword", @@ -66,8 +67,9 @@ function generate(){ "peliasQuery": { "type": "custom", "tokenizer": "peliasTokenizer", - "char_filter": ["punctuation", "nfkc_normalizer"], + "char_filter": ["ampersand_mapper", "punctuation", "nfkc_normalizer"], "filter": [ + "ampersand_replacer", "lowercase", "trim", "icu_folding", @@ -80,8 +82,9 @@ function generate(){ "peliasPhrase": { "type": "custom", "tokenizer":"peliasTokenizer", - "char_filter" : ["punctuation", "nfkc_normalizer"], + "char_filter" : ["ampersand_mapper", "punctuation", "nfkc_normalizer"], "filter": [ + "ampersand_replacer", "lowercase", "trim", "remove_duplicate_spaces", @@ -129,8 +132,9 @@ function generate(){ "peliasStreet": { "type": "custom", "tokenizer":"peliasTokenizer", - "char_filter" : ["punctuation", "nfkc_normalizer"], + "char_filter" : ["ampersand_mapper", "punctuation", "nfkc_normalizer"], "filter": [ + "ampersand_replacer", "lowercase", "trim", "remove_duplicate_spaces", @@ -147,8 +151,9 @@ function generate(){ "peliasIndexCountryAbbreviation": { "type": "custom", "tokenizer": "peliasTokenizer", - "char_filter": ["punctuation", "nfkc_normalizer"], + "char_filter": ["ampersand_mapper", "punctuation", "nfkc_normalizer"], "filter": [ + "ampersand_replacer", "lowercase", "trim", "icu_folding", @@ -161,8 +166,9 @@ function generate(){ "peliasIndexCountryAbbreviationOneEdgeGram": { "type": "custom", "tokenizer": "peliasTokenizer", - "char_filter": ["punctuation", "nfkc_normalizer"], + "char_filter": ["ampersand_mapper", "punctuation", "nfkc_normalizer"], "filter": [ + "ampersand_replacer", "lowercase", "trim", "icu_folding", @@ -175,6 +181,12 @@ function generate(){ }, }, "filter" : { + // replaces ampersand placeholders back to `&` (see `ampersand_mapper` char_filter) + "ampersand_replacer": { + "type": "pattern_replace", + "pattern": "AMPERSANDPLACEHOLDER", + "replacement": "&" + }, "street_synonyms_multiplexer": { "type": "multiplexer", "preserve_original": false, @@ -248,6 +260,13 @@ function generate(){ // more generated below }, "char_filter": { + // icu-tokenizer treats ampersands as a word boundary, so we replace them with a placeholder to avoid it, + // as we want to handle them separately, we replace them back after tokenization (see `ampersand_replacer` filter) + "ampersand_mapper": { + "type": "pattern_replace", + "pattern": "&", + "replacement": " AMPERSANDPLACEHOLDER " + }, "punctuation" : { "type" : "mapping", "mappings" : punctuation.blacklist.map(function(c){ diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index 8bddef1e..e7439549 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -15,8 +15,7 @@ "analysis": { "tokenizer": { "peliasTokenizer": { - "type": "pattern", - "pattern": "[\\s,/\\\\-]+" + "type": "icu_tokenizer" } }, "analyzer": { @@ -24,10 +23,12 @@ "type": "custom", "tokenizer": "peliasTokenizer", "char_filter": [ + "ampersand_mapper", "punctuation", "nfkc_normalizer" ], "filter": [ + "ampersand_replacer", "lowercase", "trim", "synonyms/custom_admin/multiword", @@ -43,10 +44,12 @@ "type": "custom", "tokenizer": "peliasTokenizer", "char_filter": [ + "ampersand_mapper", "punctuation", "nfkc_normalizer" ], "filter": [ + "ampersand_replacer", "lowercase", "trim", "synonyms/custom_name/multiword", @@ -66,10 +69,12 @@ "type": "custom", "tokenizer": "peliasTokenizer", "char_filter": [ + "ampersand_mapper", "punctuation", "nfkc_normalizer" ], "filter": [ + "ampersand_replacer", "lowercase", "trim", "icu_folding", @@ -83,10 +88,12 @@ "type": "custom", "tokenizer": "peliasTokenizer", "char_filter": [ + "ampersand_mapper", "punctuation", "nfkc_normalizer" ], "filter": [ + "ampersand_replacer", "lowercase", "trim", "remove_duplicate_spaces", @@ -143,10 +150,12 @@ "type": "custom", "tokenizer": "peliasTokenizer", "char_filter": [ + "ampersand_mapper", "punctuation", "nfkc_normalizer" ], "filter": [ + "ampersand_replacer", "lowercase", "trim", "remove_duplicate_spaces", @@ -164,10 +173,12 @@ "type": "custom", "tokenizer": "peliasTokenizer", "char_filter": [ + "ampersand_mapper", "punctuation", "nfkc_normalizer" ], "filter": [ + "ampersand_replacer", "lowercase", "trim", "icu_folding", @@ -181,10 +192,12 @@ "type": "custom", "tokenizer": "peliasTokenizer", "char_filter": [ + "ampersand_mapper", "punctuation", "nfkc_normalizer" ], "filter": [ + "ampersand_replacer", "lowercase", "trim", "icu_folding", @@ -197,6 +210,11 @@ } }, "filter": { + "ampersand_replacer": { + "type": "pattern_replace", + "pattern": "AMPERSANDPLACEHOLDER", + "replacement": "&" + }, "street_synonyms_multiplexer": { "type": "multiplexer", "preserve_original": false, @@ -2271,6 +2289,11 @@ } }, "char_filter": { + "ampersand_mapper": { + "type": "pattern_replace", + "pattern": "&", + "replacement": " AMPERSANDPLACEHOLDER " + }, "punctuation": { "type": "mapping", "mappings": [ diff --git a/test/settings.js b/test/settings.js index 78c6b2ba..70fac7f3 100644 --- a/test/settings.js +++ b/test/settings.js @@ -57,13 +57,14 @@ module.exports.tests.peliasAdminAnalyzer = function(test, common) { var analyzer = s.analysis.analyzer.peliasAdmin; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ['punctuation', 'nfkc_normalizer'], 'character filters specified'); + t.deepEqual(analyzer.char_filter, ['ampersand_mapper', 'punctuation', 'nfkc_normalizer'], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasAdmin token filters', function (t) { var analyzer = settings().analysis.analyzer.peliasAdmin; t.deepEqual(analyzer.filter, [ + "ampersand_replacer", "lowercase", "trim", "synonyms/custom_admin/multiword", @@ -85,13 +86,14 @@ module.exports.tests.peliasIndexOneEdgeGramAnalyzer = function(test, common) { var analyzer = s.analysis.analyzer.peliasIndexOneEdgeGram; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ["punctuation","nfkc_normalizer"], 'character filters specified'); + t.deepEqual(analyzer.char_filter, ["ampersand_mapper", "punctuation","nfkc_normalizer"], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasIndexOneEdgeGram token filters', function(t) { var analyzer = settings().analysis.analyzer.peliasIndexOneEdgeGram; t.deepEqual( analyzer.filter, [ + "ampersand_replacer", "lowercase", "trim", "synonyms/custom_name/multiword", @@ -117,13 +119,14 @@ module.exports.tests.peliasQueryAnalyzer = function (test, common) { var analyzer = s.analysis.analyzer.peliasQuery; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ['punctuation', 'nfkc_normalizer'], 'character filters specified'); + t.deepEqual(analyzer.char_filter, ['ampersand_mapper', 'punctuation', 'nfkc_normalizer'], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasQuery token filters', function (t) { var analyzer = settings().analysis.analyzer.peliasQuery; t.deepEqual(analyzer.filter, [ + 'ampersand_replacer', 'lowercase', 'trim', 'icu_folding', @@ -143,13 +146,14 @@ module.exports.tests.peliasPhraseAnalyzer = function(test, common) { var analyzer = s.analysis.analyzer.peliasPhrase; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ["punctuation","nfkc_normalizer"], 'character filters specified'); + t.deepEqual(analyzer.char_filter, ["ampersand_mapper", "punctuation", "nfkc_normalizer"], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasPhrase token filters', function(t) { var analyzer = settings().analysis.analyzer.peliasPhrase; t.deepEqual( analyzer.filter, [ + "ampersand_replacer", "lowercase", "trim", "remove_duplicate_spaces", @@ -236,13 +240,14 @@ module.exports.tests.peliasStreetAnalyzer = function(test, common) { var analyzer = s.analysis.analyzer.peliasStreet; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ['punctuation', 'nfkc_normalizer'], 'character filters specified'); + t.deepEqual(analyzer.char_filter, ['ampersand_mapper', 'punctuation', 'nfkc_normalizer'], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasStreet token filters', function(t) { var analyzer = settings().analysis.analyzer.peliasStreet; t.deepEqual( analyzer.filter, [ + "ampersand_replacer", "lowercase", "trim", "remove_duplicate_spaces", @@ -266,13 +271,14 @@ module.exports.tests.peliasIndexCountryAbbreviation = function (test, common) { var analyzer = s.analysis.analyzer.peliasIndexCountryAbbreviation; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ['punctuation', 'nfkc_normalizer'], 'character filters specified'); + t.deepEqual(analyzer.char_filter, ['ampersand_mapper', 'punctuation', 'nfkc_normalizer'], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasIndexCountryAbbreviation token filters', function (t) { var analyzer = settings().analysis.analyzer.peliasIndexCountryAbbreviation; t.deepEqual(analyzer.filter, [ + "ampersand_replacer", "lowercase", "trim", "icu_folding", @@ -292,13 +298,14 @@ module.exports.tests.peliasIndexCountryAbbreviationOneEdgeGramAnalyzer = functio var analyzer = s.analysis.analyzer.peliasIndexCountryAbbreviationOneEdgeGram; t.equal(analyzer.type, 'custom', 'custom analyzer'); t.equal(typeof analyzer.tokenizer, 'string', 'tokenizer specified'); - t.deepEqual(analyzer.char_filter, ["punctuation", "nfkc_normalizer"], 'character filters specified'); + t.deepEqual(analyzer.char_filter, ["ampersand_mapper", "punctuation", "nfkc_normalizer"], 'character filters specified'); t.true(Array.isArray(analyzer.filter), 'filters specified'); t.end(); }); test('peliasIndexCountryAbbreviationOneEdgeGram token filters', function (t) { var analyzer = settings().analysis.analyzer.peliasIndexCountryAbbreviationOneEdgeGram; t.deepEqual(analyzer.filter, [ + "ampersand_replacer", "lowercase", "trim", "icu_folding",