Skip to content

Commit

Permalink
Use ICU tokenizer to improve some Asian languages support
Browse files Browse the repository at this point in the history
  • Loading branch information
SiarheiFedartsou committed Nov 26, 2024
1 parent 4b4957e commit 8c2712b
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 18 deletions.
5 changes: 5 additions & 0 deletions integration/analyzer_peliasIndexOneEdgeGram.js
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ module.exports.tests.analyze = function(test, common){

assertAnalysis( 'british_american_english', 'town theatre', ['0:town', '1:theatre', '1:theater'] );
assertAnalysis( 'british_american_english', 'town theater', ['0:town', '1:theater', '1:theatre'] );

assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', [
'0:ซ', '0:ซอ', '0:ซอย',
'1:เพชรบุรี1', '1:เพชรบุรี', '1:เพชรบุร', '1:เพชรบุ', '1:เพชรบ', '1:เพชร', '1:เพช', '1:เพ', '1:เ',
'2:f', '2:fo', '2:foo'] );

suite.run( t.end );
});
Expand Down
10 changes: 10 additions & 0 deletions integration/analyzer_peliasQuery.js
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@ module.exports.tests.functional = function(test, common){
assertAnalysis( 'place', 'Toys "R" Us!', [ 'toys', 'r', 'us' ]);
assertAnalysis( 'address', '101 mapzen place', [ '101', 'mapzen', 'place' ]);

// complicated tokenization for some Asian languages
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] );
assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] );
assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]);
assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]);
assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]);
assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]);

suite.run( t.end );
});
};
Expand Down
11 changes: 11 additions & 0 deletions integration/analyzer_peliasStreet.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// validate analyzer is behaving as expected
const { assert } = require('@hapi/joi');
const Suite = require('../test/elastictest/Suite')

module.exports.tests = {};
Expand All @@ -22,6 +23,16 @@ module.exports.tests.analyze = function(test, common){
assertAnalysis( 'remove_ordinals', '1st 2nd 3rd 4th 5th', ['1','2','3','4','5'] );
assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast','th','101'] );

// complicated tokenization for some Asian languages
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] );
assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] );
assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]);
assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]);
assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]);
assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]);

suite.run( t.end );
});
};
Expand Down
37 changes: 28 additions & 9 deletions settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,16 @@ function generate(){
"analysis": {
"tokenizer": {
"peliasTokenizer": {
"type": "pattern",
"pattern": "[\\s,/\\\\-]+"
"type": "icu_tokenizer"
}
},
"analyzer": {
"peliasAdmin": {
"type": "custom",
"tokenizer": "peliasTokenizer",
"char_filter" : ["punctuation", "nfkc_normalizer"],
"char_filter" : ["ampersand_mapper", "punctuation", "nfkc_normalizer"],
"filter": [
"ampersand_replacer",
"lowercase",
"trim",
"synonyms/custom_admin/multiword",
Expand All @@ -46,8 +46,9 @@ function generate(){
"peliasIndexOneEdgeGram" : {
"type": "custom",
"tokenizer" : "peliasTokenizer",
"char_filter" : ["punctuation", "nfkc_normalizer"],
"char_filter" : ["ampersand_mapper", "punctuation", "nfkc_normalizer"],
"filter": [
"ampersand_replacer",
"lowercase",
"trim",
"synonyms/custom_name/multiword",
Expand All @@ -66,8 +67,9 @@ function generate(){
"peliasQuery": {
"type": "custom",
"tokenizer": "peliasTokenizer",
"char_filter": ["punctuation", "nfkc_normalizer"],
"char_filter": ["ampersand_mapper", "punctuation", "nfkc_normalizer"],
"filter": [
"ampersand_replacer",
"lowercase",
"trim",
"icu_folding",
Expand All @@ -80,8 +82,9 @@ function generate(){
"peliasPhrase": {
"type": "custom",
"tokenizer":"peliasTokenizer",
"char_filter" : ["punctuation", "nfkc_normalizer"],
"char_filter" : ["ampersand_mapper", "punctuation", "nfkc_normalizer"],
"filter": [
"ampersand_replacer",
"lowercase",
"trim",
"remove_duplicate_spaces",
Expand Down Expand Up @@ -129,8 +132,9 @@ function generate(){
"peliasStreet": {
"type": "custom",
"tokenizer":"peliasTokenizer",
"char_filter" : ["punctuation", "nfkc_normalizer"],
"char_filter" : ["ampersand_mapper", "punctuation", "nfkc_normalizer"],
"filter": [
"ampersand_replacer",
"lowercase",
"trim",
"remove_duplicate_spaces",
Expand All @@ -147,8 +151,9 @@ function generate(){
"peliasIndexCountryAbbreviation": {
"type": "custom",
"tokenizer": "peliasTokenizer",
"char_filter": ["punctuation", "nfkc_normalizer"],
"char_filter": ["ampersand_mapper", "punctuation", "nfkc_normalizer"],
"filter": [
"ampersand_replacer",
"lowercase",
"trim",
"icu_folding",
Expand All @@ -161,8 +166,9 @@ function generate(){
"peliasIndexCountryAbbreviationOneEdgeGram": {
"type": "custom",
"tokenizer": "peliasTokenizer",
"char_filter": ["punctuation", "nfkc_normalizer"],
"char_filter": ["ampersand_mapper", "punctuation", "nfkc_normalizer"],
"filter": [
"ampersand_replacer",
"lowercase",
"trim",
"icu_folding",
Expand All @@ -175,6 +181,12 @@ function generate(){
},
},
"filter" : {
// replaces ampersand placeholders back to `&` (see `ampersand_mapper` char_filter)
"ampersand_replacer": {
"type": "pattern_replace",
"pattern": "AMPERSANDPLACEHOLDER",
"replacement": "&"
},
"street_synonyms_multiplexer": {
"type": "multiplexer",
"preserve_original": false,
Expand Down Expand Up @@ -248,6 +260,13 @@ function generate(){
// more generated below
},
"char_filter": {
// icu-tokenizer treats ampersands as a word boundary, so we replace them with a placeholder to avoid it,
// as we want to handle them separately, we replace them back after tokenization (see `ampersand_replacer` filter)
"ampersand_mapper": {
"type": "pattern_replace",
"pattern": "&",
"replacement": " AMPERSANDPLACEHOLDER "
},
"punctuation" : {
"type" : "mapping",
"mappings" : punctuation.blacklist.map(function(c){
Expand Down
27 changes: 25 additions & 2 deletions test/fixtures/expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,20 @@
"analysis": {
"tokenizer": {
"peliasTokenizer": {
"type": "pattern",
"pattern": "[\\s,/\\\\-]+"
"type": "icu_tokenizer"
}
},
"analyzer": {
"peliasAdmin": {
"type": "custom",
"tokenizer": "peliasTokenizer",
"char_filter": [
"ampersand_mapper",
"punctuation",
"nfkc_normalizer"
],
"filter": [
"ampersand_replacer",
"lowercase",
"trim",
"synonyms/custom_admin/multiword",
Expand All @@ -43,10 +44,12 @@
"type": "custom",
"tokenizer": "peliasTokenizer",
"char_filter": [
"ampersand_mapper",
"punctuation",
"nfkc_normalizer"
],
"filter": [
"ampersand_replacer",
"lowercase",
"trim",
"synonyms/custom_name/multiword",
Expand All @@ -66,10 +69,12 @@
"type": "custom",
"tokenizer": "peliasTokenizer",
"char_filter": [
"ampersand_mapper",
"punctuation",
"nfkc_normalizer"
],
"filter": [
"ampersand_replacer",
"lowercase",
"trim",
"icu_folding",
Expand All @@ -83,10 +88,12 @@
"type": "custom",
"tokenizer": "peliasTokenizer",
"char_filter": [
"ampersand_mapper",
"punctuation",
"nfkc_normalizer"
],
"filter": [
"ampersand_replacer",
"lowercase",
"trim",
"remove_duplicate_spaces",
Expand Down Expand Up @@ -143,10 +150,12 @@
"type": "custom",
"tokenizer": "peliasTokenizer",
"char_filter": [
"ampersand_mapper",
"punctuation",
"nfkc_normalizer"
],
"filter": [
"ampersand_replacer",
"lowercase",
"trim",
"remove_duplicate_spaces",
Expand All @@ -164,10 +173,12 @@
"type": "custom",
"tokenizer": "peliasTokenizer",
"char_filter": [
"ampersand_mapper",
"punctuation",
"nfkc_normalizer"
],
"filter": [
"ampersand_replacer",
"lowercase",
"trim",
"icu_folding",
Expand All @@ -181,10 +192,12 @@
"type": "custom",
"tokenizer": "peliasTokenizer",
"char_filter": [
"ampersand_mapper",
"punctuation",
"nfkc_normalizer"
],
"filter": [
"ampersand_replacer",
"lowercase",
"trim",
"icu_folding",
Expand All @@ -197,6 +210,11 @@
}
},
"filter": {
"ampersand_replacer": {
"type": "pattern_replace",
"pattern": "AMPERSANDPLACEHOLDER",
"replacement": "&"
},
"street_synonyms_multiplexer": {
"type": "multiplexer",
"preserve_original": false,
Expand Down Expand Up @@ -2271,6 +2289,11 @@
}
},
"char_filter": {
"ampersand_mapper": {
"type": "pattern_replace",
"pattern": "&",
"replacement": " AMPERSANDPLACEHOLDER "
},
"punctuation": {
"type": "mapping",
"mappings": [
Expand Down
Loading

0 comments on commit 8c2712b

Please sign in to comment.