From 23b7407b795481d00cc6eb17ede81ae8078036b1 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Thu, 10 Oct 2024 12:06:04 +0100 Subject: [PATCH 1/4] add simple pattern tokenizer docs Signed-off-by: Anton Rubin --- _analyzers/tokenizers/index.md | 2 +- _analyzers/tokenizers/simple-pattern.md | 85 +++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 _analyzers/tokenizers/simple-pattern.md diff --git a/_analyzers/tokenizers/index.md b/_analyzers/tokenizers/index.md index d401851f60..1abc5ee7ff 100644 --- a/_analyzers/tokenizers/index.md +++ b/_analyzers/tokenizers/index.md @@ -2,7 +2,7 @@ layout: default title: Tokenizers nav_order: 60 -has_children: false +has_children: true has_toc: false --- diff --git a/_analyzers/tokenizers/simple-pattern.md b/_analyzers/tokenizers/simple-pattern.md new file mode 100644 index 0000000000..d518e9d16b --- /dev/null +++ b/_analyzers/tokenizers/simple-pattern.md @@ -0,0 +1,85 @@ +--- +layout: default +title: Simple pattern +parent: Tokenizers +nav_order: 110 +--- + +# Simple pattern tokenizer + +The `simple_pattern` tokenizer identifies matching sequences in the text based on the regular expression and uses those sequences as tokens. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with `simple_pattern` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_pattern_tokenizer": { + "type": "simple_pattern", + "pattern": "\\d+" + } + }, + "analyzer": { + "my_pattern_analyzer": { + "type": "custom", + "tokenizer": "my_pattern_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the created analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_pattern_analyzer", + "text": "OpenSearch-2024-10-09" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "2024", + "start_offset": 11, + "end_offset": 15, + "type": "word", + "position": 0 + }, + { + "token": "10", + "start_offset": 16, + "end_offset": 18, + "type": "word", + "position": 1 + }, + { + "token": "09", + "start_offset": 19, + "end_offset": 21, + "type": "word", + "position": 2 + } + ] +} +``` + +## Configuration + +The `simple_pattern` tokenizer can be configured with parameter `pattern` which is used to split text into tokens. Default is empty string (` `). (String, _Optional_) + From abfdf33d0b014d7207bb8b772ad7dd319623c4a8 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Wed, 16 Oct 2024 17:13:45 +0100 Subject: [PATCH 2/4] updating parameter table Signed-off-by: Anton Rubin --- _analyzers/tokenizers/simple-pattern.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/_analyzers/tokenizers/simple-pattern.md b/_analyzers/tokenizers/simple-pattern.md index d518e9d16b..e91469589d 100644 --- a/_analyzers/tokenizers/simple-pattern.md +++ b/_analyzers/tokenizers/simple-pattern.md @@ -81,5 +81,9 @@ The response contains the generated tokens: ## Configuration -The `simple_pattern` tokenizer can be configured with parameter `pattern` which is used to split text into tokens. Default is empty string (` `). (String, _Optional_) +The `simple_pattern` tokenizer can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`pattern` | Optional | String | Pattern which is used to split text into tokens. Default is empty string (` `). From b20922030cfb4eeb2f44084538e7b661936f8333 Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Thu, 5 Dec 2024 14:19:13 -0500 Subject: [PATCH 3/4] Doc review Signed-off-by: Fanit Kolchina --- _analyzers/tokenizers/simple-pattern.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/_analyzers/tokenizers/simple-pattern.md b/_analyzers/tokenizers/simple-pattern.md index e91469589d..0b7be546a0 100644 --- a/_analyzers/tokenizers/simple-pattern.md +++ b/_analyzers/tokenizers/simple-pattern.md @@ -7,11 +7,11 @@ nav_order: 110 # Simple pattern tokenizer -The `simple_pattern` tokenizer identifies matching sequences in the text based on the regular expression and uses those sequences as tokens. +The `simple_pattern` tokenizer identifies matching sequences in the text based on the regular expression and uses those sequences as tokens. It extracts terms that match the regular expression. Use this tokenizer when you want to extract specific patterns as terms directly. ## Example usage -The following example request creates a new index named `my_index` and configures an analyzer with `simple_pattern` tokenizer: +The following example request creates a new index named `my_index` and configures an analyzer with a `simple_pattern` tokenizer. The tokenizer extracts numeric terms from text: ```json PUT /my_index @@ -38,7 +38,7 @@ PUT /my_index ## Generated tokens -Use the following request to examine the tokens generated using the created analyzer: +Use the following request to examine the tokens generated using the analyzer: ```json POST /my_index/_analyze @@ -79,11 +79,11 @@ The response contains the generated tokens: } ``` -## Configuration +## Parameters The `simple_pattern` tokenizer can be configured with the following parameter. Parameter | Required/Optional | Data type | Description :--- | :--- | :--- | :--- -`pattern` | Optional | String | Pattern which is used to split text into tokens. Default is empty string (` `). +`pattern` | Optional | String | The pattern used to split text into tokens specified using a [Lucene regular expression](https://lucene.apache.org/core/9_10_0/core/org/apache/lucene/util/automaton/RegExp.html). Default is an empty string, which returns the input text as one token. From ef128e5bff6056826d6dc7ddb8dd4733339d3788 Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Mon, 9 Dec 2024 12:53:46 -0500 Subject: [PATCH 4/4] Apply suggestions from code review Co-authored-by: Nathan Bower Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- _analyzers/tokenizers/simple-pattern.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_analyzers/tokenizers/simple-pattern.md b/_analyzers/tokenizers/simple-pattern.md index 0b7be546a0..eacddd6992 100644 --- a/_analyzers/tokenizers/simple-pattern.md +++ b/_analyzers/tokenizers/simple-pattern.md @@ -7,7 +7,7 @@ nav_order: 110 # Simple pattern tokenizer -The `simple_pattern` tokenizer identifies matching sequences in the text based on the regular expression and uses those sequences as tokens. It extracts terms that match the regular expression. Use this tokenizer when you want to extract specific patterns as terms directly. +The `simple_pattern` tokenizer identifies matching sequences in text based on a regular expression and uses those sequences as tokens. It extracts terms that match the regular expression. Use this tokenizer when you want to directly extract specific patterns as terms. ## Example usage @@ -85,5 +85,5 @@ The `simple_pattern` tokenizer can be configured with the following parameter. Parameter | Required/Optional | Data type | Description :--- | :--- | :--- | :--- -`pattern` | Optional | String | The pattern used to split text into tokens specified using a [Lucene regular expression](https://lucene.apache.org/core/9_10_0/core/org/apache/lucene/util/automaton/RegExp.html). Default is an empty string, which returns the input text as one token. +`pattern` | Optional | String | The pattern used to split text into tokens, specified using a [Lucene regular expression](https://lucene.apache.org/core/9_10_0/core/org/apache/lucene/util/automaton/RegExp.html). Default is an empty string, which returns the input text as one token.