From 23b7407b795481d00cc6eb17ede81ae8078036b1 Mon Sep 17 00:00:00 2001
From: Anton Rubin <anton.rubin@eliatra.com>
Date: Thu, 10 Oct 2024 12:06:04 +0100
Subject: [PATCH 1/4] add simple pattern tokenizer docs

Signed-off-by: Anton Rubin <anton.rubin@eliatra.com>
---
 _analyzers/tokenizers/index.md          |  2 +-
 _analyzers/tokenizers/simple-pattern.md | 85 +++++++++++++++++++++++++
 2 files changed, 86 insertions(+), 1 deletion(-)
 create mode 100644 _analyzers/tokenizers/simple-pattern.md

diff --git a/_analyzers/tokenizers/index.md b/_analyzers/tokenizers/index.md
index d401851f60..1abc5ee7ff 100644
--- a/_analyzers/tokenizers/index.md
+++ b/_analyzers/tokenizers/index.md
@@ -2,7 +2,7 @@
 layout: default
 title: Tokenizers
 nav_order: 60
-has_children: false
+has_children: true
 has_toc: false
 ---
 
diff --git a/_analyzers/tokenizers/simple-pattern.md b/_analyzers/tokenizers/simple-pattern.md
new file mode 100644
index 0000000000..d518e9d16b
--- /dev/null
+++ b/_analyzers/tokenizers/simple-pattern.md
@@ -0,0 +1,85 @@
+---
+layout: default
+title: Simple pattern
+parent: Tokenizers
+nav_order: 110
+---
+
+# Simple pattern tokenizer
+
+The `simple_pattern` tokenizer identifies matching sequences in the text based on the regular expression and uses those sequences as tokens.
+
+## Example usage
+
+The following example request creates a new index named `my_index` and configures an analyzer with `simple_pattern` tokenizer:
+
+```json
+PUT /my_index
+{
+  "settings": {
+    "analysis": {
+      "tokenizer": {
+        "my_pattern_tokenizer": {
+          "type": "simple_pattern",
+          "pattern": "\\d+"
+        }
+      },
+      "analyzer": {
+        "my_pattern_analyzer": {
+          "type": "custom",
+          "tokenizer": "my_pattern_tokenizer"
+        }
+      }
+    }
+  }
+}
+```
+{% include copy-curl.html %}
+
+## Generated tokens
+
+Use the following request to examine the tokens generated using the created analyzer:
+
+```json
+POST /my_index/_analyze
+{
+  "analyzer": "my_pattern_analyzer",
+  "text": "OpenSearch-2024-10-09"
+}
+```
+{% include copy-curl.html %}
+
+The response contains the generated tokens:
+
+```json
+{
+  "tokens": [
+    {
+      "token": "2024",
+      "start_offset": 11,
+      "end_offset": 15,
+      "type": "word",
+      "position": 0
+    },
+    {
+      "token": "10",
+      "start_offset": 16,
+      "end_offset": 18,
+      "type": "word",
+      "position": 1
+    },
+    {
+      "token": "09",
+      "start_offset": 19,
+      "end_offset": 21,
+      "type": "word",
+      "position": 2
+    }
+  ]
+}
+```
+
+## Configuration
+
+The `simple_pattern` tokenizer can be configured with parameter `pattern` which is used to split text into tokens. Default is empty string (` `). (String, _Optional_)
+

From abfdf33d0b014d7207bb8b772ad7dd319623c4a8 Mon Sep 17 00:00:00 2001
From: Anton Rubin <anton.rubin@eliatra.com>
Date: Wed, 16 Oct 2024 17:13:45 +0100
Subject: [PATCH 2/4] updating parameter table

Signed-off-by: Anton Rubin <anton.rubin@eliatra.com>
---
 _analyzers/tokenizers/simple-pattern.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/_analyzers/tokenizers/simple-pattern.md b/_analyzers/tokenizers/simple-pattern.md
index d518e9d16b..e91469589d 100644
--- a/_analyzers/tokenizers/simple-pattern.md
+++ b/_analyzers/tokenizers/simple-pattern.md
@@ -81,5 +81,9 @@ The response contains the generated tokens:
 
 ## Configuration
 
-The `simple_pattern` tokenizer can be configured with parameter `pattern` which is used to split text into tokens. Default is empty string (` `). (String, _Optional_)
+The `simple_pattern` tokenizer can be configured with the following parameter.
+
+Parameter | Required/Optional | Data type | Description
+:--- | :--- | :--- | :--- 
+`pattern` | Optional | String | Pattern which is used to split text into tokens. Default is empty string (` `).
 

From b20922030cfb4eeb2f44084538e7b661936f8333 Mon Sep 17 00:00:00 2001
From: Fanit Kolchina <kolchfa@amazon.com>
Date: Thu, 5 Dec 2024 14:19:13 -0500
Subject: [PATCH 3/4] Doc review

Signed-off-by: Fanit Kolchina <kolchfa@amazon.com>
---
 _analyzers/tokenizers/simple-pattern.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/_analyzers/tokenizers/simple-pattern.md b/_analyzers/tokenizers/simple-pattern.md
index e91469589d..0b7be546a0 100644
--- a/_analyzers/tokenizers/simple-pattern.md
+++ b/_analyzers/tokenizers/simple-pattern.md
@@ -7,11 +7,11 @@ nav_order: 110
 
 # Simple pattern tokenizer
 
-The `simple_pattern` tokenizer identifies matching sequences in the text based on the regular expression and uses those sequences as tokens.
+The `simple_pattern` tokenizer identifies matching sequences in the text based on the regular expression and uses those sequences as tokens. It extracts terms that match the regular expression. Use this tokenizer when you want to extract specific patterns as terms directly.
 
 ## Example usage
 
-The following example request creates a new index named `my_index` and configures an analyzer with `simple_pattern` tokenizer:
+The following example request creates a new index named `my_index` and configures an analyzer with a `simple_pattern` tokenizer. The tokenizer extracts numeric terms from text:
 
 ```json
 PUT /my_index
@@ -38,7 +38,7 @@ PUT /my_index
 
 ## Generated tokens
 
-Use the following request to examine the tokens generated using the created analyzer:
+Use the following request to examine the tokens generated using the analyzer:
 
 ```json
 POST /my_index/_analyze
@@ -79,11 +79,11 @@ The response contains the generated tokens:
 }
 ```
 
-## Configuration
+## Parameters
 
 The `simple_pattern` tokenizer can be configured with the following parameter.
 
 Parameter | Required/Optional | Data type | Description
 :--- | :--- | :--- | :--- 
-`pattern` | Optional | String | Pattern which is used to split text into tokens. Default is empty string (` `).
+`pattern` | Optional | String | The pattern used to split text into tokens specified using a [Lucene regular expression](https://lucene.apache.org/core/9_10_0/core/org/apache/lucene/util/automaton/RegExp.html). Default is an empty string, which returns the input text as one token. 
 

From ef128e5bff6056826d6dc7ddb8dd4733339d3788 Mon Sep 17 00:00:00 2001
From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com>
Date: Mon, 9 Dec 2024 12:53:46 -0500
Subject: [PATCH 4/4] Apply suggestions from code review

Co-authored-by: Nathan Bower <nbower@amazon.com>
Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com>
---
 _analyzers/tokenizers/simple-pattern.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/_analyzers/tokenizers/simple-pattern.md b/_analyzers/tokenizers/simple-pattern.md
index 0b7be546a0..eacddd6992 100644
--- a/_analyzers/tokenizers/simple-pattern.md
+++ b/_analyzers/tokenizers/simple-pattern.md
@@ -7,7 +7,7 @@ nav_order: 110
 
 # Simple pattern tokenizer
 
-The `simple_pattern` tokenizer identifies matching sequences in the text based on the regular expression and uses those sequences as tokens. It extracts terms that match the regular expression. Use this tokenizer when you want to extract specific patterns as terms directly.
+The `simple_pattern` tokenizer identifies matching sequences in text based on a regular expression and uses those sequences as tokens. It extracts terms that match the regular expression. Use this tokenizer when you want to directly extract specific patterns as terms.
 
 ## Example usage
 
@@ -85,5 +85,5 @@ The `simple_pattern` tokenizer can be configured with the following parameter.
 
 Parameter | Required/Optional | Data type | Description
 :--- | :--- | :--- | :--- 
-`pattern` | Optional | String | The pattern used to split text into tokens specified using a [Lucene regular expression](https://lucene.apache.org/core/9_10_0/core/org/apache/lucene/util/automaton/RegExp.html). Default is an empty string, which returns the input text as one token. 
+`pattern` | Optional | String | The pattern used to split text into tokens, specified using a [Lucene regular expression](https://lucene.apache.org/core/9_10_0/core/org/apache/lucene/util/automaton/RegExp.html). Default is an empty string, which returns the input text as one token.