From 6baec7616a5e908ba4f2d778d6d62ae3ba0bf99a Mon Sep 17 00:00:00 2001
From: Anton Rubin <anton.rubin@eliatra.com>
Date: Mon, 16 Sep 2024 13:56:57 +0100
Subject: [PATCH 1/3] adding predicate token filter docs #8272

Signed-off-by: Anton Rubin <anton.rubin@eliatra.com>
---
 _analyzers/token-filters/index.md             |  2 +-
 .../token-filters/predicate-token-filter.md   | 82 +++++++++++++++++++
 2 files changed, 83 insertions(+), 1 deletion(-)
 create mode 100644 _analyzers/token-filters/predicate-token-filter.md

diff --git a/_analyzers/token-filters/index.md b/_analyzers/token-filters/index.md
index a9b621d5ab..7c8af8aec5 100644
--- a/_analyzers/token-filters/index.md
+++ b/_analyzers/token-filters/index.md
@@ -47,7 +47,7 @@ Normalization | `arabic_normalization`: [ArabicNormalizer](https://lucene.apache
 `pattern_replace` | N/A | Matches a pattern in the provided regular expression and replaces matching substrings. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html).
 `phonetic` | N/A | Uses a phonetic encoder to emit a metaphone token for each token in the token stream. Requires installing the `analysis-phonetic` plugin.
 `porter_stem` | [PorterStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/PorterStemFilter.html) | Uses the [Porter stemming algorithm](https://tartarus.org/martin/PorterStemmer/) to perform algorithmic stemming for the English language.
-`predicate_token_filter` | N/A | Removes tokens that don’t match the specified predicate script. Supports inline Painless scripts only.
+[`predicate_token_filter`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/predicate-token-filter/) | N/A | Removes tokens that don’t match the specified predicate script. Supports inline Painless scripts only.
 `remove_duplicates` | [RemoveDuplicatesTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.html) | Removes duplicate tokens that are in the same position.
 `reverse` | [ReverseStringFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/reverse/ReverseStringFilter.html) | Reverses the string corresponding to each token in the token stream. For example, the token `dog` becomes `god`.
 `shingle` | [ShingleFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/shingle/ShingleFilter.html) | Generates shingles of lengths between `min_shingle_size` and `max_shingle_size` for tokens in the token stream. Shingles are similar to n-grams but apply to words instead of letters. For example, two-word shingles added to the list of unigrams [`contribute`, `to`, `opensearch`] are [`contribute to`, `to opensearch`].
diff --git a/_analyzers/token-filters/predicate-token-filter.md b/_analyzers/token-filters/predicate-token-filter.md
new file mode 100644
index 0000000000..faee3322bc
--- /dev/null
+++ b/_analyzers/token-filters/predicate-token-filter.md
@@ -0,0 +1,82 @@
+---
+layout: default
+title: Predicate token filter
+parent: Token filters
+nav_order: 340
+---
+
+# Predicate token filter
+
+The `predicate_token_filter` evaluates whether tokens should be kept or discarded, depending on the conditions defined in a custom script.
+
+## Parameters
+
+The `predicate_token_filter` has one required pamameter: `script`. This parameter should outline the condition which is used to evaluate if the token should be kept. 
+
+## Example
+
+The following example request creates a new index named `predicate_index` and configures an analyzer with `predicate_token_filter`:
+
+```json
+PUT /predicate_index
+{
+  "settings": {
+    "analysis": {
+      "filter": {
+        "my_predicate_filter": {
+          "type": "predicate_token_filter",
+          "script": {
+            "source": "token.term.length() > 7"
+          }
+        }
+      },
+      "analyzer": {
+        "predicate_analyzer": {
+          "tokenizer": "standard",
+          "filter": [
+            "lowercase",
+            "my_predicate_filter"
+          ]
+        }
+      }
+    }
+  }
+}
+```
+{% include copy-curl.html %}
+
+## Generated tokens
+
+Use the following request to examine the tokens generated using the analyzer:
+
+```json
+POST /predicate_index/_analyze
+{
+  "text": "The OpenSearch community is growing rapidly",
+  "analyzer": "predicate_analyzer"
+}
+```
+{% include copy-curl.html %}
+
+The response contains the generated tokens:
+
+```json
+{
+  "tokens": [
+    {
+      "token": "opensearch",
+      "start_offset": 4,
+      "end_offset": 14,
+      "type": "<ALPHANUM>",
+      "position": 1
+    },
+    {
+      "token": "community",
+      "start_offset": 15,
+      "end_offset": 24,
+      "type": "<ALPHANUM>",
+      "position": 2
+    }
+  ]
+}
+```

From 0301fe640a90d3f45971a5c423c871b856bba522 Mon Sep 17 00:00:00 2001
From: Fanit Kolchina <kolchfa@amazon.com>
Date: Mon, 18 Nov 2024 14:33:28 -0500
Subject: [PATCH 2/3] Doc review

Signed-off-by: Fanit Kolchina <kolchfa@amazon.com>
---
 _analyzers/token-filters/predicate-token-filter.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/_analyzers/token-filters/predicate-token-filter.md b/_analyzers/token-filters/predicate-token-filter.md
index faee3322bc..e84d7d3677 100644
--- a/_analyzers/token-filters/predicate-token-filter.md
+++ b/_analyzers/token-filters/predicate-token-filter.md
@@ -7,15 +7,15 @@ nav_order: 340
 
 # Predicate token filter
 
-The `predicate_token_filter` evaluates whether tokens should be kept or discarded, depending on the conditions defined in a custom script.
+The `predicate_token_filter` evaluates whether tokens should be kept or discarded, depending on the conditions defined in a custom script. The tokens are evaulated in the analysis predicate context. This filter supports only inline Painless scripts.
 
 ## Parameters
 
-The `predicate_token_filter` has one required pamameter: `script`. This parameter should outline the condition which is used to evaluate if the token should be kept. 
+The `predicate_token_filter` has one required parameter: `script`. This parameter provides a condition that is used to evaluate if the token should be kept. 
 
 ## Example
 
-The following example request creates a new index named `predicate_index` and configures an analyzer with `predicate_token_filter`:
+The following example request creates a new index named `predicate_index` and configures an analyzer with a `predicate_token_filter`. The filter specifies to output tokens only if they are longer than 7 characters:
 
 ```json
 PUT /predicate_index

From 88385165a6c556fed1d0fbfbcf5d8321940e56dc Mon Sep 17 00:00:00 2001
From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com>
Date: Mon, 25 Nov 2024 14:06:12 -0500
Subject: [PATCH 3/3] Apply suggestions from code review

Co-authored-by: Nathan Bower <nbower@amazon.com>
Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com>
---
 _analyzers/token-filters/index.md                  | 2 +-
 _analyzers/token-filters/predicate-token-filter.md | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/_analyzers/token-filters/index.md b/_analyzers/token-filters/index.md
index 7c8af8aec5..e1f0d74e77 100644
--- a/_analyzers/token-filters/index.md
+++ b/_analyzers/token-filters/index.md
@@ -47,7 +47,7 @@ Normalization | `arabic_normalization`: [ArabicNormalizer](https://lucene.apache
 `pattern_replace` | N/A | Matches a pattern in the provided regular expression and replaces matching substrings. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html).
 `phonetic` | N/A | Uses a phonetic encoder to emit a metaphone token for each token in the token stream. Requires installing the `analysis-phonetic` plugin.
 `porter_stem` | [PorterStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/PorterStemFilter.html) | Uses the [Porter stemming algorithm](https://tartarus.org/martin/PorterStemmer/) to perform algorithmic stemming for the English language.
-[`predicate_token_filter`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/predicate-token-filter/) | N/A | Removes tokens that don’t match the specified predicate script. Supports inline Painless scripts only.
+[`predicate_token_filter`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/predicate-token-filter/) | N/A | Removes tokens that do not match the specified predicate script. Supports only inline Painless scripts.
 `remove_duplicates` | [RemoveDuplicatesTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.html) | Removes duplicate tokens that are in the same position.
 `reverse` | [ReverseStringFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/reverse/ReverseStringFilter.html) | Reverses the string corresponding to each token in the token stream. For example, the token `dog` becomes `god`.
 `shingle` | [ShingleFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/shingle/ShingleFilter.html) | Generates shingles of lengths between `min_shingle_size` and `max_shingle_size` for tokens in the token stream. Shingles are similar to n-grams but apply to words instead of letters. For example, two-word shingles added to the list of unigrams [`contribute`, `to`, `opensearch`] are [`contribute to`, `to opensearch`].
diff --git a/_analyzers/token-filters/predicate-token-filter.md b/_analyzers/token-filters/predicate-token-filter.md
index e84d7d3677..24729f0224 100644
--- a/_analyzers/token-filters/predicate-token-filter.md
+++ b/_analyzers/token-filters/predicate-token-filter.md
@@ -7,15 +7,15 @@ nav_order: 340
 
 # Predicate token filter
 
-The `predicate_token_filter` evaluates whether tokens should be kept or discarded, depending on the conditions defined in a custom script. The tokens are evaulated in the analysis predicate context. This filter supports only inline Painless scripts.
+The `predicate_token_filter` evaluates whether tokens should be kept or discarded, depending on the conditions defined in a custom script. The tokens are evaluated in the analysis predicate context. This filter supports only inline Painless scripts.
 
 ## Parameters
 
-The `predicate_token_filter` has one required parameter: `script`. This parameter provides a condition that is used to evaluate if the token should be kept. 
+The `predicate_token_filter` has one required parameter: `script`. This parameter provides a condition that is used to evaluate whether the token should be kept. 
 
 ## Example
 
-The following example request creates a new index named `predicate_index` and configures an analyzer with a `predicate_token_filter`. The filter specifies to output tokens only if they are longer than 7 characters:
+The following example request creates a new index named `predicate_index` and configures an analyzer with a `predicate_token_filter`. The filter specifies to only output tokens if they are longer than 7 characters:
 
 ```json
 PUT /predicate_index