diff --git a/.github/workflows/jekyll-spec-insert.yml b/.github/workflows/jekyll-spec-insert.yml new file mode 100644 index 0000000000..cefd477be2 --- /dev/null +++ b/.github/workflows/jekyll-spec-insert.yml @@ -0,0 +1,20 @@ +name: Lint and Test Jekyll Spec Insert +on: + push: + paths: + - 'spec-insert/**' + pull_request: + paths: + - 'spec-insert/**' +jobs: + lint-and-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ruby/setup-ruby@v1 + with: { ruby-version: 3.3.0 } + - run: bundle install + - working-directory: spec-insert + run: | + bundle exec rubocop + bundle exec rspec diff --git a/.github/workflows/update-api-components.yml b/.github/workflows/update-api-components.yml new file mode 100644 index 0000000000..42cc1d2827 --- /dev/null +++ b/.github/workflows/update-api-components.yml @@ -0,0 +1,52 @@ +name: Update API Components +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * 0" # Every Sunday at midnight GMT +jobs: + update-api-components: + if: ${{ github.repository == 'opensearch-project/documentation-website' }} + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + + - run: git config --global pull.rebase true + + - uses: ruby/setup-ruby@v1 + with: { ruby-version: 3.3.0 } + + - run: bundle install + + - name: Download spec and insert into documentation + run: bundle exec jekyll spec-insert + + - name: Get current date + id: date + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV + + - name: GitHub App token + id: github_app_token + uses: tibdex/github-app-token@v2.1.0 + with: + app_id: ${{ secrets.APP_ID }} + private_key: ${{ secrets.APP_PRIVATE_KEY }} + + - name: Create pull request + uses: peter-evans/create-pull-request@v6 + with: + token: ${{ steps.github_app_token.outputs.token }} + commit-message: "Updated API components to reflect the latest OpenSearch API spec (${{ env.date }})" + title: "[AUTOCUT] Update API components to reflect the latest OpenSearch API spec (${{ env.date }})" + body: | + Update API components to reflect the latest [OpenSearch API spec](https://github.com/opensearch-project/opensearch-api-specification/releases/download/main-latest/opensearch-openapi.yaml). + Date: ${{ env.date }} + branch: update-api-components-${{ env.date }} + base: main + signoff: true + labels: autocut \ No newline at end of file diff --git a/.ruby-version b/.ruby-version deleted file mode 100644 index 4772543317..0000000000 --- a/.ruby-version +++ /dev/null @@ -1 +0,0 @@ -3.3.2 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a3628b3b6e..f6d9b87ee8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -84,7 +84,7 @@ Follow these steps to set up your local copy of the repository: ``` curl -sSL https://get.rvm.io | bash -s stable - rvm install 3.2.4 + rvm install 3.3.2 ruby -v ``` @@ -158,6 +158,23 @@ To ensure that our documentation adheres to the [OpenSearch Project Style Guidel Optionally, you can install the [Vale VSCode](https://github.com/chrischinchilla/vale-vscode) extension, which integrates Vale with Visual Studio Code. By default, only _errors_ and _warnings_ are underlined. To change the minimum alert level to include _suggestions_, go to **Vale VSCode** > **Extension Settings** and select **suggestion** in the **Vale > Vale CLI: Min Alert Level** dropdown list. +## Troubleshooting + +This section provides information about potential solutions for known issues. + +### Installing Ruby on an Apple silicon machine + +If you're having trouble installing Ruby with `rvm` on an Apple silicon machine, it could be because of an OpenSSL version misalignment. To fix this issue, use the following command, replacing `` with your [desired version](https://github.com/ruby/openssl/blob/master/README.md): + +``` +# Assumes Brew is installed +curl -sSL https://get.rvm.io | bash -s stable +rvm install 3.2.4 --with-openssl-dir=$(brew --prefix openssl@) +ruby -v +``` + ## Getting help For help with the contribution process, reach out to one of the [points of contact](README.md#points-of-contact). + + diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md new file mode 100644 index 0000000000..9b0ec1c79d --- /dev/null +++ b/DEVELOPER_GUIDE.md @@ -0,0 +1,135 @@ +# Developer guide +- [Introduction](#introduction) +- [Starting the Jekyll server locally](#starting-the-jekyll-server-locally) +- [Using the spec-insert Jekyll plugin](#using-the-spec-insert-jekyll-plugin) + - [Ignoring files and folders](#ignoring-files-and-folders) +- [CI/CD](#cicd) +- [Spec insert components](#spec-insert-components) + - [Query parameters](#query-parameters) + - [Path parameters](#path-parameters) + - [Paths and HTTP methods](#paths-and-http-methods) + +## Introduction + +The `.md` documents in this repository are rendered into HTML pages using [Jekyll](https://jekyllrb.com/). These HTML pages are hosted on [opensearch.org](https://opensearch.org/docs/latest/). + +## Starting the Jekyll server locally +You can run the Jekyll server locally to view the rendered HTML pages using the following steps: + +1. Install [Ruby](https://www.ruby-lang.org/en/documentation/installation/) 3.1.0 or later for your operating system. +2. Install the required gems by running `bundle install`. +3. Run `bundle exec jekyll serve` to start the Jekyll server locally (this can take several minutes to complete). +4. Open your browser and navigate to `http://localhost:4000` to view the rendered HTML pages. + +## Using the `spec-insert` Jekyll plugin +The `spec-insert` Jekyll plugin is used to insert API components into Markdown files. The plugin downloads the [latest OpenSearch specification](https://github.com/opensearch-project/opensearch-api-specification) and renders the API components from the spec. This aims to reduce the manual effort required to keep the documentation up to date. + +To use this plugin, make sure that you have installed Ruby 3.1.0 or later and the required gems by running `bundle install`. + +Edit your Markdown file and insert the following snippet where you want render an API component: + +```markdown + + +This is where the API component will be inserted. +Everything between the `spec_insert_start` and `spec_insert_end` tags will be overwritten. + + +``` + +Then run the following Jekyll command to render the API components: +```shell +bundle exec jekyll spec-insert +``` + +If you are working on multiple Markdown files and do not want to keep running the `jekyll spec-insert` command, you can add the `--watch` (or `-W`) flag to the command to watch for changes in the Markdown files and automatically render the API components: + +```shell +bundle exec jekyll spec-insert --watch +``` + +Depending on the text editor you are using, you may need to manually reload the file from disk to see the changes applied by the plugin if the editor does not automatically reload the file periodically. + +The plugin will pull the newest OpenSearch API spec from its [repository](https://github.com/opensearch-project/opensearch-api-specification) if the spec file does not exist locally or if it is older than 24 hours. To tell the plugin to always pull the newest spec, you can add the `--refresh-spec` (or `-R`) flag to the command: + +```shell +bundle exec jekyll spec-insert --refresh-spec +``` + +### Ignoring files and folders +The `spec-insert` plugin ignores all files and folders listed in the [./_config.yml#exclude](./_config.yml) list, which is also the list of files and folders that Jekyll ignores. + +## CI/CD +The `spec-insert` plugin is run as part of the CI/CD pipeline to ensure that the API components are up to date in the documentation. This is performed through the [update-api-components.yml](.github/workflows/update-api-components.yml) GitHub Actions workflow, which creates a pull request containing the updated API components every Sunday. + +## Spec insert components +All spec insert components accept the following arguments: +- `api` (String; required): The name of the API to render the component from. This is equivalent to the `x-operation-group` field in the OpenSearch OpenAPI Spec. +- `component` (String; required): The name of the component to render, such as `query_parameters`, `path_parameters`, or `paths_and_http_methods`. +- `omit_header` (Boolean; Default is `false`): If set to `true`, the markdown header of the component will not be rendered. + +### Paths and HTTP methods +To insert paths and HTTP methods for the `search` API, use the following snippet: +```markdown + + +``` + +### Path parameters + +To insert a path parameters table of the `indices.create` API, use the following snippet. Use the `x-operation-group` field from OpenSearch OpenAPI Spec for the `api` value: + +```markdown + + +``` +This table accepts the same arguments as the query parameters table except the `include_global` argument. + +### Query parameters +To insert the API query parameters table of the `cat.indices` API, use the following snippet: +```markdown + + +``` + +This will insert the query parameters of the `cat.indices` API into the `.md` file with three default columns: `Parameter`, `Type`, and `Description`. You can customize the query parameters table by adding the `columns` argument which accepts a comma-separated list of column names. The available column names are: + +- `Parameter` +- `Type` +- `Description` +- `Required` +- `Default` + +_When `Required`/`Default` is not chosen, the information will be written in the `Description` column._ + +You can also customize this component with the following settings: + +- `include_global` (Boolean; default is `false`): Includes global query parameters in the table. +- `include_deprecated` (Boolean; default is `true`): Includes deprecated parameters in the table. +- `pretty` (Boolean; default is `false`): Renders the table in the pretty format instead of the compact format. + +The following snippet inserts the specified columns into the query parameters table: + +```markdown + + +``` diff --git a/Gemfile b/Gemfile index 7825dcd02b..fee04f3c48 100644 --- a/Gemfile +++ b/Gemfile @@ -1,4 +1,9 @@ -source "http://rubygems.org" +# frozen_string_literal: true + +source 'https://rubygems.org' + +# Manually add csv gem since Ruby 3.4.0 no longer includes it +gem 'csv', '~> 3.0' # Hello! This is where you manage which Jekyll version is used to run. # When you want to use a different version, change it below, save the @@ -8,12 +13,12 @@ source "http://rubygems.org" # # This will help ensure the proper Jekyll version is running. # Happy Jekylling! -gem "jekyll", "~> 4.3.2" +gem 'jekyll', '~> 4.3.2' # This is the default theme for new Jekyll sites. You may change this to anything you like. -gem "just-the-docs", "~> 0.3.3" -gem "jekyll-remote-theme", "~> 0.4" -gem "jekyll-redirect-from", "~> 0.16" +gem 'jekyll-redirect-from', '~> 0.16' +gem 'jekyll-remote-theme', '~> 0.4' +gem 'just-the-docs', '~> 0.3.3' # If you want to use GitHub Pages, remove the "gem "jekyll"" above and # uncomment the line below. To upgrade, run `bundle update github-pages`. @@ -22,21 +27,31 @@ gem "jekyll-redirect-from", "~> 0.16" # If you have any plugins, put them here! group :jekyll_plugins do - gem "jekyll-last-modified-at" - gem "jekyll-sitemap" + gem 'jekyll-last-modified-at' + gem 'jekyll-sitemap' + gem 'jekyll-spec-insert', :path => './spec-insert' end # Windows does not include zoneinfo files, so bundle the tzinfo-data gem -gem "tzinfo-data", platforms: [:mingw, :mswin, :x64_mingw, :jruby] +gem 'tzinfo-data', platforms: %i[mingw mswin x64_mingw jruby] # Performance-booster for watching directories on Windows -gem "wdm", "~> 0.1.0" if Gem.win_platform? +gem 'wdm', '~> 0.1.0' if Gem.win_platform? # Installs webrick dependency for building locally -gem "webrick", "~> 1.7" - +gem 'webrick', '~> 1.7' # Link checker -gem "typhoeus" -gem "ruby-link-checker" -gem "ruby-enum" +gem 'ruby-enum' +gem 'ruby-link-checker' +gem 'typhoeus' + +# Spec Insert +gem 'activesupport', '~> 7' +gem 'mustache', '~> 1' + +group :development, :test do + gem 'rspec' + gem 'rubocop', '~> 1.44', require: false + gem 'rubocop-rake', require: false +end diff --git a/README.md b/README.md index 66beb1948c..52321335c7 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ # About the OpenSearch documentation repo The `documentation-website` repository contains the user documentation for OpenSearch. You can find the rendered documentation at [opensearch.org/docs](https://opensearch.org/docs). +The markdown files in this repository are rendered into HTML pages using [Jekyll](https://jekyllrb.com/). Check the [DEVELOPER_GUIDE](DEVELOPER_GUIDE.md) for more information about how to use Jekyll for this repository. ## Contributing diff --git a/_aggregations/bucket/nested.md b/_aggregations/bucket/nested.md index 89c44c6457..affda8e437 100644 --- a/_aggregations/bucket/nested.md +++ b/_aggregations/bucket/nested.md @@ -96,8 +96,8 @@ GET logs/_search "aggregations" : { "pages" : { "doc_count" : 2, - "min_price" : { - "value" : 200.0 + "min_load_time" : { + "value" : 200 } } } diff --git a/_analyzers/character-filters/html-character-filter.md b/_analyzers/character-filters/html-character-filter.md index 9fb98d9744..ef55930bdf 100644 --- a/_analyzers/character-filters/html-character-filter.md +++ b/_analyzers/character-filters/html-character-filter.md @@ -1,11 +1,11 @@ --- layout: default -title: html_strip character filter +title: HTML strip parent: Character filters nav_order: 100 --- -# `html_strip` character filter +# HTML strip character filter The `html_strip` character filter removes HTML tags, such as `
`, `

`, and ``, from the input text and renders plain text. The filter can be configured to preserve certain tags or decode specific HTML entities, such as ` `, into spaces. diff --git a/_analyzers/language-analyzers.md b/_analyzers/language-analyzers.md deleted file mode 100644 index ca4ba320dd..0000000000 --- a/_analyzers/language-analyzers.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -layout: default -title: Language analyzers -nav_order: 100 -parent: Analyzers -redirect_from: - - /query-dsl/analyzers/language-analyzers/ ---- - -# Language analyzers - -OpenSearch supports the following language analyzers: -`arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `english`, `estonian`, `finnish`, `french`, `galician`, `german`, `greek`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`, `lithuanian`, `norwegian`, `persian`, `portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `turkish`, and `thai`. - -To use the analyzer when you map an index, specify the value within your query. For example, to map your index with the French language analyzer, specify the `french` value for the analyzer field: - -```json - "analyzer": "french" -``` - -#### Example request - -The following query specifies the `french` language analyzer for the index `my-index`: - -```json -PUT my-index -{ - "mappings": { - "properties": { - "text": { - "type": "text", - "fields": { - "french": { - "type": "text", - "analyzer": "french" - } - } - } - } - } -} -``` - - diff --git a/_analyzers/language-analyzers/arabic.md b/_analyzers/language-analyzers/arabic.md new file mode 100644 index 0000000000..e61c684cbb --- /dev/null +++ b/_analyzers/language-analyzers/arabic.md @@ -0,0 +1,182 @@ +--- +layout: default +title: Arabic +parent: Language analyzers +grand_parent: Analyzers +nav_order: 10 +--- + +# Arabic analyzer + +The built-in `arabic` analyzer can be applied to a text field using the following command: + +```json +PUT /arabic-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "arabic" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_arabic +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_arabic_analyzer":{ + "type":"arabic", + "stem_exclusion":["تكنولوجيا","سلطة "] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Arabic analyzer internals + +The `arabic` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - decimal_digit + - stop (Arabic) + - normalization (Arabic) + - keyword + - stemmer (Arabic) + +## Custom Arabic analyzer + +You can create a custom Arabic analyzer using the following command: + +```json +PUT /arabic-index +{ + "settings": { + "analysis": { + "filter": { + "arabic_stop": { + "type": "stop", + "stopwords": "_arabic_" + }, + "arabic_stemmer": { + "type": "stemmer", + "language": "arabic" + }, + "arabic_normalization": { + "type": "arabic_normalization" + }, + "decimal_digit": { + "type": "decimal_digit" + }, + "arabic_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "arabic_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "arabic_normalization", + "decimal_digit", + "arabic_stop", + "arabic_keywords", + "arabic_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "arabic_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /arabic-index/_analyze +{ + "field": "content", + "text": "الطلاب يدرسون في الجامعات العربية. أرقامهم ١٢٣٤٥٦." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "طلاب", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "يدرس", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "جامع", + "start_offset": 17, + "end_offset": 25, + "type": "", + "position": 3 + }, + { + "token": "عرب", + "start_offset": 26, + "end_offset": 33, + "type": "", + "position": 4 + }, + { + "token": "ارقامهم", + "start_offset": 35, + "end_offset": 42, + "type": "", + "position": 5 + }, + { + "token": "123456", + "start_offset": 43, + "end_offset": 49, + "type": "", + "position": 6 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/armenian.md b/_analyzers/language-analyzers/armenian.md new file mode 100644 index 0000000000..9bd0549c80 --- /dev/null +++ b/_analyzers/language-analyzers/armenian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Armenian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 20 +--- + +# Armenian analyzer + +The built-in `armenian` analyzer can be applied to a text field using the following command: + +```json +PUT /arabic-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "armenian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_armenian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_armenian_analyzer": { + "type": "armenian", + "stem_exclusion": ["բարև", "խաղաղություն"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Armenian analyzer internals + +The `armenian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Armenian) + - keyword + - stemmer (Armenian) + +## Custom Armenian analyzer + +You can create a custom Armenian analyzer using the following command: + +```json +PUT /armenian-index +{ + "settings": { + "analysis": { + "filter": { + "armenian_stop": { + "type": "stop", + "stopwords": "_armenian_" + }, + "armenian_stemmer": { + "type": "stemmer", + "language": "armenian" + }, + "armenian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "armenian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "armenian_stop", + "armenian_keywords", + "armenian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "armenian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET armenian-index/_analyze +{ + "analyzer": "stem_exclusion_armenian_analyzer", + "text": "բարև բոլորին, մենք խաղաղություն ենք ուզում և նոր օր ենք սկսել" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "բարև","start_offset": 0,"end_offset": 4,"type": "","position": 0}, + {"token": "բոլոր","start_offset": 5,"end_offset": 12,"type": "","position": 1}, + {"token": "խաղաղություն","start_offset": 19,"end_offset": 31,"type": "","position": 3}, + {"token": "ուզ","start_offset": 36,"end_offset": 42,"type": "","position": 5}, + {"token": "նոր","start_offset": 45,"end_offset": 48,"type": "","position": 7}, + {"token": "օր","start_offset": 49,"end_offset": 51,"type": "","position": 8}, + {"token": "սկսել","start_offset": 56,"end_offset": 61,"type": "","position": 10} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/basque.md b/_analyzers/language-analyzers/basque.md new file mode 100644 index 0000000000..e73510cc66 --- /dev/null +++ b/_analyzers/language-analyzers/basque.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Basque +parent: Language analyzers +grand_parent: Analyzers +nav_order: 30 +--- + +# Basque analyzer + +The built-in `basque` analyzer can be applied to a text field using the following command: + +```json +PUT /basque-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "basque" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_basque_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_basque_analyzer": { + "type": "basque", + "stem_exclusion": ["autoritate", "baldintza"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Basque analyzer internals + +The `basque` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Basque) + - keyword + - stemmer (Basque) + +## Custom Basque analyzer + +You can create a custom Basque analyzer using the following command: + +```json +PUT /basque-index +{ + "settings": { + "analysis": { + "filter": { + "basque_stop": { + "type": "stop", + "stopwords": "_basque_" + }, + "basque_stemmer": { + "type": "stemmer", + "language": "basque" + }, + "basque_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "basque_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "basque_stop", + "basque_keywords", + "basque_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "basque_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /basque-index/_analyze +{ + "field": "content", + "text": "Ikasleek euskal unibertsitateetan ikasten dute. Haien zenbakiak 123456 dira." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "ikasle","start_offset": 0,"end_offset": 8,"type": "","position": 0}, + {"token": "euskal","start_offset": 9,"end_offset": 15,"type": "","position": 1}, + {"token": "unibertsi","start_offset": 16,"end_offset": 33,"type": "","position": 2}, + {"token": "ikas","start_offset": 34,"end_offset": 41,"type": "","position": 3}, + {"token": "haien","start_offset": 48,"end_offset": 53,"type": "","position": 5}, + {"token": "zenba","start_offset": 54,"end_offset": 63,"type": "","position": 6}, + {"token": "123456","start_offset": 64,"end_offset": 70,"type": "","position": 7} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/bengali.md b/_analyzers/language-analyzers/bengali.md new file mode 100644 index 0000000000..af913a01ef --- /dev/null +++ b/_analyzers/language-analyzers/bengali.md @@ -0,0 +1,142 @@ +--- +layout: default +title: Bengali +parent: Language analyzers +grand_parent: Analyzers +nav_order: 40 +--- + +# Bengali analyzer + +The built-in `bengali` analyzer can be applied to a text field using the following command: + +```json +PUT /bengali-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bengali" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_bengali_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_bengali_analyzer": { + "type": "bengali", + "stem_exclusion": ["কর্তৃপক্ষ", "অনুমোদন"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Bengali analyzer internals + +The `bengali` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - decimal_digit + - indic_normalization + - normalization (Bengali) + - stop (Bengali) + - keyword + - stemmer (Bengali) + +## Custom Bengali analyzer + +You can create a custom Bengali analyzer using the following command: + +```json +PUT /bengali-index +{ + "settings": { + "analysis": { + "filter": { + "bengali_stop": { + "type": "stop", + "stopwords": "_bengali_" + }, + "bengali_stemmer": { + "type": "stemmer", + "language": "bengali" + }, + "bengali_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "bengali_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "decimal_digit", + "indic_normalization", + "bengali_normalization", + "bengali_stop", + "bengali_keywords", + "bengali_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bengali_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /bengali-index/_analyze +{ + "field": "content", + "text": "ছাত্ররা বিশ্ববিদ্যালয়ে পড়াশোনা করে। তাদের নম্বরগুলি ১২৩৪৫৬।" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "ছাত্র","start_offset": 0,"end_offset": 7,"type": "","position": 0}, + {"token": "বিসসবিদালয়","start_offset": 8,"end_offset": 23,"type": "","position": 1}, + {"token": "পরাসোন","start_offset": 24,"end_offset": 32,"type": "","position": 2}, + {"token": "তা","start_offset": 38,"end_offset": 43,"type": "","position": 4}, + {"token": "নমমর","start_offset": 44,"end_offset": 53,"type": "","position": 5}, + {"token": "123456","start_offset": 54,"end_offset": 60,"type": "","position": 6} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/brazilian.md b/_analyzers/language-analyzers/brazilian.md new file mode 100644 index 0000000000..67db2b92bc --- /dev/null +++ b/_analyzers/language-analyzers/brazilian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Brazilian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 50 +--- + +# Brazilian analyzer + +The built-in `brazilian` analyzer can be applied to a text field using the following command: + +```json +PUT /brazilian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "brazilian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_brazilian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_brazilian_analyzer": { + "type": "brazilian", + "stem_exclusion": ["autoridade", "aprovação"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Brazilian analyzer internals + +The `brazilian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Brazilian) + - keyword + - stemmer (Brazilian) + +## Custom Brazilian analyzer + +You can create a custom Brazilian analyzer using the following command: + +```json +PUT /brazilian-index +{ + "settings": { + "analysis": { + "filter": { + "brazilian_stop": { + "type": "stop", + "stopwords": "_brazilian_" + }, + "brazilian_stemmer": { + "type": "stemmer", + "language": "brazilian" + }, + "brazilian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "brazilian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "brazilian_stop", + "brazilian_keywords", + "brazilian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "brazilian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /brazilian-index/_analyze +{ + "field": "content", + "text": "Estudantes estudam em universidades brasileiras. Seus números são 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "estudant","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "estud","start_offset": 11,"end_offset": 18,"type": "","position": 1}, + {"token": "univers","start_offset": 22,"end_offset": 35,"type": "","position": 3}, + {"token": "brasileir","start_offset": 36,"end_offset": 47,"type": "","position": 4}, + {"token": "numer","start_offset": 54,"end_offset": 61,"type": "","position": 6}, + {"token": "sao","start_offset": 62,"end_offset": 65,"type": "","position": 7}, + {"token": "123456","start_offset": 66,"end_offset": 72,"type": "","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/bulgarian.md b/_analyzers/language-analyzers/bulgarian.md new file mode 100644 index 0000000000..42d5794e18 --- /dev/null +++ b/_analyzers/language-analyzers/bulgarian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Bulgarian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 60 +--- + +# Bulgarian analyzer + +The built-in `bulgarian` analyzer can be applied to a text field using the following command: + +```json +PUT /bulgarian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bulgarian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_bulgarian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_bulgarian_analyzer": { + "type": "bulgarian", + "stem_exclusion": ["авторитет", "одобрение"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Bulgarian analyzer internals + +The `bulgarian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Bulgarian) + - keyword + - stemmer (Bulgarian) + +## Custom Bulgarian analyzer + +You can create a custom Bulgarian analyzer using the following command: + +```json +PUT /bulgarian-index +{ + "settings": { + "analysis": { + "filter": { + "bulgarian_stop": { + "type": "stop", + "stopwords": "_bulgarian_" + }, + "bulgarian_stemmer": { + "type": "stemmer", + "language": "bulgarian" + }, + "bulgarian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "bulgarian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "bulgarian_stop", + "bulgarian_keywords", + "bulgarian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "bulgarian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /bulgarian-index/_analyze +{ + "field": "content", + "text": "Студентите учат в българските университети. Техните номера са 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "студент","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "учат","start_offset": 11,"end_offset": 15,"type": "","position": 1}, + {"token": "българск","start_offset": 18,"end_offset": 29,"type": "","position": 3}, + {"token": "университят","start_offset": 30,"end_offset": 42,"type": "","position": 4}, + {"token": "техн","start_offset": 44,"end_offset": 51,"type": "","position": 5}, + {"token": "номер","start_offset": 52,"end_offset": 58,"type": "","position": 6}, + {"token": "123456","start_offset": 62,"end_offset": 68,"type": "","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/catalan.md b/_analyzers/language-analyzers/catalan.md new file mode 100644 index 0000000000..89762da094 --- /dev/null +++ b/_analyzers/language-analyzers/catalan.md @@ -0,0 +1,143 @@ +--- +layout: default +title: Catalan +parent: Language analyzers +grand_parent: Analyzers +nav_order: 70 +--- + +# Catalan analyzer + +The built-in `catalan` analyzer can be applied to a text field using the following command: + +```json +PUT /catalan-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "catalan" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_catalan_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_catalan_analyzer": { + "type": "catalan", + "stem_exclusion": ["autoritat", "aprovació"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Catalan analyzer internals + +The `catalan` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - elision (Catalan) + - lowercase + - stop (Catalan) + - keyword + - stemmer (Catalan) + +## Custom Catalan analyzer + +You can create a custom Catalan analyzer using the following command: + +```json +PUT /catalan-index +{ + "settings": { + "analysis": { + "filter": { + "catalan_stop": { + "type": "stop", + "stopwords": "_catalan_" + }, + "catalan_elision": { + "type": "elision", + "articles": [ "d", "l", "m", "n", "s", "t"], + "articles_case": true + }, + "catalan_stemmer": { + "type": "stemmer", + "language": "catalan" + }, + "catalan_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "catalan_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "catalan_elision", + "lowercase", + "catalan_stop", + "catalan_keywords", + "catalan_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "catalan_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /catalan-index/_analyze +{ + "field": "content", + "text": "Els estudiants estudien a les universitats catalanes. Els seus números són 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "estud","start_offset": 4,"end_offset": 14,"type": "","position": 1}, + {"token": "estud","start_offset": 15,"end_offset": 23,"type": "","position": 2}, + {"token": "univer","start_offset": 30,"end_offset": 42,"type": "","position": 5}, + {"token": "catalan","start_offset": 43,"end_offset": 52,"type": "","position": 6}, + {"token": "numer","start_offset": 63,"end_offset": 70,"type": "","position": 9}, + {"token": "123456","start_offset": 75,"end_offset": 81,"type": "","position": 11} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/cjk.md b/_analyzers/language-analyzers/cjk.md new file mode 100644 index 0000000000..aed7e6da22 --- /dev/null +++ b/_analyzers/language-analyzers/cjk.md @@ -0,0 +1,142 @@ +--- +layout: default +title: CJK +parent: Language analyzers +grand_parent: Analyzers +nav_order: 80 +--- + +# CJK analyzer + +The built-in `cjk` analyzer can be applied to a text field using the following command: + +```json +PUT /cjk-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "cjk" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_cjk_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_cjk_analyzer": { + "type": "cjk", + "stem_exclusion": ["example", "words"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## CJK analyzer internals + +The `cjk` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - cjk_width + - lowercase + - cjk_bigram + - stop (similar to English) + +## Custom CJK analyzer + +You can create a custom CJK analyzer using the following command: + +```json +PUT /cjk-index +{ + "settings": { + "analysis": { + "filter": { + "english_stop": { + "type": "stop", + "stopwords": [ + "a", "and", "are", "as", "at", "be", "but", "by", "for", + "if", "in", "into", "is", "it", "no", "not", "of", "on", + "or", "s", "such", "t", "that", "the", "their", "then", + "there", "these", "they", "this", "to", "was", "will", + "with", "www" + ] + } + }, + "analyzer": { + "cjk_custom_analyzer": { + "tokenizer": "standard", + "filter": [ + "cjk_width", + "lowercase", + "cjk_bigram", + "english_stop" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "cjk_custom_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /cjk-index/_analyze +{ + "field": "content", + "text": "学生们在中国、日本和韩国的大学学习。123456" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "学生","start_offset": 0,"end_offset": 2,"type": "","position": 0}, + {"token": "生们","start_offset": 1,"end_offset": 3,"type": "","position": 1}, + {"token": "们在","start_offset": 2,"end_offset": 4,"type": "","position": 2}, + {"token": "在中","start_offset": 3,"end_offset": 5,"type": "","position": 3}, + {"token": "中国","start_offset": 4,"end_offset": 6,"type": "","position": 4}, + {"token": "日本","start_offset": 7,"end_offset": 9,"type": "","position": 5}, + {"token": "本和","start_offset": 8,"end_offset": 10,"type": "","position": 6}, + {"token": "和韩","start_offset": 9,"end_offset": 11,"type": "","position": 7}, + {"token": "韩国","start_offset": 10,"end_offset": 12,"type": "","position": 8}, + {"token": "国的","start_offset": 11,"end_offset": 13,"type": "","position": 9}, + {"token": "的大","start_offset": 12,"end_offset": 14,"type": "","position": 10}, + {"token": "大学","start_offset": 13,"end_offset": 15,"type": "","position": 11}, + {"token": "学学","start_offset": 14,"end_offset": 16,"type": "","position": 12}, + {"token": "学习","start_offset": 15,"end_offset": 17,"type": "","position": 13}, + {"token": "123456","start_offset": 18,"end_offset": 24,"type": "","position": 14} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/czech.md b/_analyzers/language-analyzers/czech.md new file mode 100644 index 0000000000..c1778cd0f4 --- /dev/null +++ b/_analyzers/language-analyzers/czech.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Czech +parent: Language analyzers +grand_parent: Analyzers +nav_order: 90 +--- + +# Czech analyzer + +The built-in `czech` analyzer can be applied to a text field using the following command: + +```json +PUT /czech-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "czech" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_czech_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_czech_analyzer": { + "type": "czech", + "stem_exclusion": ["autorita", "schválení"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Czech analyzer internals + +The `czech` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Czech) + - keyword + - stemmer (Czech) + +## Custom Czech analyzer + +You can create a custom Czech analyzer using the following command: + +```json +PUT /czech-index +{ + "settings": { + "analysis": { + "filter": { + "czech_stop": { + "type": "stop", + "stopwords": "_czech_" + }, + "czech_stemmer": { + "type": "stemmer", + "language": "czech" + }, + "czech_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "czech_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "czech_stop", + "czech_keywords", + "czech_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "czech_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /czech-index/_analyze +{ + "field": "content", + "text": "Studenti studují na českých univerzitách. Jejich čísla jsou 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "student", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "studuj", + "start_offset": 9, + "end_offset": 16, + "type": "", + "position": 1 + }, + { + "token": "česk", + "start_offset": 20, + "end_offset": 27, + "type": "", + "position": 3 + }, + { + "token": "univerzit", + "start_offset": 28, + "end_offset": 40, + "type": "", + "position": 4 + }, + { + "token": "čísl", + "start_offset": 49, + "end_offset": 54, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 60, + "end_offset": 66, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/danish.md b/_analyzers/language-analyzers/danish.md new file mode 100644 index 0000000000..b5ee1b0e97 --- /dev/null +++ b/_analyzers/language-analyzers/danish.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Danish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 100 +--- + +# Danish analyzer + +The built-in `danish` analyzer can be applied to a text field using the following command: + +```json +PUT /danish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "danish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_danish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_danish_analyzer": { + "type": "danish", + "stem_exclusion": ["autoritet", "godkendelse"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Danish analyzer internals + +The `danish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Danish) + - keyword + - stemmer (Danish) + +## Custom Danish analyzer + +You can create a custom Danish analyzer using the following command: + +```json +PUT /danish-index +{ + "settings": { + "analysis": { + "filter": { + "danish_stop": { + "type": "stop", + "stopwords": "_danish_" + }, + "danish_stemmer": { + "type": "stemmer", + "language": "danish" + }, + "danish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "danish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "danish_stop", + "danish_keywords", + "danish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "danish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /danish-index/_analyze +{ + "field": "content", + "text": "Studerende studerer på de danske universiteter. Deres numre er 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "stud", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "stud", + "start_offset": 11, + "end_offset": 19, + "type": "", + "position": 1 + }, + { + "token": "dansk", + "start_offset": 26, + "end_offset": 32, + "type": "", + "position": 4 + }, + { + "token": "universitet", + "start_offset": 33, + "end_offset": 46, + "type": "", + "position": 5 + }, + { + "token": "numr", + "start_offset": 54, + "end_offset": 59, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 63, + "end_offset": 69, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/dutch.md b/_analyzers/language-analyzers/dutch.md new file mode 100644 index 0000000000..0259707d78 --- /dev/null +++ b/_analyzers/language-analyzers/dutch.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Dutch +parent: Language analyzers +grand_parent: Analyzers +nav_order: 110 +--- + +# Dutch analyzer + +The built-in `dutch` analyzer can be applied to a text field using the following command: + +```json +PUT /dutch-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "dutch" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_dutch_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_dutch_analyzer": { + "type": "dutch", + "stem_exclusion": ["autoriteit", "goedkeuring"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Dutch analyzer internals + +The `dutch` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Dutch) + - keyword + - stemmer_override + - stemmer (Dutch) + +## Custom Dutch analyzer + +You can create a custom Dutch analyzer using the following command: + +```json +PUT /dutch-index +{ + "settings": { + "analysis": { + "filter": { + "dutch_stop": { + "type": "stop", + "stopwords": "_dutch_" + }, + "dutch_stemmer": { + "type": "stemmer", + "language": "dutch" + }, + "dutch_keywords": { + "type": "keyword_marker", + "keywords": [] + }, + "dutch_override": { + "type": "stemmer_override", + "rules": [ + "fiets=>fiets", + "bromfiets=>bromfiets", + "ei=>eier", + "kind=>kinder" + ] + } + }, + "analyzer": { + "dutch_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "dutch_stop", + "dutch_keywords", + "dutch_override", + "dutch_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "dutch_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /dutch-index/_analyze +{ + "field": "content", + "text": "De studenten studeren in Nederland en bezoeken Amsterdam. Hun nummers zijn 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 3,"end_offset": 12,"type": "","position": 1}, + {"token": "studer","start_offset": 13,"end_offset": 21,"type": "","position": 2}, + {"token": "nederland","start_offset": 25,"end_offset": 34,"type": "","position": 4}, + {"token": "bezoek","start_offset": 38,"end_offset": 46,"type": "","position": 6}, + {"token": "amsterdam","start_offset": 47,"end_offset": 56,"type": "","position": 7}, + {"token": "nummer","start_offset": 62,"end_offset": 69,"type": "","position": 9}, + {"token": "123456","start_offset": 75,"end_offset": 81,"type": "","position": 11} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/english.md b/_analyzers/language-analyzers/english.md new file mode 100644 index 0000000000..2d0b600312 --- /dev/null +++ b/_analyzers/language-analyzers/english.md @@ -0,0 +1,143 @@ +--- +layout: default +title: English +parent: Language analyzers +grand_parent: Analyzers +nav_order: 120 +--- + +# English analyzer + +The built-in `english` analyzer can be applied to a text field using the following command: + +```json +PUT /english-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "english" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_english_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_english_analyzer": { + "type": "english", + "stem_exclusion": ["authority", "authorization"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## English analyzer internals + +The `english` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - stemmer (possessive_english) + - lowercase + - stop (English) + - keyword + - stemmer (English) + +## Custom English analyzer + +You can create a custom English analyzer using the following command: + +```json +PUT /english-index +{ + "settings": { + "analysis": { + "filter": { + "english_stop": { + "type": "stop", + "stopwords": "_english_" + }, + "english_stemmer": { + "type": "stemmer", + "language": "english" + }, + "english_keywords": { + "type": "keyword_marker", + "keywords": [] + }, + "english_possessive_stemmer": { + "type": "stemmer", + "language": "possessive_english" + } + }, + "analyzer": { + "english_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "english_possessive_stemmer", + "lowercase", + "english_stop", + "english_keywords", + "english_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "english_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /english-index/_analyze +{ + "field": "content", + "text": "The students study in the USA and work at NASA. Their numbers are 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 4,"end_offset": 12,"type": "","position": 1}, + {"token": "studi","start_offset": 13,"end_offset": 18,"type": "","position": 2}, + {"token": "usa","start_offset": 26,"end_offset": 29,"type": "","position": 5}, + {"token": "work","start_offset": 34,"end_offset": 38,"type": "","position": 7}, + {"token": "nasa","start_offset": 42,"end_offset": 46,"type": "","position": 9}, + {"token": "number","start_offset": 54,"end_offset": 61,"type": "","position": 11}, + {"token": "123456","start_offset": 66,"end_offset": 72,"type": "","position": 13} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/estonian.md b/_analyzers/language-analyzers/estonian.md new file mode 100644 index 0000000000..a4cb664f18 --- /dev/null +++ b/_analyzers/language-analyzers/estonian.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Estonian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 130 +--- + +# Estonian analyzer + +The built-in `estonian` analyzer can be applied to a text field using the following command: + +```json +PUT /estonian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "estonian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_estonian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_estonian_analyzer": { + "type": "estonian", + "stem_exclusion": ["autoriteet", "kinnitus"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Estonian analyzer internals + +The `estonian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Estonian) + - keyword + - stemmer (Estonian) + +## Custom Estonian analyzer + +You can create a custom Estonian analyzer using the following command: + +```json +PUT /estonian-index +{ + "settings": { + "analysis": { + "filter": { + "estonian_stop": { + "type": "stop", + "stopwords": "_estonian_" + }, + "estonian_stemmer": { + "type": "stemmer", + "language": "estonian" + }, + "estonian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "estonian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "estonian_stop", + "estonian_keywords", + "estonian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "estonian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /estonian-index/_analyze +{ + "field": "content", + "text": "Õpilased õpivad Tallinnas ja Eesti ülikoolides. Nende numbrid on 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "õpilase","start_offset": 0,"end_offset": 8,"type": "","position": 0}, + {"token": "õpi","start_offset": 9,"end_offset": 15,"type": "","position": 1}, + {"token": "tallinna","start_offset": 16,"end_offset": 25,"type": "","position": 2}, + {"token": "eesti","start_offset": 29,"end_offset": 34,"type": "","position": 4}, + {"token": "ülikooli","start_offset": 35,"end_offset": 46,"type": "","position": 5}, + {"token": "nende","start_offset": 48,"end_offset": 53,"type": "","position": 6}, + {"token": "numbri","start_offset": 54,"end_offset": 61,"type": "","position": 7}, + {"token": "on","start_offset": 62,"end_offset": 64,"type": "","position": 8}, + {"token": "123456","start_offset": 65,"end_offset": 71,"type": "","position": 9} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/finnish.md b/_analyzers/language-analyzers/finnish.md new file mode 100644 index 0000000000..6f559650d2 --- /dev/null +++ b/_analyzers/language-analyzers/finnish.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Finnish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 140 +--- + +# Finnish analyzer + +The built-in `finnish` analyzer can be applied to a text field using the following command: + +```json +PUT /finnish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "finnish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_finnish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_finnish_analyzer": { + "type": "finnish", + "stem_exclusion": ["valta", "hyväksyntä"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Finnish analyzer internals + +The `finnish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Finnish) + - keyword + - stemmer (Finnish) + +## Custom Finnish analyzer + +You can create a custom Finnish analyzer using the following command: + +```json +PUT /finnish-index +{ + "settings": { + "analysis": { + "filter": { + "finnish_stop": { + "type": "stop", + "stopwords": "_finnish_" + }, + "finnish_stemmer": { + "type": "stemmer", + "language": "finnish" + }, + "finnish_keywords": { + "type": "keyword_marker", + "keywords": ["Helsinki", "Suomi"] + } + }, + "analyzer": { + "finnish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "finnish_stop", + "finnish_keywords", + "finnish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "finnish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /finnish-index/_analyze +{ + "field": "content", + "text": "Opiskelijat opiskelevat Helsingissä ja Suomen yliopistoissa. Heidän numeronsa ovat 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "opiskelij","start_offset": 0,"end_offset": 11,"type": "","position": 0}, + {"token": "opiskelev","start_offset": 12,"end_offset": 23,"type": "","position": 1}, + {"token": "helsing","start_offset": 24,"end_offset": 35,"type": "","position": 2}, + {"token": "suome","start_offset": 39,"end_offset": 45,"type": "","position": 4}, + {"token": "yliopisto","start_offset": 46,"end_offset": 59,"type": "","position": 5}, + {"token": "numero","start_offset": 68,"end_offset": 77,"type": "","position": 7}, + {"token": "123456","start_offset": 83,"end_offset": 89,"type": "","position": 9} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/french.md b/_analyzers/language-analyzers/french.md new file mode 100644 index 0000000000..64e7ab5415 --- /dev/null +++ b/_analyzers/language-analyzers/french.md @@ -0,0 +1,148 @@ +--- +layout: default +title: French +parent: Language analyzers +grand_parent: Analyzers +nav_order: 150 +--- + +# French analyzer + +The built-in `french` analyzer can be applied to a text field using the following command: + +```json +PUT /french-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "french" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_french_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_french_analyzer": { + "type": "french", + "stem_exclusion": ["autorité", "acceptation"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## French analyzer internals + +The `french` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - elision (French) + - lowercase + - stop (French) + - keyword + - stemmer (French) + +## Custom French analyzer + +You can create a custom French analyzer using the following command: + +```json +PUT /french-index +{ + "settings": { + "analysis": { + "filter": { + "french_stop": { + "type": "stop", + "stopwords": "_french_" + }, + "french_elision": { + "type": "elision", + "articles_case": true, + "articles": [ + "l", "m", "t", "qu", "n", "s", + "j", "d", "c", "jusqu", "quoiqu", + "lorsqu", "puisqu" + ] + }, + "french_stemmer": { + "type": "stemmer", + "language": "light_french" + }, + "french_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "french_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "french_elision", + "lowercase", + "french_stop", + "french_keywords", + "french_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "french_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /french-index/_analyze +{ + "field": "content", + "text": "Les étudiants étudient à Paris et dans les universités françaises. Leurs numéros sont 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "etudiant","start_offset": 4,"end_offset": 13,"type": "","position": 1}, + {"token": "etudient","start_offset": 14,"end_offset": 22,"type": "","position": 2}, + {"token": "pari","start_offset": 25,"end_offset": 30,"type": "","position": 4}, + {"token": "universit","start_offset": 43,"end_offset": 54,"type": "","position": 8}, + {"token": "francais","start_offset": 55,"end_offset": 65,"type": "","position": 9}, + {"token": "numero","start_offset": 73,"end_offset": 80,"type": "","position": 11}, + {"token": "123456","start_offset": 86,"end_offset": 92,"type": "","position": 13} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/galician.md b/_analyzers/language-analyzers/galician.md new file mode 100644 index 0000000000..00338b23a7 --- /dev/null +++ b/_analyzers/language-analyzers/galician.md @@ -0,0 +1,138 @@ +--- +layout: default +title: Galician +parent: Language analyzers +grand_parent: Analyzers +nav_order: 160 +--- + +# Galician analyzer + +The built-in `galician` analyzer can be applied to a text field using the following command: + +```json +PUT /galician-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "galician" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_galician_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_galician_analyzer": { + "type": "galician", + "stem_exclusion": ["autoridade", "aceptación"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Galician analyzer internals + +The `galician` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (French) + - keyword + - stemmer (French) + +## Custom Galician analyzer + +You can create a custom Galician analyzer using the following command: + +```json +PUT /galician-index +{ + "settings": { + "analysis": { + "filter": { + "galician_stop": { + "type": "stop", + "stopwords": "_galician_" + }, + "galician_stemmer": { + "type": "stemmer", + "language": "galician" + }, + "galician_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "galician_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "galician_stop", + "galician_keywords", + "galician_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "galician_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /galician-index/_analyze +{ + "field": "content", + "text": "Os estudantes estudan en Santiago e nas universidades galegas. Os seus números son 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "estud","start_offset": 3,"end_offset": 13,"type": "","position": 1}, + {"token": "estud","start_offset": 14,"end_offset": 21,"type": "","position": 2}, + {"token": "santiag","start_offset": 25,"end_offset": 33,"type": "","position": 4}, + {"token": "univers","start_offset": 40,"end_offset": 53,"type": "","position": 7}, + {"token": "galeg","start_offset": 54,"end_offset": 61,"type": "","position": 8}, + {"token": "numer","start_offset": 71,"end_offset": 78,"type": "","position": 11}, + {"token": "son","start_offset": 79,"end_offset": 82,"type": "","position": 12}, + {"token": "123456","start_offset": 83,"end_offset": 89,"type": "","position": 13} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/german.md b/_analyzers/language-analyzers/german.md new file mode 100644 index 0000000000..4071ef5378 --- /dev/null +++ b/_analyzers/language-analyzers/german.md @@ -0,0 +1,174 @@ +--- +layout: default +title: German +parent: Language analyzers +grand_parent: Analyzers +nav_order: 170 +--- + +# German analyzer + +The built-in `german` analyzer can be applied to a text field using the following command: + +```json +PUT /german-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "german" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_german_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_german_analyzer": { + "type": "german", + "stem_exclusion": ["Autorität", "Genehmigung"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## German analyzer internals + +The `german` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (German) + - keyword + - normalization (German) + - stemmer (German) + +## Custom German analyzer + +You can create a custom German analyzer using the following command: + +```json +PUT /german-index +{ + "settings": { + "analysis": { + "filter": { + "german_stop": { + "type": "stop", + "stopwords": "_german_" + }, + "german_stemmer": { + "type": "stemmer", + "language": "light_german" + }, + "german_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "german_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "german_stop", + "german_keywords", + "german_normalization", + "german_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "german_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /german-index/_analyze +{ + "field": "content", + "text": "Die Studenten studieren an den deutschen Universitäten. Ihre Nummern sind 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "student", + "start_offset": 4, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "studi", + "start_offset": 14, + "end_offset": 23, + "type": "", + "position": 2 + }, + { + "token": "deutsch", + "start_offset": 31, + "end_offset": 40, + "type": "", + "position": 5 + }, + { + "token": "universitat", + "start_offset": 41, + "end_offset": 54, + "type": "", + "position": 6 + }, + { + "token": "numm", + "start_offset": 61, + "end_offset": 68, + "type": "", + "position": 8 + }, + { + "token": "123456", + "start_offset": 74, + "end_offset": 80, + "type": "", + "position": 10 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/greek.md b/_analyzers/language-analyzers/greek.md new file mode 100644 index 0000000000..2446b1e2d6 --- /dev/null +++ b/_analyzers/language-analyzers/greek.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Greek +parent: Language analyzers +grand_parent: Analyzers +nav_order: 180 +--- + +# Greek analyzer + +The built-in `greek` analyzer can be applied to a text field using the following command: + +```json +PUT /greek-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "greek" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_greek_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_greek_analyzer": { + "type": "greek", + "stem_exclusion": ["αρχή", "έγκριση"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Greek analyzer internals + +The `greek` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Greek) + - keyword + - stemmer (Greek) + +## Custom Greek analyzer + +You can create a custom Greek analyzer using the following command: + +```json +PUT /greek-index +{ + "settings": { + "analysis": { + "filter": { + "greek_stop": { + "type": "stop", + "stopwords": "_greek_" + }, + "greek_stemmer": { + "type": "stemmer", + "language": "greek" + }, + "greek_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "greek_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "greek_stop", + "greek_keywords", + "greek_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "greek_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /greek-index/_analyze +{ + "field": "content", + "text": "Οι φοιτητές σπουδάζουν στα ελληνικά πανεπιστήμια. Οι αριθμοί τους είναι 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "φοιτητές","start_offset": 3,"end_offset": 11,"type": "","position": 1}, + {"token": "σπουδάζ","start_offset": 12,"end_offset": 22,"type": "","position": 2}, + {"token": "στα","start_offset": 23,"end_offset": 26,"type": "","position": 3}, + {"token": "ελληνικά","start_offset": 27,"end_offset": 35,"type": "","position": 4}, + {"token": "πανεπιστήμ","start_offset": 36,"end_offset": 48,"type": "","position": 5}, + {"token": "αριθμοί","start_offset": 53,"end_offset": 60,"type": "","position": 7}, + {"token": "τους","start_offset": 61,"end_offset": 65,"type": "","position": 8}, + {"token": "είνα","start_offset": 66,"end_offset": 71,"type": "","position": 9}, + {"token": "123456","start_offset": 72,"end_offset": 78,"type": "","position": 10} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/hindi.md b/_analyzers/language-analyzers/hindi.md new file mode 100644 index 0000000000..93f2eea319 --- /dev/null +++ b/_analyzers/language-analyzers/hindi.md @@ -0,0 +1,178 @@ +--- +layout: default +title: Hindi +parent: Language analyzers +grand_parent: Analyzers +nav_order: 190 +--- + +# Hindi analyzer + +The built-in `hindi` analyzer can be applied to a text field using the following command: + +```json +PUT /hindi-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hindi" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_hindi_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_hindi_analyzer": { + "type": "hindi", + "stem_exclusion": ["अधिकार", "अनुमोदन"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Hindi analyzer internals + +The `hindi` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - decimal_digit + - keyword + - normalization (indic) + - normalization (Hindi) + - stop (Hindi) + - stemmer (Hindi) + +## Custom Hindi analyzer + +You can create a custom Hindi analyzer using the following command: + +```json +PUT /hindi-index +{ + "settings": { + "analysis": { + "filter": { + "hindi_stop": { + "type": "stop", + "stopwords": "_hindi_" + }, + "hindi_stemmer": { + "type": "stemmer", + "language": "hindi" + }, + "hindi_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "hindi_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "decimal_digit", + "hindi_keywords", + "indic_normalization", + "hindi_normalization", + "hindi_stop", + "hindi_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hindi_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /hindi-index/_analyze +{ + "field": "content", + "text": "छात्र भारतीय विश्वविद्यालयों में पढ़ते हैं। उनके नंबर १२३४५६ हैं।" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "छातर", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "भारतिय", + "start_offset": 6, + "end_offset": 12, + "type": "", + "position": 1 + }, + { + "token": "विशवविदयालय", + "start_offset": 13, + "end_offset": 28, + "type": "", + "position": 2 + }, + { + "token": "पढ", + "start_offset": 33, + "end_offset": 38, + "type": "", + "position": 4 + }, + { + "token": "नंबर", + "start_offset": 49, + "end_offset": 53, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 54, + "end_offset": 60, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/hungarian.md b/_analyzers/language-analyzers/hungarian.md new file mode 100644 index 0000000000..d115c5d29c --- /dev/null +++ b/_analyzers/language-analyzers/hungarian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Hungarian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 200 +--- + +# Hungarian analyzer + +The built-in `hungarian` analyzer can be applied to a text field using the following command: + +```json +PUT /hungarian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hungarian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_hungarian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_hungarian_analyzer": { + "type": "hungarian", + "stem_exclusion": ["hatalom", "jóváhagyás"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Hungarian analyzer internals + +The `hungarian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Hungarian) + - keyword + - stemmer (Hungarian) + +## Custom Hungarian analyzer + +You can create a custom Hungarian analyzer using the following command: + +```json +PUT /hungarian-index +{ + "settings": { + "analysis": { + "filter": { + "hungarian_stop": { + "type": "stop", + "stopwords": "_hungarian_" + }, + "hungarian_stemmer": { + "type": "stemmer", + "language": "hungarian" + }, + "hungarian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "hungarian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "hungarian_stop", + "hungarian_keywords", + "hungarian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hungarian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /hungarian-index/_analyze +{ + "field": "content", + "text": "A diákok a magyar egyetemeken tanulnak. A számaik 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "diák", + "start_offset": 2, + "end_offset": 8, + "type": "", + "position": 1 + }, + { + "token": "magyar", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "egyetem", + "start_offset": 18, + "end_offset": 29, + "type": "", + "position": 4 + }, + { + "token": "tanul", + "start_offset": 30, + "end_offset": 38, + "type": "", + "position": 5 + }, + { + "token": "szám", + "start_offset": 42, + "end_offset": 49, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 50, + "end_offset": 56, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/index.md b/_analyzers/language-analyzers/index.md new file mode 100644 index 0000000000..89a4a42254 --- /dev/null +++ b/_analyzers/language-analyzers/index.md @@ -0,0 +1,135 @@ +--- +layout: default +title: Language analyzers +nav_order: 100 +parent: Analyzers +has_children: true +has_toc: true +redirect_from: + - /query-dsl/analyzers/language-analyzers/ + - /analyzers/language-analyzers/ +--- + +# Language analyzers + +OpenSearch supports the following language analyzers: +`arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `english`, `estonian`, `finnish`, `french`, `galician`, `german`, `greek`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`, `lithuanian`, `norwegian`, `persian`, `portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `thai`, and `turkish`. + +To use an analyzer when you map an index, specify the value in your query. For example, to map your index with the French language analyzer, specify the `french` value in the analyzer field: + +```json + "analyzer": "french" +``` + +#### Example request + +The following query specifies an index `my-index` with the `content` field configured as multi-field, and a sub-field named `french` is configured with the `french` language analyzer: + +```json +PUT my-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "fields": { + "french": { + "type": "text", + "analyzer": "french" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The default `french` analyzer can also be configured for the entire index using the following query: + +```json +PUT my-index +{ + "settings": { + "analysis": { + "analyzer": { + "default": { + "type": "french" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text" + }, + "title": { + "type": "text" + }, + "description": { + "type": "text" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can apply stem exclusion to any language analyzer by providing a list of lowercase words that should be excluded from stemming. Internally, OpenSearch uses the `keyword_marker` token filter to mark these words as keywords, ensuring that they are not stemmed. + +## Stem exclusion example + +Use the following request to configure `stem_exclusion`: + +```json +PUT index_with_stem_exclusion_english_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_english_analyzer":{ + "type":"english", + "stem_exclusion": ["manager", "management"] + } + } + } + } +} +``` +{% include copy-curl.html %} + + +## Stem exclusion with custom analyzers + +All language analyzers consist of tokenizers and token filters specific to a particular language. If you want to implement a custom version of the language analyzer with stem exclusion, you need to configure the `keyword_marker` token filter and list the words excluded from stemming in the `keywords` parameter: + +```json +PUT index_with_keyword_marker_analyzer +{ + "settings": { + "analysis": { + "filter": { + "protected_keywords_filter": { + "type": "keyword_marker", + "keywords": ["Apple", "OpenSearch"] + } + }, + "analyzer": { + "custom_english_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "protected_keywords_filter", + "english_stemmer" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} diff --git a/_analyzers/language-analyzers/indonesian.md b/_analyzers/language-analyzers/indonesian.md new file mode 100644 index 0000000000..5c3d430b3a --- /dev/null +++ b/_analyzers/language-analyzers/indonesian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Indonesian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 210 +--- + +# Indonesian analyzer + +The built-in `indonesian` analyzer can be applied to a text field using the following command: + +```json +PUT /indonesian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "indonesian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_indonesian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_indonesian_analyzer": { + "type": "indonesian", + "stem_exclusion": ["otoritas", "persetujuan"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Indonesian analyzer internals + +The `indonesian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Indonesian) + - keyword + - stemmer (Indonesian) + +## Custom Indonesian analyzer + +You can create a custom Indonesian analyzer using the following command: + +```json +PUT /hungarian-index +{ + "settings": { + "analysis": { + "filter": { + "hungarian_stop": { + "type": "stop", + "stopwords": "_hungarian_" + }, + "hungarian_stemmer": { + "type": "stemmer", + "language": "hungarian" + }, + "hungarian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "hungarian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "hungarian_stop", + "hungarian_keywords", + "hungarian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "hungarian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /indonesian-index/_analyze +{ + "field": "content", + "text": "Mahasiswa belajar di universitas Indonesia. Nomor mereka adalah 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "mahasiswa", + "start_offset": 0, + "end_offset": 9, + "type": "", + "position": 0 + }, + { + "token": "ajar", + "start_offset": 10, + "end_offset": 17, + "type": "", + "position": 1 + }, + { + "token": "universitas", + "start_offset": 21, + "end_offset": 32, + "type": "", + "position": 3 + }, + { + "token": "indonesia", + "start_offset": 33, + "end_offset": 42, + "type": "", + "position": 4 + }, + { + "token": "nomor", + "start_offset": 44, + "end_offset": 49, + "type": "", + "position": 5 + }, + { + "token": "123456", + "start_offset": 64, + "end_offset": 70, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/irish.md b/_analyzers/language-analyzers/irish.md new file mode 100644 index 0000000000..3e1535d134 --- /dev/null +++ b/_analyzers/language-analyzers/irish.md @@ -0,0 +1,157 @@ +--- +layout: default +title: Irish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 210 +--- + +# Irish analyzer + +The built-in `irish` analyzer can be applied to a text field using the following command: + +```json +PUT /irish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "irish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_irish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_irish_analyzer": { + "type": "irish", + "stem_exclusion": ["údarás", "faomhadh"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Irish analyzer internals + +The `irish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - hyphenation (Irish) + - elision (Irish) + - lowercase (Irish) + - stop (Irish) + - keyword + - stemmer (Irish) + +## Custom Irish analyzer + +You can create a custom Irish analyzer using the following command: + +```json +PUT /irish-index +{ + "settings": { + "analysis": { + "filter": { + "irish_stop": { + "type": "stop", + "stopwords": "_irish_" + }, + "irish_elision": { + "type": "elision", + "articles": [ "d", "m", "b" ], + "articles_case": true + }, + "irish_hyphenation": { + "type": "stop", + "stopwords": [ "h", "n", "t" ], + "ignore_case": true + }, + "irish_lowercase": { + "type": "lowercase", + "language": "irish" + }, + "irish_stemmer": { + "type": "stemmer", + "language": "irish" + }, + "irish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "irish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "irish_hyphenation", + "irish_elision", + "irish_lowercase", + "irish_stop", + "irish_keywords", + "irish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "irish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /irish-index/_analyze +{ + "field": "content", + "text": "Tá mic léinn ag staidéar in ollscoileanna na hÉireann. Is iad a gcuid uimhreacha ná 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "tá","start_offset": 0,"end_offset": 2,"type": "","position": 0}, + {"token": "mic","start_offset": 3,"end_offset": 6,"type": "","position": 1}, + {"token": "léinn","start_offset": 7,"end_offset": 12,"type": "","position": 2}, + {"token": "staidéar","start_offset": 16,"end_offset": 24,"type": "","position": 4}, + {"token": "ollscoileanna","start_offset": 28,"end_offset": 41,"type": "","position": 6}, + {"token": "héireann","start_offset": 45,"end_offset": 53,"type": "","position": 8}, + {"token": "cuid","start_offset": 64,"end_offset": 69,"type": "","position": 12}, + {"token": "uimhreacha","start_offset": 70,"end_offset": 80,"type": "","position": 13}, + {"token": "123456","start_offset": 84,"end_offset": 90,"type": "","position": 15} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/italian.md b/_analyzers/language-analyzers/italian.md new file mode 100644 index 0000000000..190056d63c --- /dev/null +++ b/_analyzers/language-analyzers/italian.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Italian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 220 +--- + +# Italian analyzer + +The built-in `italian` analyzer can be applied to a text field using the following command: + +```json +PUT /italian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "italian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_italian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_italian_analyzer": { + "type": "italian", + "stem_exclusion": ["autorità", "approvazione"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Italian analyzer internals + +The `italian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - elision (Italian) + - lowercase + - stop (Italian) + - keyword + - stemmer (Italian) + +## Custom Italian analyzer + +You can create a custom Italian analyzer using the following command: + +```json +PUT /italian-index +{ + "settings": { + "analysis": { + "filter": { + "italian_stop": { + "type": "stop", + "stopwords": "_italian_" + }, + "italian_elision": { + "type": "elision", + "articles": [ + "c", "l", "all", "dall", "dell", + "nell", "sull", "coll", "pell", + "gl", "agl", "dagl", "degl", "negl", + "sugl", "un", "m", "t", "s", "v", "d" + ], + "articles_case": true + }, + "italian_stemmer": { + "type": "stemmer", + "language": "light_italian" + }, + "italian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "italian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "italian_elision", + "lowercase", + "italian_stop", + "italian_keywords", + "italian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "italian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /italian-index/_analyze +{ + "field": "content", + "text": "Gli studenti studiano nelle università italiane. I loro numeri sono 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 4,"end_offset": 12,"type": "","position": 1}, + {"token": "studian","start_offset": 13,"end_offset": 21,"type": "","position": 2}, + {"token": "universit","start_offset": 28,"end_offset": 38,"type": "","position": 4}, + {"token": "italian","start_offset": 39,"end_offset": 47,"type": "","position": 5}, + {"token": "numer","start_offset": 56,"end_offset": 62,"type": "","position": 8}, + {"token": "123456","start_offset": 68,"end_offset": 74,"type": "","position": 10} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/latvian.md b/_analyzers/language-analyzers/latvian.md new file mode 100644 index 0000000000..2301759763 --- /dev/null +++ b/_analyzers/language-analyzers/latvian.md @@ -0,0 +1,148 @@ +--- +layout: default +title: Latvian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 230 +--- + +# Latvian analyzer + +The built-in `latvian` analyzer can be applied to a text field using the following command: + +```json +PUT /latvian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "latvian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_latvian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_latvian_analyzer": { + "type": "latvian", + "stem_exclusion": ["autoritāte", "apstiprinājums"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Latvian analyzer internals + +The `latvian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Latvian) + - keyword + - stemmer (Latvian) + +## Custom Latvian analyzer + +You can create a custom Latvian analyzer using the following command: + +```json +PUT /italian-index +{ + "settings": { + "analysis": { + "filter": { + "italian_stop": { + "type": "stop", + "stopwords": "_italian_" + }, + "italian_elision": { + "type": "elision", + "articles": [ + "c", "l", "all", "dall", "dell", + "nell", "sull", "coll", "pell", + "gl", "agl", "dagl", "degl", "negl", + "sugl", "un", "m", "t", "s", "v", "d" + ], + "articles_case": true + }, + "italian_stemmer": { + "type": "stemmer", + "language": "light_italian" + }, + "italian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "italian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "italian_elision", + "lowercase", + "italian_stop", + "italian_keywords", + "italian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "italian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /latvian-index/_analyze +{ + "field": "content", + "text": "Studenti mācās Latvijas universitātēs. Viņu numuri ir 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 0,"end_offset": 8,"type": "","position": 0}, + {"token": "māc","start_offset": 9,"end_offset": 14,"type": "","position": 1}, + {"token": "latvij","start_offset": 15,"end_offset": 23,"type": "","position": 2}, + {"token": "universitāt","start_offset": 24,"end_offset": 37,"type": "","position": 3}, + {"token": "vin","start_offset": 39,"end_offset": 43,"type": "","position": 4}, + {"token": "numur","start_offset": 44,"end_offset": 50,"type": "","position": 5}, + {"token": "123456","start_offset": 54,"end_offset": 60,"type": "","position": 7} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/lithuanian.md b/_analyzers/language-analyzers/lithuanian.md new file mode 100644 index 0000000000..ca5966c54e --- /dev/null +++ b/_analyzers/language-analyzers/lithuanian.md @@ -0,0 +1,136 @@ +--- +layout: default +title: Lithuanian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 230 +--- + +# Lithuanian analyzer + +The built-in `lithuanian` analyzer can be applied to a text field using the following command: + +```json +PUT /lithuanian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "lithuanian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_lithuanian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_lithuanian_analyzer": { + "type": "lithuanian", + "stem_exclusion": ["autoritetas", "patvirtinimas"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Lithuanian analyzer internals + +The `lithuanian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Lithuanian) + - keyword + - stemmer (Lithuanian) + +## Custom Lithuanian analyzer + +You can create a custom Lithuanian analyzer using the following command: + +```json +PUT /lithuanian-index +{ + "settings": { + "analysis": { + "filter": { + "lithuanian_stop": { + "type": "stop", + "stopwords": "_lithuanian_" + }, + "lithuanian_stemmer": { + "type": "stemmer", + "language": "lithuanian" + }, + "lithuanian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "lithuanian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "lithuanian_stop", + "lithuanian_keywords", + "lithuanian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "lithuanian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /lithuanian-index/_analyze +{ + "field": "content", + "text": "Studentai mokosi Lietuvos universitetuose. Jų numeriai yra 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 0,"end_offset": 9,"type": "","position": 0}, + {"token": "mok","start_offset": 10,"end_offset": 16,"type": "","position": 1}, + {"token": "lietuv","start_offset": 17,"end_offset": 25,"type": "","position": 2}, + {"token": "universitet","start_offset": 26,"end_offset": 41,"type": "","position": 3}, + {"token": "num","start_offset": 46,"end_offset": 54,"type": "","position": 5}, + {"token": "123456","start_offset": 59,"end_offset": 65,"type": "","position": 7} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/norwegian.md b/_analyzers/language-analyzers/norwegian.md new file mode 100644 index 0000000000..cfb04eebf3 --- /dev/null +++ b/_analyzers/language-analyzers/norwegian.md @@ -0,0 +1,137 @@ +--- +layout: default +title: Norwegian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 240 +--- + +# Norwegian analyzer + +The built-in `norwegian` analyzer can be applied to a text field using the following command: + +```json +PUT /norwegian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "norwegian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_norwegian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_norwegian_analyzer": { + "type": "norwegian", + "stem_exclusion": ["autoritet", "godkjenning"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Norwegian analyzer internals + +The `norwegian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Norwegian) + - keyword + - stemmer (Norwegian) + +## Custom Norwegian analyzer + +You can create a custom Norwegian analyzer using the following command: + +```json +PUT /norwegian-index +{ + "settings": { + "analysis": { + "filter": { + "norwegian_stop": { + "type": "stop", + "stopwords": "_norwegian_" + }, + "norwegian_stemmer": { + "type": "stemmer", + "language": "norwegian" + }, + "norwegian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "norwegian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "norwegian_stop", + "norwegian_keywords", + "norwegian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "norwegian_analyzer" + } + } + } +} + +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /norwegian-index/_analyze +{ + "field": "content", + "text": "Studentene studerer ved norske universiteter. Deres nummer er 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "student","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "studer","start_offset": 11,"end_offset": 19,"type": "","position": 1}, + {"token": "norsk","start_offset": 24,"end_offset": 30,"type": "","position": 3}, + {"token": "universitet","start_offset": 31,"end_offset": 44,"type": "","position": 4}, + {"token": "numm","start_offset": 52,"end_offset": 58,"type": "","position": 6}, + {"token": "123456","start_offset": 62,"end_offset": 68,"type": "","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/persian.md b/_analyzers/language-analyzers/persian.md new file mode 100644 index 0000000000..40b38656fd --- /dev/null +++ b/_analyzers/language-analyzers/persian.md @@ -0,0 +1,144 @@ +--- +layout: default +title: Persian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 250 +--- + +# Persian analyzer + +The built-in `persian` analyzer can be applied to a text field using the following command: + +```json +PUT /persian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "persian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_persian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_persian_analyzer": { + "type": "persian", + "stem_exclusion": ["حکومت", "تأیید"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Persian analyzer internals + +The `persian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Char filter: `mapping` + +- Token filters: + - lowercase + - decimal_digit + - normalization (Arabic) + - normalization (Persian) + - keyword + - stemmer (Norwegian) + +## Custom Persian analyzer + +You can create a custom Persian analyzer using the following command: + +```json +PUT /persian-index +{ + "settings": { + "analysis": { + "filter": { + "persian_stop": { + "type": "stop", + "stopwords": "_persian_" + }, + "persian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "char_filter": { + "null_width_replace_with_space": { + "type": "mapping", + "mappings": [ "\\u200C=>\\u0020"] + } + }, + "analyzer": { + "persian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "char_filter": [ "null_width_replace_with_space" ], + "filter": [ + "lowercase", + "decimal_digit", + "arabic_normalization", + "persian_normalization", + "persian_stop" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "persian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /persian-index/_analyze +{ + "field": "content", + "text": "دانشجویان در دانشگاه‌های ایرانی تحصیل می‌کنند. شماره‌های آن‌ها ۱۲۳۴۵۶ است." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "دانشجويان","start_offset": 0,"end_offset": 9,"type": "","position": 0}, + {"token": "دانشگاه","start_offset": 13,"end_offset": 20,"type": "","position": 2}, + {"token": "ايراني","start_offset": 25,"end_offset": 31,"type": "","position": 4}, + {"token": "تحصيل","start_offset": 32,"end_offset": 37,"type": "","position": 5}, + {"token": "شماره","start_offset": 47,"end_offset": 52,"type": "","position": 8}, + {"token": "123456","start_offset": 63,"end_offset": 69,"type": "","position": 12} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/portuguese.md b/_analyzers/language-analyzers/portuguese.md new file mode 100644 index 0000000000..166ffa0010 --- /dev/null +++ b/_analyzers/language-analyzers/portuguese.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Portuguese +parent: Language analyzers +grand_parent: Analyzers +nav_order: 260 +--- + +# Portuguese analyzer + +The built-in `portuguese` analyzer can be applied to a text field using the following command: + +```json +PUT /portuguese-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "portuguese" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_portuguese_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_portuguese_analyzer": { + "type": "portuguese", + "stem_exclusion": ["autoridade", "aprovação"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Portuguese analyzer internals + +The `portuguese` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Portuguese) + - keyword + - stemmer (Portuguese) + +## Custom Portuguese analyzer + +You can create a custom Portuguese analyzer using the following command: + +```json +PUT /portuguese-index +{ + "settings": { + "analysis": { + "filter": { + "portuguese_stop": { + "type": "stop", + "stopwords": "_portuguese_" + }, + "portuguese_stemmer": { + "type": "stemmer", + "language": "light_portuguese" + }, + "portuguese_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "portuguese_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "portuguese_stop", + "portuguese_keywords", + "portuguese_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "portuguese_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /portuguese-index/_analyze +{ + "field": "content", + "text": "Os estudantes estudam nas universidades brasileiras. Seus números são 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "estudant", + "start_offset": 3, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "estudam", + "start_offset": 14, + "end_offset": 21, + "type": "", + "position": 2 + }, + { + "token": "universidad", + "start_offset": 26, + "end_offset": 39, + "type": "", + "position": 4 + }, + { + "token": "brasileir", + "start_offset": 40, + "end_offset": 51, + "type": "", + "position": 5 + }, + { + "token": "numer", + "start_offset": 58, + "end_offset": 65, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 70, + "end_offset": 76, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/romanian.md b/_analyzers/language-analyzers/romanian.md new file mode 100644 index 0000000000..cad0953385 --- /dev/null +++ b/_analyzers/language-analyzers/romanian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Romanian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 270 +--- + +# Romanian analyzer + +The built-in `romanian` analyzer can be applied to a text field using the following command: + +```json +PUT /romanian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "romanian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_romanian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_romanian_analyzer": { + "type": "romanian", + "stem_exclusion": ["autoritate", "aprobat"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Romanian analyzer internals + +The `romanian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Romanian) + - keyword + - stemmer (Romanian) + +## Custom Romanian analyzer + +You can create a custom Romanian analyzer using the following command: + +```json +PUT /romanian-index +{ + "settings": { + "analysis": { + "filter": { + "romanian_stop": { + "type": "stop", + "stopwords": "_romanian_" + }, + "romanian_stemmer": { + "type": "stemmer", + "language": "romanian" + }, + "romanian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "romanian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "romanian_stop", + "romanian_keywords", + "romanian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "romanian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /romanian-index/_analyze +{ + "field": "content", + "text": "Studenții învață la universitățile din România. Numerele lor sunt 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "studenț", + "start_offset": 0, + "end_offset": 9, + "type": "", + "position": 0 + }, + { + "token": "învaț", + "start_offset": 10, + "end_offset": 16, + "type": "", + "position": 1 + }, + { + "token": "universităț", + "start_offset": 20, + "end_offset": 34, + "type": "", + "position": 3 + }, + { + "token": "român", + "start_offset": 39, + "end_offset": 46, + "type": "", + "position": 5 + }, + { + "token": "numer", + "start_offset": 48, + "end_offset": 56, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 66, + "end_offset": 72, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/russian.md b/_analyzers/language-analyzers/russian.md new file mode 100644 index 0000000000..bd57ba0b27 --- /dev/null +++ b/_analyzers/language-analyzers/russian.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Russian +parent: Language analyzers +grand_parent: Analyzers +nav_order: 280 +--- + +# Russian analyzer + +The built-in `russian` analyzer can be applied to a text field using the following command: + +```json +PUT /russian-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "russian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_russian_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_russian_analyzer": { + "type": "russian", + "stem_exclusion": ["авторитет", "одобрение"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Russian analyzer internals + +The `russian` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Russian) + - keyword + - stemmer (Russian) + +## Custom Russian analyzer + +You can create a custom Russian analyzer using the following command: + +```json +PUT /russian-index +{ + "settings": { + "analysis": { + "filter": { + "russian_stop": { + "type": "stop", + "stopwords": "_russian_" + }, + "russian_stemmer": { + "type": "stemmer", + "language": "russian" + }, + "russian_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "russian_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "russian_stop", + "russian_keywords", + "russian_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "russian_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /russian-index/_analyze +{ + "field": "content", + "text": "Студенты учатся в университетах России. Их номера 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "студент", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "учат", + "start_offset": 9, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "университет", + "start_offset": 18, + "end_offset": 31, + "type": "", + "position": 3 + }, + { + "token": "росс", + "start_offset": 32, + "end_offset": 38, + "type": "", + "position": 4 + }, + { + "token": "номер", + "start_offset": 43, + "end_offset": 49, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 50, + "end_offset": 56, + "type": "", + "position": 7 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/sorani.md b/_analyzers/language-analyzers/sorani.md new file mode 100644 index 0000000000..f71d43c481 --- /dev/null +++ b/_analyzers/language-analyzers/sorani.md @@ -0,0 +1,168 @@ +--- +layout: default +title: Sorani +parent: Language analyzers +grand_parent: Analyzers +nav_order: 290 +--- + +# Sorani analyzer + +The built-in `sorani` analyzer can be applied to a text field using the following command: + +```json +PUT /sorani-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "sorani" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_sorani_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_sorani_analyzer": { + "type": "sorani", + "stem_exclusion": ["مؤسسه", "اجازه"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Sorani analyzer internals + +The `sorani` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - normalization (Sorani) + - lowercase + - decimal_digit + - stop (Sorani) + - keyword + - stemmer (Sorani) + +## Custom Sorani analyzer + +You can create a custom Sorani analyzer using the following command: + +```json +PUT /sorani-index +{ + "settings": { + "analysis": { + "filter": { + "sorani_stop": { + "type": "stop", + "stopwords": "_sorani_" + }, + "sorani_stemmer": { + "type": "stemmer", + "language": "sorani" + }, + "sorani_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "sorani_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "decimal_digit", + "sorani_stop", + "sorani_keywords", + "sorani_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "sorani_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /sorani-index/_analyze +{ + "field": "content", + "text": "خوێندنی فەرمی لە هەولێرەوە. ژمارەکان ١٢٣٤٥٦." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "خوێندن", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "فەرم", + "start_offset": 8, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "هەولێر", + "start_offset": 17, + "end_offset": 26, + "type": "", + "position": 3 + }, + { + "token": "ژمار", + "start_offset": 28, + "end_offset": 36, + "type": "", + "position": 4 + }, + { + "token": "123456", + "start_offset": 37, + "end_offset": 43, + "type": "", + "position": 5 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/spanish.md b/_analyzers/language-analyzers/spanish.md new file mode 100644 index 0000000000..8a0d8fad3c --- /dev/null +++ b/_analyzers/language-analyzers/spanish.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Spanish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 300 +--- + +# Spanish analyzer + +The built-in `spanish` analyzer can be applied to a text field using the following command: + +```json +PUT /spanish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "spanish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_spanish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_spanish_analyzer": { + "type": "spanish", + "stem_exclusion": ["autoridad", "aprobación"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Spanish analyzer internals + +The `spanish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Spanish) + - keyword + - stemmer (Spanish) + +## Custom Spanish analyzer + +You can create a custom Spanish analyzer using the following command: + +```json +PUT /spanish-index +{ + "settings": { + "analysis": { + "filter": { + "spanish_stop": { + "type": "stop", + "stopwords": "_spanish_" + }, + "spanish_stemmer": { + "type": "stemmer", + "language": "light_spanish" + }, + "spanish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "spanish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "spanish_stop", + "spanish_keywords", + "spanish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "spanish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /spanish-index/_analyze +{ + "field": "content", + "text": "Los estudiantes estudian en universidades españolas. Sus números son 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "estudiant", + "start_offset": 4, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "estudian", + "start_offset": 16, + "end_offset": 24, + "type": "", + "position": 2 + }, + { + "token": "universidad", + "start_offset": 28, + "end_offset": 41, + "type": "", + "position": 4 + }, + { + "token": "español", + "start_offset": 42, + "end_offset": 51, + "type": "", + "position": 5 + }, + { + "token": "numer", + "start_offset": 57, + "end_offset": 64, + "type": "", + "position": 7 + }, + { + "token": "123456", + "start_offset": 69, + "end_offset": 75, + "type": "", + "position": 9 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/swedish.md b/_analyzers/language-analyzers/swedish.md new file mode 100644 index 0000000000..9da595f12e --- /dev/null +++ b/_analyzers/language-analyzers/swedish.md @@ -0,0 +1,172 @@ +--- +layout: default +title: Swedish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 310 +--- + +# Swedish analyzer + +The built-in `swedish` analyzer can be applied to a text field using the following command: + +```json +PUT /swedish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "swedish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_swedish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_swedish_analyzer": { + "type": "swedish", + "stem_exclusion": ["myndighet", "godkännande"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Swedish analyzer internals + +The `swedish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - lowercase + - stop (Swedish) + - keyword + - stemmer (Swedish) + +## Custom Swedish analyzer + +You can create a custom Swedish analyzer using the following command: + +```json +PUT /swedish-index +{ + "settings": { + "analysis": { + "filter": { + "swedish_stop": { + "type": "stop", + "stopwords": "_swedish_" + }, + "swedish_stemmer": { + "type": "stemmer", + "language": "swedish" + }, + "swedish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "swedish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "swedish_stop", + "swedish_keywords", + "swedish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "swedish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /swedish-index/_analyze +{ + "field": "content", + "text": "Studenter studerar vid svenska universitet. Deras nummer är 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "student", + "start_offset": 0, + "end_offset": 9, + "type": "", + "position": 0 + }, + { + "token": "studer", + "start_offset": 10, + "end_offset": 18, + "type": "", + "position": 1 + }, + { + "token": "svensk", + "start_offset": 23, + "end_offset": 30, + "type": "", + "position": 3 + }, + { + "token": "universitet", + "start_offset": 31, + "end_offset": 42, + "type": "", + "position": 4 + }, + { + "token": "numm", + "start_offset": 50, + "end_offset": 56, + "type": "", + "position": 6 + }, + { + "token": "123456", + "start_offset": 60, + "end_offset": 66, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/thai.md b/_analyzers/language-analyzers/thai.md new file mode 100644 index 0000000000..e4daa1f0be --- /dev/null +++ b/_analyzers/language-analyzers/thai.md @@ -0,0 +1,132 @@ +--- +layout: default +title: Thai +parent: Language analyzers +grand_parent: Analyzers +nav_order: 320 +--- + +# Thai analyzer + +The built-in `thai` analyzer can be applied to a text field using the following command: + +```json +PUT /thai-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "thai" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_thai_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_thai_analyzer": { + "type": "thai", + "stem_exclusion": ["อำนาจ", "การอนุมัติ"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Thai analyzer internals + +The `thai` analyzer is built using the following components: + +- Tokenizer: `thai` + +- Token filters: + - lowercase + - decimal_digit + - stop (Thai) + - keyword + +## Custom Thai analyzer + +You can create a custom Thai analyzer using the following command: + +```json +PUT /thai-index +{ + "settings": { + "analysis": { + "filter": { + "thai_stop": { + "type": "stop", + "stopwords": "_thai_" + }, + "thai_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "thai_analyzer": { + "tokenizer": "thai", + "filter": [ + "lowercase", + "decimal_digit", + "thai_stop", + "thai_keywords" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "thai_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /thai-index/_analyze +{ + "field": "content", + "text": "นักเรียนกำลังศึกษาอยู่ที่มหาวิทยาลัยไทย หมายเลข 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "นักเรียน","start_offset": 0,"end_offset": 8,"type": "word","position": 0}, + {"token": "กำลัง","start_offset": 8,"end_offset": 13,"type": "word","position": 1}, + {"token": "ศึกษา","start_offset": 13,"end_offset": 18,"type": "word","position": 2}, + {"token": "มหาวิทยาลัย","start_offset": 25,"end_offset": 36,"type": "word","position": 5}, + {"token": "ไทย","start_offset": 36,"end_offset": 39,"type": "word","position": 6}, + {"token": "หมายเลข","start_offset": 40,"end_offset": 47,"type": "word","position": 7}, + {"token": "123456","start_offset": 48,"end_offset": 54,"type": "word","position": 8} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/turkish.md b/_analyzers/language-analyzers/turkish.md new file mode 100644 index 0000000000..fb36c5413c --- /dev/null +++ b/_analyzers/language-analyzers/turkish.md @@ -0,0 +1,143 @@ +--- +layout: default +title: Turkish +parent: Language analyzers +grand_parent: Analyzers +nav_order: 330 +--- + +# Turkish analyzer + +The built-in `turkish` analyzer can be applied to a text field using the following command: + +```json +PUT /turkish-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "turkish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Stem exclusion + +You can use `stem_exclusion` with this language analyzer using the following command: + +```json +PUT index_with_stem_exclusion_turkish_analyzer +{ + "settings": { + "analysis": { + "analyzer": { + "stem_exclusion_turkish_analyzer": { + "type": "turkish", + "stem_exclusion": ["otorite", "onay"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Turkish analyzer internals + +The `turkish` analyzer is built using the following components: + +- Tokenizer: `standard` + +- Token filters: + - apostrophe + - lowercase (Turkish) + - stop (Turkish) + - keyword + - stemmer (Turkish) + +## Custom Turkish analyzer + +You can create a custom Turkish analyzer using the following command: + +```json +PUT /turkish-index +{ + "settings": { + "analysis": { + "filter": { + "turkish_stop": { + "type": "stop", + "stopwords": "_turkish_" + }, + "turkish_stemmer": { + "type": "stemmer", + "language": "turkish" + }, + "turkish_lowercase": { + "type": "lowercase", + "language": "turkish" + }, + "turkish_keywords": { + "type": "keyword_marker", + "keywords": [] + } + }, + "analyzer": { + "turkish_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "apostrophe", + "turkish_lowercase", + "turkish_stop", + "turkish_keywords", + "turkish_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "turkish_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /turkish-index/_analyze +{ + "field": "content", + "text": "Öğrenciler Türk üniversitelerinde öğrenim görüyor. Numara 123456." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "öğrenci","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "türk","start_offset": 11,"end_offset": 15,"type": "","position": 1}, + {"token": "üniversite","start_offset": 16,"end_offset": 33,"type": "","position": 2}, + {"token": "öğre","start_offset": 34,"end_offset": 41,"type": "","position": 3}, + {"token": "görüyor","start_offset": 42,"end_offset": 49,"type": "","position": 4}, + {"token": "numar","start_offset": 51,"end_offset": 57,"type": "","position": 5}, + {"token": "123456","start_offset": 58,"end_offset": 64,"type": "","position": 6} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/supported-analyzers/index.md b/_analyzers/supported-analyzers/index.md index 5616936179..43e41b8d6a 100644 --- a/_analyzers/supported-analyzers/index.md +++ b/_analyzers/supported-analyzers/index.md @@ -24,12 +24,12 @@ Analyzer | Analysis performed | Analyzer output **Stop** | - Parses strings into tokens on any non-letter character
- Removes non-letter characters
- Removes stop words
- Converts tokens to lowercase | [`s`, `fun`, `contribute`, `brand`, `new`, `pr`, `opensearch`] **Keyword** (no-op) | - Outputs the entire string unchanged | [`It’s fun to contribute a brand-new PR or 2 to OpenSearch!`] **Pattern** | - Parses strings into tokens using regular expressions
- Supports converting strings to lowercase
- Supports removing stop words | [`it`, `s`, `fun`, `to`, `contribute`, `a`,`brand`, `new`, `pr`, `or`, `2`, `to`, `opensearch`] -[**Language**]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/) | Performs analysis specific to a certain language (for example, `english`). | [`fun`, `contribut`, `brand`, `new`, `pr`, `2`, `opensearch`] +[**Language**]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/index/) | Performs analysis specific to a certain language (for example, `english`). | [`fun`, `contribut`, `brand`, `new`, `pr`, `2`, `opensearch`] **Fingerprint** | - Parses strings on any non-letter character
- Normalizes characters by converting them to ASCII
- Converts tokens to lowercase
- Sorts, deduplicates, and concatenates tokens into a single token
- Supports removing stop words | [`2 a brand contribute fun it's new opensearch or pr to`]
Note that the apostrophe was converted to its ASCII counterpart. ## Language analyzers -OpenSearch supports multiple language analyzers. For more information, see [Language analyzers]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/). +OpenSearch supports multiple language analyzers. For more information, see [Language analyzers]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/index). ## Additional analyzers diff --git a/_analyzers/token-filters/condition.md b/_analyzers/token-filters/condition.md index eb3c348728..5e87c2cbbf 100644 --- a/_analyzers/token-filters/condition.md +++ b/_analyzers/token-filters/condition.md @@ -1,6 +1,6 @@ --- layout: default -title: condition +title: Condition parent: Token filters nav_order: 70 --- diff --git a/_analyzers/token-filters/dictionary-decompounder.md b/_analyzers/token-filters/dictionary-decompounder.md new file mode 100644 index 0000000000..ced6fd6fbc --- /dev/null +++ b/_analyzers/token-filters/dictionary-decompounder.md @@ -0,0 +1,101 @@ +--- +layout: default +title: Dictionary decompounder +parent: Token filters +nav_order: 110 +--- + +# Dictionary decompounder token filter + +The `dictionary_decompounder` token filter is used to split compound words into their constituent parts based on a predefined dictionary. This filter is particularly useful for languages like German, Dutch, or Finnish, in which compound words are common, so breaking them down can improve search relevance. The `dictionary_decompounder` token filter determines whether each token (word) can be split into smaller tokens based on a list of known words. If the token can be split into known words, the filter generates the subtokens for the token. + +## Parameters + +The `dictionary_decompounder` token filter has the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`word_list` | Required unless `word_list_path` is configured | Array of strings | The dictionary of words that the filter uses to split compound words. +`word_list_path` | Required unless `word_list` is configured | String | A file path to a text file containing the dictionary words. Accepts either an absolute path or a path relative to the `config` directory. The dictionary file must be UTF-8 encoded, and each word must be listed on a separate line. +`min_word_size` | Optional | Integer | The minimum length of the entire compound word that will be considered for splitting. If a compound word is shorter than this value, it is not split. Default is `5`. +`min_subword_size` | Optional | Integer | The minimum length for any subword. If a subword is shorter than this value, it is not included in the output. Default is `2`. +`max_subword_size` | Optional | Integer | The maximum length for any subword. If a subword is longer than this value, it is not included in the output. Default is `15`. +`only_longest_match` | Optional | Boolean | If set to `true`, only the longest matching subword will be returned. Default is `false`. + +## Example + +The following example request creates a new index named `decompound_example` and configures an analyzer with the `dictionary_decompounder` filter: + +```json +PUT /decompound_example +{ + "settings": { + "analysis": { + "filter": { + "my_dictionary_decompounder": { + "type": "dictionary_decompounder", + "word_list": ["slow", "green", "turtle"] + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "my_dictionary_decompounder"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /decompound_example/_analyze +{ + "analyzer": "my_analyzer", + "text": "slowgreenturtleswim" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "slowgreenturtleswim", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + }, + { + "token": "green", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + }, + { + "token": "turtle", + "start_offset": 0, + "end_offset": 19, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/edge-ngram.md b/_analyzers/token-filters/edge-ngram.md new file mode 100644 index 0000000000..be3eaf6fab --- /dev/null +++ b/_analyzers/token-filters/edge-ngram.md @@ -0,0 +1,111 @@ +--- +layout: default +title: Edge n-gram +parent: Token filters +nav_order: 120 +--- +# Edge n-gram token filter +The `edge_ngram` token filter is very similar to the `ngram` token filter, where a particular string is split into substrings of different lengths. The `edge_ngram` token filter, however, generates n-grams (substrings) only from the beginning (edge) of a token. It's particularly useful in scenarios like autocomplete or prefix matching, where you want to match the beginning of words or phrases as the user types them. + +## Parameters + +The `edge_ngram` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min_gram` | Optional | Integer | The minimum length of the n-grams that will be generated. Default is `1`. +`max_gram` | Optional | Integer | The maximum length of the n-grams that will be generated. Default is `1` for the `edge_ngram` filter and `2` for custom token filters. Avoid setting this parameter to a low value. If the value is set too low, only very short n-grams will be generated and the search term will not be found. For example, if `max_gram` is set to `3` and you index the word "banana", the longest generated token will be "ban". If the user searches for "banana", no matches will be returned. You can use the `truncate` token filter as a search analyzer to mitigate this risk. +`preserve_original` | Optional | Boolean | Includes the original token in the output. Default is `false` . + +## Example + +The following example request creates a new index named `edge_ngram_example` and configures an analyzer with the `edge_ngram` filter: + +```json +PUT /edge_ngram_example +{ + "settings": { + "analysis": { + "filter": { + "my_edge_ngram": { + "type": "edge_ngram", + "min_gram": 3, + "max_gram": 4 + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "my_edge_ngram"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /edge_ngram_example/_analyze +{ + "analyzer": "my_analyzer", + "text": "slow green turtle" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "slo", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "gre", + "start_offset": 5, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "gree", + "start_offset": 5, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "tur", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 2 + }, + { + "token": "turt", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/elision.md b/_analyzers/token-filters/elision.md new file mode 100644 index 0000000000..abc6dba658 --- /dev/null +++ b/_analyzers/token-filters/elision.md @@ -0,0 +1,124 @@ +--- +layout: default +title: Elision +parent: Token filters +nav_order: 130 +--- + +# Elision token filter + +The `elision` token filter is used to remove elided characters from words in certain languages. Elision typically occurs in languages such as French, in which words are often contracted and combined with the following word, typically by omitting a vowel and replacing it with an apostrophe. + +The `elision` token filter is already preconfigured in the following [language analyzers]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/): `catalan`, `french`, `irish`, and `italian`. +{: .note} + +## Parameters + +The custom `elision` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`articles` | Required if `articles_path` is not configured | Array of strings | Defines which articles or short words should be removed when they appear as part of an elision. +`articles_path` | Required if `articles` is not configured | String | Specifies the path to a custom list of articles that should be removed during the analysis process. +`articles_case` | Optional | Boolean | Specifies whether the filter is case sensitive when matching elisions. Default is `false`. + +## Example + +The default set of French elisions is `l'`, `m'`, `t'`, `qu'`, `n'`, `s'`, `j'`, `d'`, `c'`, `jusqu'`, `quoiqu'`, `lorsqu'`, and `puisqu'`. You can update this by configuring the `french_elision` token filter. The following example request creates a new index named `french_texts` and configures an analyzer with a `french_elision` filter: + +```json +PUT /french_texts +{ + "settings": { + "analysis": { + "filter": { + "french_elision": { + "type": "elision", + "articles": [ "l", "t", "m", "d", "n", "s", "j" ] + } + }, + "analyzer": { + "french_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "french_elision"] + } + } + } + }, + "mappings": { + "properties": { + "text": { + "type": "text", + "analyzer": "french_analyzer" + } + } + } +} + +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /french_texts/_analyze +{ + "analyzer": "french_analyzer", + "text": "L'étudiant aime l'école et le travail." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "étudiant", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "aime", + "start_offset": 11, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "école", + "start_offset": 16, + "end_offset": 23, + "type": "", + "position": 2 + }, + { + "token": "et", + "start_offset": 24, + "end_offset": 26, + "type": "", + "position": 3 + }, + { + "token": "le", + "start_offset": 27, + "end_offset": 29, + "type": "", + "position": 4 + }, + { + "token": "travail", + "start_offset": 30, + "end_offset": 37, + "type": "", + "position": 5 + } + ] +} +``` diff --git a/_analyzers/token-filters/fingerprint.md b/_analyzers/token-filters/fingerprint.md new file mode 100644 index 0000000000..75c6615459 --- /dev/null +++ b/_analyzers/token-filters/fingerprint.md @@ -0,0 +1,86 @@ +--- +layout: default +title: Fingerprint +parent: Token filters +nav_order: 140 +--- + +# Fingerprint token filter + +The `fingerprint` token filter is used to standardize and deduplicate text. This is particularly useful when consistency in text processing is crucial. The `fingerprint` token filter achieves this by processing text using the following steps: + +1. **Lowercasing**: Converts all text to lowercase. +2. **Splitting**: Breaks the text into tokens. +3. **Sorting**: Arranges the tokens in alphabetical order. +4. **Removing duplicates**: Eliminates repeated tokens. +5. **Joining tokens**: Combines the tokens into a single string, typically joined by a space or another specified separator. + +## Parameters + +The `fingerprint` token filter can be configured with the following two parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`max_output_size` | Optional | Integer | Limits the length of the generated fingerprint string. If the concatenated string exceeds the `max_output_size`, the filter will not produce any output, resulting in an empty token. Default is `255`. +`separator` | Optional | String | Defines the character(s) used to join the tokens into a single string after they have been sorted and deduplicated. Default is space (`" "`). + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `fingerprint` token filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_fingerprint": { + "type": "fingerprint", + "max_output_size": 200, + "separator": "-" + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_fingerprint" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "OpenSearch is a powerful search engine that scales easily" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "a-easily-engine-is-opensearch-powerful-scales-search-that", + "start_offset": 0, + "end_offset": 57, + "type": "fingerprint", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/flatten-graph.md b/_analyzers/token-filters/flatten-graph.md new file mode 100644 index 0000000000..8d51c57400 --- /dev/null +++ b/_analyzers/token-filters/flatten-graph.md @@ -0,0 +1,109 @@ +--- +layout: default +title: Flatten graph +parent: Token filters +nav_order: 150 +--- + +# Flatten graph token filter + +The `flatten_graph` token filter is used to handle complex token relationships that occur when multiple tokens are generated at the same position in a graph structure. Some token filters, like `synonym_graph` and `word_delimiter_graph`, generate multi-position tokens---tokens that overlap or span multiple positions. These token graphs are useful for search queries but are not directly supported during indexing. The `flatten_graph` token filter resolves multi-position tokens into a linear sequence of tokens. Flattening the graph ensures compatibility with the indexing process. + +Token graph flattening is a lossy process. Whenever possible, avoid using the `flatten_graph` filter. Instead, apply graph token filters exclusively in search analyzers, removing the need for the `flatten_graph` filter. +{: .important} + +## Example + +The following example request creates a new index named `test_index` and configures an analyzer with a `flatten_graph` filter: + +```json +PUT /test_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_index_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "my_custom_filter", + "flatten_graph" + ] + } + }, + "filter": { + "my_custom_filter": { + "type": "word_delimiter_graph", + "catenate_all": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /test_index/_analyze +{ + "analyzer": "my_index_analyzer", + "text": "OpenSearch helped many employers" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0, + "positionLength": 2 + }, + { + "token": "Open", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "Search", + "start_offset": 4, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "helped", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 2 + }, + { + "token": "many", + "start_offset": 18, + "end_offset": 22, + "type": "", + "position": 3 + }, + { + "token": "employers", + "start_offset": 23, + "end_offset": 32, + "type": "", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/hunspell.md b/_analyzers/token-filters/hunspell.md new file mode 100644 index 0000000000..6720ba74de --- /dev/null +++ b/_analyzers/token-filters/hunspell.md @@ -0,0 +1,108 @@ +--- +layout: default +title: Hunspell +parent: Token filters +nav_order: 160 +--- + +# Hunspell token filter + +The `hunspell` token filter is used for stemming and morphological analysis of words in a specific language. This filter applies Hunspell dictionaries, which are widely used in spell checkers. It works by breaking down words into their root forms (stemming). + +The Hunspell dictionary files are automatically loaded at startup from the `/hunspell/` directory. For example, the `en_GB` locale must have at least one `.aff` file and one or more `.dic` files in the `/hunspell/en_GB/` directory. + +You can download these files from [LibreOffice dictionaries](https://github.com/LibreOffice/dictionaries). + +## Parameters + +The `hunspell` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`language/lang/locale` | At least one of the three is required | String | Specifies the language for the Hunspell dictionary. +`dedup` | Optional | Boolean | Determines whether to remove multiple duplicate stemming terms for the same token. Default is `true`. +`dictionary` | Optional | Array of strings | Configures the dictionary files to be used for the Hunspell dictionary. Default is all files in the `/hunspell/` directory. +`longest_only` | Optional | Boolean | Specifies whether only the longest stemmed version of the token should be returned. Default is `false`. + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `hunspell` filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_hunspell_filter": { + "type": "hunspell", + "lang": "en_GB", + "dedup": true, + "longest_only": true + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_hunspell_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "the turtle moves slowly" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "turtle", + "start_offset": 4, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "move", + "start_offset": 11, + "end_offset": 16, + "type": "", + "position": 2 + }, + { + "token": "slow", + "start_offset": 17, + "end_offset": 23, + "type": "", + "position": 3 + } + ] +} +``` diff --git a/_analyzers/token-filters/hyphenation-decompounder.md b/_analyzers/token-filters/hyphenation-decompounder.md new file mode 100644 index 0000000000..6e53d4dfd5 --- /dev/null +++ b/_analyzers/token-filters/hyphenation-decompounder.md @@ -0,0 +1,102 @@ +--- +layout: default +title: Hyphenation decompounder +parent: Token filters +nav_order: 170 +--- + +# Hyphenation decompounder token filter + +The `hyphenation_decompounder` token filter is used to break down compound words into their constituent parts. This filter is particularly useful for languages like German, Dutch, and Swedish, in which compound words are common. The filter uses hyphenation patterns (typically defined in .xml files) to identify the possible locations within a compound word where it can be split into components. These components are then checked against a provided dictionary. If there is a match, those components are treated as valid tokens. For more information about hyphenation pattern files, see [FOP XML Hyphenation Patterns](https://offo.sourceforge.net/#FOP+XML+Hyphenation+Patterns). + +## Parameters + +The `hyphenation_decompounder` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`hyphenation_patterns_path` | Required | String | The path (relative to the `config` directory or absolute) to the hyphenation patterns file, which contains the language-specific rules for word splitting. The file is typically in XML format. Sample files can be downloaded from the [OFFO SourceForge project](https://sourceforge.net/projects/offo/). +`word_list` | Required if `word_list_path` is not set | Array of strings | A list of words used to validate the components generated by the hyphenation patterns. +`word_list_path` | Required if `word_list` is not set | String | The path (relative to the `config` directory or absolute) to a list of subwords. +`max_subword_size` | Optional | Integer | The maximum subword length. If the generated subword exceeds this length, it will not be added to the generated tokens. Default is `15`. +`min_subword_size` | Optional | Integer | The minimum subword length. If the generated subword is shorter than the specified length, it will not be added to the generated tokens. Default is `2`. +`min_word_size` | Optional | Integer | The minimum word character length. Word tokens shorter than this length are excluded from decomposition into subwords. Default is `5`. +`only_longest_match` | Optional | Boolean | Only includes the longest subword in the generated tokens. Default is `false`. + +## Example + +The following example request creates a new index named `test_index` and configures an analyzer with a `hyphenation_decompounder` filter: + +```json +PUT /test_index +{ + "settings": { + "analysis": { + "filter": { + "my_hyphenation_decompounder": { + "type": "hyphenation_decompounder", + "hyphenation_patterns_path": "analysis/hyphenation_patterns.xml", + "word_list": ["notebook", "note", "book"], + "min_subword_size": 3, + "min_word_size": 5, + "only_longest_match": false + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_hyphenation_decompounder" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /test_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "notebook" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "notebook", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "note", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "book", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/index.md b/_analyzers/token-filters/index.md index 9976feed60..b06489c805 100644 --- a/_analyzers/token-filters/index.md +++ b/_analyzers/token-filters/index.md @@ -17,7 +17,7 @@ The following table lists all token filters that OpenSearch supports. Token filter | Underlying Lucene token filter| Description [`apostrophe`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/apostrophe/) | [ApostropheFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/tr/ApostropheFilter.html) | In each token containing an apostrophe, the `apostrophe` token filter removes the apostrophe itself and all characters following it. [`asciifolding`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/asciifolding/) | [ASCIIFoldingFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.html) | Converts alphabetic, numeric, and symbolic characters. -`cjk_bigram` | [CJKBigramFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKBigramFilter.html) | Forms bigrams of Chinese, Japanese, and Korean (CJK) tokens. +[`cjk_bigram`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/cjk-bigram/) | [CJKBigramFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKBigramFilter.html) | Forms bigrams of Chinese, Japanese, and Korean (CJK) tokens. [`cjk_width`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/cjk-width/) | [CJKWidthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKWidthFilter.html) | Normalizes Chinese, Japanese, and Korean (CJK) tokens according to the following rules:
- Folds full-width ASCII character variants into their equivalent basic Latin characters.
- Folds half-width katakana character variants into their equivalent kana characters. [`classic`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/classic) | [ClassicFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/classic/ClassicFilter.html) | Performs optional post-processing on the tokens generated by the classic tokenizer. Removes possessives (`'s`) and removes `.` from acronyms. [`common_grams`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/common_gram/) | [CommonGramsFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/commongrams/CommonGramsFilter.html) | Generates bigrams for a list of frequently occurring terms. The output contains both single terms and bigrams. @@ -25,43 +25,43 @@ Token filter | Underlying Lucene token filter| Description [`decimal_digit`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/decimal-digit/) | [DecimalDigitFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/DecimalDigitFilter.html) | Converts all digits in the Unicode decimal number general category to basic Latin digits (0--9). [`delimited_payload`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/delimited-payload/) | [DelimitedPayloadTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.html) | Separates a token stream into tokens with corresponding payloads, based on a provided delimiter. A token consists of all characters preceding the delimiter, and a payload consists of all characters following the delimiter. For example, if the delimiter is `|`, then for the string `foo|bar`, `foo` is the token and `bar` is the payload. [`delimited_term_freq`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/delimited-term-frequency/) | [DelimitedTermFrequencyTokenFilter](https://lucene.apache.org/core/9_7_0/analysis/common/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.html) | Separates a token stream into tokens with corresponding term frequencies, based on a provided delimiter. A token consists of all characters before the delimiter, and a term frequency is the integer after the delimiter. For example, if the delimiter is `|`, then for the string `foo|5`, `foo` is the token and `5` is the term frequency. -`dictionary_decompounder` | [DictionaryCompoundWordTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.html) | Decomposes compound words found in many Germanic languages. -`edge_ngram` | [EdgeNGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.html) | Tokenizes the given token into edge n-grams (n-grams that start at the beginning of the token) of lengths between `min_gram` and `max_gram`. Optionally, keeps the original token. -`elision` | [ElisionFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/util/ElisionFilter.html) | Removes the specified [elisions](https://en.wikipedia.org/wiki/Elision) from the beginning of tokens. For example, changes `l'avion` (the plane) to `avion` (plane). -`fingerprint` | [FingerprintFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.html) | Sorts and deduplicates the token list and concatenates tokens into a single token. -`flatten_graph` | [FlattenGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/FlattenGraphFilter.html) | Flattens a token graph produced by a graph token filter, such as `synonym_graph` or `word_delimiter_graph`, making the graph suitable for indexing. -`hunspell` | [HunspellStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hunspell/HunspellStemFilter.html) | Uses [Hunspell](https://en.wikipedia.org/wiki/Hunspell) rules to stem tokens. Because Hunspell supports a word having multiple stems, this filter can emit multiple tokens for each consumed token. Requires you to configure one or more language-specific Hunspell dictionaries. -`hyphenation_decompounder` | [HyphenationCompoundWordTokenFilter](https://lucene.apache.org/core/9_8_0/analysis/common/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.html) | Uses XML-based hyphenation patterns to find potential subwords in compound words and checks the subwords against the specified word list. The token output contains only the subwords found in the word list. -`keep_types` | [TypeTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/TypeTokenFilter.html) | Keeps or removes tokens of a specific type. -`keep_word` | [KeepWordFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.html) | Checks the tokens against the specified word list and keeps only those that are in the list. -`keyword_marker` | [KeywordMarkerFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.html) | Marks specified tokens as keywords, preventing them from being stemmed. -`keyword_repeat` | [KeywordRepeatFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.html) | Emits each incoming token twice: once as a keyword and once as a non-keyword. -`kstem` | [KStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/KStemFilter.html) | Provides kstem-based stemming for the English language. Combines algorithmic stemming with a built-in dictionary. -`kuromoji_completion` | [JapaneseCompletionFilter](https://lucene.apache.org/core/9_10_0/analysis/kuromoji/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.html) | Adds Japanese romanized terms to the token stream (in addition to the original tokens). Usually used to support autocomplete on Japanese search terms. Note that the filter has a `mode` parameter, which should be set to `index` when used in an index analyzer and `query` when used in a search analyzer. Requires the `analysis-kuromoji` plugin. For information about installing the plugin, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#additional-plugins). -`length` | [LengthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LengthFilter.html) | Removes tokens whose lengths are shorter or longer than the length range specified by `min` and `max`. -`limit` | [LimitTokenCountFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.html) | Limits the number of output tokens. A common use case is to limit the size of document field values based on token count. -`lowercase` | [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to lowercase. The default [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) is for the English language. You can set the `language` parameter to `greek` (uses [GreekLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/el/GreekLowerCaseFilter.html)), `irish` (uses [IrishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.html)), or `turkish` (uses [TurkishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.html)). -`min_hash` | [MinHashFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/minhash/MinHashFilter.html) | Uses the [MinHash technique](https://en.wikipedia.org/wiki/MinHash) to estimate document similarity. Performs the following operations on a token stream sequentially:
1. Hashes each token in the stream.
2. Assigns the hashes to buckets, keeping only the smallest hashes of each bucket.
3. Outputs the smallest hash from each bucket as a token stream. -`multiplexer` | N/A | Emits multiple tokens at the same position. Runs each token through each of the specified filter lists separately and outputs the results as separate tokens. -`ngram` | [NGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/NGramTokenFilter.html) | Tokenizes the given token into n-grams of lengths between `min_gram` and `max_gram`. -Normalization | `arabic_normalization`: [ArabicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ar/ArabicNormalizer.html)
`german_normalization`: [GermanNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html)
`hindi_normalization`: [HindiNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hi/HindiNormalizer.html)
`indic_normalization`: [IndicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/in/IndicNormalizer.html)
`sorani_normalization`: [SoraniNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html)
`persian_normalization`: [PersianNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/fa/PersianNormalizer.html)
`scandinavian_normalization` : [ScandinavianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html)
`scandinavian_folding`: [ScandinavianFoldingFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html)
`serbian_normalization`: [SerbianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.html) | Normalizes the characters of one of the listed languages. -`pattern_capture` | N/A | Generates a token for every capture group in the provided regular expression. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). -`pattern_replace` | N/A | Matches a pattern in the provided regular expression and replaces matching substrings. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). -`phonetic` | N/A | Uses a phonetic encoder to emit a metaphone token for each token in the token stream. Requires installing the `analysis-phonetic` plugin. -`porter_stem` | [PorterStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/PorterStemFilter.html) | Uses the [Porter stemming algorithm](https://tartarus.org/martin/PorterStemmer/) to perform algorithmic stemming for the English language. -`predicate_token_filter` | N/A | Removes tokens that don’t match the specified predicate script. Supports inline Painless scripts only. -`remove_duplicates` | [RemoveDuplicatesTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.html) | Removes duplicate tokens that are in the same position. -`reverse` | [ReverseStringFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/reverse/ReverseStringFilter.html) | Reverses the string corresponding to each token in the token stream. For example, the token `dog` becomes `god`. -`shingle` | [ShingleFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/shingle/ShingleFilter.html) | Generates shingles of lengths between `min_shingle_size` and `max_shingle_size` for tokens in the token stream. Shingles are similar to n-grams but apply to words instead of letters. For example, two-word shingles added to the list of unigrams [`contribute`, `to`, `opensearch`] are [`contribute to`, `to opensearch`]. -`snowball` | N/A | Stems words using a [Snowball-generated stemmer](https://snowballstem.org/). You can use the `snowball` token filter with the following languages in the `language` field: `Arabic`, `Armenian`, `Basque`, `Catalan`, `Danish`, `Dutch`, `English`, `Estonian`, `Finnish`, `French`, `German`, `German2`, `Hungarian`, `Irish`, `Italian`, `Kp`, `Lithuanian`, `Lovins`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`, `Spanish`, `Swedish`, `Turkish`. -`stemmer` | N/A | Provides algorithmic stemming for the following languages in the `language` field: `arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `dutch_kp`, `english`, `light_english`, `lovins`, `minimal_english`, `porter2`, `possessive_english`, `estonian`, `finnish`, `light_finnish`, `french`, `light_french`, `minimal_french`, `galician`, `minimal_galician`, `german`, `german2`, `light_german`, `minimal_german`, `greek`, `hindi`, `hungarian`, `light_hungarian`, `indonesian`, `irish`, `italian`, `light_italian`, `latvian`, `Lithuanian`, `norwegian`, `light_norwegian`, `minimal_norwegian`, `light_nynorsk`, `minimal_nynorsk`, `portuguese`, `light_portuguese`, `minimal_portuguese`, `portuguese_rslp`, `romanian`, `russian`, `light_russian`, `sorani`, `spanish`, `light_spanish`, `swedish`, `light_swedish`, `turkish`. -`stemmer_override` | N/A | Overrides stemming algorithms by applying a custom mapping so that the provided terms are not stemmed. -`stop` | [StopFilter](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/analysis/StopFilter.html) | Removes stop words from a token stream. -`synonym` | N/A | Supplies a synonym list for the analysis process. The synonym list is provided using a configuration file. -`synonym_graph` | N/A | Supplies a synonym list, including multiword synonyms, for the analysis process. -`trim` | [TrimFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TrimFilter.html) | Trims leading and trailing white space from each token in a stream. -`truncate` | [TruncateTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.html) | Truncates tokens whose length exceeds the specified character limit. -`unique` | N/A | Ensures each token is unique by removing duplicate tokens from a stream. -`uppercase` | [UpperCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to uppercase. -`word_delimiter` | [WordDelimiterFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. -`word_delimiter_graph` | [WordDelimiterGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. Assigns multi-position tokens a `positionLength` attribute. +[`dictionary_decompounder`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/dictionary-decompounder/) | [DictionaryCompoundWordTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.html) | Decomposes compound words found in many Germanic languages. +[`edge_ngram`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/edge-ngram/) | [EdgeNGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.html) | Tokenizes the given token into edge n-grams (n-grams that start at the beginning of the token) of lengths between `min_gram` and `max_gram`. Optionally, keeps the original token. +[`elision`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/elision/) | [ElisionFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/util/ElisionFilter.html) | Removes the specified [elisions](https://en.wikipedia.org/wiki/Elision) from the beginning of tokens. For example, changes `l'avion` (the plane) to `avion` (plane). +[`fingerprint`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/fingerprint/) | [FingerprintFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.html) | Sorts and deduplicates the token list and concatenates tokens into a single token. +[`flatten_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/flatten-graph/) | [FlattenGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/FlattenGraphFilter.html) | Flattens a token graph produced by a graph token filter, such as `synonym_graph` or `word_delimiter_graph`, making the graph suitable for indexing. +[`hunspell`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/hunspell/) | [HunspellStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hunspell/HunspellStemFilter.html) | Uses [Hunspell](https://en.wikipedia.org/wiki/Hunspell) rules to stem tokens. Because Hunspell allows a word to have multiple stems, this filter can emit multiple tokens for each consumed token. Requires the configuration of one or more language-specific Hunspell dictionaries. +[`hyphenation_decompounder`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/hyphenation-decompounder/) | [HyphenationCompoundWordTokenFilter](https://lucene.apache.org/core/9_8_0/analysis/common/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.html) | Uses XML-based hyphenation patterns to find potential subwords in compound words and checks the subwords against the specified word list. The token output contains only the subwords found in the word list. +[`keep_types`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keep-types/) | [TypeTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/TypeTokenFilter.html) | Keeps or removes tokens of a specific type. +[`keep_words`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keep-words/) | [KeepWordFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.html) | Checks the tokens against the specified word list and keeps only those that are in the list. +[`keyword_marker`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keyword-marker/) | [KeywordMarkerFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.html) | Marks specified tokens as keywords, preventing them from being stemmed. +[`keyword_repeat`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/keyword-repeat/) | [KeywordRepeatFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.html) | Emits each incoming token twice: once as a keyword and once as a non-keyword. +[`kstem`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/kstem/) | [KStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/KStemFilter.html) | Provides KStem-based stemming for the English language. Combines algorithmic stemming with a built-in dictionary. +[`kuromoji_completion`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/kuromoji-completion/) | [JapaneseCompletionFilter](https://lucene.apache.org/core/9_10_0/analysis/kuromoji/org/apache/lucene/analysis/ja/JapaneseCompletionFilter.html) | Adds Japanese romanized terms to a token stream (in addition to the original tokens). Usually used to support autocomplete of Japanese search terms. Note that the filter has a `mode` parameter that should be set to `index` when used in an index analyzer and `query` when used in a search analyzer. Requires the `analysis-kuromoji` plugin. For information about installing the plugin, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#additional-plugins). +[`length`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/length/) | [LengthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LengthFilter.html) | Removes tokens that are shorter or longer than the length range specified by `min` and `max`. +[`limit`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/limit/) | [LimitTokenCountFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilter.html) | Limits the number of output tokens. For example, document field value sizes can be limited based on the token count. +[`lowercase`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/lowercase/) | [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to lowercase. The default [LowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) processes the English language. To process other languages, set the `language` parameter to `greek` (uses [GreekLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/el/GreekLowerCaseFilter.html)), `irish` (uses [IrishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.html)), or `turkish` (uses [TurkishLowerCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.html)). +[`min_hash`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/min-hash/) | [MinHashFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/minhash/MinHashFilter.html) | Uses the [MinHash technique](https://en.wikipedia.org/wiki/MinHash) to estimate document similarity. Performs the following operations on a token stream sequentially:
1. Hashes each token in the stream.
2. Assigns the hashes to buckets, keeping only the smallest hashes of each bucket.
3. Outputs the smallest hash from each bucket as a token stream. +[`multiplexer`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/multiplexer/) | N/A | Emits multiple tokens at the same position. Runs each token through each of the specified filter lists separately and outputs the results as separate tokens. +[`ngram`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/ngram/) | [NGramTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ngram/NGramTokenFilter.html) | Tokenizes the given token into n-grams of lengths between `min_gram` and `max_gram`. +[Normalization]({{site.url}}{{site.baseurl}}/analyzers/token-filters/normalization/) | `arabic_normalization`: [ArabicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ar/ArabicNormalizer.html)
`german_normalization`: [GermanNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html)
`hindi_normalization`: [HindiNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/hi/HindiNormalizer.html)
`indic_normalization`: [IndicNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/in/IndicNormalizer.html)
`sorani_normalization`: [SoraniNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html)
`persian_normalization`: [PersianNormalizer](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/fa/PersianNormalizer.html)
`scandinavian_normalization` : [ScandinavianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html)
`scandinavian_folding`: [ScandinavianFoldingFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html)
`serbian_normalization`: [SerbianNormalizationFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.html) | Normalizes the characters of one of the listed languages. +[`pattern_capture`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/pattern-capture/) | N/A | Generates a token for every capture group in the provided regular expression. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). +[`pattern_replace`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/pattern-replace/) | N/A | Matches a pattern in the provided regular expression and replaces matching substrings. Uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). +[`phonetic`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/phonetic/) | N/A | Uses a phonetic encoder to emit a metaphone token for each token in the token stream. Requires installing the `analysis-phonetic` plugin. +[`porter_stem`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/porter-stem/) | [PorterStemFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/PorterStemFilter.html) | Uses the [Porter stemming algorithm](https://tartarus.org/martin/PorterStemmer/) to perform algorithmic stemming for the English language. +[`predicate_token_filter`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/predicate-token-filter/) | N/A | Removes tokens that do not match the specified predicate script. Supports only inline Painless scripts. +[`remove_duplicates`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/remove-duplicates/) | [RemoveDuplicatesTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.html) | Removes duplicate tokens that are in the same position. +[`reverse`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/reverse/) | [ReverseStringFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/reverse/ReverseStringFilter.html) | Reverses the string corresponding to each token in the token stream. For example, the token `dog` becomes `god`. +[`shingle`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/shingle/) | [ShingleFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/shingle/ShingleFilter.html) | Generates shingles of lengths between `min_shingle_size` and `max_shingle_size` for tokens in the token stream. Shingles are similar to n-grams but are generated using words instead of letters. For example, two-word shingles added to the list of unigrams [`contribute`, `to`, `opensearch`] are [`contribute to`, `to opensearch`]. +[`snowball`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/snowball/) | N/A | Stems words using a [Snowball-generated stemmer](https://snowballstem.org/). The `snowball` token filter supports using the following languages in the `language` field: `Arabic`, `Armenian`, `Basque`, `Catalan`, `Danish`, `Dutch`, `English`, `Estonian`, `Finnish`, `French`, `German`, `German2`, `Hungarian`, `Irish`, `Italian`, `Kp`, `Lithuanian`, `Lovins`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`, `Spanish`, `Swedish`, `Turkish`. +[`stemmer`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/stemmer/) | N/A | Provides algorithmic stemming for the following languages used in the `language` field: `arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `dutch_kp`, `english`, `light_english`, `lovins`, `minimal_english`, `porter2`, `possessive_english`, `estonian`, `finnish`, `light_finnish`, `french`, `light_french`, `minimal_french`, `galician`, `minimal_galician`, `german`, `german2`, `light_german`, `minimal_german`, `greek`, `hindi`, `hungarian`, `light_hungarian`, `indonesian`, `irish`, `italian`, `light_italian`, `latvian`, `Lithuanian`, `norwegian`, `light_norwegian`, `minimal_norwegian`, `light_nynorsk`, `minimal_nynorsk`, `portuguese`, `light_portuguese`, `minimal_portuguese`, `portuguese_rslp`, `romanian`, `russian`, `light_russian`, `sorani`, `spanish`, `light_spanish`, `swedish`, `light_swedish`, `turkish`. +[`stemmer_override`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/stemmer-override/) | N/A | Overrides stemming algorithms by applying a custom mapping so that the provided terms are not stemmed. +[`stop`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/stop/) | [StopFilter](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/analysis/StopFilter.html) | Removes stop words from a token stream. +[`synonym`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/synonym/) | N/A | Supplies a synonym list for the analysis process. The synonym list is provided using a configuration file. +[`synonym_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/synonym-graph/) | N/A | Supplies a synonym list, including multiword synonyms, for the analysis process. +[`trim`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/trim/) | [TrimFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TrimFilter.html) | Trims leading and trailing white space characters from each token in a stream. +[`truncate`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/truncate/) | [TruncateTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.html) | Truncates tokens with lengths exceeding the specified character limit. +[`unique`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/unique/) | N/A | Ensures that each token is unique by removing duplicate tokens from a stream. +[`uppercase`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/uppercase/) | [UpperCaseFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/LowerCaseFilter.html) | Converts tokens to uppercase. +[`word_delimiter`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/word-delimiter/) | [WordDelimiterFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. +[`word_delimiter_graph`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/word-delimiter-graph/) | [WordDelimiterGraphFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.html) | Splits tokens at non-alphanumeric characters and performs normalization based on the specified rules. Assigns a `positionLength` attribute to multi-position tokens. diff --git a/_analyzers/token-filters/keep-types.md b/_analyzers/token-filters/keep-types.md new file mode 100644 index 0000000000..59e617f567 --- /dev/null +++ b/_analyzers/token-filters/keep-types.md @@ -0,0 +1,115 @@ +--- +layout: default +title: Keep types +parent: Token filters +nav_order: 180 +--- + +# Keep types token filter + +The `keep_types` token filter is a type of token filter used in text analysis to control which token types are kept or discarded. Different tokenizers produce different token types, for example, ``, ``, or ``. + +The `keyword`, `simple_pattern`, and `simple_pattern_split` tokenizers do not support the `keep_types` token filter because these tokenizers do not support token type attributes. +{: .note} + +## Parameters + +The `keep_types` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`types` | Required | List of strings | List of token types to be kept or discarded (determined by the `mode`). +`mode`| Optional | String | Whether to `include` or `exclude` the token types specified in `types`. Default is `include`. + + +## Example + +The following example request creates a new index named `test_index` and configures an analyzer with a `keep_types` filter: + +```json +PUT /test_index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "keep_types_filter"] + } + }, + "filter": { + "keep_types_filter": { + "type": "keep_types", + "types": [""] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /test_index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "Hello 2 world! This is an example." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "hello", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "world", + "start_offset": 8, + "end_offset": 13, + "type": "", + "position": 2 + }, + { + "token": "this", + "start_offset": 15, + "end_offset": 19, + "type": "", + "position": 3 + }, + { + "token": "is", + "start_offset": 20, + "end_offset": 22, + "type": "", + "position": 4 + }, + { + "token": "an", + "start_offset": 23, + "end_offset": 25, + "type": "", + "position": 5 + }, + { + "token": "example", + "start_offset": 26, + "end_offset": 33, + "type": "", + "position": 6 + } + ] +} +``` diff --git a/_analyzers/token-filters/keep-words.md b/_analyzers/token-filters/keep-words.md new file mode 100644 index 0000000000..4a6b199e5c --- /dev/null +++ b/_analyzers/token-filters/keep-words.md @@ -0,0 +1,92 @@ +--- +layout: default +title: Keep words +parent: Token filters +nav_order: 190 +--- + +# Keep words token filter + +The `keep_words` token filter is designed to keep only certain words during the analysis process. This filter is useful if you have a large body of text but are only interested in certain keywords or terms. + +## Parameters + +The `keep_words` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`keep_words` | Required if `keep_words_path` is not configured | List of strings | The list of words to keep. +`keep_words_path` | Required if `keep_words` is not configured | String | The path to the file containing the list of words to keep. +`keep_words_case` | Optional | Boolean | Whether to lowercase all words during comparison. Default is `false`. + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `keep_words` filter: + +```json +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_keep_word": { + "tokenizer": "standard", + "filter": [ "keep_words_filter" ] + } + }, + "filter": { + "keep_words_filter": { + "type": "keep", + "keep_words": ["example", "world", "opensearch"], + "keep_words_case": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_index/_analyze +{ + "analyzer": "custom_keep_word", + "text": "Hello, world! This is an OpenSearch example." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "world", + "start_offset": 7, + "end_offset": 12, + "type": "", + "position": 1 + }, + { + "token": "OpenSearch", + "start_offset": 25, + "end_offset": 35, + "type": "", + "position": 5 + }, + { + "token": "example", + "start_offset": 36, + "end_offset": 43, + "type": "", + "position": 6 + } + ] +} +``` diff --git a/_analyzers/token-filters/keyword-marker.md b/_analyzers/token-filters/keyword-marker.md new file mode 100644 index 0000000000..0ec2cb96f5 --- /dev/null +++ b/_analyzers/token-filters/keyword-marker.md @@ -0,0 +1,127 @@ +--- +layout: default +title: Keyword marker +parent: Token filters +nav_order: 200 +--- + +# Keyword marker token filter + +The `keyword_marker` token filter is used to prevent certain tokens from being altered by stemmers or other filters. The `keyword_marker` token filter does this by marking the specified tokens as `keywords`, which prevents any stemming or other processing. This ensures that specific words remain in their original form. + +## Parameters + +The `keyword_marker` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`ignore_case` | Optional | Boolean | Whether to ignore the letter case when matching keywords. Default is `false`. +`keywords` | Required if either `keywords_path` or `keywords_pattern` is not set | List of strings | The list of tokens to mark as keywords. +`keywords_path` | Required if either `keywords` or `keywords_pattern` is not set | String | The path (relative to the `config` directory or absolute) to the list of keywords. +`keywords_pattern` | Required if either `keywords` or `keywords_path` is not set | String | A [regular expression](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html) used for matching tokens to be marked as keywords. + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `keyword_marker` filter. The filter marks the word `example` as a keyword: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "keyword_marker_filter", "stemmer"] + } + }, + "filter": { + "keyword_marker_filter": { + "type": "keyword_marker", + "keywords": ["example"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "Favorite example" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens. Note that while the word `favorite` was stemmed, the word `example` was not stemmed because it was marked as a keyword: + +```json +{ + "tokens": [ + { + "token": "favorit", + "start_offset": 0, + "end_offset": 8, + "type": "", + "position": 0 + }, + { + "token": "example", + "start_offset": 9, + "end_offset": 16, + "type": "", + "position": 1 + } + ] +} +``` + +You can further examine the impact of the `keyword_marker` token filter by adding the following parameters to the `_analyze` query: + +```json +GET /my_index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "This is an OpenSearch example demonstrating keyword marker.", + "explain": true, + "attributes": "keyword" +} +``` +{% include copy-curl.html %} + +This will produce additional details in the response similar to the following: + +```json +{ + "name": "porter_stem", + "tokens": [ + ... + { + "token": "example", + "start_offset": 22, + "end_offset": 29, + "type": "", + "position": 4, + "keyword": true + }, + { + "token": "demonstr", + "start_offset": 30, + "end_offset": 43, + "type": "", + "position": 5, + "keyword": false + }, + ... + ] +} +``` diff --git a/_analyzers/token-filters/keyword-repeat.md b/_analyzers/token-filters/keyword-repeat.md new file mode 100644 index 0000000000..5ba15a037c --- /dev/null +++ b/_analyzers/token-filters/keyword-repeat.md @@ -0,0 +1,160 @@ +--- +layout: default +title: Keyword repeat +parent: Token filters +nav_order: 210 +--- + +# Keyword repeat token filter + +The `keyword_repeat` token filter emits the keyword version of a token into a token stream. This filter is typically used when you want to retain both the original token and its modified version after further token transformations, such as stemming or synonym expansion. The duplicated tokens allow the original, unchanged version of the token to remain in the final analysis alongside the modified versions. + +The `keyword_repeat` token filter should be placed before stemming filters. Stemming is not applied to every token, thus you may have duplicate tokens in the same position after stemming. To remove duplicate tokens, use the `remove_duplicates` token filter after the stemmer. +{: .note} + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `keyword_repeat` filter: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "filter": { + "my_kstem": { + "type": "kstem" + }, + "my_lowercase": { + "type": "lowercase" + } + }, + "analyzer": { + "my_custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "my_lowercase", + "keyword_repeat", + "my_kstem" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_custom_analyzer", + "text": "Stopped quickly" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "stopped", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "stop", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "quickly", + "start_offset": 8, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "quick", + "start_offset": 8, + "end_offset": 15, + "type": "", + "position": 1 + } + ] +} +``` + +You can further examine the impact of the `keyword_repeat` token filter by adding the following parameters to the `_analyze` query: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_custom_analyzer", + "text": "Stopped quickly", + "explain": true, + "attributes": "keyword" +} +``` +{% include copy-curl.html %} + +The response includes detailed information, such as tokenization, filtering, and the application of specific token filters: + +```json +{ + "detail": { + "custom_analyzer": true, + "charfilters": [], + "tokenizer": { + "name": "standard", + "tokens": [ + {"token": "OpenSearch","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "helped","start_offset": 11,"end_offset": 17,"type": "","position": 1}, + {"token": "many","start_offset": 18,"end_offset": 22,"type": "","position": 2}, + {"token": "employers","start_offset": 23,"end_offset": 32,"type": "","position": 3} + ] + }, + "tokenfilters": [ + { + "name": "lowercase", + "tokens": [ + {"token": "opensearch","start_offset": 0,"end_offset": 10,"type": "","position": 0}, + {"token": "helped","start_offset": 11,"end_offset": 17,"type": "","position": 1}, + {"token": "many","start_offset": 18,"end_offset": 22,"type": "","position": 2}, + {"token": "employers","start_offset": 23,"end_offset": 32,"type": "","position": 3} + ] + }, + { + "name": "keyword_marker_filter", + "tokens": [ + {"token": "opensearch","start_offset": 0,"end_offset": 10,"type": "","position": 0,"keyword": true}, + {"token": "helped","start_offset": 11,"end_offset": 17,"type": "","position": 1,"keyword": false}, + {"token": "many","start_offset": 18,"end_offset": 22,"type": "","position": 2,"keyword": false}, + {"token": "employers","start_offset": 23,"end_offset": 32,"type": "","position": 3,"keyword": false} + ] + }, + { + "name": "kstem_filter", + "tokens": [ + {"token": "opensearch","start_offset": 0,"end_offset": 10,"type": "","position": 0,"keyword": true}, + {"token": "help","start_offset": 11,"end_offset": 17,"type": "","position": 1,"keyword": false}, + {"token": "many","start_offset": 18,"end_offset": 22,"type": "","position": 2,"keyword": false}, + {"token": "employer","start_offset": 23,"end_offset": 32,"type": "","position": 3,"keyword": false} + ] + } + ] + } +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/kstem.md b/_analyzers/token-filters/kstem.md new file mode 100644 index 0000000000..d13fd2c675 --- /dev/null +++ b/_analyzers/token-filters/kstem.md @@ -0,0 +1,92 @@ +--- +layout: default +title: KStem +parent: Token filters +nav_order: 220 +--- + +# KStem token filter + +The `kstem` token filter is a stemming filter used to reduce words to their root forms. The filter is a lightweight algorithmic stemmer designed for the English language that performs the following stemming operations: + +- Reduces plurals to their singular form. +- Converts different verb tenses to their base form. +- Removes common derivational endings, such as "-ing" or "-ed". + +The `kstem` token filter is equivalent to the a `stemmer` filter configured with a `light_english` language. It provides a more conservative stemming compared to other stemming filters like `porter_stem`. + +The `kstem` token filter is based on the Lucene KStemFilter. For more information, see the [Lucene documentation](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/en/KStemFilter.html). + +## Example + +The following example request creates a new index named `my_kstem_index` and configures an analyzer with a `kstem` filter: + +```json +PUT /my_kstem_index +{ + "settings": { + "analysis": { + "filter": { + "kstem_filter": { + "type": "kstem" + } + }, + "analyzer": { + "my_kstem_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "kstem_filter" + ] + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_kstem_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_kstem_index/_analyze +{ + "analyzer": "my_kstem_analyzer", + "text": "stops stopped" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "stop", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "stop", + "start_offset": 6, + "end_offset": 13, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/kuromoji-completion.md b/_analyzers/token-filters/kuromoji-completion.md new file mode 100644 index 0000000000..24833e92e1 --- /dev/null +++ b/_analyzers/token-filters/kuromoji-completion.md @@ -0,0 +1,127 @@ +--- +layout: default +title: Kuromoji completion +parent: Token filters +nav_order: 230 +--- + +# Kuromoji completion token filter + +The `kuromoji_completion` token filter is used to stem Katakana words in Japanese, which are often used to represent foreign words or loanwords. This filter is especially useful for autocompletion or suggest queries, in which partial matches on Katakana words can be expanded to include their full forms. + +To use this token filter, you must first install the `analysis-kuromoji` plugin on all nodes by running `bin/opensearch-plugin install analysis-kuromoji` and then restart the cluster. For more information about installing additional plugins, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/index/). + +## Example + +The following example request creates a new index named `kuromoji_sample` and configures an analyzer with a `kuromoji_completion` filter: + +```json +PUT kuromoji_sample +{ + "settings": { + "index": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "kuromoji_tokenizer", + "filter": [ + "my_katakana_stemmer" + ] + } + }, + "filter": { + "my_katakana_stemmer": { + "type": "kuromoji_completion" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer with text that translates to "use a computer": + +```json +POST /kuromoji_sample/_analyze +{ + "analyzer": "my_analyzer", + "text": "コンピューターを使う" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "コンピューター", // The original Katakana word "computer". + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "konpyuーtaー", // Romanized version (Romaji) of "コンピューター". + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "konnpyuーtaー", // Another possible romanized version of "コンピューター" (with a slight variation in the spelling). + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "を", // A Japanese particle, "wo" or "o" + "start_offset": 7, + "end_offset": 8, + "type": "word", + "position": 1 + }, + { + "token": "wo", // Romanized form of the particle "を" (often pronounced as "o"). + "start_offset": 7, + "end_offset": 8, + "type": "word", + "position": 1 + }, + { + "token": "o", // Another version of the romanization. + "start_offset": 7, + "end_offset": 8, + "type": "word", + "position": 1 + }, + { + "token": "使う", // The verb "use" in Kanji. + "start_offset": 8, + "end_offset": 10, + "type": "word", + "position": 2 + }, + { + "token": "tukau", // Romanized version of "使う" + "start_offset": 8, + "end_offset": 10, + "type": "word", + "position": 2 + }, + { + "token": "tsukau", // Another romanized version of "使う", where "tsu" is more phonetically correct + "start_offset": 8, + "end_offset": 10, + "type": "word", + "position": 2 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/length.md b/_analyzers/token-filters/length.md new file mode 100644 index 0000000000..f6c5dcc706 --- /dev/null +++ b/_analyzers/token-filters/length.md @@ -0,0 +1,91 @@ +--- +layout: default +title: Length +parent: Token filters +nav_order: 240 +--- + +# Length token filter + +The `length` token filter is used to remove tokens that don't meet specified length criteria (minimum and maximum values) from the token stream. + +## Parameters + +The `length` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min` | Optional | Integer | The minimum token length. Default is `0`. +`max` | Optional | Integer | The maximum token length. Default is `Integer.MAX_VALUE` (`2147483647`). + + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `length` filter: + +```json +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "only_keep_4_to_10_characters": { + "tokenizer": "whitespace", + "filter": [ "length_4_to_10" ] + } + }, + "filter": { + "length_4_to_10": { + "type": "length", + "min": 4, + "max": 10 + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_index/_analyze +{ + "analyzer": "only_keep_4_to_10_characters", + "text": "OpenSearch is a great tool!" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "word", + "position": 0 + }, + { + "token": "great", + "start_offset": 16, + "end_offset": 21, + "type": "word", + "position": 3 + }, + { + "token": "tool!", + "start_offset": 22, + "end_offset": 27, + "type": "word", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/limit.md b/_analyzers/token-filters/limit.md new file mode 100644 index 0000000000..a849f5f06b --- /dev/null +++ b/_analyzers/token-filters/limit.md @@ -0,0 +1,89 @@ +--- +layout: default +title: Limit +parent: Token filters +nav_order: 250 +--- + +# Limit token filter + +The `limit` token filter is used to limit the number of tokens passed through the analysis chain. + +## Parameters + +The `limit` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`max_token_count` | Optional | Integer | The maximum number of tokens to be generated. Default is `1`. +`consume_all_tokens` | Optional | Boolean | (Expert-level setting) Uses all tokens from the tokenizer, even if the result exceeds `max_token_count`. When this parameter is set, the output still only contains the number of tokens specified by `max_token_count`. However, all tokens generated by the tokenizer are processed. Default is `false`. + +## Example + +The following example request creates a new index named `my_index` and configures an analyzer with a `limit` filter: + +```json +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "three_token_limit": { + "tokenizer": "standard", + "filter": [ "custom_token_limit" ] + } + }, + "filter": { + "custom_token_limit": { + "type": "limit", + "max_token_count": 3 + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_index/_analyze +{ + "analyzer": "three_token_limit", + "text": "OpenSearch is a powerful and flexible search engine." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OpenSearch", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "is", + "start_offset": 11, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "a", + "start_offset": 14, + "end_offset": 15, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/lowercase.md b/_analyzers/token-filters/lowercase.md new file mode 100644 index 0000000000..89f0f219fa --- /dev/null +++ b/_analyzers/token-filters/lowercase.md @@ -0,0 +1,82 @@ +--- +layout: default +title: Lowercase +parent: Token filters +nav_order: 260 +--- + +# Lowercase token filter + +The `lowercase` token filter is used to convert all characters in the token stream to lowercase, making searches case insensitive. + +## Parameters + +The `lowercase` token filter can be configured with the following parameter. + +Parameter | Required/Optional | Description +:--- | :--- | :--- + `language` | Optional | Specifies a language-specific token filter. Valid values are:
- [`greek`](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/el/GreekLowerCaseFilter.html)
- [`irish`](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ga/IrishLowerCaseFilter.html)
- [`turkish`](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.html).
Default is the [Lucene LowerCaseFilter](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/core/LowerCaseFilter.html). + +## Example + +The following example request creates a new index named `custom_lowercase_example`. It configures an analyzer with a `lowercase` filter and specifies `greek` as the `language`: + +```json +PUT /custom_lowercase_example +{ + "settings": { + "analysis": { + "analyzer": { + "greek_lowercase_example": { + "type": "custom", + "tokenizer": "standard", + "filter": ["greek_lowercase"] + } + }, + "filter": { + "greek_lowercase": { + "type": "lowercase", + "language": "greek" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /custom_lowercase_example/_analyze +{ + "analyzer": "greek_lowercase_example", + "text": "Αθήνα ΕΛΛΑΔΑ" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "αθηνα", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "ελλαδα", + "start_offset": 6, + "end_offset": 12, + "type": "", + "position": 1 + } + ] +} +``` diff --git a/_analyzers/token-filters/min-hash.md b/_analyzers/token-filters/min-hash.md new file mode 100644 index 0000000000..e4f1a8da91 --- /dev/null +++ b/_analyzers/token-filters/min-hash.md @@ -0,0 +1,138 @@ +--- +layout: default +title: Min hash +parent: Token filters +nav_order: 270 +--- + +# Min hash token filter + +The `min_hash` token filter is used to generate hashes for tokens based on a [MinHash](https://en.wikipedia.org/wiki/MinHash) approximation algorithm, which is useful for detecting similarity between documents. The `min_hash` token filter generates hashes for a set of tokens (typically from an analyzed field). + +## Parameters + +The `min_hash` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`hash_count` | Optional | Integer | The number of hash values to generate for each token. Increasing this value generally improves the accuracy of similarity estimation but increases the computational cost. Default is `1`. +`bucket_count` | Optional | Integer | The number of hash buckets to use. This affects the granularity of the hashing. A larger number of buckets provides finer granularity and reduces hash collisions but requires more memory. Default is `512`. +`hash_set_size` | Optional | Integer | The number of hashes to retain in each bucket. This can influence the hashing quality. Larger set sizes may lead to better similarity detection but consume more memory. Default is `1`. +`with_rotation` | Optional | Boolean | When set to `true`, the filter populates empty buckets with the value from the first non-empty bucket found to its circular right, provided that the `hash_set_size` is `1`. If the `bucket_count` argument exceeds `1`, this setting automatically defaults to `true`; otherwise, it defaults to `false`. + +## Example + +The following example request creates a new index named `minhash_index` and configures an analyzer with a `min_hash` filter: + +```json +PUT /minhash_index +{ + "settings": { + "analysis": { + "filter": { + "minhash_filter": { + "type": "min_hash", + "hash_count": 3, + "bucket_count": 512, + "hash_set_size": 1, + "with_rotation": false + } + }, + "analyzer": { + "minhash_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "minhash_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /minhash_index/_analyze +{ + "analyzer": "minhash_analyzer", + "text": "OpenSearch is very powerful." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens (the tokens are not human readable because they represent hashes): + +```json +{ + "tokens" : [ + { + "token" : "\u0000\u0000㳠锯ੲ걌䐩䉵", + "start_offset" : 0, + "end_offset" : 27, + "type" : "MIN_HASH", + "position" : 0 + }, + { + "token" : "\u0000\u0000㳠锯ੲ걌䐩䉵", + "start_offset" : 0, + "end_offset" : 27, + "type" : "MIN_HASH", + "position" : 0 + }, + ... +``` + +In order to demonstrate the usefulness of the `min_hash` token filter, you can use the following Python script to compare the two strings using the previously created analyzer: + +```python +from opensearchpy import OpenSearch +from requests.auth import HTTPBasicAuth + +# Initialize the OpenSearch client with authentication +host = 'https://localhost:9200' # Update if using a different host/port +auth = ('admin', 'admin') # Username and password + +# Create the OpenSearch client with SSL verification turned off +client = OpenSearch( + hosts=[host], + http_auth=auth, + use_ssl=True, + verify_certs=False, # Disable SSL certificate validation + ssl_show_warn=False # Suppress SSL warnings in the output +) + +# Analyzes text and returns the minhash tokens +def analyze_text(index, text): + response = client.indices.analyze( + index=index, + body={ + "analyzer": "minhash_analyzer", + "text": text + } + ) + return [token['token'] for token in response['tokens']] + +# Analyze two similar texts +tokens_1 = analyze_text('minhash_index', 'OpenSearch is a powerful search engine.') +tokens_2 = analyze_text('minhash_index', 'OpenSearch is a very powerful search engine.') + +# Calculate Jaccard similarity +set_1 = set(tokens_1) +set_2 = set(tokens_2) +shared_tokens = set_1.intersection(set_2) +jaccard_similarity = len(shared_tokens) / len(set_1.union(set_2)) + +print(f"Jaccard Similarity: {jaccard_similarity}") +``` + +The response should contain the Jaccard similarity score: + +```yaml +Jaccard Similarity: 0.8571428571428571 +``` \ No newline at end of file diff --git a/_analyzers/token-filters/multiplexer.md b/_analyzers/token-filters/multiplexer.md new file mode 100644 index 0000000000..21597b7fc1 --- /dev/null +++ b/_analyzers/token-filters/multiplexer.md @@ -0,0 +1,165 @@ +--- +layout: default +title: Multiplexer +parent: Token filters +nav_order: 280 +--- + +# Multiplexer token filter + +The `multiplexer` token filter allows you to create multiple versions of the same token by applying different filters. This is useful when you want to analyze the same token in multiple ways. For example, you may want to analyze a token using different stemming, synonyms, or n-gram filters and use all of the generated tokens together. This token filter works by duplicating the token stream and applying different filters to each copy. + +The `multiplexer` token filter removes duplicate tokens from the token stream. +{: .important} + +The `multiplexer` token filter does not support multiword `synonym` or `synonym_graph` token filters or `shingle` token filters because they need to analyze not only the current token but also upcoming tokens in order to determine how to transform the input correctly. +{: .important} + +## Parameters + +The `multiplexer` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`filters` | Optional | List of strings | A comma-separated list of token filters to apply to each copy of the token stream. Default is an empty list. +`preserve_original` | Optional | Boolean | Whether to keep the original token as one of the outputs. Default is `true`. + +## Example + +The following example request creates a new index named `multiplexer_index` and configures an analyzer with a `multiplexer` filter: + +```json +PUT /multiplexer_index +{ + "settings": { + "analysis": { + "filter": { + "english_stemmer": { + "type": "stemmer", + "name": "english" + }, + "synonym_filter": { + "type": "synonym", + "synonyms": [ + "quick,fast" + ] + }, + "multiplexer_filter": { + "type": "multiplexer", + "filters": ["english_stemmer", "synonym_filter"], + "preserve_original": true + } + }, + "analyzer": { + "multiplexer_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "multiplexer_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /multiplexer_index/_analyze +{ + "analyzer": "multiplexer_analyzer", + "text": "The slow turtle hides from the quick dog" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "The", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 4, + "end_offset": 8, + "type": "", + "position": 1 + }, + { + "token": "turtle", + "start_offset": 9, + "end_offset": 15, + "type": "", + "position": 2 + }, + { + "token": "turtl", + "start_offset": 9, + "end_offset": 15, + "type": "", + "position": 2 + }, + { + "token": "hides", + "start_offset": 16, + "end_offset": 21, + "type": "", + "position": 3 + }, + { + "token": "hide", + "start_offset": 16, + "end_offset": 21, + "type": "", + "position": 3 + }, + { + "token": "from", + "start_offset": 22, + "end_offset": 26, + "type": "", + "position": 4 + }, + { + "token": "the", + "start_offset": 27, + "end_offset": 30, + "type": "", + "position": 5 + }, + { + "token": "quick", + "start_offset": 31, + "end_offset": 36, + "type": "", + "position": 6 + }, + { + "token": "fast", + "start_offset": 31, + "end_offset": 36, + "type": "SYNONYM", + "position": 6 + }, + { + "token": "dog", + "start_offset": 37, + "end_offset": 40, + "type": "", + "position": 7 + } + ] +} +``` diff --git a/_analyzers/token-filters/ngram.md b/_analyzers/token-filters/ngram.md new file mode 100644 index 0000000000..c029eac26e --- /dev/null +++ b/_analyzers/token-filters/ngram.md @@ -0,0 +1,137 @@ +--- +layout: default +title: N-gram +parent: Token filters +nav_order: 290 +--- + +# N-gram token filter + +The `ngram` token filter is a powerful tool used to break down text into smaller components, known as _n-grams_, which can improve partial matching and fuzzy search capabilities. It works by splitting a token into smaller substrings of defined lengths. These filters are commonly used in search applications to support autocomplete, partial matches, and typo-tolerant search. For more information, see [Autocomplete functionality]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/autocomplete/) and [Did-you-mean]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/did-you-mean/). + +## Parameters + +The `ngram` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min_gram` | Optional | Integer | The minimum length of the n-grams. Default is `1`. +`max_gram` | Optional | Integer | The maximum length of the n-grams. Default is `2`. +`preserve_original` | Optional | Boolean | Whether to keep the original token as one of the outputs. Default is `false`. + +## Example + +The following example request creates a new index named `ngram_example_index` and configures an analyzer with an `ngram` filter: + +```json +PUT /ngram_example_index +{ + "settings": { + "analysis": { + "filter": { + "ngram_filter": { + "type": "ngram", + "min_gram": 2, + "max_gram": 3 + } + }, + "analyzer": { + "ngram_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "ngram_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /ngram_example_index/_analyze +{ + "analyzer": "ngram_analyzer", + "text": "Search" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "se", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "sea", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "ea", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "ear", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "ar", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "arc", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "rc", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "rch", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "ch", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/normalization.md b/_analyzers/token-filters/normalization.md new file mode 100644 index 0000000000..1be08e65c2 --- /dev/null +++ b/_analyzers/token-filters/normalization.md @@ -0,0 +1,88 @@ +--- +layout: default +title: Normalization +parent: Token filters +nav_order: 300 +--- + +# Normalization token filter + +The `normalization` token filter is designed to adjust and simplify text in a way that reduces variations, particularly variations in special characters. It is primarily used to handle variations in writing by standardizing characters in specific languages. + +The following `normalization` token filters are available: + +- [arabic_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ar/ArabicNormalizer.html) +- [german_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html) +- [hindi_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/hi/HindiNormalizer.html) +- [indic_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/in/IndicNormalizer.html) +- [sorani_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html) +- [persian_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/fa/PersianNormalizer.html) +- [scandinavian_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html) +- [scandinavian_folding](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html) +- [serbian_normalization](https://lucene.apache.org/core/8_7_0/analyzers-common/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.html) + + +## Example + +The following example request creates a new index named `german_normalizer_example` and configures an analyzer with a `german_normalization` filter: + +```json +PUT /german_normalizer_example +{ + "settings": { + "analysis": { + "filter": { + "german_normalizer": { + "type": "german_normalization" + } + }, + "analyzer": { + "german_normalizer_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "german_normalizer" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /german_normalizer_example/_analyze +{ + "text": "Straße München", + "analyzer": "german_normalizer_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "strasse", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "munchen", + "start_offset": 7, + "end_offset": 14, + "type": "", + "position": 1 + } + ] +} +``` diff --git a/_analyzers/token-filters/pattern-capture.md b/_analyzers/token-filters/pattern-capture.md new file mode 100644 index 0000000000..cff36b583d --- /dev/null +++ b/_analyzers/token-filters/pattern-capture.md @@ -0,0 +1,97 @@ +--- +layout: default +title: Pattern capture +parent: Token filters +nav_order: 310 +--- + +# Pattern capture token filter + +The `pattern_capture` token filter is a powerful filter that uses regular expressions to capture and extract parts of text according to specific patterns. This filter can be useful when you want to extract particular parts of tokens, such as email domains, hashtags, or numbers, and reuse them for further analysis or indexing. + +## Parameters + +The `pattern_capture` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`patterns` | Required | Array of strings | An array of regular expressions used to capture parts of text. +`preserve_original` | Required | Boolean| Whether to keep the original token in the output. Default is `true`. + + +## Example + +The following example request creates a new index named `email_index` and configures an analyzer with a `pattern_capture` filter to extract the local part and domain name from an email address: + +```json +PUT /email_index +{ + "settings": { + "analysis": { + "filter": { + "email_pattern_capture": { + "type": "pattern_capture", + "preserve_original": true, + "patterns": [ + "^([^@]+)", + "@(.+)$" + ] + } + }, + "analyzer": { + "email_analyzer": { + "tokenizer": "uax_url_email", + "filter": [ + "email_pattern_capture", + "lowercase" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /email_index/_analyze +{ + "text": "john.doe@example.com", + "analyzer": "email_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "john.doe@example.com", + "start_offset": 0, + "end_offset": 20, + "type": "", + "position": 0 + }, + { + "token": "john.doe", + "start_offset": 0, + "end_offset": 20, + "type": "", + "position": 0 + }, + { + "token": "example.com", + "start_offset": 0, + "end_offset": 20, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/pattern-replace.md b/_analyzers/token-filters/pattern-replace.md new file mode 100644 index 0000000000..73ef7fa7d8 --- /dev/null +++ b/_analyzers/token-filters/pattern-replace.md @@ -0,0 +1,116 @@ +--- +layout: default +title: Pattern replace +parent: Token filters +nav_order: 320 +--- + +# Pattern replace token filter + +The `pattern_replace` token filter allows you to modify tokens using regular expressions. This filter replaces patterns in tokens with the specified values, giving you flexibility in transforming or normalizing tokens before indexing them. It's particularly useful when you need to clean or standardize text during analysis. + +## Parameters + +The `pattern_replace` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`pattern` | Required | String | A regular expression pattern that matches the text that needs to be replaced. +`all` | Optional | Boolean | Whether to replace all pattern matches. If `false`, only the first match is replaced. Default is `true`. +`replacement` | Optional | String | A string with which to replace the matched pattern. Default is an empty string. + + +## Example + +The following example request creates a new index named `text_index` and configures an analyzer with a `pattern_replace` filter to replace tokens containing digits with the string `[NUM]`: + +```json +PUT /text_index +{ + "settings": { + "analysis": { + "filter": { + "number_replace_filter": { + "type": "pattern_replace", + "pattern": "\\d+", + "replacement": "[NUM]" + } + }, + "analyzer": { + "number_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "number_replace_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /text_index/_analyze +{ + "text": "Visit us at 98765 Example St.", + "analyzer": "number_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "visit", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "us", + "start_offset": 6, + "end_offset": 8, + "type": "", + "position": 1 + }, + { + "token": "at", + "start_offset": 9, + "end_offset": 11, + "type": "", + "position": 2 + }, + { + "token": "[NUM]", + "start_offset": 12, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "example", + "start_offset": 18, + "end_offset": 25, + "type": "", + "position": 4 + }, + { + "token": "st", + "start_offset": 26, + "end_offset": 28, + "type": "", + "position": 5 + } + ] +} +``` diff --git a/_analyzers/token-filters/phonetic.md b/_analyzers/token-filters/phonetic.md new file mode 100644 index 0000000000..7fe380851f --- /dev/null +++ b/_analyzers/token-filters/phonetic.md @@ -0,0 +1,98 @@ +--- +layout: default +title: Phonetic +parent: Token filters +nav_order: 330 +--- + +# Phonetic token filter + +The `phonetic` token filter transforms tokens into their phonetic representations, enabling more flexible matching of words that sound similar but are spelled differently. This is particularly useful for searching names, brands, or other entities that users might spell differently but pronounce similarly. + +The `phonetic` token filter is not included in OpenSearch distributions by default. To use this token filter, you must first install the `analysis-phonetic` plugin as follows and then restart OpenSearch: + +```bash +./bin/opensearch-plugin install analysis-phonetic +``` +{% include copy.html %} + +For more information about installing plugins, see [Installing plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/). +{: .note} + +## Parameters + +The `phonetic` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`encoder` | Optional | String | Specifies the phonetic algorithm to use.

Valid values are:
- `metaphone` (default)
- `double_metaphone`
- `soundex`
- `refined_soundex`
- `caverphone1`
- `caverphone2`
- `cologne`
- `nysiis`
- `koelnerphonetik`
- `haasephonetik`
- `beider_morse`
- `daitch_mokotoff ` +`replace` | Optional | Boolean | Whether to replace the original token. If `false`, the original token is included in the output along with the phonetic encoding. Default is `true`. + + +## Example + +The following example request creates a new index named `names_index` and configures an analyzer with a `phonetic` filter: + +```json +PUT /names_index +{ + "settings": { + "analysis": { + "filter": { + "my_phonetic_filter": { + "type": "phonetic", + "encoder": "double_metaphone", + "replace": true + } + }, + "analyzer": { + "phonetic_analyzer": { + "tokenizer": "standard", + "filter": [ + "my_phonetic_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated for the names `Stephen` and `Steven` using the analyzer: + +```json +POST /names_index/_analyze +{ + "text": "Stephen", + "analyzer": "phonetic_analyzer" +} +``` +{% include copy-curl.html %} + +```json +POST /names_index/_analyze +{ + "text": "Steven", + "analyzer": "phonetic_analyzer" +} +``` +{% include copy-curl.html %} + +In both cases, the response contains the same generated token: + +```json +{ + "tokens": [ + { + "token": "STFN", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + } + ] +} +``` diff --git a/_analyzers/token-filters/porter-stem.md b/_analyzers/token-filters/porter-stem.md new file mode 100644 index 0000000000..fa2f4208a7 --- /dev/null +++ b/_analyzers/token-filters/porter-stem.md @@ -0,0 +1,83 @@ +--- +layout: default +title: Porter stem +parent: Token filters +nav_order: 340 +--- + +# Porter stem token filter + +The `porter_stem` token filter reduces words to their base (or _stem_) form and removes common suffixes from words, which helps in matching similar words by their root. For example, the word `running` is stemmed to `run`. This token filter is primarily used for the English language and provides stemming based on the [Porter stemming algorithm](https://snowballstem.org/algorithms/porter/stemmer.html). + + +## Example + +The following example request creates a new index named `my_stem_index` and configures an analyzer with a `porter_stem` filter: + +```json +PUT /my_stem_index +{ + "settings": { + "analysis": { + "filter": { + "my_porter_stem": { + "type": "porter_stem" + } + }, + "analyzer": { + "porter_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_porter_stem" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /my_stem_index/_analyze +{ + "text": "running runners ran", + "analyzer": "porter_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "run", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "runner", + "start_offset": 8, + "end_offset": 15, + "type": "", + "position": 1 + }, + { + "token": "ran", + "start_offset": 16, + "end_offset": 19, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/predicate-token-filter.md b/_analyzers/token-filters/predicate-token-filter.md new file mode 100644 index 0000000000..24729f0224 --- /dev/null +++ b/_analyzers/token-filters/predicate-token-filter.md @@ -0,0 +1,82 @@ +--- +layout: default +title: Predicate token filter +parent: Token filters +nav_order: 340 +--- + +# Predicate token filter + +The `predicate_token_filter` evaluates whether tokens should be kept or discarded, depending on the conditions defined in a custom script. The tokens are evaluated in the analysis predicate context. This filter supports only inline Painless scripts. + +## Parameters + +The `predicate_token_filter` has one required parameter: `script`. This parameter provides a condition that is used to evaluate whether the token should be kept. + +## Example + +The following example request creates a new index named `predicate_index` and configures an analyzer with a `predicate_token_filter`. The filter specifies to only output tokens if they are longer than 7 characters: + +```json +PUT /predicate_index +{ + "settings": { + "analysis": { + "filter": { + "my_predicate_filter": { + "type": "predicate_token_filter", + "script": { + "source": "token.term.length() > 7" + } + } + }, + "analyzer": { + "predicate_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_predicate_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST /predicate_index/_analyze +{ + "text": "The OpenSearch community is growing rapidly", + "analyzer": "predicate_analyzer" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 4, + "end_offset": 14, + "type": "", + "position": 1 + }, + { + "token": "community", + "start_offset": 15, + "end_offset": 24, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/remove-duplicates.md b/_analyzers/token-filters/remove-duplicates.md new file mode 100644 index 0000000000..b0a589884a --- /dev/null +++ b/_analyzers/token-filters/remove-duplicates.md @@ -0,0 +1,152 @@ +--- +layout: default +title: Remove duplicates +parent: Token filters +nav_order: 350 +--- + +# Remove duplicates token filter + +The `remove_duplicates` token filter is used to remove duplicate tokens that are generated in the same position during analysis. + +## Example + +The following example request creates an index with a `keyword_repeat` token filter. The filter adds a `keyword` version of each token in the same position as the token itself and then uses a `kstem` to create a stemmed version of the token: + +```json +PUT /example-index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "keyword_repeat", + "kstem" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +Use the following request to analyze the string `Slower turtle`: + +```json +GET /example-index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "Slower turtle" +} +``` +{% include copy-curl.html %} + +The response contains the token `turtle` twice in the same position: + +```json +{ + "tokens": [ + { + "token": "slower", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "turtle", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "turtle", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + } + ] +} +``` + +The duplicate token can be removed by adding a `remove_duplicates` token filter to the index settings: + +```json +PUT /index-remove-duplicate +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "keyword_repeat", + "kstem", + "remove_duplicates" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /index-remove-duplicate/_analyze +{ + "analyzer": "custom_analyzer", + "text": "Slower turtle" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "slower", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "slow", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "turtle", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/reverse.md b/_analyzers/token-filters/reverse.md new file mode 100644 index 0000000000..dc48f07e77 --- /dev/null +++ b/_analyzers/token-filters/reverse.md @@ -0,0 +1,86 @@ +--- +layout: default +title: Reverse +parent: Token filters +nav_order: 360 +--- + +# Reverse token filter + +The `reverse` token filter reverses the order of the characters in each token, making suffix information accessible at the beginning of the reversed tokens during analysis. + +This is useful for suffix-based searches: + +The `reverse` token filter is useful when you need to perform suffix-based searches, such as in the following scenarios: + +- **Suffix matching**: Searching for words based on their suffixes, such as identifying words with a specific ending (for example, `-tion` or `-ing`). +- **File extension searches**: Searching for files by their extensions, such as `.txt` or `.jpg`. +- **Custom sorting or ranking**: By reversing tokens, you can implement unique sorting or ranking logic based on suffixes. +- **Autocomplete for suffixes**: Implementing autocomplete suggestions that use suffixes rather than prefixes. + + +## Example + +The following example request creates a new index named `my-reverse-index` and configures an analyzer with a `reverse` filter: + +```json +PUT /my-reverse-index +{ + "settings": { + "analysis": { + "filter": { + "reverse_filter": { + "type": "reverse" + } + }, + "analyzer": { + "my_reverse_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "reverse_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-reverse-index/_analyze +{ + "analyzer": "my_reverse_analyzer", + "text": "hello world" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "olleh", + "start_offset": 0, + "end_offset": 5, + "type": "", + "position": 0 + }, + { + "token": "dlrow", + "start_offset": 6, + "end_offset": 11, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/shingle.md b/_analyzers/token-filters/shingle.md new file mode 100644 index 0000000000..ea961bf3e0 --- /dev/null +++ b/_analyzers/token-filters/shingle.md @@ -0,0 +1,120 @@ +--- +layout: default +title: Shingle +parent: Token filters +nav_order: 370 +--- + +# Shingle token filter + +The `shingle` token filter is used to generate word n-grams, or _shingles_, from input text. For example, for the string `slow green turtle`, the `shingle` filter creates the following one- and two-word shingles: `slow`, `slow green`, `green`, `green turtle`, and `turtle`. + +This token filter is often used in conjunction with other filters to enhance search accuracy by indexing phrases rather than individual tokens. For more information, see [Phrase suggester]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/did-you-mean/#phrase-suggester). + +## Parameters + +The `shingle` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`min_shingle_size` | Optional | Integer | The minimum number of tokens to concatenate. Default is `2`. +`max_shingle_size` | Optional | Integer | The maximum number of tokens to concatenate. Default is `2`. +`output_unigrams` | Optional | Boolean | Whether to include unigrams (individual tokens) as output. Default is `true`. +`output_unigrams_if_no_shingles` | Optional | Boolean | Whether to output unigrams if no shingles are generated. Default is `false`. +`token_separator` | Optional | String | A separator used to concatenate tokens into a shingle. Default is a space (`" "`). +`filler_token` | Optional | String | A token inserted into empty positions or gaps between tokens. Default is an underscore (`_`). + +If `output_unigrams` and `output_unigrams_if_no_shingles` are both set to `true`, `output_unigrams_if_no_shingles` is ignored. +{: .note} + +## Example + +The following example request creates a new index named `my-shingle-index` and configures an analyzer with a `shingle` filter: + +```json +PUT /my-shingle-index +{ + "settings": { + "analysis": { + "filter": { + "my_shingle_filter": { + "type": "shingle", + "min_shingle_size": 2, + "max_shingle_size": 2, + "output_unigrams": true + } + }, + "analyzer": { + "my_shingle_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_shingle_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-shingle-index/_analyze +{ + "analyzer": "my_shingle_analyzer", + "text": "slow green turtle" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "slow", + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "slow green", + "start_offset": 0, + "end_offset": 10, + "type": "shingle", + "position": 0, + "positionLength": 2 + }, + { + "token": "green", + "start_offset": 5, + "end_offset": 10, + "type": "", + "position": 1 + }, + { + "token": "green turtle", + "start_offset": 5, + "end_offset": 17, + "type": "shingle", + "position": 1, + "positionLength": 2 + }, + { + "token": "turtle", + "start_offset": 11, + "end_offset": 17, + "type": "", + "position": 2 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/snowball.md b/_analyzers/token-filters/snowball.md new file mode 100644 index 0000000000..149486e727 --- /dev/null +++ b/_analyzers/token-filters/snowball.md @@ -0,0 +1,108 @@ +--- +layout: default +title: Snowball +parent: Token filters +nav_order: 380 +--- + +# Snowball token filter + +The `snowball` token filter is a stemming filter based on the [Snowball](https://snowballstem.org/) algorithm. It supports many languages and is more efficient and accurate than the Porter stemming algorithm. + +## Parameters + +The `snowball` token filter can be configured with a `language` parameter that accepts the following values: + +- `Arabic` +- `Armenian` +- `Basque` +- `Catalan` +- `Danish` +- `Dutch` +- `English` (default) +- `Estonian` +- `Finnish` +- `French` +- `German` +- `German2` +- `Hungarian` +- `Italian` +- `Irish` +- `Kp` +- `Lithuanian` +- `Lovins` +- `Norwegian` +- `Porter` +- `Portuguese` +- `Romanian` +- `Russian` +- `Spanish` +- `Swedish` +- `Turkish` + +## Example + +The following example request creates a new index named `my-snowball-index` and configures an analyzer with a `snowball` filter: + +```json +PUT /my-snowball-index +{ + "settings": { + "analysis": { + "filter": { + "my_snowball_filter": { + "type": "snowball", + "language": "English" + } + }, + "analyzer": { + "my_snowball_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_snowball_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-snowball-index/_analyze +{ + "analyzer": "my_snowball_analyzer", + "text": "running runners" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "run", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "runner", + "start_offset": 8, + "end_offset": 15, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/stemmer-override.md b/_analyzers/token-filters/stemmer-override.md new file mode 100644 index 0000000000..c06f673714 --- /dev/null +++ b/_analyzers/token-filters/stemmer-override.md @@ -0,0 +1,139 @@ +--- +layout: default +title: Stemmer override +parent: Token filters +nav_order: 400 +--- + +# Stemmer override token filter + +The `stemmer_override` token filter allows you to define custom stemming rules that override the behavior of default stemmers like Porter or Snowball. This can be useful when you want to apply specific stemming behavior to certain words that might not be modified correctly by the standard stemming algorithms. + +## Parameters + +The `stemmer_override` token filter must be configured with exactly one of the following parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +`rules` | String | Defines the override rules directly in the settings. +`rules_path` | String | Specifies the path to the file containing custom rules (mappings). The path can be either an absolute path or a path relative to the config directory. + +## Example + +The following example request creates a new index named `my-index` and configures an analyzer with a `stemmer_override` filter: + +```json +PUT /my-index +{ + "settings": { + "analysis": { + "filter": { + "my_stemmer_override_filter": { + "type": "stemmer_override", + "rules": [ + "running, runner => run", + "bought => buy", + "best => good" + ] + } + }, + "analyzer": { + "my_custom_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_stemmer_override_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-index/_analyze +{ + "analyzer": "my_custom_analyzer", + "text": "I am a runner and bought the best shoes" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "i", + "start_offset": 0, + "end_offset": 1, + "type": "", + "position": 0 + }, + { + "token": "am", + "start_offset": 2, + "end_offset": 4, + "type": "", + "position": 1 + }, + { + "token": "a", + "start_offset": 5, + "end_offset": 6, + "type": "", + "position": 2 + }, + { + "token": "run", + "start_offset": 7, + "end_offset": 13, + "type": "", + "position": 3 + }, + { + "token": "and", + "start_offset": 14, + "end_offset": 17, + "type": "", + "position": 4 + }, + { + "token": "buy", + "start_offset": 18, + "end_offset": 24, + "type": "", + "position": 5 + }, + { + "token": "the", + "start_offset": 25, + "end_offset": 28, + "type": "", + "position": 6 + }, + { + "token": "good", + "start_offset": 29, + "end_offset": 33, + "type": "", + "position": 7 + }, + { + "token": "shoes", + "start_offset": 34, + "end_offset": 39, + "type": "", + "position": 8 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/stemmer.md b/_analyzers/token-filters/stemmer.md new file mode 100644 index 0000000000..dd1344fcbc --- /dev/null +++ b/_analyzers/token-filters/stemmer.md @@ -0,0 +1,118 @@ +--- +layout: default +title: Stemmer +parent: Token filters +nav_order: 390 +--- + +# Stemmer token filter + +The `stemmer` token filter reduces words to their root or base form (also known as their _stem_). + +## Parameters + +The `stemmer` token filter can be configured with a `language` parameter that accepts the following values: + +- Arabic: `arabic` +- Armenian: `armenian` +- Basque: `basque` +- Bengali: `bengali` +- Brazilian Portuguese: `brazilian` +- Bulgarian: `bulgarian` +- Catalan: `catalan` +- Czech: `czech` +- Danish: `danish` +- Dutch: `dutch, dutch_kp` +- English: `english` (default), `light_english`, `lovins`, `minimal_english`, `porter2`, `possessive_english` +- Estonian: `estonian` +- Finnish: `finnish`, `light_finnish` +- French: `light_french`, `french`, `minimal_french` +- Galician: `galician`, `minimal_galician` (plural step only) +- German: `light_german`, `german`, `german2`, `minimal_german` +- Greek: `greek` +- Hindi: `hindi` +- Hungarian: `hungarian, light_hungarian` +- Indonesian: `indonesian` +- Irish: `irish` +- Italian: `light_italian, italian` +- Kurdish (Sorani): `sorani` +- Latvian: `latvian` +- Lithuanian: `lithuanian` +- Norwegian (Bokmål): `norwegian`, `light_norwegian`, `minimal_norwegian` +- Norwegian (Nynorsk): `light_nynorsk`, `minimal_nynorsk` +- Portuguese: `light_portuguese`, `minimal_portuguese`, `portuguese`, `portuguese_rslp` +- Romanian: `romanian` +- Russian: `russian`, `light_russian` +- Spanish: `light_spanish`, `spanish` +- Swedish: `swedish`, `light_swedish` +- Turkish: `turkish` + +You can also use the `name` parameter as an alias for the `language` parameter. If both are set, the `name` parameter is ignored. +{: .note} + +## Example + +The following example request creates a new index named `my-stemmer-index` and configures an analyzer with a `stemmer` filter: + +```json +PUT /my-stemmer-index +{ + "settings": { + "analysis": { + "filter": { + "my_english_stemmer": { + "type": "stemmer", + "language": "english" + } + }, + "analyzer": { + "my_stemmer_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_english_stemmer" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-stemmer-index/_analyze +{ + "analyzer": "my_stemmer_analyzer", + "text": "running runs" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "run", + "start_offset": 0, + "end_offset": 7, + "type": "", + "position": 0 + }, + { + "token": "run", + "start_offset": 8, + "end_offset": 12, + "type": "", + "position": 1 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/stop.md b/_analyzers/token-filters/stop.md new file mode 100644 index 0000000000..8f3e01b72d --- /dev/null +++ b/_analyzers/token-filters/stop.md @@ -0,0 +1,111 @@ +--- +layout: default +title: Stop +parent: Token filters +nav_order: 410 +--- + +# Stop token filter + +The `stop` token filter is used to remove common words (also known as _stopwords_) from a token stream during analysis. Stopwords are typically articles and prepositions, such as `a` or `for`. These words are not significantly meaningful in search queries and are often excluded to improve search efficiency and relevance. + +The default list of English stopwords includes the following words: `a`, `an`, `and`, `are`, `as`, `at`, `be`, `but`, `by`, `for`, `if`, `in`, `into`, `is`, `it`, `no`, `not`, `of`, `on`, `or`, `such`, `that`, `the`, `their`, `then`, `there`, `these`, `they`, `this`, `to`, `was`, `will`, and `with`. + +## Parameters + +The `stop` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`stopwords` | Optional | String | Specifies either a custom array of stopwords or a language for which to fetch the predefined Lucene stopword list:

- [`_arabic_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt)
- [`_armenian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/hy/stopwords.txt)
- [`_basque_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/eu/stopwords.txt)
- [`_bengali_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt)
- [`_brazilian_` (Brazilian Portuguese)](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/br/stopwords.txt)
- [`_bulgarian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt)
- [`_catalan_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ca/stopwords.txt)
- [`_cjk_` (Chinese, Japanese, and Korean)](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/cjk/stopwords.txt)
- [`_czech_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/cz/stopwords.txt)
- [`_danish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/danish_stop.txt)
- [`_dutch_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/dutch_stop.txt)
- [`_english_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java#L48) (Default)
- [`_estonian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/et/stopwords.txt)
- [`_finnish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/finnish_stop.txt)
- [`_french_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/french_stop.txt)
- [`_galician_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/gl/stopwords.txt)
- [`_german_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/german_stop.txt)
- [`_greek_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/el/stopwords.txt)
- [`_hindi_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt)
- [`_hungarian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/hungarian_stop.txt)
- [`_indonesian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/id/stopwords.txt)
- [`_irish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ga/stopwords.txt)
- [`_italian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/italian_stop.txt)
- [`_latvian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/lv/stopwords.txt)
- [`_lithuanian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/lt/stopwords.txt)
- [`_norwegian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/norwegian_stop.txt)
- [`_persian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt)
- [`_portuguese_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/portuguese_stop.txt)
- [`_romanian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt)
- [`_russian_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/russian_stop.txt)
- [`_sorani_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/sr/stopwords.txt)
- [`_spanish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/ckb/stopwords.txt)
- [`_swedish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball/swedish_stop.txt)
- [`_thai_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/th/stopwords.txt)
- [`_turkish_`](https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt) +`stopwords_path` | Optional | String | Specifies the file path (absolute or relative to the config directory) of the file containing custom stopwords. +`ignore_case` | Optional | Boolean | If `true`, stopwords will be matched regardless of their case. Default is `false`. +`remove_trailing` | Optional | Boolean | If `true`, trailing stopwords will be removed during analysis. Default is `true`. + +## Example + +The following example request creates a new index named `my-stopword-index` and configures an analyzer with a `stop` filter that uses the predefined stopword list for the English language: + +```json +PUT /my-stopword-index +{ + "settings": { + "analysis": { + "filter": { + "my_stop_filter": { + "type": "stop", + "stopwords": "_english_" + } + }, + "analyzer": { + "my_stop_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_stop_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-stopword-index/_analyze +{ + "analyzer": "my_stop_analyzer", + "text": "A quick dog jumps over the turtle" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "quick", + "start_offset": 2, + "end_offset": 7, + "type": "", + "position": 1 + }, + { + "token": "dog", + "start_offset": 8, + "end_offset": 11, + "type": "", + "position": 2 + }, + { + "token": "jumps", + "start_offset": 12, + "end_offset": 17, + "type": "", + "position": 3 + }, + { + "token": "over", + "start_offset": 18, + "end_offset": 22, + "type": "", + "position": 4 + }, + { + "token": "turtle", + "start_offset": 27, + "end_offset": 33, + "type": "", + "position": 6 + } + ] +} +``` \ No newline at end of file diff --git a/_analyzers/token-filters/synonym-graph.md b/_analyzers/token-filters/synonym-graph.md new file mode 100644 index 0000000000..75c7c79151 --- /dev/null +++ b/_analyzers/token-filters/synonym-graph.md @@ -0,0 +1,180 @@ +--- +layout: default +title: Synonym graph +parent: Token filters +nav_order: 420 +--- + +# Synonym graph token filter + +The `synonym_graph` token filter is a more advanced version of the `synonym` token filter. It supports multiword synonyms and processes synonyms across multiple tokens, making it ideal for phrases or scenarios in which relationships between tokens are important. + +## Parameters + +The `synonym_graph` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`synonyms` | Either `synonyms` or `synonyms_path` must be specified | String | A list of synonym rules defined directly in the configuration. +`synonyms_path` | Either `synonyms` or `synonyms_path` must be specified | String | The file path to a file containing synonym rules (either an absolute path or a path relative to the config directory). +`lenient` | Optional | Boolean | Whether to ignore exceptions when loading the rule configurations. Default is `false`. +`format` | Optional | String | Specifies the format used to determine how OpenSearch defines and interprets synonyms. Valid values are:
- `solr`
- [`wordnet`](https://wordnet.princeton.edu/).
Default is `solr`. +`expand` | Optional | Boolean | Whether to expand equivalent synonym rules. Default is `false`.

For example:
If `synonyms` are defined as `"quick, fast"` and `expand` is set to `true`, then the synonym rules are configured as follows:
- `quick => quick`
- `quick => fast`
- `fast => quick`
- `fast => fast`

If `expand` is set to `false`, the synonym rules are configured as follows:
- `quick => quick`
- `fast => quick` + +## Example: Solr format + +The following example request creates a new index named `my-index` and configures an analyzer with a `synonym_graph` filter. The filter is configured with the default `solr` rule format: + +```json +PUT /my-index +{ + "settings": { + "analysis": { + "filter": { + "my_synonym_graph_filter": { + "type": "synonym_graph", + "synonyms": [ + "sports car, race car", + "fast car, speedy vehicle", + "luxury car, premium vehicle", + "electric car, EV" + ] + } + }, + "analyzer": { + "my_synonym_graph_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_synonym_graph_filter" + ] + } + } + } + } +} + +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-car-index/_analyze +{ + "analyzer": "my_synonym_graph_analyzer", + "text": "I just bought a sports car and it is a fast car." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "i","start_offset": 0,"end_offset": 1,"type": "","position": 0}, + {"token": "just","start_offset": 2,"end_offset": 6,"type": "","position": 1}, + {"token": "bought","start_offset": 7,"end_offset": 13,"type": "","position": 2}, + {"token": "a","start_offset": 14,"end_offset": 15,"type": "","position": 3}, + {"token": "race","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 4}, + {"token": "sports","start_offset": 16,"end_offset": 22,"type": "","position": 4,"positionLength": 2}, + {"token": "car","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 5,"positionLength": 2}, + {"token": "car","start_offset": 23,"end_offset": 26,"type": "","position": 6}, + {"token": "and","start_offset": 27,"end_offset": 30,"type": "","position": 7}, + {"token": "it","start_offset": 31,"end_offset": 33,"type": "","position": 8}, + {"token": "is","start_offset": 34,"end_offset": 36,"type": "","position": 9}, + {"token": "a","start_offset": 37,"end_offset": 38,"type": "","position": 10}, + {"token": "speedy","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 11}, + {"token": "fast","start_offset": 39,"end_offset": 43,"type": "","position": 11,"positionLength": 2}, + {"token": "vehicle","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 12,"positionLength": 2}, + {"token": "car","start_offset": 44,"end_offset": 47,"type": "","position": 13} + ] +} +``` + +## Example: WordNet format + +The following example request creates a new index named `my-wordnet-index` and configures an analyzer with a `synonym_graph` filter. The filter is configured with the [`wordnet`](https://wordnet.princeton.edu/) rule format: + +```json +PUT /my-wordnet-index +{ + "settings": { + "analysis": { + "filter": { + "my_synonym_graph_filter": { + "type": "synonym_graph", + "format": "wordnet", + "synonyms": [ + "s(100000001, 1, 'sports car', n, 1, 0).", + "s(100000001, 2, 'race car', n, 1, 0).", + "s(100000001, 3, 'fast car', n, 1, 0).", + "s(100000001, 4, 'speedy vehicle', n, 1, 0)." + ] + } + }, + "analyzer": { + "my_synonym_graph_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_synonym_graph_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-wordnet-index/_analyze +{ + "analyzer": "my_synonym_graph_analyzer", + "text": "I just bought a sports car and it is a fast car." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "i","start_offset": 0,"end_offset": 1,"type": "","position": 0}, + {"token": "just","start_offset": 2,"end_offset": 6,"type": "","position": 1}, + {"token": "bought","start_offset": 7,"end_offset": 13,"type": "","position": 2}, + {"token": "a","start_offset": 14,"end_offset": 15,"type": "","position": 3}, + {"token": "race","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 4}, + {"token": "fast","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 4,"positionLength": 2}, + {"token": "speedy","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 4,"positionLength": 3}, + {"token": "sports","start_offset": 16,"end_offset": 22,"type": "","position": 4,"positionLength": 4}, + {"token": "car","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 5,"positionLength": 4}, + {"token": "car","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 6,"positionLength": 3}, + {"token": "vehicle","start_offset": 16,"end_offset": 26,"type": "SYNONYM","position": 7,"positionLength": 2}, + {"token": "car","start_offset": 23,"end_offset": 26,"type": "","position": 8}, + {"token": "and","start_offset": 27,"end_offset": 30,"type": "","position": 9}, + {"token": "it","start_offset": 31,"end_offset": 33,"type": "","position": 10}, + {"token": "is","start_offset": 34,"end_offset": 36,"type": "","position": 11}, + {"token": "a","start_offset": 37,"end_offset": 38,"type": "","position": 12}, + {"token": "sports","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 13}, + {"token": "race","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 13,"positionLength": 2}, + {"token": "speedy","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 13,"positionLength": 3}, + {"token": "fast","start_offset": 39,"end_offset": 43,"type": "","position": 13,"positionLength": 4}, + {"token": "car","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 14,"positionLength": 4}, + {"token": "car","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 15,"positionLength": 3}, + {"token": "vehicle","start_offset": 39,"end_offset": 47,"type": "SYNONYM","position": 16,"positionLength": 2}, + {"token": "car","start_offset": 44,"end_offset": 47,"type": "","position": 17} + ] +} +``` diff --git a/_analyzers/token-filters/synonym.md b/_analyzers/token-filters/synonym.md new file mode 100644 index 0000000000..296d5cd5db --- /dev/null +++ b/_analyzers/token-filters/synonym.md @@ -0,0 +1,277 @@ +--- +layout: default +title: Synonym +parent: Token filters +nav_order: 415 +--- + +# Synonym token filter + +The `synonym` token filter allows you to map multiple terms to a single term or create equivalence groups between words, improving search flexibility. + +## Parameters + +The `synonym` token filter can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`synonyms` | Either `synonyms` or `synonyms_path` must be specified | String | A list of synonym rules defined directly in the configuration. +`synonyms_path` | Either `synonyms` or `synonyms_path` must be specified | String | The file path to a file containing synonym rules (either an absolute path or a path relative to the config directory). +`lenient` | Optional | Boolean | Whether to ignore exceptions when loading the rule configurations. Default is `false`. +`format` | Optional | String | Specifies the format used to determine how OpenSearch defines and interprets synonyms. Valid values are:
- `solr`
- [`wordnet`](https://wordnet.princeton.edu/).
Default is `solr`. +`expand` | Optional | Boolean | Whether to expand equivalent synonym rules. Default is `false`.

For example:
If `synonyms` are defined as `"quick, fast"` and `expand` is set to `true`, then the synonym rules are configured as follows:
- `quick => quick`
- `quick => fast`
- `fast => quick`
- `fast => fast`

If `expand` is set to `false`, the synonym rules are configured as follows:
- `quick => quick`
- `fast => quick` + +## Example: Solr format + +The following example request creates a new index named `my-synonym-index` and configures an analyzer with a `synonym` filter. The filter is configured with the default `solr` rule format: + +```json +PUT /my-synonym-index +{ + "settings": { + "analysis": { + "filter": { + "my_synonym_filter": { + "type": "synonym", + "synonyms": [ + "car, automobile", + "quick, fast, speedy", + "laptop => computer" + ] + } + }, + "analyzer": { + "my_synonym_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_synonym_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-synonym-index/_analyze +{ + "analyzer": "my_synonym_analyzer", + "text": "The quick dog jumps into the car with a laptop" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "quick", + "start_offset": 4, + "end_offset": 9, + "type": "", + "position": 1 + }, + { + "token": "fast", + "start_offset": 4, + "end_offset": 9, + "type": "SYNONYM", + "position": 1 + }, + { + "token": "speedy", + "start_offset": 4, + "end_offset": 9, + "type": "SYNONYM", + "position": 1 + }, + { + "token": "dog", + "start_offset": 10, + "end_offset": 13, + "type": "", + "position": 2 + }, + { + "token": "jumps", + "start_offset": 14, + "end_offset": 19, + "type": "", + "position": 3 + }, + { + "token": "into", + "start_offset": 20, + "end_offset": 24, + "type": "", + "position": 4 + }, + { + "token": "the", + "start_offset": 25, + "end_offset": 28, + "type": "", + "position": 5 + }, + { + "token": "car", + "start_offset": 29, + "end_offset": 32, + "type": "", + "position": 6 + }, + { + "token": "automobile", + "start_offset": 29, + "end_offset": 32, + "type": "SYNONYM", + "position": 6 + }, + { + "token": "with", + "start_offset": 33, + "end_offset": 37, + "type": "", + "position": 7 + }, + { + "token": "a", + "start_offset": 38, + "end_offset": 39, + "type": "", + "position": 8 + }, + { + "token": "computer", + "start_offset": 40, + "end_offset": 46, + "type": "SYNONYM", + "position": 9 + } + ] +} +``` + +## Example: WordNet format + +The following example request creates a new index named `my-wordnet-index` and configures an analyzer with a `synonym` filter. The filter is configured with the [`wordnet`](https://wordnet.princeton.edu/) rule format: + +```json +PUT /my-wordnet-index +{ + "settings": { + "analysis": { + "filter": { + "my_wordnet_synonym_filter": { + "type": "synonym", + "format": "wordnet", + "synonyms": [ + "s(100000001,1,'fast',v,1,0).", + "s(100000001,2,'quick',v,1,0).", + "s(100000001,3,'swift',v,1,0)." + ] + } + }, + "analyzer": { + "my_wordnet_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "my_wordnet_synonym_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-wordnet-index/_analyze +{ + "analyzer": "my_wordnet_analyzer", + "text": "I have a fast car" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "i", + "start_offset": 0, + "end_offset": 1, + "type": "", + "position": 0 + }, + { + "token": "have", + "start_offset": 2, + "end_offset": 6, + "type": "", + "position": 1 + }, + { + "token": "a", + "start_offset": 7, + "end_offset": 8, + "type": "", + "position": 2 + }, + { + "token": "fast", + "start_offset": 9, + "end_offset": 13, + "type": "", + "position": 3 + }, + { + "token": "quick", + "start_offset": 9, + "end_offset": 13, + "type": "SYNONYM", + "position": 3 + }, + { + "token": "swift", + "start_offset": 9, + "end_offset": 13, + "type": "SYNONYM", + "position": 3 + }, + { + "token": "car", + "start_offset": 14, + "end_offset": 17, + "type": "", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/trim.md b/_analyzers/token-filters/trim.md new file mode 100644 index 0000000000..cdfebed52f --- /dev/null +++ b/_analyzers/token-filters/trim.md @@ -0,0 +1,93 @@ +--- +layout: default +title: Trim +parent: Token filters +nav_order: 430 +--- + +# Trim token filter + +The `trim` token filter removes leading and trailing white space characters from tokens. + +Many popular tokenizers, such as `standard`, `keyword`, and `whitespace` tokenizers, automatically strip leading and trailing white space characters during tokenization. When using these tokenizers, there is no need to configure an additional `trim` token filter. +{: .note} + + +## Example + +The following example request creates a new index named `my_pattern_trim_index` and configures an analyzer with a `trim` filter and a `pattern` tokenizer, which does not remove leading and trailing white space characters: + +```json +PUT /my_pattern_trim_index +{ + "settings": { + "analysis": { + "filter": { + "my_trim_filter": { + "type": "trim" + } + }, + "tokenizer": { + "my_pattern_tokenizer": { + "type": "pattern", + "pattern": "," + } + }, + "analyzer": { + "my_pattern_trim_analyzer": { + "type": "custom", + "tokenizer": "my_pattern_tokenizer", + "filter": [ + "lowercase", + "my_trim_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my_pattern_trim_index/_analyze +{ + "analyzer": "my_pattern_trim_analyzer", + "text": " OpenSearch , is , powerful " +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 0, + "end_offset": 12, + "type": "word", + "position": 0 + }, + { + "token": "is", + "start_offset": 13, + "end_offset": 18, + "type": "word", + "position": 1 + }, + { + "token": "powerful", + "start_offset": 19, + "end_offset": 32, + "type": "word", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/truncate.md b/_analyzers/token-filters/truncate.md new file mode 100644 index 0000000000..16d1452901 --- /dev/null +++ b/_analyzers/token-filters/truncate.md @@ -0,0 +1,107 @@ +--- +layout: default +title: Truncate +parent: Token filters +nav_order: 440 +--- + +# Truncate token filter + +The `truncate` token filter is used to shorten tokens exceeding a specified length. It trims tokens to a maximum number of characters, ensuring that tokens exceeding this limit are truncated. + +## Parameters + +The `truncate` token filter can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`length` | Optional | Integer | Specifies the maximum length of the generated token. Default is `10`. + +## Example + +The following example request creates a new index named `truncate_example` and configures an analyzer with a `truncate` filter: + +```json +PUT /truncate_example +{ + "settings": { + "analysis": { + "filter": { + "truncate_filter": { + "type": "truncate", + "length": 5 + } + }, + "analyzer": { + "truncate_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "truncate_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /truncate_example/_analyze +{ + "analyzer": "truncate_analyzer", + "text": "OpenSearch is powerful and scalable" +} + +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opens", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "is", + "start_offset": 11, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "power", + "start_offset": 14, + "end_offset": 22, + "type": "", + "position": 2 + }, + { + "token": "and", + "start_offset": 23, + "end_offset": 26, + "type": "", + "position": 3 + }, + { + "token": "scala", + "start_offset": 27, + "end_offset": 35, + "type": "", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/unique.md b/_analyzers/token-filters/unique.md new file mode 100644 index 0000000000..c4dfcbab16 --- /dev/null +++ b/_analyzers/token-filters/unique.md @@ -0,0 +1,106 @@ +--- +layout: default +title: Unique +parent: Token filters +nav_order: 450 +--- + +# Unique token filter + +The `unique` token filter ensures that only unique tokens are kept during the analysis process, removing duplicate tokens that appear within a single field or text block. + +## Parameters + +The `unique` token filter can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`only_on_same_position` | Optional | Boolean | If `true`, the token filter acts as a `remove_duplicates` token filter and only removes tokens that are in the same position. Default is `false`. + +## Example + +The following example request creates a new index named `unique_example` and configures an analyzer with a `unique` filter: + +```json +PUT /unique_example +{ + "settings": { + "analysis": { + "filter": { + "unique_filter": { + "type": "unique", + "only_on_same_position": false + } + }, + "analyzer": { + "unique_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "unique_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /unique_example/_analyze +{ + "analyzer": "unique_analyzer", + "text": "OpenSearch OpenSearch is powerful powerful and scalable" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "is", + "start_offset": 22, + "end_offset": 24, + "type": "", + "position": 1 + }, + { + "token": "powerful", + "start_offset": 25, + "end_offset": 33, + "type": "", + "position": 2 + }, + { + "token": "and", + "start_offset": 43, + "end_offset": 46, + "type": "", + "position": 3 + }, + { + "token": "scalable", + "start_offset": 47, + "end_offset": 55, + "type": "", + "position": 4 + } + ] +} +``` diff --git a/_analyzers/token-filters/uppercase.md b/_analyzers/token-filters/uppercase.md new file mode 100644 index 0000000000..5026892400 --- /dev/null +++ b/_analyzers/token-filters/uppercase.md @@ -0,0 +1,83 @@ +--- +layout: default +title: Uppercase +parent: Token filters +nav_order: 460 +--- + +# Uppercase token filter + +The `uppercase` token filter is used to convert all tokens (words) to uppercase during analysis. + +## Example + +The following example request creates a new index named `uppercase_example` and configures an analyzer with an `uppercase` filter: + +```json +PUT /uppercase_example +{ + "settings": { + "analysis": { + "filter": { + "uppercase_filter": { + "type": "uppercase" + } + }, + "analyzer": { + "uppercase_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "uppercase_filter" + ] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /uppercase_example/_analyze +{ + "analyzer": "uppercase_analyzer", + "text": "OpenSearch is powerful" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "OPENSEARCH", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "IS", + "start_offset": 11, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "POWERFUL", + "start_offset": 14, + "end_offset": 22, + "type": "", + "position": 2 + } + ] +} +``` diff --git a/_analyzers/token-filters/word-delimiter-graph.md b/_analyzers/token-filters/word-delimiter-graph.md new file mode 100644 index 0000000000..ac734bebeb --- /dev/null +++ b/_analyzers/token-filters/word-delimiter-graph.md @@ -0,0 +1,164 @@ +--- +layout: default +title: Word delimiter graph +parent: Token filters +nav_order: 480 +--- + +# Word delimiter graph token filter + +The `word_delimiter_graph` token filter is used to split tokens at predefined characters and also offers optional token normalization based on customizable rules. + +The `word_delimiter_graph` filter is used to remove punctuation from complex identifiers like part numbers or product IDs. In such cases, it is best used with the `keyword` tokenizer. For hyphenated words, use the `synonym_graph` token filter instead of the `word_delimiter_graph` filter because users frequently search for these terms both with and without hyphens. +{: .note} + +By default, the filter applies the following rules. + +| Description | Input | Output | +|:---|:---|:---| +| Treats non-alphanumeric characters as delimiters. | `ultra-fast` | `ultra`, `fast` | +| Removes delimiters at the beginning or end of tokens. | `Z99++'Decoder'`| `Z99`, `Decoder` | +| Splits tokens when there is a transition between uppercase and lowercase letters. | `OpenSearch` | `Open`, `Search` | +| Splits tokens when there is a transition between letters and numbers. | `T1000` | `T`, `1000` | +| Removes the possessive ('s) from the end of tokens. | `John's` | `John` | + +It's important **not** to use tokenizers that strip punctuation, like the `standard` tokenizer, with this filter. Doing so may prevent proper token splitting and interfere with options like `catenate_all` or `preserve_original`. We recommend using this filter with a `keyword` or `whitespace` tokenizer. +{: .important} + +## Parameters + +You can configure the `word_delimiter_graph` token filter using the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`adjust_offsets` | Optional | Boolean | Determines whether the token offsets should be recalculated for split or concatenated tokens. When `true`, the filter adjusts the token offsets to accurately represent the token's position within the token stream. This adjustment ensures that the token's location in the text aligns with its modified form after processing, which is particularly useful for applications like highlighting or phrase queries. When `false`, the offsets remain unchanged, which may result in misalignment when the processed tokens are mapped back to their positions in the original text. If your analyzer uses filters like `trim` that change the token lengths without changing their offsets, we recommend setting this parameter to `false`. Default is `true`. +`catenate_all` | Optional | Boolean | Produces concatenated tokens from a sequence of alphanumeric parts. For example, `"quick-fast-200"` becomes `[ quickfast200, quick, fast, 200 ]`. Default is `false`. +`catenate_numbers` | Optional | Boolean | Concatenates numerical sequences. For example, `"10-20-30"` becomes `[ 102030, 10, 20, 30 ]`. Default is `false`. +`catenate_words` | Optional | Boolean | Concatenates alphabetic words. For example, `"high-speed-level"` becomes `[ highspeedlevel, high, speed, level ]`. Default is `false`. +`generate_number_parts` | Optional | Boolean | If `true`, numeric tokens (tokens consisting of numbers only) are included in the output. Default is `true`. +`generate_word_parts` | Optional | Boolean | If `true`, alphabetical tokens (tokens consisting of alphabetic characters only) are included in the output. Default is `true`. +`ignore_keywords` | Optional | Boolean | Whether to process tokens marked as keywords. Default is `false`. +`preserve_original` | Optional | Boolean | Keeps the original token (which may include non-alphanumeric delimiters) alongside the generated tokens in the output. For example, `"auto-drive-300"` becomes `[ auto-drive-300, auto, drive, 300 ]`. If `true`, the filter generates multi-position tokens not supported by indexing, so do not use this filter in an index analyzer or use the `flatten_graph` filter after this filter. Default is `false`. +`protected_words` | Optional | Array of strings | Specifies tokens that should not be split. +`protected_words_path` | Optional | String | Specifies a path (absolute or relative to the config directory) to a file containing tokens that should not be separated by new lines. +`split_on_case_change` | Optional | Boolean | Splits tokens where consecutive letters have different cases (one is lowercase and the other is uppercase). For example, `"OpenSearch"` becomes `[ Open, Search ]`. Default is `true`. +`split_on_numerics` | Optional | Boolean | Splits tokens where there are consecutive letters and numbers. For example `"v8engine"` will become `[ v, 8, engine ]`. Default is `true`. +`stem_english_possessive` | Optional | Boolean | Removes English possessive endings, such as `'s`. Default is `true`. +`type_table` | Optional | Array of strings | A custom map that specifies how to treat characters and whether to treat them as delimiters, which avoids unwanted splitting. For example, to treat a hyphen (`-`) as an alphanumeric character, specify `["- => ALPHA"]` so that words are not split at hyphens. Valid types are:
- `ALPHA`: alphabetical
- `ALPHANUM`: alphanumeric
- `DIGIT`: numeric
- `LOWER`: lowercase alphabetical
- `SUBWORD_DELIM`: non-alphanumeric delimiter
- `UPPER`: uppercase alphabetical +`type_table_path` | Optional | String | Specifies a path (absolute or relative to the config directory) to a file containing a custom character map. The map specifies how to treat characters and whether to treat them as delimiters, which avoids unwanted splitting. For valid types, see `type_table`. + +## Example + +The following example request creates a new index named `my-custom-index` and configures an analyzer with a `word_delimiter_graph` filter: + +```json +PUT /my-custom-index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "tokenizer": "keyword", + "filter": [ "custom_word_delimiter_filter" ] + } + }, + "filter": { + "custom_word_delimiter_filter": { + "type": "word_delimiter_graph", + "split_on_case_change": true, + "split_on_numerics": true, + "stem_english_possessive": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-custom-index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "FastCar's Model2023" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Fast", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "Car", + "start_offset": 4, + "end_offset": 7, + "type": "word", + "position": 1 + }, + { + "token": "Model", + "start_offset": 10, + "end_offset": 15, + "type": "word", + "position": 2 + }, + { + "token": "2023", + "start_offset": 15, + "end_offset": 19, + "type": "word", + "position": 3 + } + ] +} +``` + + +## Differences between the word_delimiter_graph and word_delimiter filters + + +Both the `word_delimiter_graph` and `word_delimiter` token filters generate tokens spanning multiple positions when any of the following parameters are set to `true`: + +- `catenate_all` +- `catenate_numbers` +- `catenate_words` +- `preserve_original` + +To illustrate the differences between these filters, consider the input text `Pro-XT500`. + + +### word_delimiter_graph + + +The `word_delimiter_graph` filter assigns a `positionLength` attribute to multi-position tokens, indicating how many positions a token spans. This ensures that the filter always generates valid token graphs, making it suitable for use in advanced token graph scenarios. Although token graphs with multi-position tokens are not supported for indexing, they can still be useful in search scenarios. For example, queries like `match_phrase` can use these graphs to generate multiple subqueries from a single input string. For the example input text, the `word_delimiter_graph` filter generates the following tokens: + +- `Pro` (position 1) +- `XT500` (position 2) +- `ProXT500` (position 1, `positionLength`: 2) + +The `positionLength` attribute the production of a valid graph to be used in advanced queries. + + +### word_delimiter + + +In contrast, the `word_delimiter` filter does not assign a `positionLength` attribute to multi-position tokens, leading to invalid graphs when these tokens are present. For the example input text, the `word_delimiter` filter generates the following tokens: + +- `Pro` (position 1) +- `XT500` (position 2) +- `ProXT500` (position 1, no `positionLength`) + +The lack of a `positionLength` attribute results in a token graph that is invalid for token streams containing multi-position tokens. \ No newline at end of file diff --git a/_analyzers/token-filters/word-delimiter.md b/_analyzers/token-filters/word-delimiter.md new file mode 100644 index 0000000000..d820fae2a0 --- /dev/null +++ b/_analyzers/token-filters/word-delimiter.md @@ -0,0 +1,128 @@ +--- +layout: default +title: Word delimiter +parent: Token filters +nav_order: 470 +--- + +# Word delimiter token filter + +The `word_delimiter` token filter is used to split tokens at predefined characters and also offers optional token normalization based on customizable rules. + +We recommend using the `word_delimiter_graph` filter instead of the `word_delimiter` filter whenever possible because the `word_delimiter` filter sometimes produces invalid token graphs. For more information about the differences between the two filters, see [Differences between the `word_delimiter_graph` and `word_delimiter` filters]({{site.url}}{{site.baseurl}}/analyzers/token-filters/word-delimiter-graph/#differences-between-the-word_delimiter_graph-and-word_delimiter-filters). +{: .important} + +The `word_delimiter` filter is used to remove punctuation from complex identifiers like part numbers or product IDs. In such cases, it is best used with the `keyword` tokenizer. For hyphenated words, use the `synonym_graph` token filter instead of the `word_delimiter` filter because users frequently search for these terms both with and without hyphens. +{: .note} + +By default, the filter applies the following rules. + +| Description | Input | Output | +|:---|:---|:---| +| Treats non-alphanumeric characters as delimiters. | `ultra-fast` | `ultra`, `fast` | +| Removes delimiters at the beginning or end of tokens. | `Z99++'Decoder'`| `Z99`, `Decoder` | +| Splits tokens when there is a transition between uppercase and lowercase letters. | `OpenSearch` | `Open`, `Search` | +| Splits tokens when there is a transition between letters and numbers. | `T1000` | `T`, `1000` | +| Removes the possessive ('s) from the end of tokens. | `John's` | `John` | + +It's important **not** to use tokenizers that strip punctuation, like the `standard` tokenizer, with this filter. Doing so may prevent proper token splitting and interfere with options like `catenate_all` or `preserve_original`. We recommend using this filter with a `keyword` or `whitespace` tokenizer. +{: .important} + +## Parameters + +You can configure the `word_delimiter` token filter using the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`catenate_all` | Optional | Boolean | Produces concatenated tokens from a sequence of alphanumeric parts. For example, `"quick-fast-200"` becomes `[ quickfast200, quick, fast, 200 ]`. Default is `false`. +`catenate_numbers` | Optional | Boolean | Concatenates numerical sequences. For example, `"10-20-30"` becomes `[ 102030, 10, 20, 30 ]`. Default is `false`. +`catenate_words` | Optional | Boolean | Concatenates alphabetic words. For example, `"high-speed-level"` becomes `[ highspeedlevel, high, speed, level ]`. Default is `false`. +`generate_number_parts` | Optional | Boolean | If `true`, numeric tokens (tokens consisting of numbers only) are included in the output. Default is `true`. +`generate_word_parts` | Optional | Boolean | If `true`, alphabetical tokens (tokens consisting of alphabetic characters only) are included in the output. Default is `true`. +`preserve_original` | Optional | Boolean | Keeps the original token (which may include non-alphanumeric delimiters) alongside the generated tokens in the output. For example, `"auto-drive-300"` becomes `[ auto-drive-300, auto, drive, 300 ]`. If `true`, the filter generates multi-position tokens not supported by indexing, so do not use this filter in an index analyzer or use the `flatten_graph` filter after this filter. Default is `false`. +`protected_words` | Optional | Array of strings | Specifies tokens that should not be split. +`protected_words_path` | Optional | String | Specifies a path (absolute or relative to the config directory) to a file containing tokens that should not be separated by new lines. +`split_on_case_change` | Optional | Boolean | Splits tokens where consecutive letters have different cases (one is lowercase and the other is uppercase). For example, `"OpenSearch"` becomes `[ Open, Search ]`. Default is `true`. +`split_on_numerics` | Optional | Boolean | Splits tokens where there are consecutive letters and numbers. For example `"v8engine"` will become `[ v, 8, engine ]`. Default is `true`. +`stem_english_possessive` | Optional | Boolean | Removes English possessive endings, such as `'s`. Default is `true`. +`type_table` | Optional | Array of strings | A custom map that specifies how to treat characters and whether to treat them as delimiters, which avoids unwanted splitting. For example, to treat a hyphen (`-`) as an alphanumeric character, specify `["- => ALPHA"]` so that words are not split at hyphens. Valid types are:
- `ALPHA`: alphabetical
- `ALPHANUM`: alphanumeric
- `DIGIT`: numeric
- `LOWER`: lowercase alphabetical
- `SUBWORD_DELIM`: non-alphanumeric delimiter
- `UPPER`: uppercase alphabetical +`type_table_path` | Optional | String | Specifies a path (absolute or relative to the config directory) to a file containing a custom character map. The map specifies how to treat characters and whether to treat them as delimiters, which avoids unwanted splitting. For valid types, see `type_table`. + +## Example + +The following example request creates a new index named `my-custom-index` and configures an analyzer with a `word_delimiter` filter: + +```json +PUT /my-custom-index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyzer": { + "tokenizer": "keyword", + "filter": [ "custom_word_delimiter_filter" ] + } + }, + "filter": { + "custom_word_delimiter_filter": { + "type": "word_delimiter", + "split_on_case_change": true, + "split_on_numerics": true, + "stem_english_possessive": true + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +GET /my-custom-index/_analyze +{ + "analyzer": "custom_analyzer", + "text": "FastCar's Model2023" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "Fast", + "start_offset": 0, + "end_offset": 4, + "type": "word", + "position": 0 + }, + { + "token": "Car", + "start_offset": 4, + "end_offset": 7, + "type": "word", + "position": 1 + }, + { + "token": "Model", + "start_offset": 10, + "end_offset": 15, + "type": "word", + "position": 2 + }, + { + "token": "2023", + "start_offset": 15, + "end_offset": 19, + "type": "word", + "position": 3 + } + ] +} +``` diff --git a/_api-reference/common-parameters.md b/_api-reference/common-parameters.md index 5b536ad992..ac3efbf4bf 100644 --- a/_api-reference/common-parameters.md +++ b/_api-reference/common-parameters.md @@ -123,4 +123,17 @@ Kilometers | `km` or `kilometers` Meters | `m` or `meters` Centimeters | `cm` or `centimeters` Millimeters | `mm` or `millimeters` -Nautical miles | `NM`, `nmi`, or `nauticalmiles` \ No newline at end of file +Nautical miles | `NM`, `nmi`, or `nauticalmiles` + +## `X-Opaque-Id` header + +You can specify an opaque identifier for any request using the `X-Opaque-Id` header. This identifier is used to track tasks and deduplicate deprecation warnings in server-side logs. This identifier is used to differentiate between callers sending requests to your OpenSearch cluster. Do not specify a unique value per request. + +#### Example request + +The following request adds an opaque ID to the request: + +```json +curl -H "X-Opaque-Id: my-curl-client-1" -XGET localhost:9200/_tasks +``` +{% include copy.html %} diff --git a/_api-reference/index-apis/update-settings.md b/_api-reference/index-apis/update-settings.md index 18caf2185d..3afaaa10d3 100644 --- a/_api-reference/index-apis/update-settings.md +++ b/_api-reference/index-apis/update-settings.md @@ -11,7 +11,7 @@ redirect_from: **Introduced 1.0** {: .label .label-purple } -You can use the update settings API operation to update index-level settings. You can change dynamic index settings at any time, but static settings cannot be changed after index creation. For more information about static and dynamic index settings, see [Create index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/). +You can use the update settings API operation to update index-level settings. You can change dynamic index settings at any time, but static settings cannot be changed after index creation. For more information about static and dynamic index settings, see [Configuring OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/). Aside from the static and dynamic index settings, you can also update individual plugins' settings. To get the full list of updatable settings, run `GET /_settings?include_defaults=true`. diff --git a/_automating-configurations/api/create-workflow.md b/_automating-configurations/api/create-workflow.md index 610bfe8fab..ad9552c3ef 100644 --- a/_automating-configurations/api/create-workflow.md +++ b/_automating-configurations/api/create-workflow.md @@ -16,7 +16,7 @@ Creating a workflow adds the content of a workflow template to the flow framewor To obtain the validation template for workflow steps, call the [Get Workflow Steps API]({{site.url}}{{site.baseurl}}/automating-configurations/api/get-workflow-steps/). -You can include placeholder expressions in the value of workflow step fields. For example, you can specify a credential field in a template as `openAI_key: '${{ openai_key }}'`. The expression will be substituted with the user-provided value during provisioning, using the format {% raw %}`${{ }}`{% endraw %}. You can pass the actual key as a parameter by using the [Provision Workflow API]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/) or by using this API with the `provision` parameter set to `true`. +You can include placeholder expressions in the value of workflow step fields. For example, you can specify a credential field in a template as {% raw %}`openAI_key: '${{ openai_key }}'`{% endraw %}. The expression will be substituted with the user-provided value during provisioning, using the format {% raw %}`${{ }}`{% endraw %}. You can pass the actual key as a parameter by using the [Provision Workflow API]({{site.url}}{{site.baseurl}}/automating-configurations/api/provision-workflow/) or by using this API with the `provision` parameter set to `true`. Once a workflow is created, provide its `workflow_id` to other APIs. diff --git a/_automating-configurations/api/provision-workflow.md b/_automating-configurations/api/provision-workflow.md index 62c4954ee9..cb1fe42789 100644 --- a/_automating-configurations/api/provision-workflow.md +++ b/_automating-configurations/api/provision-workflow.md @@ -30,7 +30,7 @@ The following table lists the available path parameters. ## Query parameters -If you have included a substitution expression in the template, you may pass it as a query parameter or as a string value of a request body field. For example, if you specified a credential field in a template as `openAI_key: '${{ openai_key }}'`, then you can include the `openai_key` parameter as a query parameter or body field so it can be substituted during provisioning. For example, the following request provides a query parameter: +If you have included a substitution expression in the template, you may pass it as a query parameter or as a string value of a request body field. For example, if you specified a credential field in a template as {% raw %}`openAI_key: '${{ openai_key }}'`{% endraw %}, then you can include the `openai_key` parameter as a query parameter or body field so it can be substituted during provisioning. For example, the following request provides a query parameter: ```json POST /_plugins/_flow_framework/workflow//_provision?= @@ -47,14 +47,14 @@ POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_provision ``` {% include copy-curl.html %} -The following request substitutes the expression `${{ openai_key }}` with the value "12345" using a query parameter: +The following request substitutes the expression {% raw %}`${{ openai_key }}`{% endraw %} with the value "12345" using a query parameter: ```json POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_provision?openai_key=12345 ``` {% include copy-curl.html %} -The following request substitutes the expression `${{ openai_key }}` with the value "12345" using the request body: +The following request substitutes the expression {% raw %}`${{ openai_key }}`{% endraw %} with the value "12345" using the request body: ```json POST /_plugins/_flow_framework/workflow/8xL8bowB8y25Tqfenm50/_provision diff --git a/_benchmark/reference/commands/aggregate.md b/_benchmark/reference/commands/aggregate.md index 17612f1164..a891bf3edf 100644 --- a/_benchmark/reference/commands/aggregate.md +++ b/_benchmark/reference/commands/aggregate.md @@ -69,9 +69,30 @@ Aggregate test execution ID: aggregate_results_geonames_9aafcfb8-d3b7-4583-864e ------------------------------- ``` -The results will be aggregated into one test execution and stored under the ID shown in the output: +The results will be aggregated into one test execution and stored under the ID shown in the output. +### Additional options - `--test-execution-id`: Define a unique ID for the aggregated test execution. - `--results-file`: Write the aggregated results to the provided file. - `--workload-repository`: Define the repository from which OpenSearch Benchmark will load workloads (default is `default`). +## Aggregated results + +Aggregated results includes the following information: + +- **Relative Standard Deviation (RSD)**: For each metric an additional `mean_rsd` value shows the spread of results across test executions. +- **Overall min/max values**: Instead of averaging minimum and maximum values, the aggregated result include `overall_min` and `overall_max` which reflect the true minimum/maximum across all test runs. +- **Storage**: Aggregated test results are stored in a separate `aggregated_results` folder alongside the `test_executions` folder. + +The following example shows aggregated results: + +```json + "throughput": { + "overall_min": 29056.890292903263, + "mean": 50115.8603858536, + "median": 50099.54349684457, + "overall_max": 72255.15946248993, + "unit": "docs/s", + "mean_rsd": 59.426059705973664 + }, +``` diff --git a/_config.yml b/_config.yml index f2749b66f8..3c6f737cc8 100644 --- a/_config.yml +++ b/_config.yml @@ -31,9 +31,6 @@ collections: install-and-configure: permalink: /:collection/:path/ output: true - upgrade-to: - permalink: /:collection/:path/ - output: true im-plugin: permalink: /:collection/:path/ output: true @@ -94,6 +91,9 @@ collections: data-prepper: permalink: /:collection/:path/ output: true + migration-assistant: + permalink: /:collection/:path/ + output: true tools: permalink: /:collection/:path/ output: true @@ -121,6 +121,9 @@ collections: getting-started: permalink: /:collection/:path/ output: true + workspace: + permalink: /:collection/:path/ + output: true opensearch_collection: # Define the collections used in the theme @@ -134,11 +137,6 @@ opensearch_collection: install-and-configure: name: Install and upgrade nav_fold: true - upgrade-to: - name: Migrate to OpenSearch - # nav_exclude: true - nav_fold: true - # search_exclude: true im-plugin: name: Managing Indexes nav_fold: true @@ -210,6 +208,12 @@ clients_collection: name: Clients nav_fold: true +migration_assistant_collection: + collections: + migration-assistant: + name: Migration Assistant + nav_fold: true + benchmark_collection: collections: benchmark: @@ -249,6 +253,12 @@ defaults: values: section: "benchmark" section-name: "Benchmark" + - + scope: + path: "_migration-assistant" + values: + section: "migration-assistant" + section-name: "Migration Assistant" # Enable or disable the site search # By default, just-the-docs enables its JSON file-based search. We also have an OpenSearch-driven search functionality. @@ -308,6 +318,7 @@ plugins: - jekyll-remote-theme - jekyll-redirect-from - jekyll-sitemap + - jekyll-spec-insert # This format has to conform to RFC822 last-modified-at: @@ -317,6 +328,8 @@ last-modified-at: # The following items will not be processed, by default. Create a custom list # to override the default setting. exclude: + - README.md + - DEVELOPER_GUIDE.md - Gemfile - Gemfile.lock - node_modules @@ -324,6 +337,12 @@ exclude: - vendor/cache/ - vendor/gems/ - vendor/ruby/ - - README.md - - .idea - - templates + - templates/ + - .sass-cache/ + - .jekyll-cache/ + - .idea/ + - .github/ + - .bundle/ + - _site/ + - spec-insert + - release-notes \ No newline at end of file diff --git a/_dashboards/dashboards-assistant/alert-insight.md b/_dashboards/dashboards-assistant/alert-insight.md index 9e41b977ca..603e5aba44 100644 --- a/_dashboards/dashboards-assistant/alert-insight.md +++ b/_dashboards/dashboards-assistant/alert-insight.md @@ -197,6 +197,9 @@ POST /.plugins-ml-config/_doc/os_insight The created `os_insight` agent provides alert insights related to OpenSearch cluster metrics. For insights about alerts unrelated to OpenSearch cluster metrics, you need to register an agent with [this template](https://github.com/opensearch-project/flow-framework/blob/2.x/sample-templates/create-knowledge-base-alert-agent.json) and change the agent name to `KB_For_Alert_Insight`. {: .note} +This example demonstrates a system index. In security-enabled domains, only superadmins have permissions to execute this code. For information about making superadmin calls, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). For access permissions, contact your system administrator. +{: .warning} + ### Step 4: Test the agents You can verify that the agents were created successfully by calling the agents with an example payload. diff --git a/_dashboards/dashboards-assistant/data-summary.md b/_dashboards/dashboards-assistant/data-summary.md index 5ad465be62..e90e184e07 100644 --- a/_dashboards/dashboards-assistant/data-summary.md +++ b/_dashboards/dashboards-assistant/data-summary.md @@ -36,7 +36,7 @@ queryEnhancements.queryAssist.summary.enabled: true ### Step 2: Create a data summary agent -To orchestrate data summarization, create a data summary [agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/#agents). To create an agent, send a `POST /_plugins/_ml/agents/_register` request and provide the agent template as a payload: +To orchestrate data summarization, create a data summary [agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/#agents). To create an agent, send a `POST /_plugins/_flow_framework/workflow?provision=true` request and provide the agent template as a payload:

@@ -45,7 +45,7 @@ To orchestrate data summarization, create a data summary [agent]({{site.url}}{{s {: .text-delta} ```json -POST /_plugins/_ml/agents/_register +POST /_plugins/_flow_framework/workflow?provision=true { "name": "Query Assist Agent", "description": "Create a Query Assist Agent using Claude on BedRock", @@ -237,6 +237,9 @@ POST /.plugins-ml-config/_doc/os_data2summary ``` {% include copy-curl.html %} +This example demonstrates a system index. In security-enabled domains, only superadmins have permissions to execute this code. For information about making superadmin calls, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). For access permissions, contact your system administrator. +{: .warning} + ### Step 4: Test the agent You can verify that the data summary agent was created successfully by calling the agent with an example payload: @@ -288,4 +291,4 @@ To view alert insights in OpenSearch Dashboards, use the following steps: 1. From the query language dropdown list, select **PPL**. You will see the generated data summary after the query text, as shown in the following image. - data summary \ No newline at end of file + data summary diff --git a/_dashboards/dashboards-assistant/suggest-anomaly-detector.md b/_dashboards/dashboards-assistant/suggest-anomaly-detector.md index 2c9f795e8d..8f4aac80fd 100644 --- a/_dashboards/dashboards-assistant/suggest-anomaly-detector.md +++ b/_dashboards/dashboards-assistant/suggest-anomaly-detector.md @@ -36,7 +36,7 @@ assistant.smartAnomalyDetector.enabled: true ### Step 2: Create an anomaly detector suggestion agent -To orchestrate anomaly detector suggestions, create an anomaly detector suggestion [agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/#agents). To create an agent, send a `POST /_plugins/_ml/agents/_register` request and provide the agent template as a payload. For more information, see [Configuring OpenSearch Assistant]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/index/#configuring-opensearch-assistant). +To orchestrate anomaly detector suggestions, create an anomaly detector suggestion [agent]({{site.url}}{{site.baseurl}}/ml-commons-plugin/agents-tools/index/#agents). To create an agent, send a `POST /_plugins/_flow_framework/workflow?provision=true` request and provide the agent template as a payload. For more information, see [Configuring OpenSearch Assistant]({{site.url}}{{site.baseurl}}/dashboards/dashboards-assistant/index/#configuring-opensearch-assistant). For sample agent templates, see [Flow Framework sample templates](https://github.com/opensearch-project/flow-framework/tree/2.x/sample-templates). Note the agent ID; you'll use it in the following step. @@ -55,6 +55,9 @@ POST /.plugins-ml-config/_doc/os_suggest_ad ``` {% include copy-curl.html %} +This example demonstrates a system index. In security-enabled domains, only superadmins have permissions to execute this code. For information about making superadmin calls, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). For access permissions, contact your system administrator. +{: .warning} + ### Step 4: Test the agent You can verify that the agent was created successfully by calling the agent with an example payload: diff --git a/_dashboards/dashboards-assistant/text-to-visualization.md b/_dashboards/dashboards-assistant/text-to-visualization.md index 5c0aab51f0..a30ca6d1f8 100644 --- a/_dashboards/dashboards-assistant/text-to-visualization.md +++ b/_dashboards/dashboards-assistant/text-to-visualization.md @@ -177,7 +177,7 @@ POST /_plugins/_flow_framework/workflow//_provision To view the status of the workflow and all created resources, send the following request: ```json -/_plugins/_flow_framework/workflow//_status +GET /_plugins/_flow_framework/workflow//_status ``` {% include copy-curl.html %} @@ -209,6 +209,9 @@ POST /.plugins-ml-config/_doc/os_text2vega_with_instructions ``` {% include copy-curl.html %} +This example demonstrates a system index. In security-enabled domains, only superadmins have permissions to execute this code. For information about making superadmin calls, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). For access permissions, contact your system administrator. +{: .warning} + ### Step 4: Test the agent You can verify that the agent was created successfully by calling the agent with an example payload: diff --git a/_dashboards/management/accelerate-external-data.md b/_dashboards/management/accelerate-external-data.md index 6d1fa030e4..00eb8671ec 100644 --- a/_dashboards/management/accelerate-external-data.md +++ b/_dashboards/management/accelerate-external-data.md @@ -1,10 +1,8 @@ --- layout: default title: Optimize query performance using OpenSearch indexing -parent: Connecting Amazon S3 to OpenSearch -grand_parent: Data sources -nav_order: 15 -has_children: false +parent: Data sources +nav_order: 17 --- # Optimize query performance using OpenSearch indexing @@ -14,35 +12,171 @@ Introduced 2.11 Query performance can be slow when using external data sources for reasons such as network latency, data transformation, and data volume. You can optimize your query performance by using OpenSearch indexes, such as a skipping index or a covering index. -A _skipping index_ uses skip acceleration methods, such as partition, minimum and maximum values, and value sets, to ingest and create compact aggregate data structures. This makes them an economical option for direct querying scenarios. +- A _skipping index_ uses skip acceleration methods, such as partition, minimum and maximum values, and value sets, to ingest and create compact aggregate data structures. This makes them an economical option for direct querying scenarios. For more information, see [Skipping indexes](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#skipping-indexes). +- A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. For more information, see [Covering indexes](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#covering-indexes). +- A _materialized view_ enhances query performance by storing precomputed and aggregated data from the source data. For more information, see [Materialized views](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#materialized-views). -A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. See the [Flint Index Reference Manual](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md) for comprehensive guidance on this feature's indexing process. +For comprehensive guidance on each indexing process, see the [Flint Index Reference Manual](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md). ## Data sources use case: Accelerate performance -To get started with the **Accelerate performance** use case available in **Data sources**, follow these steps: +To get started with accelerating query performance, perform the following steps: -1. Go to **OpenSearch Dashboards** > **Query Workbench** and select your Amazon S3 data source from the **Data sources** dropdown menu in the upper-left corner. -2. From the left-side navigation menu, select a database. -3. View the results in the table and confirm that you have the desired data. +1. Go to **OpenSearch Dashboards** > **Query Workbench** and select your data source from the **Data sources** dropdown menu. +2. From the navigation menu, select a database. +3. View the results in the table and confirm that you have the correct data. 4. Create an OpenSearch index by following these steps: - 1. Select the **Accelerate data** button. A pop-up window appears. - 2. Enter your details in **Select data fields**. In the **Database** field, select the desired acceleration index: **Skipping index** or **Covering index**. A _skipping index_ uses skip acceleration methods, such as partition, min/max, and value sets, to ingest data using compact aggregate data structures. This makes them an economical option for direct querying scenarios. A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. -5. Under **Index settings**, enter the information for your acceleration index. For information about naming, select **Help**. Note that an Amazon S3 table can only have one skipping index at a time. + 1. Select **Accelerate data**. A pop-up window appears. + 2. Enter your database and table details under **Select data fields**. +5. For **Acceleration type**, select the type of acceleration according to your use case. Then, enter the information for your acceleration type. For more information, see the following sections: + - [Skipping indexes](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#skipping-indexes) + - [Covering indexes](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#covering-indexes) + - [Materialized views](https://opensearch.org/docs/latest/dashboards/management/accelerate-external-data/#materialized-views) + +## Skipping indexes + +A _skipping index_ uses skip acceleration methods, such as partition, min/max, and value sets, to ingest data using compact aggregate data structures. This makes them an economical option for direct querying scenarios. + +With a skipping index, you can index only the metadata of the data stored in Amazon S3. When you query a table with a skipping index, the query planner references the index and rewrites the query to efficiently locate the data, instead of scanning all partitions and files. This allows the skipping index to quickly narrow down the specific location of the stored data. ### Define skipping index settings -1. Under **Skipping index definition**, select the **Add fields** button to define the skipping index acceleration method and choose the fields you want to add. -2. Select the **Copy Query to Editor** button to apply your skipping index settings. -3. View the skipping index query details in the table pane and then select the **Run** button. Your index is added to the left-side navigation menu containing the list of your databases. +1. Under **Skipping index definition**, select **Generate** to automatically generate a skipping index. Alternately, to manually choose the fields you want to add, select **Add fields**. Choose from the following types: + - `Partition`: Uses data partition details to locate data. This type is best for partitioning-based columns such as year, month, day, hour. + - `MinMax`: Uses lower and upper bound of the indexed column to locate data. This type is best for numeric columns. + - `ValueSet`: Uses a unique value set to locate data. This type is best for columns with low to moderate cardinality that require exact matching. + - `BloomFilter`: Uses the bloom filter algorithm to locate data. This type is best for columns with high cardinality that do not require exact matching. +2. Select **Create acceleration** to apply your skipping index settings. +3. View the skipping index query details and then click **Run**. OpenSearch adds your index to the left navigation pane. + +Alternately, you can manually create a skipping index using Query Workbench. Select your data source from the dropdown and run a query like the following: + +```sql +CREATE SKIPPING INDEX +ON datasourcename.gluedatabasename.vpclogstable( + `srcaddr` BLOOM_FILTER, + `dstaddr` BLOOM_FILTER, + `day` PARTITION, + `account_id`BLOOM_FILTER + ) WITH ( +index_settings = '{"number_of_shards":5,"number_of_replicas":1}', +auto_refresh = true, +checkpoint_location = 's3://accountnum-vpcflow/AWSLogs/checkpoint' +) +``` + +## Covering indexes + +A _covering index_ ingests all or some of the data from the source into OpenSearch and makes it possible to use all OpenSearch Dashboards and plugin functionality. + +With a covering index, you can ingest data from a specified column in a table. This is the most performant of the three indexing types. Because OpenSearch ingests all data from your desired column, you get better performance and can perform advanced analytics. + +OpenSearch creates a new index from the covering index data. You can use this new index to create visualizations, or for anomaly detection and geospatial capabilities. You can manage the covering view index with Index State Management. For more information, see [Index State Management](https://opensearch.org/docs/latest/im-plugin/ism/index/). ### Define covering index settings -1. Under **Index settings**, enter a valid index name. Note that each Amazon S3 table can have multiple covering indexes. -2. Once you have added the index name, define the covering index fields by selecting `(add fields here)` under **Covering index definition**. -3. Select the **Copy Query to Editor** button to apply your covering index settings. -4. View the covering index query details in the table pane and then select the **Run** button. Your index is added to the left-side navigation menu containing the list of your databases. +1. For **Index name**, enter a valid index name. Note that each table can have multiple covering indexes. +2. Choose a **Refresh type**. By default, OpenSearch automatically refreshes the index. Otherwise, you must manually trigger a refresh using a REFRESH statement. +3. Enter a **Checkpoint location**, which is a path for refresh job checkpoints. The location must be a path in an HDFS compatible file system. +4. Define the covering index fields by selecting **(add fields here)** under **Covering index definition**. +5. Select **Create acceleration** to apply your covering index settings. +6. View the covering index query details and then click **Run**. OpenSearch adds your index to the left navigation pane. + +Alternately, you can manually create a covering index on your table using Query Workbench. Select your data source from the dropdown and run a query like the following: + +```sql +CREATE INDEX vpc_covering_index +ON datasourcename.gluedatabasename.vpclogstable (version, account_id, interface_id, +srcaddr, dstaddr, srcport, dstport, protocol, packets, +bytes, start, action, log_status STRING, +`aws-account-id`, `aws-service`, `aws-region`, year, +month, day, hour ) +WITH ( + auto_refresh = true, + refresh_interval = '15 minute', + checkpoint_location = 's3://accountnum-vpcflow/AWSLogs/checkpoint' +) +``` + +## Materialized views + +With _materialized views_, you can use complex queries, such as aggregations, to power Dashboards visualizations. Materialized views ingest a small amount of your data, depending on the query, into OpenSearch. OpenSearch then forms an index from the ingested data that you can use for visualizations. You can manage the materialized view index with Index State Management. For more information, see [Index State Management](https://opensearch.org/docs/latest/im-plugin/ism/index/). + +### Define materialized view settings + +1. For **Index name**, enter a valid index name. Note that each table can have multiple covering indexes. +2. Choose a **Refresh type**. By default, OpenSearch automatically refreshes the index. Otherwise, you must manually trigger a refresh using a `REFRESH` statement. +3. Enter a **Checkpoint location**, which is a path for refresh job checkpoints. The location must be a path in an HDFS compatible file system. +4. Enter a **Watermark delay**, which defines how late data can come and still be processed, such as 1 minute or 10 seconds. +5. Define the covering index fields under **Materialized view definition**. +6. Select **Create acceleration** to apply your materialized view index settings. +7. View the materialized view query details and then click **Run**. OpenSearch adds your index to the left navigation pane. + +Alternately, you can manually create a materialized view index on your table using Query Workbench. Select your data source from the dropdown and run a query like the following: + +```sql +CREATE MATERIALIZED VIEW {table_name}__week_live_mview AS + SELECT + cloud.account_uid AS `aws.vpc.cloud_account_uid`, + cloud.region AS `aws.vpc.cloud_region`, + cloud.zone AS `aws.vpc.cloud_zone`, + cloud.provider AS `aws.vpc.cloud_provider`, + + CAST(IFNULL(src_endpoint.port, 0) AS LONG) AS `aws.vpc.srcport`, + CAST(IFNULL(src_endpoint.svc_name, 'Unknown') AS STRING) AS `aws.vpc.pkt-src-aws-service`, + CAST(IFNULL(src_endpoint.ip, '0.0.0.0') AS STRING) AS `aws.vpc.srcaddr`, + CAST(IFNULL(src_endpoint.interface_uid, 'Unknown') AS STRING) AS `aws.vpc.src-interface_uid`, + CAST(IFNULL(src_endpoint.vpc_uid, 'Unknown') AS STRING) AS `aws.vpc.src-vpc_uid`, + CAST(IFNULL(src_endpoint.instance_uid, 'Unknown') AS STRING) AS `aws.vpc.src-instance_uid`, + CAST(IFNULL(src_endpoint.subnet_uid, 'Unknown') AS STRING) AS `aws.vpc.src-subnet_uid`, + + CAST(IFNULL(dst_endpoint.port, 0) AS LONG) AS `aws.vpc.dstport`, + CAST(IFNULL(dst_endpoint.svc_name, 'Unknown') AS STRING) AS `aws.vpc.pkt-dst-aws-service`, + CAST(IFNULL(dst_endpoint.ip, '0.0.0.0') AS STRING) AS `aws.vpc.dstaddr`, + CAST(IFNULL(dst_endpoint.interface_uid, 'Unknown') AS STRING) AS `aws.vpc.dst-interface_uid`, + CAST(IFNULL(dst_endpoint.vpc_uid, 'Unknown') AS STRING) AS `aws.vpc.dst-vpc_uid`, + CAST(IFNULL(dst_endpoint.instance_uid, 'Unknown') AS STRING) AS `aws.vpc.dst-instance_uid`, + CAST(IFNULL(dst_endpoint.subnet_uid, 'Unknown') AS STRING) AS `aws.vpc.dst-subnet_uid`, + CASE + WHEN regexp(dst_endpoint.ip, '(10\\..*)|(192\\.168\\..*)|(172\\.1[6-9]\\..*)|(172\\.2[0-9]\\..*)|(172\\.3[0-1]\\.*)') + THEN 'ingress' + ELSE 'egress' + END AS `aws.vpc.flow-direction`, + + CAST(IFNULL(connection_info['protocol_num'], 0) AS INT) AS `aws.vpc.connection.protocol_num`, + CAST(IFNULL(connection_info['tcp_flags'], '0') AS STRING) AS `aws.vpc.connection.tcp_flags`, + CAST(IFNULL(connection_info['protocol_ver'], '0') AS STRING) AS `aws.vpc.connection.protocol_ver`, + CAST(IFNULL(connection_info['boundary'], 'Unknown') AS STRING) AS `aws.vpc.connection.boundary`, + CAST(IFNULL(connection_info['direction'], 'Unknown') AS STRING) AS `aws.vpc.connection.direction`, + + CAST(IFNULL(traffic.packets, 0) AS LONG) AS `aws.vpc.packets`, + CAST(IFNULL(traffic.bytes, 0) AS LONG) AS `aws.vpc.bytes`, + + CAST(FROM_UNIXTIME(time / 1000) AS TIMESTAMP) AS `@timestamp`, + CAST(FROM_UNIXTIME(start_time / 1000) AS TIMESTAMP) AS `start_time`, + CAST(FROM_UNIXTIME(start_time / 1000) AS TIMESTAMP) AS `interval_start_time`, + CAST(FROM_UNIXTIME(end_time / 1000) AS TIMESTAMP) AS `end_time`, + status_code AS `aws.vpc.status_code`, + + severity AS `aws.vpc.severity`, + class_name AS `aws.vpc.class_name`, + category_name AS `aws.vpc.category_name`, + activity_name AS `aws.vpc.activity_name`, + disposition AS `aws.vpc.disposition`, + type_name AS `aws.vpc.type_name`, + + region AS `aws.vpc.region`, + accountid AS `aws.vpc.account-id` + FROM + datasourcename.gluedatabasename.vpclogstable +WITH ( + auto_refresh = true, + refresh_interval = '15 Minute', + checkpoint_location = 's3://accountnum-vpcflow/AWSLogs/checkpoint', + watermark_delay = '1 Minute', +) +``` ## Limitations -This feature is still under development, so there are some limitations. For real-time updates, refer to the [developer documentation on GitHub](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md#limitations). +This feature is still under development, so there are some limitations. For real-time updates, see the [developer documentation on GitHub](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md#limitations). diff --git a/_dashboards/management/acl.md b/_dashboards/management/acl.md new file mode 100644 index 0000000000..bd57b72419 --- /dev/null +++ b/_dashboards/management/acl.md @@ -0,0 +1,78 @@ +--- +layout: default +title: Access control lists for saved objects +parent: Dashboards Management +nav_order: 50 +--- + +# Access control lists for saved objects +Introduced 2.18 +{: .label .label-purple } + +You can use access control lists (ACLs) to manage permissions for your saved objects, providing authorization (AuthZ) capabilities without requiring backend plugin integration. + +## Understanding ACL types + +ACLs are applied at two levels: + +1. **Workspace ACL:** Workspace objects inherit permissions from their parent workspace. See [Workspace ACL]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl) for more information. +2. **Objects ACL:** Each individual object can have its own ACL policy. All operations on these objects must pass ACL policy validation. + +## Enabling the ACL feature + +The ACL feature must be enabled before you can define any access controls. Enable it by: + +1. Opening your `opensearch_dashboards.yml` file. +2. Enabling permissions with `savedObjects.permission.enabled: true`. + +## Defining ACL permissions + +ACL permissions are defined using the following schema: + +```json +{ + "permissions": { + "": { + "users": ["", ""], + "groups": ["", ""] + } + } +} +``` +{% include copy-curl.html %} + +### Granting permissions to authenticated users + +The wildcard character (`*`) grants permissions to all authenticated users. In the following example, the ACL grants workspace management permissions to the `finance_manager` group and dashboard creation permissions to the `finance_analyst` group: + +```json +{ + "permissions": { + "write": { + "groups": ["finance_manager"] + }, + "library_write": { + "groups": ["finance_analyst"] + } + } +} +``` +{% include copy-curl.html %} + +### Configuring mixed-level permissions + +To allow one user, `user-1` for example, to modify an object while giving read-only access to others, you can configure the ACL policy as follows: + +```json +{ + "permissions": { + "read": { + "users": ["*"] + }, + "write": { + "users": ["user-1"] + }, + } +} +``` +{% include copy-curl.html %} diff --git a/_dashboards/management/management-index.md b/_dashboards/management/management-index.md index 7edc4d06c2..01796180e5 100644 --- a/_dashboards/management/management-index.md +++ b/_dashboards/management/management-index.md @@ -9,16 +9,14 @@ has_children: true Introduced 2.10 {: .label .label-purple } -**Dashboards Management** serves as the command center for customizing OpenSearch Dashboards to your needs. A view of the interface is shown in the following image. +**Dashboards Management** is the central hub for managing and customizing OpenSearch data directly within OpenSearch Dashboards. -Dashboards Management interface - -{::nomarkdown}alert icon{:/} **Note**
OpenSearch and OpenSearch Dashboards privileges govern access to individual features. If you do not have the appropriate access, consult your administrator. -{: .note} +OpenSearch and OpenSearch Dashboards permissions govern access to individual features. If you do not have the appropriate access permissions, consult your administrator. +{: .warning} ## Applications -The following applications are available in **Dashboards Management**: +You can access the following applications in **Dashboards Management**: - **[Index Patterns]({{site.url}}{{site.baseurl}}/dashboards/management/index-patterns/):** To access OpenSearch data, you need to create an index pattern so that you can select the data you want to use and define the properties of the fields. The Index Pattern tool gives you the ability to create an index pattern from within the UI. Index patterns point to one or more indexes, data streams, or index aliases. - **[Data Sources]({{site.url}}{{site.baseurl}}/dashboards/management/multi-data-sources/):** The Data Sources tool is used to configure and manage the data sources that OpenSearch uses to collect and analyze data. You can use the tool to specify the source configuration in your copy of the [OpenSearch Dashboards configuration file]({{site.url}}{{site.baseurl}}https://github.com/opensearch-project/OpenSearch-Dashboards/blob/main/config/opensearch_dashboards.yml). diff --git a/_dashboards/workspace/apis.md b/_dashboards/workspace/apis.md new file mode 100644 index 0000000000..683488e423 --- /dev/null +++ b/_dashboards/workspace/apis.md @@ -0,0 +1,386 @@ +--- +layout: default +title: Workspaces APIs +parent: Workspace for OpenSearch Dashboards +nav_order: 10 +--- + +# Workspaces APIs +Introduced 2.18 +{: .label .label-purple } + +The Workspaces API provides a set of endpoints for managing workspaces in OpenSearch Dashboards. + +## List Workspaces API + +You can use the following endpoint to retrieve a list of workspaces: + +```json +POST :/api/workspaces/_list +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `search` | String | Optional | A query string used to filter workspaces with simple query syntax, for example, `simple_query_string`. | +| `searchFields` | Array | Optional | Specifies which fields to perform the search query against. | +| `sortField` | String | Optional | The field name to use for sorting results. | +| `sortOrder` | String | Optional | Specifies ascending or descending sort order. | +| `perPage` | Number | Optional | The number of workspace results per page. | +| `page` | Number | Optional | The number of pages of results to retrieve. | +| `permissionModes` | Array | Optional | A list of permissions to filter by. | + +#### Example request + +```json +POST /api/workspaces/_list +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": { + "page": 1, + "per_page": 20, + "total": 3, + "workspaces": [ + { + "name": "test1", + "features": [ + "use-case-all" + ], + "id": "hWNZls" + }, + { + "name": "test2", + "features": [ + "use-case-observability" + ], + "id": "SnkOPt" + } + ] + } +} +``` +{% include copy-curl.html %} + +## Get Workspaces API + +You can use the following endpoint to retrieve a single workspace: + +```json +GET :/api/workspaces/ +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. All path parameters are required. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `` | String | Required | Identifies the unique workspace to be retrieved. | + +#### Example request + +```json +GET /api/workspaces/SnkOPt +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": { + "name": "test2", + "features": ["use-case-all"], + "id": "SnkOPt" + } +} +``` +{% include copy-curl.html %} + +## Create Workspaces API + +You can use the following endpoint to create a workspace: + +```json +POST :/api/workspaces +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `attributes` | Object | Required | Defines the workspace attributes. | +| `permissions` | Object | Optional | Specifies the permissions for the workspace. | + +#### Example request + +```json +POST api/workspaces +{ + "attributes": { + "name": "test4", + "description": "test4" + } +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": { + "id": "eHVoCJ" + } +} +``` +{% include copy-curl.html %} + +## Update Workspaces API + +You can use the following endpoint to update the attributes and permissions for a workspace: + +```json +PUT :/api/workspaces/ +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `` | String | Required | Identifies the unique workspace to be retrieved. | +| `attributes` | Object | Required | Defines the workspace attributes. | +| `permissions` | Object | Optional | Specifies the permissions for the workspace. | + +#### Example request + +```json +PUT api/workspaces/eHVoCJ +{ + "attributes": { + "name": "test4", + "description": "test update" + } +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": true +} +``` +{% include copy-curl.html %} + +## Delete Workspaces API + +You can use the following endpoint to delete a workspace: + +```json +DELETE :/api/workspaces/ +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. All path parameters are required. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `` | String | Required | Identifies the unique workspace to be retrieved. | + +#### Example request + +```json +DELETE api/workspaces/eHVoCJ +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": true +} +``` +{% include copy-curl.html %} + +## Duplicate Saved Objects Workspaces API + +You can use the following endpoint to copy saved objects between workspaces: + +```json +POST :/api/workspaces/_duplicate_saved_objects +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `objects` | Array | Required | Specifies the saved objects to be duplicated. | +| `targetWorkspace` | String | Required | Identifies the destination workspace for copying. | +| `includeReferencesDeep` | Boolean | Optional | Determines whether to copy all referenced objects to the target workspace. Default is `true`. | + +The following table lists the attributes of the object in the `objects` parameter. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `type` | String | Required | Defines the saved object classification, such as `index-pattern`, `config`, or `dashboard`. | +| `id` | String | Required | The ID of the saved object. | + +#### Example request + +```json +POST api/workspaces/_duplicate_saved_objects +{ + "objects": [ + { + "type": "index-pattern", + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d" + } + ], + "targetWorkspace": "9gt4lB" +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "successCount": 1, + "success": true, + "successResults": [ + { + "type": "index-pattern", + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d", + "meta": { + "title": "test*", + "icon": "indexPatternApp" + }, + "destinationId": "f4b724fd-9647-4bbf-bf59-610b43a62c75" + } + ] +} +``` +{% include copy-curl.html %} + +## Associate Saved Objects Workspaces API + +You can use the following endpoint to associate saved objects with a workspace: + +```json +POST :/api/workspaces/_associate +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `workspaceId` | String | Required | Identifies the target workspace for object association. | +| `savedObjects` | Array | Required | Specifies the list of saved objects to be copied. | + +The following table lists the attributes of the object in the `objects` parameter. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `type` | String | Required | Defines the saved object classification, such as `index-pattern`, `config`, or `dashboard`. | +| `id` | String | Required | The ID of the saved object. | + +#### Example request + +```json +POST api/workspaces/_associate +{ + "objects": [ + { + "type": "index-pattern", + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d" + } + ], + "targetWorkspace": "9gt4lB" +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": [ + { + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d", + } + ] +} +``` +{% include copy-curl.html %} + +## Dissociate Saved Objects Workspaces API + +You can use the following endpoint to dissociate saved objects from a workspace: + +```json +POST :/api/workspaces/_dissociate +``` +{% include copy-curl.html %} + +The following table lists the available path parameters. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `workspaceId` | String | Required | The target workspace with which to associate the objects. | +| `savedObjects` | Array | Required | A list of saved objects to copy. | + +The following table lists the attributes of the `savedObjects` parameter. + +| Parameter | Data type | Required | Description | +| :--- | :--- | :--- | :--- | +| `type` | String | Required | The type of the saved object, such as `index-pattern`, `config`, or `dashboard`. | +| `id` | String | Required | The ID of the saved object. | + +#### Example request + +```json +POST api/workspaces/_dissociate +{ + "objects": [ + { + "type": "index-pattern", + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d" + } + ], + "targetWorkspace": "9gt4lB" +} +``` +{% include copy-curl.html %} + +The following example response shows a successful API call: + +```json +{ + "success": true, + "result": [ + { + "id": "619cc200-ecd0-11ee-95b1-e7363f9e289d", + } + ] +} +``` +{% include copy-curl.html %} diff --git a/_dashboards/workspace/create-workspace.md b/_dashboards/workspace/create-workspace.md new file mode 100644 index 0000000000..34ba65bb54 --- /dev/null +++ b/_dashboards/workspace/create-workspace.md @@ -0,0 +1,52 @@ +--- +layout: default +title: Create a workspace +parent: Workspace for OpenSearch Dashboards +nav_order: 1 +--- + +# Create a workspace +Introduced 2.18 +{: .label .label-purple } + +Before getting started with this tutorial, you must enable the workspace feature flag. See [Enabling the ACL feature]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace/#enabling-the-workspace-feature) for more information. + +When the saved objects permission is enabled, only users with admin status can create workspaces. See [Configuring the dashboard admin]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) for more information. + +To create a workspace, follow these steps: + +1. Open OpenSearch Dashboards. +2. From the main page, choose the appropriate card for your use case, for example, **Observability**, **Security Analytics**, **Search**, **Essentials**, or **Analytics**. Alternatively, you can select the **Create workspace** button and choose the appropriate use case from the dropdown menu. +3. Enter the required information in the **Workspace details** window. + - **Workspace name** is required. Valid characters are `a-z`, `A-Z`, `0-9`, parentheses (`()`), brackets (`[]`), underscore (`_`), hyphen (`-`), and spaces. Choose a unique workspace name within the character limit (40 characters). The **Create workspace** button is disabled when the workspace name already exists or exceeds the character limit, and an error message appears. + - **Use case and features** is required. Choose the use case that best fits your needs. If you are using Amazon OpenSearch Serverless and have enabled the [multiple data sources]({{site.url}}{{site.baseurl}}/dashboards/management/data-sources/) feature, **Essentials** is automatically assigned. +4. (Optional) Select the color picker to customize the color of your workspace icon. +5. (Optional) Add a workspace description of up to 200 characters. This option is disabled when the description exceeds the character limit. +6. Save your workspace. + - The **Create workspace** button becomes active once you enter the information for all required fields. You become the workspace owner automatically. The system redirects you to either the collaborators page if the saved objects permission is enabled or the overview page if the saved objects permission is disabled. See [Configuring dashboard admin]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) for more information about permissions. + +To set up permissions, see [Workspace access control lists]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/) for more information. + +## Associating data sources with a workspace + +The **Associate data source** option is only visible when the multiple data sources feature is enabled. Before creating your workspace, you must connect it with at least one data source. If you have not set up your data sources, see [Data sources]({{site.url}}{{site.baseurl}}/dashboards/management/data-sources/). Once your sources are connected, you can link them to your new workspace. +{: .warning} + +### Associating OpenSearch data sources + +To associate OpenSearch data sources, follow these steps: + +1. Select the **Associate OpenSearch Data Sources** button to open the selection modal. +2. View the available data sources in the modal: + - Standard OpenSearch sources appear as single entries. + - Sources with direct query connections show a +N indicator. +3. Select the appropriate data source name(s). +4. Select the **Associate data sources** button to complete the association. + +### Associating direct query sources + +To associate direct query sources, follow these steps: + +1. Select the **Associate direct query data sources** button to open the selection modal. The modal displays only sources with direct query connections. +2. Select a data source to automatically expand its direct query connections. +3. Select the **Associate data sources** button to complete the association. diff --git a/_dashboards/workspace/index.md b/_dashboards/workspace/index.md new file mode 100644 index 0000000000..f0f572a4a5 --- /dev/null +++ b/_dashboards/workspace/index.md @@ -0,0 +1,27 @@ +--- +layout: default +title: Getting started with workspaces +parent: Workspace for OpenSearch Dashboards +nav_order: 0 +--- + +# Getting started with workspaces +Introduced 2.18 +{: .label .label-purple } + +OpenSearch Dashboards 2.18 introduces an enhanced home page that provides a comprehensive view of all your workspaces. + +The new home page includes the following features: + +1. A **Create workspace** button for [OpenSearch Dashboard admins]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) to navigate to the [create workspace]({{site.url}}{{site.baseurl}}/dashboards/workspace/create-workspace) page. +2. Workspace access time information and a link to the workspace overview page. +3. A use case information icon that displays information about the workspace's purpose. +4. A **View all workspaces** button that navigates to the [workspace management]({{site.url}}{{site.baseurl}}/dashboards/workspace/manage-workspace/#navigating-the-workspaces-list) page. +5. Links to the latest OpenSearch documentation through the **Learn more from documentation** button and to [OpenSearch Playground](https://playground.opensearch.org/app/home#/) through the **Explore live demo environment at playground.opensearch.org** button. + +The navigation logic ensures a seamless user experience by directing you to the appropriate page based on your workspace access level: + +- If a you have a default workspace configured, you are directed to the workspace overview page. +- If a you have only one workspace, you are directed to the overview page of that workspace. +- If a you have multiple workspaces, you are directed to the new home page. +- If a you have no workspaces, you are directed to the new home page. diff --git a/_dashboards/workspace/manage-workspace.md b/_dashboards/workspace/manage-workspace.md new file mode 100644 index 0000000000..45733d75be --- /dev/null +++ b/_dashboards/workspace/manage-workspace.md @@ -0,0 +1,118 @@ +--- +layout: default +title: Manage workspaces +parent: Workspace for OpenSearch Dashboards +nav_order: 2 +--- + +# Manage workspaces +Introduced 2.18 +{: .label .label-purple } + +You can access and modify the workspace details, including name, description, use case, and icon color, on the **Workspace details** page. + +To access and modify your workspace details, follow these steps: + +1. Open OpenSearch Dashboards and navigate to **My Workspaces**. +2. Choose the desired workspace and then select the **Edit** button to make changes +3. Select the **Save** button to confirm changes or the **Discard changes** button to cancel modifications. + +## Workspace update permissions + +The following permissions apply when changing workspaces: + +1. **Without the Security plugin:** All users can edit and update the workspace. +2. **With the Security plugin installed and `savedObjects.permission.enabled: false` in the `config/opensearch_dashboards.yml` file:** All users can edit and update workspaces. +3. **With the Security plugin and `savedObjects.permission.enabled: true` in the `config/opensearch_dashboards.yml`:** Only the [workspace owner]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#defining-workspace-collaborators) and the [workspace admins]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) can edit and update workspaces. + +## Workspace update restrictions + +When updating workspace use cases, the following rules apply. + +Original use case | Target use case | +:---: | :---: +Analytics | Cannot be changed to any other use case +Search | Analytics +Security analytics | Analytics +Observability | Analytics +Essentials | Analytics Search
Security Analytics
Observability + +## Workspace control panel + +The **Workspace details** page features the following buttons in the upper-right corner: + +1. **Delete** ({::nomarkdown}trash can icon{:/} icon) + - **Without the Security plugin installed:** All users can delete the workspace. + - **With the Security plugins installed and `savedObjects.permission.enabled: false` in the `config/opensearch_dashboards.yml` file:** All users can delete the workspace. + - **With the Security plugin installed and `savedObjects.permission.enabled: true` in the `config/opensearch_dashboards.yml` file:** Only the admin can delete the workspace. +2. **Set as default workspace:** Sets the current workspace as the default login destination. +3. **Workspace overview:** Opens the **Overview** page in a new tab. + +## Adding assets to the workspace + +Access the **Sample data** in the navigation menu on the left. Select the appropriate dataset to install it in your cluster and OpenSearch Dashboards. + +## Copying assets between workspaces + +Data sources and configuration copying are not supported. +{: .warning} + +The assets page provides the following methods for copying assets across workspaces: + +1. **Copy all assets to...:** Copies all assets in the table. +2. **Copy to...:** Moves selected assets from the table. +3. **Copy to...:** Copies a single asset from the table. + +After selecting a copy option, choose the target workspace from the dropdown menu. The **Copy related assets** checkbox allows you to transfer associated assets. + +Upon selecting the **Copy** button, a side panel appears showing successful and failed asset transfers. Asset copy destinations depend on the following security configurations: + +1. **Without the Security plugin:** All workspaces are accessible. +2. **With the Security plugin and `savedObjects.permission.enabled: false` in the `config/opensearch_dashboards.yml` file:** All workspaces are accessible. +3. **With the Security plugin and `savedObjects.permission.enabled: true` in the `config/opensearch_dashboards.yml` file:** Only workspaces for which the user has read and write or admin permissions are accessible. + +## Associating data sources + +On the data source management page, you can access a comprehensive list of associated OpenSearch connections, monitor direct query connections relevant to your current workspace, and establish new data source associations as needed. + +### Managing OpenSearch connections + +The OpenSearch connections tab displays all associated connections for the current workspace. Follow these steps to manage your connections: + +1. Access a comprehensive list of associated OpenSearch connections on the connections tab. +2. Use the **Remove association** button to unlink connections as needed. +3. Add new data sources by selecting the **OpenSearch data sources** button and subsequent modal. +4. Select from unassociated OpenSearch connections to expand your workspace's capabilities. + +### Adding direct query connections + +The **Direct query connections** tab displays a list of all direct query connections associated with your current workspace. To add more direct query connections to your workspace, select the **Direct query data sources** button. A modal window opens. + +The association modal displays a list of OpenSearch connections that contain direct query connections and have not yet been associated with your current workspace. When you associate an OpenSearch connection with your current workspace, all direct query connections within that OpenSearch connection are automatically associated as well. + +## Deleting your workspace + +Workspace deletion is restricted to admins. If you do not see a {::nomarkdown}trash can icon{:/} icon, check your permissions. See [Configuring dashboard administrators]({{site.url}}{{site.baseurl}}/dashboards/workspace/workspace-acl/#configuring-dashboard-administrators) for more information. +{: .warning} + +Deleting a workspace permanently erases all its assets (except data sources) and the workspace itself. This action cannot be reversed. + +To delete a workspace, follow these steps: + +1. From the **Workspace details** page, select the {::nomarkdown}trash can icon{:/} icon in the upper-right corner to delete the current workspace. +2. Alternatively, from the workspace list page, select the {::nomarkdown}ellipsis icon{:/} icon and select **Delete**. Optionally, select multiple workspaces for bulk deletion. + +## Navigating the workspaces list + +The workspaces list page serves as your central hub for workspace management, displaying all workspaces for which you have access permissions. Key features include the following: + +- Search: Quickly find a workspace by name. +- Filter: Sort workspaces by use case. +- At a glance: View each workspace's name, use case, description, last update time, and associated data sources. + +Each workspace entry includes an **Actions** column with the following functional buttons. These tools streamline your workspace management, allowing for efficient organization and customization of your OpenSearch Dashboards environment: + +1. Copy ID: One-click copying of the workspace ID. +2. Edit: Direct access to the workspace's detailed configuration page. +3. Set as default: Easily set any workspace as your default workspace. +4. Delete: Remove workspaces as needed (may require admin privileges). diff --git a/_dashboards/workspace/workspace-acl.md b/_dashboards/workspace/workspace-acl.md new file mode 100644 index 0000000000..16b2cc8628 --- /dev/null +++ b/_dashboards/workspace/workspace-acl.md @@ -0,0 +1,153 @@ +--- +layout: default +title: Workspace access control lists +parent: Workspace for OpenSearch Dashboards +nav_order: 3 +--- + +# Workspace access control lists +Introduced 2.18 +{: .label .label-purple } + +Workspace access control lists (ACLs) manage authorization for saved objects `AuthZ(Authorization)` while enabling [Security in OpenSearch]({{site.url}}{{site.baseurl}}/security/) for `AuthN(Authentication)`. + +## Personas + +**Workspace** use cases involve the following key personas: + +* **Dashboard admin:** Has full access to all OpenSearch Dashboards functions and data. +* **Workspace administrator (also called _owner_):** Has full control over a specific workspace, including its configuration and saved objects. When a workspace is created, its creator is automatically assigned the role of workspace owner. +* **Workspace content producer:** Can view, create, and update saved objects within the workspace. +* **Workspace viewer:** Has read-only access to saved objects in the workspace. + + Roles are workspace specific, allowing users to assume different roles across workspaces. + {: .note} + +## Enabling permission control + +See [Enabling the ACL feature]({{site.url}}{{site.baseurl}}/dashboards/management/acl#enabling-the-acl-feature) for instructions. + +## Configuring dashboard administrators + +To grant full access to all workspaces and objects in OpenSearch Dashboards, configure the admin permissions. Edit the `opensearch_dashboards.yml` file to define the admin by user ID and backend role, as shown in the following configuration: + +```yaml +opensearchDashboards.dashboardAdmin.users: ["UserID"] +opensearchDashboards.dashboardAdmin.groups: ["BackendRole"] +savedObjects.permission.enabled: true +``` +{% include copy.html %} + +By default, the configuration is set to `[]`, meaning that no users are designated as admins. If the Security plugin is not installed and `savedObjects.permission.enabled: false`, all users are granted admin permissions. + +### Configuring global admin access + +Set all users as admins with this wildcard setting: + +```yaml +opensearchDashboards.dashboardAdmin.users: ["*"] +``` +{% include copy-curl.html %} + +### Configuring admin access for a single user + +Configure a user with the `admin-user-id` setting: + +```yaml +opensearchDashboards.dashboardAdmin.users: ["admin-user-id"] +``` +{% include copy-curl.html %} + +### Configuring admin access by backend role + +Configure a user with the `admin-role` setting: + +```yaml +opensearchDashboards.dashboardAdmin.groups: ["admin-role"] +``` +{% include copy-curl.html %} + +### Admin-restricted operations + +Admin-restricted operations include the following: + +- Workspace creation +- Workspace deletion +- Data source connections +- Disconnecting data sources from workspaces + +## Defining workspace collaborators + +Access to collaborator management is limited to admins. The **Collaborators** feature is only available when permission control is enabled. For instructions on activating permission control, see [Enabling permission control](#enabling-permission-control). The access levels include the following: + +- **Read only:** Grants permission to view the workspace and its assets. +- **Read and write:** Allows viewing and editing of assets within the workspace. +- **Admin:** Provides full access, including viewing and editing of assets within the workspace and updating workspace metadata, such as name, description, data sources, and collaborators. + +From the **Collaborators** page, you can by collaborator ID and filter results by collaborator type and access level. + +### Adding collaborators + +Workspace creators are granted the **Admin** access level as a collaborator. To add more collaborators, select the **Add collaborators** button, which displays a dropdown menu. Choose **Add Users** or **Add Groups** to access the corresponding modal for adding new collaborators. + +#### Adding users + +To add users, follow these steps: + +1. Select the **Add Users** button to open the modal. The modal displays one empty `User ID` field by default. +2. Choose an access level: **Read only**, **Read and write**, or **Admin**. +3. Choose **Add another User** to add multiple users. Do not use duplicate or existing `User ID` fields to avoid errors. +4. Resolve any errors before finalizing. Successfully added users appear in the collaborators table. + +#### Adding groups + +To add groups, follow these steps: + +1. Select the **Add Groups** button to open the modal. The modal displays one empty `Group ID` field by default. +2. Choose an access level: **Read only**, **Read and write**, or **Admin**. +3. Use **Add another group** to add multiple groups. Do not use duplicate or existing `Group ID` fields to avoid errors. +4. Resolve any errors before finalizing. Successfully added users appear in the collaborators table. + +### Modifying access levels + +You can modify collaborators access levels after adding them to the collaborators table if you have the required permissions. Collaborators can be assigned any access level. However, if all **Admin** collaborators are changed to lower access levels, then only admins can manage workspace collaboration. + +#### Modifying individual access levels + +To modify a single collaborator's access level, follow these steps: + +1. Select the action icon on the right of the table row. +2. Select **Change access level** from the dropdown menu. +3. Choose the desired access level from the list. +4. Confirm the change in the modal that appears and select **Confirm**. The collaborator's access level is updated in the table upon confirmation. + +#### Modifying access levels in batch + +To change access levels for several collaborators simultaneously, follow these steps: + +1. Select the desired collaborator rows in the table. +2. Select the **Actions** button that appears. +3. Select **Change access level** from the dropdown menu. +4. Select the new access level from the list provided. +5. Review and confirm the changes in the modal that appears. The access levels for all selected collaborators are updated in the table upon confirmation. + +### Deleting collaborators + +After adding collaborators to the table, you have the option to delete them. Be cautious when removing admin collaborators because deleting all of them restricts workspace collaborator management to admins only. A confirmation modal is displayed before finalizing this action. + +#### Deleting individual collaborators + +To delete an individual collaborator, follow these steps: + +1. Select the {::nomarkdown}ellipsis icon{:/} icon on the right of the table row to display a dropdown menu. +2. Select **Delete collaborator** from the dropdown menu. A confirmation modal appears to verify your action. +3. Select **Confirm** in the modal to remove the collaborator from the table. + +#### Deleting collaborators in batch + +To remove several collaborators simultaneously, follow these steps: + +1. Select the rows containing the collaborators you want to remove from the table. A "Delete x collaborators" button appears. +2. Select the **Delete x collaborators** button. +3. Review the confirmation modal that appears. +4. Select **Confirm** to remove all selected collaborators from the table. diff --git a/_dashboards/workspace/workspace.md b/_dashboards/workspace/workspace.md new file mode 100644 index 0000000000..0938c48891 --- /dev/null +++ b/_dashboards/workspace/workspace.md @@ -0,0 +1,118 @@ +--- +layout: default +title: Workspace for OpenSearch Dashboards +nav_order: 110 +has_children: true +--- + +# Workspace for OpenSearch Dashboards +Introduced 2.18 +{: .label .label-purple } + +The Workspace feature in OpenSearch Dashboards enables you to tailor your environment with use-case-specific configurations. For example, you can create dedicated workspaces for observability scenarios, allowing you to focus on relevant functionalities. Additionally, the Workspace feature enables organization of visual assets, such as dashboards and visualizations, within a workspace with isolated storage. + +## Workspace data model + +The Workspace data model is defined by the following structure: + +```typescript +interface Workspace { + id: string; + name: string; + description?: string; + features?: string[]; + color: string; + uiSettings: Record; +} +``` +{% include copy-curl.html %} + +The Workspace data model is composed of the following key attributes: + +- `id`: String type; unique ID for each each workspace. +- `name`: String type; designates the name of the workspace. +- `description`: Optional string type; provides contextual information for the workspace. +- `features`: Optional array of strings; contains use case IDs linked to the workspace. + +--- + +#### Example Workspace object + +The following object shows a typical Workspace configuration: + +```typescript +{ + id: "M5NqCu", + name: "Analytics team", + description: "Analytics team workspace", + features: ["use-case-analytics"], +} +``` +{% include copy-curl.html %} + +The configuration creates the `Analytics team` using the `use-case-observability` feature set. Use cases map to specific feature groups, limiting functionality to the defined set within each workspace. + +The following are predefined use case options: + +- `use-case-observability` +- `use-case-security-analytics` +- `use-case-search` +- `use-case-essentials` +- `use-case-all` + +--- + +## Associating saved objects with workspaces + +Saved objects in OpenSearch Dashboards, such as dashboards, visualizations, and index patterns, can be associated with specific workspaces, improving organization and accessibility as the volume of objects grows. + +The `workspaces` attribute, an array of strings, is added to saved objects to be linked with one or more workspaces. As a result, saved objects such as dashboards and visualizations are only accessible within their designated workspaces. + +The following saved object shows a dashboard object associated with the workspace `M5NqCu`: + +```typescript +{ + type: "dashboard", + id: "da123f20-6680-11ee-93fa-df944ec23359", + workspaces: ["M5NqCu"] +} +``` +{% include copy-curl.html %} + +Saved objects support association with multiple workspaces, facilitating cross-team collaboration and resource sharing. This feature is useful when an object is relevant to multiple teams, projects, or use cases. + +The following example shows a data source object linked to multiple workspaces: + +```typescript +{ + type: "data-source", + id: "da123f20-6680-11ee-93fa-df944ec23359", + workspaces: ["M5NqCu", "", ""] +} +``` +{% include copy-curl.html %} + +## Non-workspace saved objects + +Not all saved objects in OpenSearch Dashboards are associated with a workspace. Some objects operate independently of the workspace framework. These objects lack `workspace` attributes and serve system-wide functions. For example, the global user interface settings object manages configurations affecting the entire OpenSearch Dashboards interface in order to maintain consistent functionality across all workspaces. + +This dual approach allows OpenSearch Dashboards to balance granular, context-specific customization with overall system consistency. + +## Enabling the Workspace feature + +In your `opensearch_dashboards.yml` file, set the following option: + +```yaml +workspace.enabled: true +uiSettings: + overrides: + "home:useNewHomePage": true +``` +{% include copy-curl.html %} + +If your cluster has the Security plugin installed, then multi-tenancy must be disabled to avoid conflicts with similar workspaces: + +```yaml +opensearch_security.multitenancy.enabled: false +``` +{% include copy-curl.html %} diff --git a/_data-prepper/pipelines/configuration/processors/anomaly-detector.md b/_data-prepper/pipelines/configuration/processors/anomaly-detector.md index 9628bb6caf..ba574bdf7d 100644 --- a/_data-prepper/pipelines/configuration/processors/anomaly-detector.md +++ b/_data-prepper/pipelines/configuration/processors/anomaly-detector.md @@ -53,6 +53,7 @@ You can configure `random_cut_forest` mode with the following options. | `sample_size` | `256` | 100--2500 | The sample size used in the ML algorithm. | | `time_decay` | `0.1` | 0--1.0 | The time decay value used in the ML algorithm. Used as the mathematical expression `timeDecay` divided by `SampleSize` in the ML algorithm. | | `type` | `metrics` | N/A | The type of data sent to the algorithm. | +| `output_after` | 32 | N/A | Specifies the number of events to process before outputting any detected anomalies. | | `version` | `1.0` | N/A | The algorithm version number. | ## Usage diff --git a/_data-prepper/pipelines/configuration/processors/aws-lambda.md b/_data-prepper/pipelines/configuration/processors/aws-lambda.md new file mode 100644 index 0000000000..bd167996a1 --- /dev/null +++ b/_data-prepper/pipelines/configuration/processors/aws-lambda.md @@ -0,0 +1,94 @@ +--- +layout: default +title: aws_lambda +parent: Processors +grand_parent: Pipelines +nav_order: 10 +--- + +# aws_lambda integration for Data Prepper + +The [AWS Lambda](https://aws.amazon.com/lambda/) integration allows developers to use serverless computing capabilities within their Data Prepper pipelines for flexible event processing and data routing. + +## AWS Lambda processor configuration + +The `aws_lambda` processor enables invocation of an AWS Lambda function within your Data Prepper pipeline in order to process events. It supports both synchronous and asynchronous invocations based on your use case. + +## Configuration fields + +You can configure the processor using the following configuration options. + +Field | Type | Required | Description +-------------------- | ------- | -------- | ---------------------------------------------------------------------------- +`function_name` | String | Required | The name of the AWS Lambda function to invoke. +`invocation_type` | String | Required | Specifies the invocation type, either `request-response` or `event`. Default is `request-response`. +`aws.region` | String | Required | The AWS Region in which the Lambda function is located. +`aws.sts_role_arn` | String | Optional | The Amazon Resource Name (ARN) of the role to assume before invoking the Lambda function. +`max_retries` | Integer | Optional | The maximum number of retries for failed invocations. Default is `3`. +`batch` | Object | Optional | The batch settings for the Lambda invocations. Default is `key_name = "events"`. Default threshold is `event_count=100`, `maximum_size="5mb"`, and `event_collect_timeout = 10s`. +`lambda_when` | String | Optional | A conditional expression that determines when to invoke the Lambda processor. +`response_codec` | Object | Optional | A codec configuration for parsing Lambda responses. Default is `json`. +`tags_on_match_failure` | List | Optional | A list of tags to add to events when Lambda matching fails or encounters an unexpected error. +`sdk_timeout` | Duration| Optional | Configures the SDK's client connection timeout period. Default is `60s`. +`response_events_match` | Boolean | Optional | Specifies how Data Prepper interprets and processes Lambda function responses. Default is `false`. + +#### Example configuration + +``` +processors: + - aws_lambda: + function_name: "my-lambda-function" + invocation_type: "request-response" + response_events_match: false + aws: + region: "us-east-1" + sts_role_arn: "arn:aws:iam::123456789012:role/my-lambda-role" + max_retries: 3 + batch: + key_name: "events" + threshold: + event_count: 100 + maximum_size: "5mb" + event_collect_timeout: PT10S + lambda_when: "event['status'] == 'process'" + +``` +{% include copy-curl.html %} + +## Usage + +The processor supports the following invocation types: + +- `request-response`: The processor waits for Lambda function completion before proceeding. +- `event`: The function is triggered asynchronously without waiting for a response. +- `batch`: When enabled, events are aggregated and sent in bulk to optimize Lambda invocations. Batch thresholds control the event count, size limit, and timeout. +- `codec`: JSON is used for both request and response codecs. Lambda must return JSON array outputs. +- `tags_on_match_failure`: Custom tags can be applied to events when Lambda processing fails or encounters unexpected issues. + +## Behavior + +When configured for batching, the AWS Lambda processor groups multiple events into a single request. This grouping is governed by batch thresholds, which can be based on the event count, size limit, or timeout. The processor then sends the entire batch to the Lambda function as a single payload. + +## Lambda response handling + +The `response_events_match` setting defines how Data Prepper handles the relationship between batch events sent to Lambda and the response received: + +- `true`: Lambda returns a JSON array with results for each batched event. Data Prepper maps this array back to its corresponding original event, ensuring that each event in the batch gets the corresponding part of the response from the array. +- `false`: Lambda returns one or more events for the entire batch. Response events are not correlated with the original events. Original event metadata is not preserved in the response events. For example, when `response_events_match` is set to `true`, the Lambda function is expected to return the same number of response events as the number of original requests, maintaining the original order. + +## Limitations + +Note the following limitations: + +- Payload limitation: 6 MB payload limit +- Response codec: JSON-only codec support + +## Integration testing + +Integration tests for this plugin are executed separately from the main Data Prepper build process. Use the following Gradle command to run these tests: + +``` +./gradlew :data-prepper-plugins:aws-lambda:integrationTest -Dtests.processor.lambda.region="us-east-1" -Dtests.processor.lambda.functionName="lambda_test_function" -Dtests.processor.lambda.sts_role_arn="arn:aws:iam::123456789012:role/dataprepper-role +``` + +{% include copy-curl.html %} diff --git a/_data-prepper/pipelines/configuration/sinks/aws-lambda.md b/_data-prepper/pipelines/configuration/sinks/aws-lambda.md new file mode 100644 index 0000000000..d8c00bdb16 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sinks/aws-lambda.md @@ -0,0 +1,73 @@ +--- +layout: default +title: aws_lambda +parent: Sinks +grand_parent: Pipelines +nav_order: 10 +--- + +---------------------------------------------------------------------------------------- +# `aws_lambda` sink for Data Prepper + +This page explains how to configure and use [AWS Lambda](https://aws.amazon.com/lambda/) with Data Prepper, enabling Lambda functions to serve as both processors and sinks. + +## `aws_lambda` sink + +Configure the Lambda sink using the following parameters. + +Field | Type | Required | Description +--------------------| ------- | -------- | ---------------------------------------------------------------------------- +`function_name` | String | Yes | The name of the AWS Lambda function to invoke. +`invocation_type` | String | No | Specifies the invocation type. Default is `event`. +`aws.region` | String | Yes | The AWS Region in which the Lambda function is located. +`aws.sts_role_arn` | String | No | The Amazon Resource Name (ARN) of the role to assume before invoking the Lambda function. +`max_retries` | Integer | No | The maximum number of retries if the invocation fails. Default is `3`. +`batch` | Object | No | Optional batch settings for Lambda invocations. Default is `key_name = events`. Default threshold is `event_count=100`, `maximum_size="5mb"`, and `event_collect_timeout = 10s`. +`lambda_when` | String | No | A conditional expression that determines when to invoke the Lambda sink. +`dlq` | Object | No | The dead-letter queue (DLQ) configuration for failed invocations. + +#### Example configuration + +``` +sink: + - aws_lambda: + function_name: "my-lambda-sink" + invocation_type: "event" + aws: + region: "us-west-2" + sts_role_arn: "arn:aws:iam::123456789012:role/my-lambda-sink-role" + max_retries: 5 + batch: + key_name: "events" + threshold: + event_count: 50 + maximum_size: "3mb" + event_collect_timeout: PT5S + lambda_when: "event['type'] == 'log'" + dlq: + region: "us-east-1" + sts_role_arn: "arn:aws:iam::123456789012:role/my-sqs-role" + bucket: "<>" +``` +{% include copy-curl.html %} + +## Usage + +The invocation types are as follows: + +- `event` (Default): Executes functions asynchronously without waiting for responses. +- `request-response` (Sink only): Executes functions synchronously, though responses are not processed. +- `batch`: Automatically groups events based on configured thresholds. +- `dlq`: Supports the DLQ configuration for failed invocations after retry attempts. + +Data Prepper components use an AWS Identity and Access Management (IAM) role assumption, `aws.sts_role_arn`, for secure Lambda function invocation and respect Lambda's concurrency limits during event processing. For more information, see the [AWS Lambda documentation](https://docs.aws.amazon.com/lambda). +{: .note} + +## Developer guide + +Integration tests must be executed separately from the main Data Prepper build. Execute them with the following command: + +``` +./gradlew :data-prepper-plugins:aws-lambda:integrationTest -Dtests.sink.lambda.region="us-east-1" -Dtests.sink.lambda.functionName="lambda_test_function" -Dtests.sink.lambda.sts_role_arn="arn:aws:iam::123456789012:role/dataprepper-role +``` +{% include copy-curl.html %} diff --git a/_data-prepper/pipelines/configuration/sources/s3.md b/_data-prepper/pipelines/configuration/sources/s3.md index db92718a36..7ca27ee500 100644 --- a/_data-prepper/pipelines/configuration/sources/s3.md +++ b/_data-prepper/pipelines/configuration/sources/s3.md @@ -104,7 +104,7 @@ Option | Required | Type | Description `s3_select` | No | [s3_select](#s3_select) | The Amazon S3 Select configuration. `scan` | No | [scan](#scan) | The S3 scan configuration. `delete_s3_objects_on_read` | No | Boolean | When `true`, the S3 scan attempts to delete S3 objects after all events from the S3 object are successfully acknowledged by all sinks. `acknowledgments` should be enabled when deleting S3 objects. Default is `false`. -`workers` | No | Integer | Configures the number of worker threads that the source uses to read data from S3. Leave this value as the default unless your S3 objects are less than 1 MB in size. Performance may decrease for larger S3 objects. This setting affects SQS-based sources and S3-Scan sources. Default is `1`. +`workers` | No | Integer | Configures the number of worker threads (1--10) that the source uses to read data from S3. Leave this value as the default unless your S3 objects are less than 1 MB in size. Performance may decrease for larger S3 objects. This setting affects SQS-based sources and S3-Scan sources. Default is `1`. diff --git a/_data-prepper/pipelines/expression-syntax.md b/_data-prepper/pipelines/expression-syntax.md index 383b54c19b..07f68ee58e 100644 --- a/_data-prepper/pipelines/expression-syntax.md +++ b/_data-prepper/pipelines/expression-syntax.md @@ -30,6 +30,9 @@ The following table lists the supported operators. Operators are listed in order |----------------------|-------------------------------------------------------|---------------| | `()` | Priority expression | Left to right | | `not`
`+`
`-`| Unary logical NOT
Unary positive
Unary negative | Right to left | +| `*`, `/` | Multiplication and division operators | Left to right | +| `+`, `-` | Addition and subtraction operators | Left to right | +| `+` | String concatenation operator | Left to right | | `<`, `<=`, `>`, `>=` | Relational operators | Left to right | | `==`, `!=` | Equality operators | Left to right | | `and`, `or` | Conditional expression | Left to right | @@ -78,7 +81,6 @@ Conditional expressions allow you to combine multiple expressions or values usin or not ``` -{% include copy-curl.html %} The following are some example conditional expressions: @@ -91,9 +93,64 @@ not /status_code in {200, 202} ``` {% include copy-curl.html %} +### Arithmetic expressions + +Arithmetic expressions enable basic mathematical operations like addition, subtraction, multiplication, and division. These expressions can be combined with conditional expressions to create more complex conditional statements. The available arithmetic operators are +, -, *, and /. The syntax for using the arithmetic operators is as follows: + +``` + + + - + * + / +``` + +The following are example arithmetic expressions: + +``` +/value + length(/message) +/bytes / 1024 +/value1 - /value2 +/TimeInSeconds * 1000 +``` +{% include copy-curl.html %} + +The following are some example arithmetic expressions used in conditional expressions : + +``` +/value + length(/message) > 200 +/bytes / 1024 < 10 +/value1 - /value2 != /value3 + /value4 +``` +{% include copy-curl.html %} + +### String concatenation expressions + +String concatenation expressions enable you to combine strings to create new strings. These concatenated strings can also be used within conditional expressions. The syntax for using string concatenation is as follows: + +``` + + +``` + +The following are example string concatenation expressions: + +``` +/name + "suffix" +"prefix" + /name +"time of " + /timeInMs + " ms" +``` +{% include copy-curl.html %} + +The following are example string concatenation expressions that can be used in conditional expressions: + +``` +/service + ".com" == /url +"www." + /service != /url +``` +{% include copy-curl.html %} + ### Reserved symbols -Reserved symbols are symbols that are not currently used in the expression syntax but are reserved for possible future functionality or extensions. Reserved symbols include `^`, `*`, `/`, `%`, `+`, `-`, `xor`, `=`, `+=`, `-=`, `*=`, `/=`, `%=`, `++`, `--`, and `${}`. +Certain symbols, such as ^, %, xor, =, +=, -=, *=, /=, %=, ++, --, and ${}, are reserved for future functionality or extensions. Reserved symbols include `^`, `%`, `xor`, `=`, `+=`, `-=`, `*=`, `/=`, `%=`, `++`, `--`, and `${}`. ## Syntax components @@ -170,6 +227,9 @@ White space is optional around relational operators, regex equality operators, e | `()` | Priority expression | Yes | `/a==(/b==200)`
`/a in ({200})` | `/status in({200})` | | `in`, `not in` | Set operators | Yes | `/a in {200}`
`/a not in {400}` | `/a in{200, 202}`
`/a not in{400}` | | `<`, `<=`, `>`, `>=` | Relational operators | No | `/status < 300`
`/status>=300` | | +| `+` | String concatenation operator | No | `/status_code + /message + "suffix"` +| `+`, `-` | Arithmetic addition and subtraction operators | No | `/status_code + length(/message) - 2` +| `*`, `/` | Multiplication and division operators | No | `/status_code * length(/message) / 3` | `=~`, `!~` | Regex equality operators | No | `/msg =~ "^\w*$"`
`/msg=~"^\w*$"` | | | `==`, `!=` | Equality operators | No | `/status == 200`
`/status_code==200` | | | `and`, `or`, `not` | Conditional operators | Yes | `/a<300 and /b>200` | `/b<300and/b>200` | diff --git a/_data/top_nav.yml b/_data/top_nav.yml index 51d8138680..6552d90359 100644 --- a/_data/top_nav.yml +++ b/_data/top_nav.yml @@ -63,6 +63,8 @@ items: url: /docs/latest/clients/ - label: Benchmark url: /docs/latest/benchmark/ + - label: Migration Assistant + url: /docs/latest/migration-assistant/ - label: Platform url: /platform/index.html children: diff --git a/_field-types/supported-field-types/flat-object.md b/_field-types/supported-field-types/flat-object.md index c9e59710e1..65d7c6dc8e 100644 --- a/_field-types/supported-field-types/flat-object.md +++ b/_field-types/supported-field-types/flat-object.md @@ -56,7 +56,8 @@ The flat object field type supports the following queries: - [Multi-match]({{site.url}}{{site.baseurl}}/query-dsl/full-text/multi-match/) - [Query string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/query-string/) - [Simple query string]({{site.url}}{{site.baseurl}}/query-dsl/full-text/simple-query-string/) -- [Exists]({{site.url}}{{site.baseurl}}/query-dsl/term/exists/) +- [Exists]({{site.url}}{{site.baseurl}}/query-dsl/term/exists/) +- [Wildcard]({{site.url}}{{site.baseurl}}/query-dsl/term/wildcard/) ## Limitations @@ -243,4 +244,4 @@ PUT /test-index/ ``` {% include copy-curl.html %} -Because `issue.number` is not part of the flat object, you can use it to aggregate and sort documents. \ No newline at end of file +Because `issue.number` is not part of the flat object, you can use it to aggregate and sort documents. diff --git a/_getting-started/communicate.md b/_getting-started/communicate.md index 3472270c30..773558fb21 100644 --- a/_getting-started/communicate.md +++ b/_getting-started/communicate.md @@ -28,7 +28,7 @@ curl -X GET "http://localhost:9200/_cluster/health" If you're using the Security plugin, provide the username and password in the request: ```bash -curl -X GET "http://localhost:9200/_cluster/health" -ku admin: +curl -X GET "https://localhost:9200/_cluster/health" -ku admin: ``` {% include copy.html %} @@ -317,4 +317,4 @@ Once a field is created, you cannot change its type. Changing a field type requi ## Next steps -- See [Ingest data into OpenSearch]({{site.url}}{{site.baseurl}}/getting-started/ingest-data/) to learn about ingestion options. \ No newline at end of file +- See [Ingest data into OpenSearch]({{site.url}}{{site.baseurl}}/getting-started/ingest-data/) to learn about ingestion options. diff --git a/_includes/cards.html b/_includes/cards.html index 6d958e61a5..5ab37b8c27 100644 --- a/_includes/cards.html +++ b/_includes/cards.html @@ -30,8 +30,14 @@

Measure performance metrics for your OpenSearch cluster

+ +
+ +

Migration Assistant

+

Migrate to OpenSearch from other platforms

+ +
- diff --git a/_includes/header.html b/_includes/header.html index 20d82c451e..32d5b14774 100644 --- a/_includes/header.html +++ b/_includes/header.html @@ -82,7 +82,7 @@ {% endif %}