From ecd95ba92a97b91e38536df147f8d5f0e7be21d2 Mon Sep 17 00:00:00 2001 From: Chris Helma Date: Fri, 27 Dec 2024 07:30:36 -0600 Subject: [PATCH] TP: Enable user guidance on GenAI recs, add ES 6.8 and 7.10 knowledge Signed-off-by: Chris Helma --- TransformationPlayground/README.md | 61 + .../playground/playground/urls.py | 4 +- .../playground/schema.json | 19 +- .../playground/transform_api/serializers.py | 4 +- .../transform_api/tests/test_serializers.py | 4 +- .../playground/transform_api/views.py | 5 +- .../playground/transform_expert/expert.py | 3 + .../playground/transform_expert/parameters.py | 1 + .../transform_expert/prompting/generation.py | 5 +- .../prompting/knowledge/__init__.py | 7 + .../prompting/knowledge/es_6_8.py | 1428 +++++++++++++ .../prompting/knowledge/es_7_10.py | 1897 +++++++++++++++++ .../transform_expert/prompting/templates.py | 23 +- .../playground_frontend/src/app/page.tsx | 56 +- .../src/generated-api-client/api.ts | 51 +- .../src/generated-api-client/base.ts | 2 +- .../src/generated-api-client/common.ts | 2 +- .../src/generated-api-client/configuration.ts | 2 +- .../src/generated-api-client/index.ts | 2 +- 19 files changed, 3525 insertions(+), 51 deletions(-) create mode 100644 TransformationPlayground/playground/transform_expert/prompting/knowledge/es_7_10.py diff --git a/TransformationPlayground/README.md b/TransformationPlayground/README.md index 33cdff41d..ec68d11b9 100644 --- a/TransformationPlayground/README.md +++ b/TransformationPlayground/README.md @@ -78,6 +78,67 @@ To run the Frontend, first start the Backend and ensure it's running. Then, exe You should then be able to hit the Playground website in your web browser at `http://localhost:3000`. The GUI should be pretty self-explanatory. +Here's an example input JSON for ElasticSearch 6.8: +```json +{ + "index_name": "test-index", + "index_json": { + "settings": { + "index": { + "number_of_shards": 1, + "number_of_replicas": 0 + } + }, + "mappings": { + "type1": { + "properties": { + "title": { + "type": "text" + } + } + }, + "type2": { + "properties": { + "contents": { + "type": "text" + } + } + } + } + } +} +``` + + +Here's an example input JSON for ElasticSearch 7.10: +```json +{ + "index_name": "test-index", + "index_json": { + "settings": { + "index": { + "soft_deletes": { + "enabled": false + }, + "number_of_shards": 1, + "number_of_replicas": 1 + } + }, + "mappings": { + "properties": { + "title": { + "type": "text" + }, + "content": { + "type": "text" + } + } + } + } +} +``` + + ### Dependencies `pipenv` is used to managed dependencies within the project. The `Pipefile` and `Pipefile.lock` handle the local environment. You can add dependencies like so: diff --git a/TransformationPlayground/playground/playground/urls.py b/TransformationPlayground/playground/playground/urls.py index 9e5fc5478..b41e6950e 100644 --- a/TransformationPlayground/playground/playground/urls.py +++ b/TransformationPlayground/playground/playground/urls.py @@ -1,12 +1,12 @@ from django.urls import path from drf_spectacular.views import SpectacularAPIView, SpectacularSwaggerView -from transform_api.views import TransformsIndexView, TransformsIndexTestView +from transform_api.views import TransformsIndexCreateView, TransformsIndexTestView urlpatterns = [ path('api/schema/', SpectacularAPIView.as_view(), name='schema'), path('api/docs/', SpectacularSwaggerView.as_view(url_name='schema'), name='swagger-ui'), - path('transforms/index/', TransformsIndexView.as_view(), name='transforms_index'), + path('transforms/index/create/', TransformsIndexCreateView.as_view(), name='transforms_index_create'), path('transforms/index/test/', TransformsIndexTestView.as_view(), name='transforms_index_test'), ] \ No newline at end of file diff --git a/TransformationPlayground/playground/schema.json b/TransformationPlayground/playground/schema.json index bc60658c2..908d3b51d 100644 --- a/TransformationPlayground/playground/schema.json +++ b/TransformationPlayground/playground/schema.json @@ -1,7 +1,7 @@ openapi: 3.0.3 info: title: Transformation API - version: 0.0.1 + version: 0.1.0 description: API for JSON transformation logic. paths: /api/schema/: @@ -150,9 +150,9 @@ paths: type: object additionalProperties: {} description: '' - /transforms/index/: + /transforms/index/create/: post: - operationId: transforms_index_create + operationId: transforms_index_create_create tags: - transforms requestBody: @@ -211,8 +211,11 @@ components: SourceVersionEnum: enum: - Elasticsearch 6.8 + - Elasticsearch 7.10 type: string - description: '* `Elasticsearch 6.8` - ES_6_8' + description: |- + * `Elasticsearch 6.8` - ES_6_8 + * `Elasticsearch 7.10` - ES_7_10 TargetVersionEnum: enum: - OpenSearch 2.17 @@ -226,12 +229,12 @@ components: TransformsIndexCreateRequest: type: object properties: - transform_language: - $ref: '#/components/schemas/TransformLanguageEnum' source_version: $ref: '#/components/schemas/SourceVersionEnum' target_version: $ref: '#/components/schemas/TargetVersionEnum' + transform_language: + $ref: '#/components/schemas/TransformLanguageEnum' input_shape: type: object properties: @@ -242,6 +245,10 @@ components: required: - index_name - index_json + transform_logic: + type: string + user_guidance: + type: string test_target_url: type: string format: uri diff --git a/TransformationPlayground/playground/transform_api/serializers.py b/TransformationPlayground/playground/transform_api/serializers.py index ff9d0fe71..e8e40be6e 100644 --- a/TransformationPlayground/playground/transform_api/serializers.py +++ b/TransformationPlayground/playground/transform_api/serializers.py @@ -65,10 +65,12 @@ def to_representation(self, value: Dict[str, Any]) -> Dict[str, Any]: class TransformsIndexCreateRequestSerializer(serializers.Serializer): - transform_language = EnumChoiceField(enum=TransformLanguage) source_version = EnumChoiceField(enum=SourceVersion) target_version = EnumChoiceField(enum=TargetVersion) + transform_language = EnumChoiceField(enum=TransformLanguage) input_shape = IndexShapeField() + transform_logic = serializers.CharField(required=False, default=None) + user_guidance = serializers.CharField(required=False, default=None) test_target_url = serializers.URLField(required=False, default=None) class TransformsIndexCreateResponseSerializer(serializers.Serializer): diff --git a/TransformationPlayground/playground/transform_api/tests/test_serializers.py b/TransformationPlayground/playground/transform_api/tests/test_serializers.py index c9f62a2e8..6bae4d497 100644 --- a/TransformationPlayground/playground/transform_api/tests/test_serializers.py +++ b/TransformationPlayground/playground/transform_api/tests/test_serializers.py @@ -107,6 +107,8 @@ def test_valid_input(self): } } }, + "transform_logic": "some transform logic", + "user_guidance": "some user guidance", "test_target_url": "http://localhost:29200" }) self.assertTrue(test_serializer.is_valid()) @@ -165,7 +167,7 @@ def test_valid_input(self): } } ], - "transform_logic": "the transform logic", + "transform_logic": "the new transform logic", "validation_report": [ "Entry 1", "Entry 2" diff --git a/TransformationPlayground/playground/transform_api/views.py b/TransformationPlayground/playground/transform_api/views.py index bef786750..c5eab71a9 100644 --- a/TransformationPlayground/playground/transform_api/views.py +++ b/TransformationPlayground/playground/transform_api/views.py @@ -21,7 +21,7 @@ logger = logging.getLogger("transform_api") -class TransformsIndexView(APIView): +class TransformsIndexCreateView(APIView): @csrf_exempt @extend_schema( request=TransformsIndexCreateRequestSerializer, @@ -86,7 +86,8 @@ def _perform_transformation(self, transform_id: str, request: TransformsIndexCre ) system_message = expert.system_prompt_factory( - request.validated_data["input_shape"] + user_guidance=request.validated_data["user_guidance"], + input_shape=request.validated_data["input_shape"] ) turns = [ system_message, diff --git a/TransformationPlayground/playground/transform_expert/expert.py b/TransformationPlayground/playground/transform_expert/expert.py index 44f9e7034..5bd74bc8b 100644 --- a/TransformationPlayground/playground/transform_expert/expert.py +++ b/TransformationPlayground/playground/transform_expert/expert.py @@ -60,6 +60,9 @@ def get_expert(source_version: SourceVersion, target_version: TargetVersion, tra tools=tool_bundle ) +class NoTransformCreatedError(Exception): + pass + def invoke_expert(expert: Expert, task: TransformTask) -> TransformTask: logger.info(f"Invoking the Transform Expert for transform_id: {task.transform_id}") logger.debug(f"Initial Transform Task: {str(task.to_json())}") diff --git a/TransformationPlayground/playground/transform_expert/parameters.py b/TransformationPlayground/playground/transform_expert/parameters.py index fa0845b70..c023a0f00 100644 --- a/TransformationPlayground/playground/transform_expert/parameters.py +++ b/TransformationPlayground/playground/transform_expert/parameters.py @@ -6,6 +6,7 @@ class SourceVersion(Enum): ES_6_8 = "Elasticsearch 6.8" + ES_7_10 = "Elasticsearch 7.10" class TargetVersion(Enum): OS_2_17 = "OpenSearch 2.17" diff --git a/TransformationPlayground/playground/transform_expert/prompting/generation.py b/TransformationPlayground/playground/transform_expert/prompting/generation.py index 59aadd678..2e96efacb 100644 --- a/TransformationPlayground/playground/transform_expert/prompting/generation.py +++ b/TransformationPlayground/playground/transform_expert/prompting/generation.py @@ -17,7 +17,7 @@ def _get_base_template(source_version: SourceVersion, target_version: TargetVers def get_system_prompt_factory(source_version: SourceVersion, target_version: TargetVersion, input_shape_type: TransformType, transform_language: TransformLanguage) -> Callable[[Dict[str, Any]], SystemMessage]: base_template = _get_base_template(source_version, target_version, input_shape_type, transform_language) - def factory(input_shape: Dict[str, Any]) -> SystemMessage: + def factory(user_guidance: str, input_shape: Dict[str, Any]) -> SystemMessage: return SystemMessage( content=base_template.format( source_version=source_version, @@ -26,7 +26,8 @@ def factory(input_shape: Dict[str, Any]) -> SystemMessage: target_version=target_version, target_guidance=get_target_guidance(source_version, target_version, input_shape_type, transform_language), target_knowledge=get_target_knowledge(source_version, target_version, input_shape_type, transform_language), - source_json=input_shape + source_json=input_shape, + user_guidance=user_guidance ) ) diff --git a/TransformationPlayground/playground/transform_expert/prompting/knowledge/__init__.py b/TransformationPlayground/playground/transform_expert/prompting/knowledge/__init__.py index 80d6e1f67..3fd3e1db9 100644 --- a/TransformationPlayground/playground/transform_expert/prompting/knowledge/__init__.py +++ b/TransformationPlayground/playground/transform_expert/prompting/knowledge/__init__.py @@ -1,6 +1,7 @@ from transform_expert.parameters import SourceVersion, TargetVersion, TransformType, TransformLanguage from transform_expert.prompting.knowledge.es_6_8 import INDEX_GUIDANCE as es_6_8_index_guidance, INDEX_KNOWLEDGE as es_6_8_index_knowledge +from transform_expert.prompting.knowledge.es_7_10 import INDEX_GUIDANCE as es_7_10_index_guidance, INDEX_KNOWLEDGE as es_7_10_index_knowledge from transform_expert.prompting.knowledge.os_2_17 import INDEX_GUIDANCE as os_2_17_index_guidance, INDEX_KNOWLEDGE as os_2_17_index_knowledge @@ -8,6 +9,9 @@ def get_source_guidance(source_version: SourceVersion, target_version: TargetVer if source_version == SourceVersion.ES_6_8: if input_shape_type == TransformType.INDEX: return es_6_8_index_guidance + elif source_version == SourceVersion.ES_7_10: + if input_shape_type == TransformType.INDEX: + return es_7_10_index_guidance return "" @@ -15,6 +19,9 @@ def get_source_knowledge(source_version: SourceVersion, target_version: TargetVe if source_version == SourceVersion.ES_6_8: if input_shape_type == TransformType.INDEX: return es_6_8_index_knowledge + elif source_version == SourceVersion.ES_7_10: + if input_shape_type == TransformType.INDEX: + return es_7_10_index_knowledge return "" diff --git a/TransformationPlayground/playground/transform_expert/prompting/knowledge/es_6_8.py b/TransformationPlayground/playground/transform_expert/prompting/knowledge/es_6_8.py index 7174c20c9..00fb1dc52 100644 --- a/TransformationPlayground/playground/transform_expert/prompting/knowledge/es_6_8.py +++ b/TransformationPlayground/playground/transform_expert/prompting/knowledge/es_6_8.py @@ -7,5 +7,1433 @@ """ INDEX_KNOWLEDGE = """ +# Elasticsearch Index Creation Guide +## Introduction to Index Creation + +Elasticsearch organizes data into indices, which are fundamental structures for storing and retrieving documents. This guide explains how to create an index using the Create Index API, along with important considerations and advanced configuration options. + +## Basic Index Creation + +To create a new index with default settings, use the following command: + +``` +PUT +``` + +For example: + +``` +PUT twitter +``` + +This creates an index named "twitter" with all default settings. + +## Index Naming Rules + +When choosing a name for your index, adhere to these restrictions: + +- Use only lowercase letters +- Avoid these characters: `\ / * ? " < > | , #` (and space) +- Don't start with `-`, `_`, or `+` +- Avoid `.` or `..` as full names +- Keep names under 255 bytes (multi-byte characters count more) + +Note: Prior to Elasticsearch 7.0, colons (`:`) were allowed but are now deprecated. + +## Customizing Index Settings + +You can specify custom settings when creating an index: + +``` +PUT +{ + "settings": { + "number_of_shards": 3, + "number_of_replicas": 2 + } +} +``` + +Key settings include: +- `number_of_shards`: Determines how the index is split (default: 5) +- `number_of_replicas`: Sets the number of replica shards (default: 1) + +For more detailed settings, refer to the index modules documentation. + +## Defining Mappings + +You can define the structure of your documents using mappings: + +``` +PUT test +{ + "settings": { + "number_of_shards": 1 + }, + "mappings": { + "_doc": { + "properties": { + "field1": { "type": "text" } + } + } + } +} +``` + +## Creating Aliases + +Aliases can be created alongside the index: + +```json +PUT test +{ + "aliases": { + "alias_1": {}, + "alias_2": { + "filter": { + "term": {"user": "kimchy"} + }, + "routing": "kimchy" + } + } +} +``` + +## Shard Activation and Response + +By default, the API responds once primary shards are active. The response includes: + +```json +{ + "acknowledged": true, + "shards_acknowledged": true, + "index": "test" +} +``` + +- `acknowledged`: Indicates successful index creation +- `shards_acknowledged`: Shows if required shard copies started before timeout + +You can adjust the number of active shards to wait for: + +```json +PUT test +{ + "settings": { + "index.write.wait_for_active_shards": "2" + } +} +``` + +Or use a request parameter: + +``` +PUT test?wait_for_active_shards=2 +``` + +Note: Even if `acknowledged` or `shards_acknowledged` is `false`, the index creation may still be successful. These values indicate whether the operation completed before the timeout. + +## Mapping Without Types (Elasticsearch 7.0+) + +From Elasticsearch 7.0, you can create mappings without specifying a type: + +``` +PUT test?include_type_name=false +{ + "mappings": { + "properties": { + "foo": { + "type": "keyword" + } + } + } +} +``` + +This approach is recommended for future compatibility as types are being removed from Elasticsearch. + +# Elasticsearch Index Aliases + +Index aliases in Elasticsearch provide a flexible way to reference one or more indices using a single name. This feature offers several benefits for managing and querying your data. + +## Key Concepts + +- An alias can point to one or multiple indices +- Aliases automatically resolve to the actual index names in API calls +- An alias cannot share a name with an existing index +- Aliases support filters and routing for advanced use cases + +## Basic Alias Operations + +### Adding an Alias + +To associate the alias `alias1` with index `test1`: + +```json +POST /_aliases +{ + "actions" : [ + { "add" : { "index" : "test1", "alias" : "alias1" } } + ] +} +``` + +### Removing an Alias + +To remove the alias `alias1` from index `test1`: + +```json +POST /_aliases +{ + "actions" : [ + { "remove" : { "index" : "test1", "alias" : "alias1" } } + ] +} +``` + +### Renaming an Alias + +Renaming is achieved through an atomic remove and add operation: + +```json +POST /_aliases +{ + "actions" : [ + { "remove" : { "index" : "test1", "alias" : "alias1" } }, + { "add" : { "index" : "test2", "alias" : "alias1" } } + ] +} +``` + +### Multiple Index Associations + +Assign an alias to multiple indices: + +```json +POST /_aliases +{ + "actions" : [ + { "add" : { "indices" : ["test1", "test2"], "alias" : "alias1" } } + ] +} +``` + +You can also use glob patterns: + +```json +POST /_aliases +{ + "actions" : [ + { "add" : { "index" : "test*", "alias" : "all_test_indices" } } + ] +} +``` + +Note: Glob pattern aliases are point-in-time and don't automatically update as matching indices are added or removed. + +It is an error to index to an alias which points to more than one index. + +### Swapping an Index with an Alias + +It's possible to swap an index with an alias in one operation: + +```json +PUT test +PUT test_2 +POST /_aliases +{ + "actions" : [ + { "add": { "index": "test_2", "alias": "test" } }, + { "remove_index": { "index": "test" } } + ] +} +``` + +The `remove_index` action is equivalent to deleting an index. + +## Advanced Alias Features + +### Filtered Aliases + +Filtered aliases allow you to create different "views" of the same index using Query DSL: + +```json +POST /_aliases +{ + "actions" : [ + { + "add" : { + "index" : "test1", + "alias" : "alias2", + "filter" : { "term" : { "user" : "kimchy" } } + } + } + ] +} +``` + +Ensure that the fields used in filters exist in the index mapping. For example: + +```json +PUT /test1 +{ + "mappings": { + "properties": { + "user" : { + "type": "keyword" + } + } + } +} +``` + +### Routing + +Assign routing values to aliases to optimize shard operations: + +```json +POST /_aliases +{ + "actions" : [ + { + "add" : { + "index" : "test", + "alias" : "alias1", + "routing" : "1" + } + } + ] +} +``` + +You can specify different routing for search and index operations: + +```json +POST /_aliases +{ + "actions" : [ + { + "add" : { + "index" : "test", + "alias" : "alias2", + "search_routing" : "1,2", + "index_routing" : "2" + } + } + ] +} +``` + +Note: Search routing may contain several values separated by comma. Index routing can contain only a single value. + +If a search operation that uses routing alias also has a routing parameter, an intersection of both search alias routing and routing specified in the parameter is used. For example: + +``` +GET /alias2/_search?q=user:kimchy&routing=2,3 +``` + +This command will use "2" as a routing value. + +### Write Index + +Designate a specific index as the write index for an alias: + +```json +POST /_aliases +{ + "actions" : [ + { + "add" : { + "index" : "test", + "alias" : "alias1", + "is_write_index" : true + } + }, + { + "add" : { + "index" : "test2", + "alias" : "alias1" + } + } + ] +} +``` + +This configuration directs write operations to the designated write index when an alias points to multiple indices. + +To swap which index is the write index for an alias: + +```json +POST /_aliases +{ + "actions" : [ + { + "add" : { + "index" : "test", + "alias" : "alias1", + "is_write_index" : false + } + }, { + "add" : { + "index" : "test2", + "alias" : "alias1", + "is_write_index" : true + } + } + ] +} +``` + +Aliases that do not explicitly set `is_write_index: true` for an index, and only reference one index, will have that referenced index behave as if it is the write index until an additional index is referenced. At that point, there will be no write index and writes will be rejected. + +## API Endpoints + +### Add a Single Alias + +``` +PUT /{index}/_alias/{name} +``` + +Parameters: +- `index`: Target index (supports patterns and multiple indices) +- `name`: Alias name (required) +- `routing`: Optional routing value +- `filter`: Optional filter query + +You can also use the plural `_aliases`. + +### Retrieve Existing Aliases + +``` +GET /{index}/_alias/{alias} +``` + +Options: +- `index`: Index name (supports wildcards and multiple indices) +- `alias`: Alias name (supports wildcards and multiple names) +- `ignore_unavailable`: If true, ignore non-existent indices + +Examples: + +All aliases for the index logs_20162801: +``` +GET /logs_20162801/_alias/* +``` + +All aliases with the name 2016 in any index: +``` +GET /_alias/2016 +``` + +All aliases that start with 20 in any index: +``` +GET /_alias/20* +``` + +### Check Alias Existence + +``` +HEAD /{index}/_alias/{alias} +``` + +Examples: +``` +HEAD /_alias/2016 +HEAD /_alias/20* +HEAD /logs_20162801/_alias/* +``` + +### Delete an Alias + +``` +DELETE /{index}/_alias/{name} +``` + +## Aliases During Index Creation + +Specify aliases when creating a new index: + +```json +PUT /logs_20162801 +{ + "mappings" : { + "properties" : { + "year" : {"type" : "integer"} + } + }, + "aliases" : { + "current_day" : {}, + "2016" : { + "filter" : { + "term" : {"year" : 2016 } + } + } + } +} +``` + +# Elasticsearch Index Management + +## Index Modules and Settings + +Index Modules in Elasticsearch are responsible for controlling various aspects of an index. Each index has its own set of modules and associated settings. These settings can be categorized into two types: static and dynamic. + +### Static Index Settings + +Static settings can only be set during index creation or on a closed index. Changing these settings on a closed index may lead to incorrect configurations that can only be fixed by deleting and recreating the index. + +Key static settings include: + +1. **Number of Shards** (`index.number_of_shards`) + - Default: 1024 + - Can be modified using: `export ES_JAVA_OPTS="-Des.index.max_number_of_shards=128"` + +2. **Shard Check on Startup** (`index.shard.check_on_startup`) + - Default: `false` + - Options: `checksum`, `true`, `fix` (deprecated, same as `false`) + - Note: This is an expert-level setting and may significantly impact startup time for large indices. + +3. **Index Codec** (`index.codec`) + - Options: `default`, `best_compression` + +4. **Routing Partition Size** (`index.routing_partition_size`) + +5. **Load Fixed Bitset Filters Eagerly** (`index.load_fixed_bitset_filters_eagerly`) + +### Dynamic Index Settings + +Dynamic settings can be changed on a live index using the update index settings API. Some key dynamic settings are: + +1. **Number of Replicas** (`index.number_of_replicas`) + +2. **Auto-expand Replicas** (`index.auto_expand_replicas`) + - Options: `0-5`, `all`, `0-all`, `false`, `YELLOW` + +3. **Refresh Interval** (`index.refresh_interval`) + - Examples: `1s`, `-1` + +4. **Max Result Window** (`index.max_result_window`) + - Default: 10000 + - Affects `from + size` in search requests + +5. **Max Inner Result Window** (`index.max_inner_result_window`) + - Default: 100 + +6. **Max Rescore Window** (`index.max_rescore_window`) + - Defaults to `max(window_size, from + size)` for `rescore` + +7. **Read-only Settings** + - `index.blocks.read_only`: Prevents write operations + - `index.blocks.read_only_allow_delete`: Similar to read-only, but allows index deletion + - `index.blocks.read`: Disables read operations + - `index.blocks.write`: Disables write operations + - `index.blocks.metadata`: Disables metadata operations + +8. **Shard Allocation and Rebalancing** + - `index.routing.allocation.enable`: Controls shard allocation + - Options: `all` (default), `primaries`, `new_primaries`, `none` + - `index.routing.rebalance.enable`: Enables shard rebalancing + - Options: `all` (default), `primaries`, `replicas`, `none` + +9. **Other Notable Settings** + - `index.max_docvalue_fields_search`: Limits `docvalue_fields` in search requests + - `index.max_script_fields`: Limits `script_fields` in search requests (default: 32) + - `index.max_ngram_diff`: Sets maximum ngram difference (default: 1) + - `index.max_shingle_diff`: Sets maximum shingle difference (default: 3) + - `index.max_refresh_listeners`: Limits concurrent refresh listeners + - `index.highlight.max_analyzed_offset`: Sets maximum number of characters analyzed for highlighting + - `index.max_terms_count`: Maximum number of terms in a terms query (default: 65536) + - `index.gc_deletes`: Sets the duration for retaining deleted documents (default: 60s) + - `index.max_regex_length`: Sets maximum length of regex in a regexp query (default: 1000) + - `index.default_pipeline`: Sets the default ingest pipeline for the index + +## Additional Index Modules + +Elasticsearch provides various other index modules with their own specific settings. These modules cater to different aspects of index management and functionality. For more detailed information on these modules, refer to the Elasticsearch documentation. + +## Best Practices + +1. Exercise caution when modifying static settings, especially on production indices. +2. Regularly review and optimize dynamic settings based on your use case and performance requirements. +3. Be aware of the implications of enabling read-only modes and their impact on write operations and index management. +4. When using advanced features like custom routing or sharding, ensure you understand their effects on index performance and scalability. + +# Understanding Text Analysis in Elasticsearch + +## Introduction to Analysis + +Text analysis is a crucial process in Elasticsearch that transforms raw text into searchable tokens. This process occurs during both indexing and searching, ensuring efficient and accurate retrieval of information. + +## The Analysis Process + +Analysis involves breaking down text into individual terms, which are then added to an inverted index for quick searching. This process is handled by analyzers, which can be either built-in or custom-defined for each index. + +### Example of Analysis + +Consider the following sentence: + +``` +"The QUICK brown foxes jumped over the lazy dog!" +``` + +When processed by the built-in English analyzer, it undergoes several transformations: + +1. Tokenization: Breaks the text into individual words +2. Lowercasing: Converts all tokens to lowercase +3. Stopword removal: Eliminates common words like "the" +4. Stemming: Reduces words to their root form (e.g., "foxes" to "fox") + +The resulting tokens added to the inverted index would be: + +``` +[quick, brown, fox, jump, over, lazi, dog] +``` + +## Configuring Analysis + +### Index-Time Analysis + +You can specify an analyzer for each text field in your mapping: + +```json +PUT my_index +{ + "mappings": { + "_doc": { + "properties": { + "title": { + "type": "text", + "analyzer": "standard" + } + } + } + } +} +``` + +If no analyzer is specified, Elasticsearch looks for a `default` analyzer in the index settings. If none is found, it uses the `standard` analyzer. + +### Search-Time Analysis + +The same analysis process applies to search queries, ensuring that the query terms match the indexed terms. For example, searching for "a quick fox" would be analyzed into: + +``` +[quick, fox] +``` + +This allows for matching even when the exact words differ (e.g., "quick" vs. "QUICK", "fox" vs. "foxes"). + +### Determining the Search Analyzer + +Elasticsearch determines which analyzer to use for searching in the following order: + +1. Analyzer specified in the query itself +2. `search_analyzer` mapping parameter +3. `analyzer` mapping parameter +4. `default_search` analyzer in index settings +5. `default` analyzer in index settings +6. `standard` analyzer + +## Best Practices + +- Use the same analyzer at index and search time for consistency +- Choose analyzers based on the language and specific requirements of your data +- Consider custom analyzers for specialized text processing needs + +# Customizing Shard Allocation in Elasticsearch + +Elasticsearch provides powerful mechanisms to control where shards of an index are allocated within your cluster. This feature, known as shard allocation filtering, allows you to fine-tune the distribution of your data across nodes based on various criteria. + +## Node Attributes: The Building Blocks + +Before diving into allocation rules, it's crucial to understand node attributes. These are custom metadata tags you can assign to your Elasticsearch nodes, providing a flexible way to categorize them. For example: + +```bash +bin/elasticsearch -Enode.attr.rack=rack1 -Enode.attr.size=big +``` + +You can set these attributes either via command line arguments or in the `elasticsearch.yml` configuration file. + +## Crafting Allocation Rules + +With node attributes in place, you can create allocation rules using the `index.routing.allocation.*` settings. These settings come in three flavors: + +1. `include`: Allows shards on nodes with matching attributes +2. `exclude`: Prevents shards from being allocated to nodes with matching attributes +3. `require`: Mandates that shards must be on nodes with the specified attributes + +### Examples in Action + +Let's explore some practical scenarios: + +1. Allocate to specific node types: + +```json +PUT test/_settings +{ + "index.routing.allocation.include.size": "big,medium" +} +``` + +This allocates shards of `test` index to nodes tagged as either `big` or `medium`. + +2. Avoid certain nodes: + +```json +PUT test/_settings +{ + "index.routing.allocation.exclude.size": "small" +} +``` + +This prevents `test` index shards from being allocated to nodes tagged as `small`. + +3. Strict allocation requirements: + +```json +PUT test/_settings +{ + "index.routing.allocation.include.size": "big", + "index.routing.allocation.include.rack": "rack1" +} +``` + +This ensures `test` index shards are only on `big` nodes in `rack1`. + +## Beyond Custom Attributes + +Elasticsearch also provides built-in attributes for even more granular control: + +| Attribute | Description | +|-----------|-------------| +| `_name` | Target specific nodes by name | +| `_host_ip` | Match nodes by their host IP address (IP associated with hostname) | +| `_publish_ip` | Filter based on the node's publish IP address | +| `_ip` | Matches either `_host_ip` or `_publish_ip` | +| `_host` | Select nodes by hostname | + +These can be used just like custom attributes in your allocation rules. + +### Wildcard Magic + +For added flexibility, attribute values support wildcards. For instance: + +```json +PUT test/_settings +{ + "index.routing.allocation.include._ip": "192.168.2.*" +} +``` + +This targets all nodes with IP addresses in the 192.168.2.0/24 subnet. + +## Key Considerations + +- Multiple rules combine with AND logic – all conditions must be satisfied. +- If no suitable nodes match your criteria, shard movement will not occur. +- These settings are dynamic, allowing you to adjust allocation rules on live indices. +- The per-index shard allocation filters work in conjunction with the cluster-wide allocation filters explained in Cluster Level Shard Allocation. + +# Optimizing Node Departure Handling in Elasticsearch + +When a node unexpectedly leaves an Elasticsearch cluster, the system's default response can sometimes lead to unnecessary strain. This document outlines the process, potential issues, and how to optimize cluster behavior in such scenarios. + +## Default Cluster Reaction + +Upon detecting a node's departure, Elasticsearch typically: + +1. Elevates replica shards to primary status to replace lost primaries +2. Allocates new replicas to maintain redundancy (if sufficient nodes are available) +3. Redistributes shards for optimal balance across remaining nodes + +While this approach prioritizes data integrity by ensuring every shard is fully replicated as soon as possible, it can inadvertently cause significant cluster load, especially if the node's absence is temporary. + +## The "Shard Shuffle" Problem + +Consider this scenario: + +1. Node 5 experiences a brief network disconnection +2. Cluster promotes replicas and allocates new ones +3. Substantial data is copied across the network +4. Cluster undergoes rebalancing +5. Node 5 reconnects shortly after +6. Another rebalancing occurs to incorporate Node 5 + +This sequence can lead to unnecessary data transfer and processing, particularly if Node 5's absence was brief. The process would be even quicker for idle shards (those not receiving indexing requests) which have been automatically sync-flushed. + +## Delayed Allocation: A Smarter Approach + +Elasticsearch offers a solution: delayed allocation of unassigned replica shards. This feature is controlled by the `index.unassigned.node_left.delayed_timeout` setting, which defaults to `1m` (one minute). + +### Configuring Delayed Allocation + +To modify this setting cluster-wide or for a specific index: + +```json +PUT _all/_settings +{ + "settings": { + "index.unassigned.node_left.delayed_timeout": "5m" + } +} +``` + +This example sets a 5-minute delay before reallocating shards from a departed node. + +### Revised Scenario with Delayed Allocation + +With this feature enabled, the process changes: + +1. Node 5 loses connectivity +2. Primaries are still immediately replaced +3. Cluster logs a delay message for unassigned shard allocation +4. Cluster status remains yellow due to unassigned replicas +5. If Node 5 returns before timeout expiration, replicas are quickly reinstated + +Note: This delay doesn't affect primary shard promotion or allocation of previously unassigned replicas. It also resets after a full cluster restart or master node failover. + +## Shard Relocation Cancellation + +If the delay timeout is exceeded and reallocation begins, but the original node rejoins with matching sync IDs, Elasticsearch will cancel the ongoing relocation in favor of the returning node's data. This is why the default timeout is set to just one minute: even if shard relocation begins, cancelling recovery in favor of the synced shard is relatively inexpensive. + +## Monitoring Delayed Allocations + +To check the number of shards affected by delayed allocation: + +``` +GET _cluster/health +``` + +Look for the `delayed_unassigned_shards` value in the response. + +## Handling Permanent Node Loss + +If a node won't be returning, you can trigger immediate shard allocation: + +```json +PUT _all/_settings +{ + "settings": { + "index.unassigned.node_left.delayed_timeout": "0" + } +} +``` + +This setting can be reverted once recovery begins. + +# Controlling Shard Distribution in Elasticsearch + +Elasticsearch's cluster-level shard allocator aims to distribute shards from a single index across multiple nodes. However, achieving perfect balance isn't always feasible, especially when dealing with varying numbers of shards, indices, and their respective sizes. + +## Index-Specific Shard Limit + +To manage shard distribution more precisely, Elasticsearch offers a dynamic setting that enforces a maximum number of shards from a single index on any given node: + +``` +index.routing.allocation.total_shards_per_node +``` + +This setting allows you to fine-tune shard allocation on a per-index basis. + +## Global Shard Limit + +For a broader approach to shard management, you can set a cluster-wide limit on the number of shards per node, regardless of which index they belong to: + +``` +cluster.routing.allocation.total_shards_per_node +``` + +This global setting helps maintain overall cluster balance. + +## Important Considerations + +Both of these configurations impose strict limits on shard allocation. As a result, some shards may remain unallocated if the specified limits are reached. It's crucial to use these settings with caution, as they can significantly impact your cluster's performance and data distribution. + +## Additional Resources + +For those new to Elasticsearch and the ELK stack, consider exploring these popular introductory videos: + +1. Video: Get Started with Elasticsearch +2. Video: Intro to Kibana +3. Video: ELK for Logs & Metrics + +These resources provide valuable insights into the fundamentals of the Elasticsearch ecosystem and its applications in log and metric analysis. + +# Understanding Elasticsearch Mappings + +## Introduction to Mappings + +Mappings in Elasticsearch define how documents and their fields are stored and indexed. This crucial process determines the searchability and analysis of your data. Key aspects of mapping include: + +- Designating full-text fields +- Specifying field types (e.g., numbers, dates, geolocations) +- Configuring the catch-all `_all` field indexing +- Setting date value formats +- Establishing rules for dynamically added fields + +## Mapping Types and Their Evolution + +Historically, each index could have multiple mapping types. However, this feature has been deprecated since version 6.0.0. Currently: + +- Every index has a single mapping type +- This type defines the document's indexing structure +- Standard mapping type components include: + - `_index` + - `_type` + - `_id` + - `_source` + - `properties` + +For more information on this change, refer to the "Removal of mapping types" documentation. + +## Field Data Types + +Elasticsearch supports a variety of field data types to accommodate different kinds of information: + +1. Simple types: + - `text`, `keyword`, `date`, `long`, `double`, `boolean`, `ip` +2. Hierarchical types (for JSON-like structures): + - `object`, `nested` +3. Specialized types: + - `geo_point`, `geo_shape`, `completion` + +### Multi-fields + +Fields can be indexed in multiple ways to serve different purposes. For example: +- A string field could be indexed as both `text` (for full-text search) and `keyword` (for sorting and aggregations) +- Text can be analyzed using different analyzers (e.g., `standard`, `english`, `french`) + +This flexibility is achieved through the `fields` parameter, supported by most data types. + +## Preventing Mapping Explosion + +To avoid potential out-of-memory errors caused by an excessive number of field mappings (mapping explosion), Elasticsearch provides several limiting settings: + +1. `index.mapping.total_fields.limit`: Caps the total number of fields (default: 1000) +2. `index.mapping.depth.limit`: Restricts the depth of nested objects (default: 20) +3. `index.mapping.nested_fields.limit`: Limits the number of `nested` fields (default: 50) + +These safeguards are particularly important when using dynamic mappings, where new fields are automatically added as documents are indexed. + +## Dynamic vs. Explicit Mappings + +### Dynamic Mapping + +Elasticsearch can automatically detect and map new fields as they're indexed. This feature: +- Allows for flexibility in document structure +- Works for top-level fields and nested objects +- Can be customized to fit specific needs + +### Explicit Mapping + +While dynamic mapping is convenient, explicit mapping offers more control: +- Allows you to define exact field types and properties +- Can be set during index creation or updated using the PUT mapping API +- Provides better optimization for your specific use case + +Note: Existing field mappings generally cannot be updated. To change a field's mapping, you typically need to create a new index and reindex your data. If you only wish to rename a field without changing its mapping, consider using an `alias` field. + +## Example: Creating an Index with Explicit Mapping + +```json +PUT my_index +{ + "mappings": { + "_doc": { + "properties": { + "title": { "type": "text" }, + "name": { "type": "text" }, + "age": { "type": "integer" }, + "created": { + "type": "date", + "format": "strict_date_optional_time||epoch_millis" + } + } + } + } +} +``` + +This example demonstrates: +1. Creating an index named `my_index` +2. Defining a mapping type `_doc` +3. Specifying properties for fields: + - `title` and `name` as text fields + - `age` as an integer + - `created` as a date field with two possible formats + +# Understanding Elasticsearch Shards and Merging + +## Shard Structure +Elasticsearch shards are composed of Lucene indexes, which are further divided into segments. These segments serve as the fundamental storage units within the index, containing the actual index data. One key characteristic of segments is their immutability. + +## Segment Merging +To maintain optimal index size and remove deleted entries, Elasticsearch periodically combines smaller segments into larger ones. This process, known as merging, employs auto-throttling to balance resource utilization between merging operations and other critical tasks like search functionality. + +## Merge Scheduler + +The Concurrent Merge Scheduler oversees merge operations, executing them as needed. Key points about the merge scheduler include: + +1. Merge operations run in dedicated threads. +2. When the maximum thread count is reached, subsequent merges are queued. +3. A dynamic setting controls the maximum thread count: + + ``` + index.merge.scheduler.max_thread_count + ``` + +4. The default maximum thread count is calculated using the following formula: + + ```java + Math.max(1, Math.min(4, Runtime.getRuntime().availableProcessors() / 2)) + ``` + + This ensures at least 1 thread and at most 4 threads, depending on the available processors. + +# Elasticsearch Similarity Module + +The Similarity module in Elasticsearch defines how matching documents are scored and ranked. This feature operates on a per-field basis, allowing different similarity algorithms to be applied to different fields through mapping configurations. + +## Configuring Custom Similarities + +While the built-in similarities are generally sufficient for most use cases, advanced users can configure custom similarities. These configurations are set via index settings when creating or updating an index. + +Example configuration: + +```json +PUT /index +{ + "settings": { + "index": { + "similarity": { + "my_similarity": { + "type": "DFR", + "basic_model": "g", + "after_effect": "l", + "normalization": "h2", + "normalization.h2.c": "3.0" + } + } + } + } +} +``` + +To apply the custom similarity to a specific field: + +```json +PUT /index/_mapping/_doc +{ + "properties": { + "title": { "type": "text", "similarity": "my_similarity" } + } +} +``` + +## Available Similarity Algorithms + +### 1. BM25 Similarity (Default) + +- Type name: `BM25` +- Based on TF/IDF +- Suitable for short fields like names +- Options: + - `k1`: Controls non-linear term frequency normalization (default: 1.2) + - `b`: Controls document length normalization of tf values (default: 0.75) + - `discount_overlaps`: Ignores overlap tokens when computing norms (default: true) + +### 2. Classic Similarity + +- Type name: `classic` +- Deprecated since version 6.3.0 +- Based on TF/IDF model +- Option: `discount_overlaps` + +### 3. DFR Similarity + +- Type name: `DFR` +- Implements the divergence from randomness framework +- Options: + - `basic_model`: be, d, g, if, in, ine, p + - `after_effect`: no, b, l + - `normalization`: no, h1, h2, h3, z + +### 4. DFI Similarity + +- Type name: `DFI` +- Implements the divergence from independence model +- Option: `independence_measure` (values: standardized, saturated, chisquared) + +### 5. IB Similarity + +- Type name: `IB` +- Information-based model +- Options: + - `distribution`: ll, spl + - `lambda`: df, ttf + - `normalization`: Same as DFR similarity + +### 6. LM Dirichlet Similarity + +- Type name: `LMDirichlet` +- Option: `mu` (default: 2000) + +### 7. LM Jelinek Mercer Similarity + +- Type name: `LMJelinekMercer` +- Option: `lambda` (default: 0.1, optimal: 0.1 for title queries, 0.7 for long queries) + +### 8. Scripted Similarity + +- Type name: `scripted` +- Allows custom score computation using scripts +- Components: + - `script`: Computes the score + - `weight_script`: Optional, computes document-independent part of the score + +Example of a scripted TF-IDF implementation: + +```json +PUT /index +{ + "settings": { + "number_of_shards": 1, + "similarity": { + "scripted_tfidf": { + "type": "scripted", + "script": { + "source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;" + } + } + } + }, + "mappings": { + "_doc": { + "properties": { + "field": { + "type": "text", + "similarity": "scripted_tfidf" + } + } + } + } +} +``` + +Note: Scripted similarities must follow specific rules to ensure correct functionality: +- Returned scores must be positive. +- All other variables remaining equal, scores must not decrease when `doc.freq` increases. +- All other variables remaining equal, scores must not increase when `doc.length` increases. + +For improved efficiency, you can use a `weight_script` to compute the document-independent part of the score: + +```json +PUT /index +{ + "settings": { + "number_of_shards": 1, + "similarity": { + "scripted_tfidf": { + "type": "scripted", + "weight_script": { + "source": "double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; return query.boost * idf;" + }, + "script": { + "source": "double tf = Math.sqrt(doc.freq); double norm = 1/Math.sqrt(doc.length); return weight * tf * norm;" + } + } + } + }, + "mappings": { + "_doc": { + "properties": { + "field": { + "type": "text", + "similarity": "scripted_tfidf" + } + } + } + } +} +``` + +## Changing the Default Similarity + +To change the default similarity for all fields in an index: + +1. When creating the index: + +```json +PUT /index +{ + "settings": { + "index": { + "similarity": { + "default": { + "type": "boolean" + } + } + } + } +} +``` + +2. For an existing index: + +```json +POST /index/_close + +PUT /index/_settings +{ + "index": { + "similarity": { + "default": { + "type": "boolean" + } + } + } +} + +POST /index/_open +``` + +# Elasticsearch Slow Logs + +Elasticsearch provides two types of slow logs to help monitor and optimize performance: Search Slow Log and Index Slow Log. These logs allow you to track queries, fetch operations, and indexing operations that exceed specified time thresholds. + +## Search Slow Log + +The Search Slow Log records slow search operations at the shard level, including both query and fetch phases. + +### Configuration + +You can set thresholds for different logging levels (warn, info, debug, trace) for both query and fetch phases. Here's an example configuration: + +```yaml +index.search.slowlog.threshold.query.warn: 10s +index.search.slowlog.threshold.query.info: 5s +index.search.slowlog.threshold.query.debug: 2s +index.search.slowlog.threshold.query.trace: 500ms + +index.search.slowlog.threshold.fetch.warn: 1s +index.search.slowlog.threshold.fetch.info: 800ms +index.search.slowlog.threshold.fetch.debug: 500ms +index.search.slowlog.threshold.fetch.trace: 200ms + +index.search.slowlog.level: info +``` + +These settings are dynamic and can be updated for each index using the Update Indices Settings API: + +```json +PUT /twitter/_settings +{ + "index.search.slowlog.threshold.query.warn": "10s", + "index.search.slowlog.threshold.query.info": "5s", + "index.search.slowlog.threshold.query.debug": "2s", + "index.search.slowlog.threshold.query.trace": "500ms", + "index.search.slowlog.threshold.fetch.warn": "1s", + "index.search.slowlog.threshold.fetch.info": "800ms", + "index.search.slowlog.threshold.fetch.debug": "500ms", + "index.search.slowlog.threshold.fetch.trace": "200ms", + "index.search.slowlog.level": "info" +} +``` + +### Important Notes + +- By default, all thresholds are disabled (set to `-1`). +- Not all levels need to be configured; you can set only the ones you need. +- Logging occurs at the shard level, providing insights into execution on specific machines. + +### Log File Configuration + +The Search Slow Log file is configured in `log4j2.properties`. Here's the default configuration: + +```properties +appender.index_search_slowlog_rolling.type = RollingFile +appender.index_search_slowlog_rolling.name = index_search_slowlog_rolling +appender.index_search_slowlog_rolling.fileName = ${sys:es.logs}_index_search_slowlog.log +appender.index_search_slowlog_rolling.layout.type = PatternLayout +appender.index_search_slowlog_rolling.layout.pattern = [%d{ISO8601}][%-5p][%-25c] [%node_name]%marker %.10000m%n +appender.index_search_slowlog_rolling.filePattern = ${sys:es.logs}_index_search_slowlog-%d{yyyy-MM-dd}.log +appender.index_search_slowlog_rolling.policies.type = Policies +appender.index_search_slowlog_rolling.policies.time.type = TimeBasedTriggeringPolicy +appender.index_search_slowlog_rolling.policies.time.interval = 1 +appender.index_search_slowlog_rolling.policies.time.modulate = true + +logger.index_search_slowlog_rolling.name = index.search.slowlog +logger.index_search_slowlog_rolling.level = trace +logger.index_search_slowlog_rolling.appenderRef.index_search_slowlog_rolling.ref = index_search_slowlog_rolling +logger.index_search_slowlog_rolling.additivity = false +``` + +## Index Slow Log + +The Index Slow Log is similar to the Search Slow Log but focuses on indexing operations. + +### Configuration + +You can set thresholds for different logging levels for indexing operations. Here's an example configuration: + +```yaml +index.indexing.slowlog.threshold.index.warn: 10s +index.indexing.slowlog.threshold.index.info: 5s +index.indexing.slowlog.threshold.index.debug: 2s +index.indexing.slowlog.threshold.index.trace: 500ms +index.indexing.slowlog.level: info +index.indexing.slowlog.source: 1000 +``` + +These settings can be updated dynamically using the Update Indices Settings API: + +```json +PUT /twitter/_settings +{ + "index.indexing.slowlog.threshold.index.warn": "10s", + "index.indexing.slowlog.threshold.index.info": "5s", + "index.indexing.slowlog.threshold.index.debug": "2s", + "index.indexing.slowlog.threshold.index.trace": "500ms", + "index.indexing.slowlog.level": "info", + "index.indexing.slowlog.source": "1000" +} +``` + +### Additional Settings + +- `index.indexing.slowlog.source`: Controls how much of the `_source` field is logged (default: 1000 characters). + - Set to `false` or `0` to skip logging the source entirely. + - Set to `true` to log the entire source regardless of size. +- `index.indexing.slowlog.reformat`: Controls whether the source is reformatted (default: true). + - Set to `false` to log the source "as is", potentially spanning multiple log lines. + +### Log File Configuration + +The Index Slow Log file is configured in `log4j2.properties`. Here's the default configuration: + +```properties +appender.index_indexing_slowlog_rolling.type = RollingFile +appender.index_indexing_slowlog_rolling.name = index_indexing_slowlog_rolling +appender.index_indexing_slowlog_rolling.fileName = ${sys:es.logs}_index_indexing_slowlog.log +appender.index_indexing_slowlog_rolling.layout.type = PatternLayout +appender.index_indexing_slowlog_rolling.layout.pattern = [%d{ISO8601}][%-5p][%-25c] [%node_name]%marker %.-10000m%n +appender.index_indexing_slowlog_rolling.filePattern = ${sys:es.logs}_index_indexing_slowlog-%d{yyyy-MM-dd}.log +appender.index_indexing_slowlog_rolling.policies.type = Policies +appender.index_indexing_slowlog_rolling.policies.time.type = TimeBasedTriggeringPolicy +appender.index_indexing_slowlog_rolling.policies.time.interval = 1 +appender.index_indexing_slowlog_rolling.policies.time.modulate = true + +logger.index_indexing_slowlog.name = index.indexing.slowlog.index +logger.index_indexing_slowlog.level = trace +logger.index_indexing_slowlog.appenderRef.index_indexing_slowlog_rolling.ref = index_indexing_slowlog_rolling +logger.index_indexing_slowlog.additivity = false +``` + +# Elasticsearch Storage Configuration + +## Overview + +Elasticsearch provides flexible options for controlling how index data is stored and accessed on disk. This document outlines the various storage types available and how to configure them. + +## File System Storage Types + +Elasticsearch offers multiple file system implementations, known as storage types. By default, it selects the most suitable implementation based on the operating environment. + +### Configuring Storage Types + +#### Global Configuration + +To set a storage type for all indices, add the following to your `config/elasticsearch.yml` file: + +```yaml +index.store.type: niofs +``` + +#### Per-Index Configuration + +You can also specify the storage type for individual indices at creation time: + +```json +PUT /my_index +{ + "settings": { + "index.store.type": "niofs" + } +} +``` + +**Note**: Configuring storage types is considered an expert-level setting and may be subject to change in future releases. + +### Available Storage Types + +Elasticsearch supports the following storage types: + +1. `fs` +2. `mmapfs` +3. `simplefs` +4. `SimpleFsDirectory` +5. `niofs` +6. `NIOFSDirectory` +7. `MMapDirectory` +8. `hybridfs` + +### Memory-Mapped Storage Restrictions + +The use of `mmapfs` and the related `hybridfs` storage types can be controlled using the `node.store.allow_mmap` setting. This boolean setting determines whether memory-mapping is allowed, with the default being `true`. + +To disable memory-mapping: + +```yaml +node.store.allow_mmap: false +``` + +This setting is particularly useful in environments where you cannot control the creation of memory maps and need to disable memory-mapping capabilities. + +# Understanding Translog in Elasticsearch + +## Introduction to Translog + +In Elasticsearch, a translog is a crucial component that ensures data durability and aids in recovery processes. It addresses the gap between Lucene commits, which are resource-intensive operations that persist changes to disk. + +## Purpose and Functionality + +1. **Data Persistence**: The translog records all index and delete operations that occur between Lucene commits. +2. **Recovery Mechanism**: In case of unexpected shutdowns or hardware failures, the translog allows Elasticsearch to recover recent, acknowledged transactions that haven't been included in the last Lucene commit. + +## Flush Operation + +A flush in Elasticsearch involves two main actions: +1. Performing a Lucene commit +2. Starting a new translog + +Flushes occur automatically to prevent the translog from growing excessively large, which could lead to prolonged recovery times. While manual flushing is possible via API, it's rarely necessary. + +## Translog Settings + +Translog behavior can be fine-tuned using several dynamically updatable per-index settings: + +- `index.translog.sync_interval`: Defaults to 5s (minimum: 100ms) +- `index.translog.durability`: + - `async`: Fsync and commit every 5 seconds + - `request` (default): Fsync and commit after every index, delete, update, or bulk request +- `index.translog.flush_threshold_size`: Defaults to 512mb +- `index.translog.retention.size` +- `index.translog.retention.age`: Defaults to 12h + +## Handling Translog Corruption + +In rare cases of translog corruption, Elasticsearch provides a tool for recovery: + +```bash +bin/elasticsearch-translog truncate -d /path/to/translog/directory +``` + +Important notes: +- This tool is deprecated and will be removed in Elasticsearch 7.0 +- Use `elasticsearch-shard` tool instead +- Stop Elasticsearch before running this tool +- Running this tool will result in data loss from the translog + +The tool will display warnings and ask for confirmation before proceeding. It will then remove existing translog files and create a new empty checkpoint and translog. """ \ No newline at end of file diff --git a/TransformationPlayground/playground/transform_expert/prompting/knowledge/es_7_10.py b/TransformationPlayground/playground/transform_expert/prompting/knowledge/es_7_10.py new file mode 100644 index 000000000..045cf33c6 --- /dev/null +++ b/TransformationPlayground/playground/transform_expert/prompting/knowledge/es_7_10.py @@ -0,0 +1,1897 @@ +INDEX_GUIDANCE = """ + +""" + +INDEX_KNOWLEDGE = """ + +# Create Index API + +## Overview + +The Create Index API allows you to add a new index to your Elasticsearch cluster. This powerful feature enables you to customize various aspects of your index, including its settings, mappings, and aliases. + +## API Endpoint + +``` +PUT / +``` + +Replace `` with your desired index name. + +## Prerequisites + +To use this API, you must have either the `create_index` or `manage` index privilege if Elasticsearch security features are enabled. + +## Index Naming Rules + +When choosing a name for your index, adhere to these guidelines: + +- Use lowercase letters only +- Avoid these characters: `\`, `/`, `*`, `?`, `"`, `<`, `>`, `|`, ` ` (space), `,`, `#` +- Don't start with `-`, `_`, or `+` +- Avoid `.` or `..` as standalone names +- Keep names under 255 bytes (note: multi-byte characters count faster towards this limit) +- Avoid names starting with `.` (except for hidden and plugin-managed internal indices) + +Note: Index names containing `:` (colon) are deprecated since version 7.0. + +## Request Body + +The request body can include the following optional components: + +1. `settings`: Configure index-specific settings +2. `mappings`: Define field types and properties +3. `aliases`: Set up index aliases + +### Index Settings + +Customize your index configuration using the `settings` object: + +```json +{ + "settings": { + "number_of_shards": 3, + "number_of_replicas": 2 + } +} +``` + +- `number_of_shards`: Defaults to 1 if not specified +- `number_of_replicas`: Defaults to 1 (one replica per primary shard) if not specified + +For more detailed settings, refer to the index modules documentation. + +### Mappings + +Define the structure of your documents using the `mappings` object: + +```json +{ + "mappings": { + "properties": { + "field1": { "type": "text" } + } + } +} +``` + +Note: Prior to Elasticsearch 7.0, mappings included a type name. While deprecated, you can still include a type by setting the `include_type_name` parameter. + +### Aliases + +Create index aliases within the `aliases` object: + +```json +{ + "aliases": { + "alias_1": {}, + "alias_2": { + "filter": { + "term": { "user.id": "kimchy" } + }, + "routing": "shard-1" + } + } +} +``` + +## Query Parameters + +- `include_type_name` (Optional): Set to `true` to include a type name in mappings (deprecated) +- `wait_for_active_shards` (Optional, string): Specify the number of active shard copies required before proceeding. Options: + - A positive integer up to the total number of shards (`number_of_replicas+1`) + - `"all"` + Default: 1 (the primary shard) +- `master_timeout` (Optional): Specify how long to wait for a connection to the master node. Default: 30s +- `timeout` (Optional): Set the operation timeout + +## Response + +A successful index creation returns a JSON response: + +```json +{ + "acknowledged": true, + "shards_acknowledged": true, + "index": "test" +} +``` + +- `acknowledged`: Indicates if the index was successfully created in the cluster +- `shards_acknowledged`: Shows if the required number of shard copies were started before timeout +- `index`: The name of the created index + +Note: Even if `acknowledged` or `shards_acknowledged` is `false`, the index creation might still succeed but not complete before the timeout. These values indicate whether the operation completed before the timeout. + +## Waiting for Active Shards + +Control the number of active shards to wait for using either: + +1. Index setting `index.write.wait_for_active_shards`: + +```json +{ + "settings": { + "index.write.wait_for_active_shards": "2" + } +} +``` + +2. Request parameter `wait_for_active_shards`: + +``` +PUT /test?wait_for_active_shards=2 +``` + +This setting affects the `wait_for_active_shards` value on all subsequent write operations. + +# Update Index Alias API + +This API allows you to add, remove, or modify index aliases in Elasticsearch, providing a flexible way to reference one or more indices. + +## Overview + +An index alias is a secondary name for one or more indices. Most Elasticsearch APIs accept an alias in place of an index name, offering enhanced flexibility in data management. + +## API Endpoint + +``` +POST /_aliases +``` + +## Security Considerations + +When Elasticsearch security features are enabled, the following index privileges are required: + +- For `add` or `remove` actions: `manage` privilege on both the index and alias +- For `remove_index` action: `manage` privilege on the index + +## Request Body Structure + +The request body consists of an `actions` array, containing objects that define the operations to perform: + +```json +{ + "actions": [ + { + "action_type": { + "parameter1": "value1", + "parameter2": "value2" + } + } + ] +} +``` + +### Action Types + +1. `add`: Add an alias +2. `remove`: Remove an alias +3. `remove_index`: Remove an index (use with caution) + +### Common Parameters + +- `index` or `indices`: Specify target index/indices (required if not using `indices`) +- `alias` or `aliases`: Specify alias name(s) (required for add/remove actions) +- `filter`: Apply a query filter to the alias +- `routing`: Set custom routing for the alias +- `is_write_index`: Designate an index as the write index for an alias +- `is_hidden`: If true, the alias is hidden +- `expand_wildcards`: Expand wildcard expressions to concrete indices +- `must_exist`: If true, the alias must exist to perform the action + +## Usage Examples + +### Adding an Alias + +```json +POST /_aliases +{ + "actions": [ + { "add": { "index": "my-index-000001", "alias": "alias1" } } + ] +} +``` + +### Removing an Alias + +```json +POST /_aliases +{ + "actions": [ + { "remove": { "index": "test1", "alias": "alias1" } } + ] +} +``` + +### Renaming an Alias + +```json +POST /_aliases +{ + "actions": [ + { "remove": { "index": "test1", "alias": "alias1" } }, + { "add": { "index": "test1", "alias": "alias2" } } + ] +} +``` + +### Multiple Index Association + +```json +POST /_aliases +{ + "actions": [ + { "add": { "indices": ["test1", "test2"], "alias": "alias1" } } + ] +} +``` + +You can also use wildcard patterns to associate an alias with multiple indices: + +```json +POST /_aliases +{ + "actions": [ + { "add": { "index": "test*", "alias": "all_test_indices" } } + ] +} +``` + +Note: This creates a point-in-time alias for currently matching indices and doesn't automatically update for future matching indices. + +## Advanced Features + +### Filtered Aliases + +Create aliases with query filters for customized index views: + +```json +POST /_aliases +{ + "actions": [ + { + "add": { + "index": "my-index-000001", + "alias": "alias2", + "filter": { "term": { "user.id": "kimchy" } } + } + } + ] +} +``` + +### Custom Routing + +Optimize shard operations by associating routing values with aliases: + +```json +POST /_aliases +{ + "actions": [ + { + "add": { + "index": "test", + "alias": "alias2", + "search_routing": "1,2", + "index_routing": "2" + } + } + ] +} +``` + +### Write Index + +Designate a specific index as the write target for an alias: + +```json +POST /_aliases +{ + "actions": [ + { + "add": { + "index": "test", + "alias": "alias1", + "is_write_index": true + } + }, + { + "add": { + "index": "test2", + "alias": "alias1" + } + } + ] +} +``` + +Only one index per alias can be assigned as the write index. If no write index is specified and multiple indices are referenced by an alias, writes will be rejected. + +## Best Practices + +1. Use descriptive alias names for clarity +2. Regularly review and update alias configurations +3. Be cautious when using `remove_index` action +4. Leverage filtered aliases for efficient data access patterns +5. Utilize the write index feature for controlled write operations +6. Avoid indexing to an alias that points to multiple indices +7. Be aware that swapping an index with an alias may cause temporary failures for in-flight requests + +# Elasticsearch Index Management + +## Index Modules + +Index Modules are specialized components that manage various aspects of an individual index in Elasticsearch. They play a crucial role in controlling index-specific behaviors and settings. + +## Index Settings + +Elasticsearch allows for granular control over index behavior through index-level settings. These settings come in two flavors: + +1. Static settings: Configured at index creation and cannot be changed afterwards. +2. Dynamic settings: Can be modified on live indices. + +> **Warning**: Modifying static or dynamic settings on a closed index may lead to irreconcilable configuration issues, potentially requiring index deletion and recreation. + +## Static Index Settings + +Here are the key static index settings not associated with specific index modules: + +### Primary Shards (`index.number_of_shards`) + +- **Default**: 1 +- **Description**: Defines the number of primary shards for an index. +- **Constraints**: + - Can only be set during index creation. + - Maximum of 1024 shards per index (adjustable via system property). + +### Routing Shards (`index.number_of_routing_shards`) + +- **Description**: Determines the number of routing shards used for index splitting. +- **Example**: An index with 5 shards and 30 routing shards allows splitting by factors of 2 or 3. +- **Default**: Calculated based on the number of primary shards to allow splitting up to 1024 shards. + +### Shard Corruption Check (`index.shard.check_on_startup`) + +- **Options**: + - `false`: No check + - `checksum`: Basic check + - `true`: Thorough check (CPU and memory intensive) +- **Note**: Checking large indices may be time-consuming. + +### Compression Codec (`index.codec`) + +- **Options**: + - `default` + - `best_compression` + +### Other Static Settings + +- `index.routing_partition_size` +- `index.soft_deletes.enabled` +- `index.soft_deletes.retention_lease.period` (Default: 12h) +- `index.load_fixed_bitset_filters_eagerly` +- `index.hidden` + +## Dynamic Index Settings + +These settings can be adjusted on active indices: + +1. **Replica Management** + - `index.number_of_replicas`: Number of replica shards + - `index.auto_expand_replicas`: Automatic replica adjustment (e.g., "0-5", "all", "0-all", "false") + +2. **Search and Indexing Behavior** + - `index.search.idle.after`: Idle shard detection time (Default: 30s) + - `index.refresh_interval`: Index refresh frequency (Default: 1s, -1 to disable) + - `index.max_result_window`: Maximum search result window (Default: 10000) + - `index.max_inner_result_window`: Inner hits limit (Default: 100) + - `index.max_rescore_window`: Rescore window size limit + - `index.max_docvalue_fields_search`: DocValue fields limit + - `index.max_script_fields`: Script fields limit (Default: 32) + - `index.max_ngram_diff`: NGram difference limit (Default: 1) + - `index.max_shingle_diff`: Shingle difference limit (Default: 3) + - `index.max_refresh_listeners`: Refresh listeners limit + - `index.analyze.max_token_count`: Token count limit for analysis + - `index.highlight.max_analyzed_offset`: Analyzed content limit for highlighting (Default: 1000000) + - `index.max_terms_count`: Terms query limit (Default: 65536) + - `index.max_regex_length`: Regex length limit (Default: 1000) + +3. **Routing and Allocation** + - `index.routing.allocation.enable`: Shard allocation control (Options: all, primaries, new_primaries, none) + - `index.routing.rebalance.enable`: Shard rebalancing control (Options: all, primaries, replicas, none) + +4. **Miscellaneous** + - `index.gc_deletes`: Deleted document retention period (Default: 60s) + - `index.default_pipeline`: Default ingest pipeline + - `index.final_pipeline`: Final ingest pipeline (Cannot modify `_index` field) + +# Understanding and Implementing Text Analysis in Elasticsearch + +## What is Text Analysis? + +Text analysis is a crucial process in Elasticsearch that transforms unstructured text into a structured format optimized for search operations. This process is applied to various types of content, such as email bodies or product descriptions, to enhance searchability and data retrieval. + +## When to Consider Text Analysis Configuration + +Elasticsearch automatically performs text analysis during the indexing of text fields and when searching through them. However, there are specific scenarios where configuring text analysis becomes particularly important: + +1. If your index contains `text` fields +2. When text searches aren't yielding expected results +3. For building a custom search engine +4. When mining unstructured data +5. To optimize searches for specific languages +6. For conducting lexicographic or linguistic research + +If your index doesn't include text fields, you may not need to configure text analysis further, and you can skip the pages in this section. + +## Key Components of Text Analysis + +To effectively implement and utilize text analysis in Elasticsearch, it's essential to understand and configure the following components: + +1. Analyzers +2. Tokenizers +3. Token filters +4. Character filters +5. Normalizers + +Each of these components plays a specific role in the text analysis process and can be customized to suit your particular use case. + +## Exploring Text Analysis Further + +To deepen your understanding and effectively implement text analysis in Elasticsearch, consider exploring these topics: + +- Overview of text analysis concepts +- Step-by-step guide to configuring text analysis +- Reference guide for built-in analyzers +- Comprehensive tokenizer reference +- Detailed token filter reference +- In-depth character filters reference +- Understanding and using normalizers + +# Optimizing Shard Allocation in Elasticsearch + +## Introduction to Index-Level Shard Allocation Filtering + +Elasticsearch provides powerful tools for controlling the distribution of shards across your cluster. By leveraging index-level shard allocation filters, you can fine-tune where Elasticsearch places shards for specific indices. These filters work in tandem with cluster-wide allocation settings and allocation awareness features to give you granular control over your data distribution. + +## Filter Types and Attributes + +Shard allocation filters can be based on: + +1. Custom node attributes +2. Built-in attributes: + - `_name`: Node name + - `_host_ip`: Host IP address + - `_publish_ip`: Publish IP address + - `_ip`: Either host IP or publish IP + - `_host`: Hostname + - `_id`: Node ID + - `_tier`: Node's data tier role + - `_tier_preference`: Tier preference + +Index lifecycle management utilizes these filters, particularly those based on custom attributes, to orchestrate shard reallocation during phase transitions. + +## Dynamic Configuration + +One of the key advantages of shard allocation filters is their dynamic nature. The `cluster.routing.allocation` settings can be adjusted on-the-fly, allowing for live migration of indices between different sets of nodes. However, it's important to note that relocations only occur if they don't violate other routing constraints, such as the rule preventing primary and replica shards from residing on the same node. + +## Use Case Example + +For instance, you could use a custom node attribute to indicate a node's performance characteristics and use shard allocation filtering to route shards for a particular index to the most appropriate class of hardware. + +## Implementing Index-Level Shard Allocation Filtering + +To implement filtering based on custom node attributes, follow these steps: + +1. Define custom node attributes in the `elasticsearch.yml` file or at node startup: + + ```yaml + node.attr.size: medium + ``` + + Or when starting a node: + + ``` + ./bin/elasticsearch -Enode.attr.size=medium + ``` + +2. Apply routing allocation filters to the index using one of three types: `include`, `exclude`, or `require`. + + Example: Allocate shards from the `test` index to `big` or `medium` nodes: + + ```json + PUT test/_settings + { + "index.routing.allocation.include.size": "big,medium" + } + ``` + + When using multiple filters, all conditions must be met simultaneously: + - All `require` conditions must be satisfied + - No `exclude` conditions can be satisfied + - At least one `include` condition must be satisfied + + Example: Move the `test` index to `big` nodes in `rack1`: + + ```json + PUT test/_settings + { + "index.routing.allocation.require.size": "big", + "index.routing.allocation.require.rack": "rack1" + } + ``` + +## Available Index Allocation Filter Settings + +- `index.routing.allocation.include.{attribute}` +- `index.routing.allocation.require.{attribute}` +- `index.routing.allocation.exclude.{attribute}` + +Replace `{attribute}` with the desired node attribute. + +## Advanced Usage: Wildcards and Tier Filtering + +Wildcards can be used when specifying attribute values: + +```json +PUT test/_settings +{ + "index.routing.allocation.include._ip": "192.168.2.*" +} +``` + +For `_tier` filtering, note that it's based on node roles. Only a subset of roles are considered data tier roles, and the generic data role will match any tier filtering. + +# Optimizing Node Departure Handling in Elasticsearch + +When a node unexpectedly leaves an Elasticsearch cluster, the system's default response can sometimes lead to unnecessary strain. This document outlines the process, potential issues, and solutions for managing node departures more efficiently. + +## Default Behavior on Node Departure + +When a node exits the cluster, Elasticsearch typically: + +1. Elevates replica shards to primary status to replace lost primaries +2. Allocates new replica shards (if sufficient nodes are available) +3. Redistributes shards for optimal balance across remaining nodes + +While this approach safeguards against data loss, it can trigger a resource-intensive "shard shuffle" that may be unnecessary if the node's absence is temporary. + +## The Challenge of Premature Reallocation + +Consider this scenario: + +1. Node 5 experiences a network disconnection +2. Replica shards are promoted to primaries +3. New replicas are created on other nodes +4. Extensive data copying occurs across the network +5. Cluster undergoes rebalancing +6. Node 5 reconnects shortly after +7. Another rebalancing occurs to include Node 5 + +This process results in significant, potentially avoidable network traffic and processing load. + +## Delayed Allocation: A Smarter Approach + +Elasticsearch offers a solution through delayed allocation of unassigned replica shards. This feature is controlled by the `index.unassigned.node_left.delayed_timeout` setting. + +### Configuring Delayed Allocation + +To modify the delay timeout (default is 1 minute): + +```json +PUT _all/_settings +{ + "settings": { + "index.unassigned.node_left.delayed_timeout": "5m" + } +} +``` + +### Revised Scenario with Delayed Allocation + +With this setting in place, the process changes: + +1. Node 5 loses connectivity +2. Replica shards are promoted to primaries +3. Master node logs a delay message for unassigned shard allocation +4. Cluster status remains yellow due to unassigned replicas +5. If Node 5 returns before timeout expiration, replicas are quickly reallocated (sync-flushed shards recover almost immediately) + +Note: This setting doesn't affect primary shard promotion or initial replica assignments. It also doesn't come into effect after a full cluster restart. Additionally, in case of a master failover, the elapsed delay time is reset to the full initial delay. + +## Shard Relocation Cancellation + +If the delay timeout is exceeded: +- The master assigns missing shards to another node +- If the original node rejoins with matching sync-ids, any ongoing relocations are cancelled +- The existing synced shard is used for efficient recovery + +The default 1-minute timeout balances quick recovery with the potential for efficient cancellation. + +## Monitoring Delayed Allocations + +To check the number of delayed unassigned shards: + +``` +GET _cluster/health +``` + +Look for the `delayed_unassigned_shards` value in the response. + +## Handling Permanent Node Removal + +If a node won't return, immediately allocate missing shards by setting the timeout to zero: + +```json +PUT _all/_settings +{ + "settings": { + "index.unassigned.node_left.delayed_timeout": "0" + } +} +``` + +You can revert this setting once shard recovery begins. + +# Shard Allocation in Elasticsearch: Managing Total Shards per Node + +Elasticsearch's cluster-level shard allocator aims to distribute shards from a single index across multiple nodes. However, achieving an even distribution can be challenging, especially when dealing with numerous indices, varying shard sizes, or limited node resources. + +## Configuring Shard Allocation Limits + +To address potential imbalances, Elasticsearch provides dynamic settings that allow you to set hard limits on shard allocation: + +### Index-Specific Limit + +``` +index.routing.allocation.total_shards_per_node +``` + +This setting restricts the total number of shards from a specific index that can be allocated to a single node. + +### Cluster-Wide Limit + +``` +cluster.routing.allocation.total_shards_per_node +``` + +This setting imposes a limit on the total number of shards (regardless of index) that can be allocated to each node. + +Both settings are dynamic and default to `-1`, which means unlimited shards per node. Specifically, this is defined as: + +``` +-1 +``` + +The value represents the maximum number of primary and replica shards allocated to each node. + +## Shard Allocation in Action + +Let's examine how these settings affect shard allocation with an example: + +Assume a cluster has `cluster.routing.allocation.total_shards_per_node` set to 100, with three nodes: + +- Node A: 100 shards +- Node B: 98 shards +- Node C: 1 shard + +If Node C fails, Elasticsearch will reallocate its shard to Node B, as allocating to Node A would exceed its 100-shard limit. + +## Important Considerations + +1. These settings impose hard limits that may result in some shards remaining unallocated. +2. Use these settings cautiously, as they can impact cluster performance and data availability. +3. Proper configuration requires understanding your cluster's resources and workload patterns. +4. Elasticsearch checks these settings during shard allocation to ensure the limits are not exceeded. + +# Data Tier Allocation in Elasticsearch + +## Understanding Index-Level Data Tier Allocation + +Elasticsearch provides a powerful feature for controlling the allocation of indices to specific data tiers. This functionality is implemented through the data tier allocator, which acts as a shard allocation filter. The allocator utilizes two built-in node attributes: + +1. `_tier` +2. `_tier_preference` + +These attributes are determined by the data node roles assigned to each node in your Elasticsearch cluster. + +## Data Node Roles and Tiers + +The following data node roles correspond to different data tiers: + +- `data_content` +- `data_hot` +- `data_warm` +- `data_cold` + +It's important to note that the general `data` role is not considered a valid data tier and cannot be used for data tier filtering. + +## Configuring Data Tier Allocation + +To control the allocation of indices to specific data tiers, you can use various index-level settings. These settings allow you to include, require, or exclude certain tiers for your indices. + +### Available Settings + +1. `index.routing.allocation.include._tier`: Specifies which tiers the index may be allocated to. +2. `index.routing.allocation.require._tier`: Defines the tier(s) that the index must be allocated to. +3. `index.routing.allocation.exclude._tier`: Indicates which tiers the index must not be allocated to. +4. `index.routing.allocation.include._tier_preference`: Sets the preferred order of tiers for allocation. + +### Examples + +- To allow allocation to either warm or hot tiers, with a preference for warm: + ``` + index.routing.allocation.include._tier_preference: data_warm,data_hot + ``` + +- To restrict allocation to only the warm tier: + ``` + index.routing.allocation.require._tier: data_warm + ``` + +- To ensure an index is only allocated to the hot tier: + ``` + index.routing.allocation.require._tier: data_hot + ``` + +## Node Configuration + +To designate a node's role and, consequently, its tier, use the `node.roles` setting in the node's configuration. + +# Understanding Elasticsearch Mapping + +Elasticsearch mapping is a crucial process that defines how documents and their fields are stored and indexed. This guide will explore the key aspects of mapping, including its definition, important settings, dynamic and explicit mapping, and how to manage mappings in your Elasticsearch cluster. + +## What is Mapping? + +Mapping in Elasticsearch allows you to: + +- Specify which string fields should be treated as full text +- Define fields containing numbers, dates, or geolocations +- Set the format for date values +- Establish custom rules for dynamically added fields + +A typical mapping definition includes: + +- `_index` +- `_id` +- `_source` +- `properties` + +Note: Prior to version 7.0.0, mapping definitions included a type name. This has since been removed. + +## Preventing Mapping Explosion + +Mapping explosion occurs when an index has too many fields, potentially causing out-of-memory errors and recovery difficulties. This is particularly problematic with dynamic mapping, where each new document might introduce new fields. + +To mitigate this risk, Elasticsearch provides several settings: + +1. `index.mapping.total_fields.limit`: Caps the total number of fields in an index (default: 1000). +2. `index.mapping.depth.limit`: Restricts the maximum depth for a field (default: 20). +3. `index.mapping.nested_fields.limit`: Limits the number of nested fields (default: 50). +4. `index.mapping.nested_objects.limit`: Sets the maximum number of nested JSON objects (default: 10000). +5. `index.mapping.field_name_length.limit`: Constrains the maximum length of a field name (default: Long.MAX_VALUE). + +When increasing these limits, consider also raising the `indices.query.bool.max_clause_count` setting to accommodate larger queries. + +For fields with many arbitrary keys, the flattened data type may be a suitable alternative. + +## Dynamic Mapping + +Elasticsearch's dynamic mapping feature allows you to index documents without predefined field mappings. New fields are automatically added to the index mapping when a document is indexed. This applies to both top-level mapping types and inner `object` and `nested` fields. + +While convenient, it's important to note that dynamic mapping rules can be configured to customize the mapping that is used for new fields. + +## Explicit Mappings + +As you become more familiar with your data structure, you may want to define explicit mappings for better control and optimization. Explicit mappings can be created when: + +1. Creating a new index +2. Adding fields to an existing index + +### Creating an Index with Explicit Mapping + +Use the create index API to define mappings when creating a new index: + +```json +PUT /my-index-000001 +{ + "mappings": { + "properties": { + "age": { "type": "integer" }, + "email": { "type": "keyword" }, + "name": { "type": "text" } + } + } +} +``` + +### Adding Fields to Existing Mappings + +The put mapping API allows you to add new fields to an existing index: + +```json +PUT /my-index-000001/_mapping +{ + "properties": { + "employee-id": { + "type": "keyword", + "index": false + } + } +} +``` + +This example adds an `employee-id` field that is stored but not indexed or searchable. + +## Updating Field Mappings + +It's important to note that except for supported mapping parameters, you can't change the mapping or field type of an existing field. Attempts to do so could invalidate already indexed data. If you need to modify a field's mapping: + +1. For data streams, refer to the "Change mappings and settings for a data stream" documentation. +2. For other indices, create a new index with the desired mapping and reindex your data. + +To create an alternate name for a field without invalidating existing data, consider using an alias field. + +## Viewing Mappings + +### View Complete Index Mapping + +Use the get mapping API to see the full mapping of an index: + +``` +GET /my-index-000001/_mapping +``` + +### View Specific Field Mappings + +For large indices or when you only need information about particular fields, use the get field mapping API: + +``` +GET /my-index-000001/_mapping/field/employee-id +``` + +This approach is especially useful for indices with numerous fields. + +# Understanding Elasticsearch Shards and Merging + +## Shard Structure +Elasticsearch shards are composed of Lucene indexes, which are further divided into segments. These segments serve as the fundamental storage units within the index, housing the actual data. It's important to note that segments are immutable, meaning their content cannot be altered once created. + +## Segment Management +To maintain optimal index size and performance, Elasticsearch employs a process of merging smaller segments into larger ones. This process serves two primary purposes: +1. Controlling index size +2. Removing deleted documents (expunging deletes) + +The merging operation utilizes an auto-throttling mechanism, which carefully balances resource allocation between merging tasks and other critical functions like search operations. + +## Merge Scheduling + +Elasticsearch employs a merge scheduler, specifically the ConcurrentMergeScheduler, to oversee merge operations. This scheduler manages the execution of merges as needed, running them in separate threads. When the maximum thread count is reached, subsequent merge operations are queued until a thread becomes available. + +### Configurable Settings + +The merge scheduler supports a dynamic setting that can be adjusted: + +``` +index.merge.scheduler.max_thread_count +``` + +The default value for this setting is calculated using the following formula: + +``` +Math.max(1, Math.min(4, <> / 2)) +``` + +This formula ensures that the thread count is at least 1 and at most 4, or half the number of processors available to the node, whichever is smaller. + +# Elasticsearch Similarity Module + +The Similarity module in Elasticsearch defines how matching documents are scored and ranked. This module operates on a per-field basis, allowing different similarity algorithms to be applied to different fields through mapping configurations. + +## Configuring Custom Similarities + +While the built-in similarities are generally sufficient, advanced users can configure custom similarities through index settings. This can be done when creating an index or updating existing settings. + +Example configuration: + +```json +PUT /index +{ + "settings": { + "index": { + "similarity": { + "my_similarity": { + "type": "DFR", + "basic_model": "g", + "after_effect": "l", + "normalization": "h2", + "normalization.h2.c": "3.0" + } + } + } + } +} +``` + +To apply the custom similarity to a specific field: + +```json +PUT /index/_mapping +{ + "properties": { + "title": { "type": "text", "similarity": "my_similarity" } + } +} +``` + +## Available Similarity Algorithms + +### 1. BM25 (Default) + +BM25 is the default similarity algorithm in Elasticsearch. It's based on TF/IDF and is particularly effective for short fields like names. + +Configuration options: +- `k1`: Controls non-linear term frequency normalization (default: 1.2) +- `b`: Controls document length normalization of tf values (default: 0.75) +- `discount_overlaps`: Determines if overlap tokens are ignored in norm computation (default: true) + +Type name: `BM25` + +### 2. DFR (Divergence from Randomness) + +The DFR similarity implements the divergence from randomness framework. + +Configuration options: +- `basic_model`: Choices are g, if, in, and ine +- `after_effect`: Choices are b and l +- `normalization`: Choices are no, h1, h2, h3, and z + +Type name: `DFR` + +### 3. DFI (Divergence from Independence) + +This similarity implements the divergence from independence model. + +Configuration option: +- `independence_measure`: Choices are standardized, saturated, chisquared + +Note: It's recommended to retain stop words when using this similarity for good relevance. Terms with frequency less than the expected frequency will get a score of 0. + +Type name: `DFI` + +### 4. IB (Information Based) + +The IB model is based on the concept that information content in symbolic distribution sequences is primarily determined by repetitive usage of basic elements. + +Configuration options: +- `distribution`: Choices are ll and spl +- `lambda`: Choices are df and ttf +- `normalization`: Same options as DFR similarity + +Type name: `IB` + +### 5. LM Dirichlet + +Configuration option: +- `mu`: Default is 2000 + +Note: Terms with fewer occurrences than predicted by the language model receive a score of 0. + +Type name: `LMDirichlet` + +### 6. LM Jelinek Mercer + +This algorithm attempts to capture important text patterns while filtering out noise. + +Configuration option: +- `lambda`: Optimal value varies (default: 0.1) + - ~0.1 for title queries + - ~0.7 for long queries + - As lambda approaches 0, documents matching more query terms rank higher + +Type name: `LMJelinekMercer` + +### 7. Scripted Similarity + +Allows custom scoring logic using scripts. Example implementation of TF-IDF: + +```json +PUT /index +{ + "settings": { + "number_of_shards": 1, + "similarity": { + "scripted_tfidf": { + "type": "scripted", + "script": { + "source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;" + } + } + } + }, + "mappings": { + "properties": { + "field": { + "type": "text", + "similarity": "scripted_tfidf" + } + } + } +} +``` + +Important rules for scripted similarities: +1. Scores must be positive +2. Scores should not decrease when `doc.freq` increases (all else being equal) +3. Scores should not increase when `doc.length` increases (all else being equal) + +For improved efficiency, use `weight_script` for document-independent calculations: + +```json +PUT /index +{ + "settings": { + "number_of_shards": 1, + "similarity": { + "scripted_tfidf": { + "type": "scripted", + "weight_script": { + "source": "double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; return query.boost * idf;" + }, + "script": { + "source": "double tf = Math.sqrt(doc.freq); double norm = 1/Math.sqrt(doc.length); return weight * tf * norm;" + } + } + } + }, + "mappings": { + "properties": { + "field": { + "type": "text", + "similarity": "scripted_tfidf" + } + } + } +} +``` + +Type name: `scripted` + +## Changing the Default Similarity + +To change the default similarity for all fields in an index: + +1. When creating the index: + +```json +PUT /index +{ + "settings": { + "index": { + "similarity": { + "default": { + "type": "boolean" + } + } + } + } +} +``` + +2. For an existing index: + +```json +POST /index/_close + +PUT /index/_settings +{ + "index": { + "similarity": { + "default": { + "type": "boolean" + } + } + } +} + +POST /index/_open +``` + +Note: Changing the default similarity for an existing index requires closing and reopening the index. + +# Elasticsearch Search and Indexing Slow Logs + +Elasticsearch provides powerful logging capabilities for slow search queries and indexing operations. These logs help identify performance bottlenecks and optimize your cluster's efficiency. + +## Search Slow Log + +The search slow log captures lengthy search operations at the shard level, offering insights into query and fetch phases. + +### Configuration + +You can set thresholds for both query and fetch phases using dynamic index settings: + +```yaml +index.search.slowlog.threshold.query.warn: 10s +index.search.slowlog.threshold.query.info: 5s +index.search.slowlog.threshold.query.debug: 2s +index.search.slowlog.threshold.query.trace: 500ms + +index.search.slowlog.threshold.fetch.warn: 1s +index.search.slowlog.threshold.fetch.info: 800ms +index.search.slowlog.threshold.fetch.debug: 500ms +index.search.slowlog.threshold.fetch.trace: 200ms + +index.search.slowlog.level: info +``` + +Apply these settings using the Update Index Settings API: + +```json +PUT /my-index-000001/_settings +{ + "index.search.slowlog.threshold.query.warn": "10s", + "index.search.slowlog.threshold.query.info": "5s", + "index.search.slowlog.threshold.query.debug": "2s", + "index.search.slowlog.threshold.query.trace": "500ms", + "index.search.slowlog.threshold.fetch.warn": "1s", + "index.search.slowlog.threshold.fetch.info": "800ms", + "index.search.slowlog.threshold.fetch.debug": "500ms", + "index.search.slowlog.threshold.fetch.trace": "200ms", + "index.search.slowlog.level": "info" +} +``` + +### Key Features + +- Thresholds are disabled by default (set to `-1`) +- Logging levels: warn, info, debug, trace +- Shard-level logging for precise performance tracking +- The benefit of several levels is the ability to quickly "grep" for specific thresholds breached + +### Log File Configuration + +The search slow log file is configured in `log4j2.properties`: + +```properties +appender.index_search_slowlog_rolling.type = RollingFile +appender.index_search_slowlog_rolling.name = index_search_slowlog_rolling +appender.index_search_slowlog_rolling.fileName = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}_index_search_slowlog.log +appender.index_search_slowlog_rolling.layout.type = PatternLayout +appender.index_search_slowlog_rolling.layout.pattern = [%d{ISO8601}][%-5p][%-25c] [%node_name]%marker %.-10000m%n +appender.index_search_slowlog_rolling.filePattern = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}_index_search_slowlog-%i.log.gz +appender.index_search_slowlog_rolling.policies.type = Policies +appender.index_search_slowlog_rolling.policies.size.type = SizeBasedTriggeringPolicy +appender.index_search_slowlog_rolling.policies.size.size = 1GB +appender.index_search_slowlog_rolling.strategy.type = DefaultRolloverStrategy +appender.index_search_slowlog_rolling.strategy.max = 4 + +logger.index_search_slowlog_rolling.name = index.search.slowlog +logger.index_search_slowlog_rolling.level = trace +logger.index_search_slowlog_rolling.appenderRef.index_search_slowlog_rolling.ref = index_search_slowlog_rolling +logger.index_search_slowlog_rolling.additivity = false +``` + +## Identifying Search Slow Log Origin + +To trace slow queries back to their origin, you can use the `X-Opaque-ID` header. This user ID is included in the search slow logs and JSON logs. + +Example search slow log entry: +``` +[2030-08-30T11:59:37,786][WARN ][i.s.s.query ] [node-0] [index6][0] took[78.4micros], took_millis[0], total_hits[0 hits], stats[], search_type[QUERY_THEN_FETCH], total_shards[1], source[{"query":{"match_all":{"boost":1.0}}}], id[MY_USER_ID], +``` + +Example JSON log entry: +```json +{ + "type": "index_search_slowlog", + "timestamp": "2030-08-30T11:59:37,786+02:00", + "level": "WARN", + "component": "i.s.s.query", + "cluster.name": "distribution_run", + "node.name": "node-0", + "message": "[index6][0]", + "took": "78.4micros", + "took_millis": "0", + "total_hits": "0 hits", + "stats": "[]", + "search_type": "QUERY_THEN_FETCH", + "total_shards": "1", + "source": "{\"query\":{\"match_all\":{\"boost\":1.0}}}", + "id": "MY_USER_ID", + "cluster.uuid": "Aq-c-PAeQiK3tfBYtig9Bw", + "node.id": "D7fUYfnfTLa2D7y-xw6tZg" +} +``` + +## Index Slow Log + +The index slow log captures slow indexing operations, similar to the search slow log. + +### Configuration + +Configure index slow log thresholds using dynamic index settings: + +```yaml +index.indexing.slowlog.threshold.index.warn: 10s +index.indexing.slowlog.threshold.index.info: 5s +index.indexing.slowlog.threshold.index.debug: 2s +index.indexing.slowlog.threshold.index.trace: 500ms +index.indexing.slowlog.level: info +index.indexing.slowlog.source: 1000 +``` + +Apply these settings using the Update Index Settings API: + +```json +PUT /my-index-000001/_settings +{ + "index.indexing.slowlog.threshold.index.warn": "10s", + "index.indexing.slowlog.threshold.index.info": "5s", + "index.indexing.slowlog.threshold.index.debug": "2s", + "index.indexing.slowlog.threshold.index.trace": "500ms", + "index.indexing.slowlog.level": "info", + "index.indexing.slowlog.source": "1000" +} +``` + +### Key Features + +- Log file name: `_index_indexing_slowlog.log` +- Control logging of `_source` field: + - Default: First 1000 characters + - Set to `false` or `0` to skip logging the source + - Set to `true` to log the entire source regardless of size +- `index.indexing.slowlog.reformat`: Set to `false` to preserve original document format (default is `true`) + +### Log File Configuration + +The index slow log file is configured in `log4j2.properties`: + +```properties +appender.index_indexing_slowlog_rolling.type = RollingFile +appender.index_indexing_slowlog_rolling.name = index_indexing_slowlog_rolling +appender.index_indexing_slowlog_rolling.fileName = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}_index_indexing_slowlog.log +appender.index_indexing_slowlog_rolling.layout.type = PatternLayout +appender.index_indexing_slowlog_rolling.layout.pattern = [%d{ISO8601}][%-5p][%-25c] [%node_name]%marker %.-10000m%n +appender.index_indexing_slowlog_rolling.filePattern = ${sys:es.logs.base_path}${sys:file.separator}${sys:es.logs.cluster_name}_index_indexing_slowlog-%i.log.gz +appender.index_indexing_slowlog_rolling.policies.type = Policies +appender.index_indexing_slowlog_rolling.policies.size.type = SizeBasedTriggeringPolicy +appender.index_indexing_slowlog_rolling.policies.size.size = 1GB +appender.index_indexing_slowlog_rolling.strategy.type = DefaultRolloverStrategy +appender.index_indexing_slowlog_rolling.strategy.max = 4 + +logger.index_indexing_slowlog.name = index.indexing.slowlog.index +logger.index_indexing_slowlog.level = trace +logger.index_indexing_slowlog.appenderRef.index_indexing_slowlog_rolling.ref = index_indexing_slowlog_rolling +logger.index_indexing_slowlog.additivity = false +``` + +# Elasticsearch Storage Configuration + +## Introduction to the Store Module + +The store module in Elasticsearch provides control over the storage and access of index data on disk. While this is a low-level setting, it's crucial to understand its implications on performance and resource utilization. Some store implementations have poor concurrency or disable optimizations for heap memory usage, which is why it's generally recommended to stick with the default settings. + +## Storage Types + +Elasticsearch offers various file system implementations, known as storage types. By default, it selects the most suitable implementation based on the operating environment. However, you can explicitly configure the storage type for all indices or on a per-index basis. + +### Global Configuration + +To set the storage type globally, add the following to your `config/elasticsearch.yml` file: + +```yaml +index.store.type: hybridfs +``` + +### Per-Index Configuration + +For individual index configuration, use the following JSON structure during index creation: + +```json +PUT /my-index-000001 +{ + "settings": { + "index.store.type": "hybridfs" + } +} +``` + +## Available Storage Types + +Elasticsearch supports the following storage types: + +1. `fs` +2. `hybridfs` +3. `simplefs` +4. `SimpleFsDirectory` +5. `niofs` +6. `NIOFSDirectory` +7. `mmapfs` +8. `MMapDirectory` + +## Memory-Mapping Configuration + +The use of `mmapfs` and `hybridfs` can be restricted using the `node.store.allow_mmap` setting. This boolean setting controls whether memory-mapping is allowed, with the default being true. + +This configuration is particularly useful in environments where you cannot control the creation of memory maps and need to disable memory-mapping capabilities. For example: + +```yaml +node.store.allow_mmap: false +``` + +## Expert-Level Configuration + +It's important to note that storage type configuration is considered an expert-level setting. Exercise caution when modifying these settings, as they may impact performance and stability. Furthermore, this setting might be removed in future Elasticsearch releases. + +## Recommendation + +Unless you have specific requirements or expertise in this area, it's advisable to stick with the default storage settings. Elasticsearch's default choices are optimized for most use cases and provide a balance between performance and resource utilization. + +# Lucene Commits and Translog in Elasticsearch + +## Persistence and Recovery Mechanisms + +Elasticsearch employs two primary mechanisms to ensure data durability and facilitate recovery: Lucene commits and the translog. + +### Lucene Commits + +Lucene commits are operations that persist changes to disk. While crucial for data durability, they are resource-intensive and cannot be performed after every index or delete operation. Consequently, changes occurring between commits may be lost in case of unexpected shutdowns or hardware failures. + +### Translog + +To mitigate potential data loss, each shard maintains a transaction log called the translog. This log records all index and delete operations after they've been processed by the internal Lucene index but before acknowledgment. In the event of a crash, recent acknowledged operations that weren't included in the last Lucene commit can be recovered from the translog during shard recovery. + +## Elasticsearch Flush + +An Elasticsearch flush involves performing a Lucene commit and initiating a new translog generation. Flushes occur automatically in the background to prevent the translog from growing excessively large, which could significantly slow down recovery times. While manual flushing is possible via an API, it's rarely necessary. + +## Translog Configuration + +### Durability Settings + +The `index.translog.durability` setting controls how translog data is persisted: + +- `request` (default): Ensures the translog is `fsync`ed and committed on the primary and all allocated replicas before reporting success to the client. +- `async`: Performs `fsync` and commits the translog at intervals specified by `index.translog.sync_interval`. This may result in loss of operations performed just before a crash when the node recovers. + +### Key Settings + +The following dynamically updatable per-index settings control the behavior of the translog: + +- `index.translog.sync_interval`: Default is `5s` +- `index.translog.flush_threshold_size`: Default is `512mb` + +These settings influence translog behavior and can be adjusted as needed. + +## Translog Retention (Deprecated) + +Translog retention settings have been deprecated since version 7.4.0 in favor of soft deletes. These settings are now effectively ignored and will be removed in future versions. + +Previously, these settings controlled how much translog data was retained for peer recoveries: + +- `index.translog.retention.size`: Default was `512mb` +- `index.translog.retention.age`: Default was `12h` + +## Note on Soft Deletes + +Soft deletes have replaced translog retention as the preferred method for retaining historical operations. This change improves the efficiency of replica recovery processes. + +In earlier versions, when an index was not using soft deletes, Elasticsearch recovered each replica shard by replaying operations from the primary's translog. This required both primary and replica shards to preserve extra operations in their translogs to support potential rebuilding of replicas or promotion of replicas to primary status. + +# History Retention in Elasticsearch + +## Introduction + +Elasticsearch employs a mechanism called history retention to efficiently manage and replay operations on shards. This feature is crucial for scenarios such as: + +1. Bringing briefly offline replicas up to date +2. Facilitating cross-cluster replication + +## Core Concepts + +### Soft Deletes + +Elasticsearch utilizes soft deletes to preserve recent deletion operations in the Lucene index. This allows for: + +- Efficient replay of operations +- Preservation of deletion information not inherently stored in Lucene + +Soft deletes are necessary because while indexed documents contain all information needed for replay, deletions do not. + +### Shard History Retention Leases + +To manage operation replay, Elasticsearch implements shard history retention leases. Key points include: + +- Each potential replay target (e.g., replicas, follower shards) creates a lease +- Leases track the first unprocessed operation's sequence number +- As operations are processed, the lease's sequence number is updated +- Soft-deleted operations are discarded when no longer covered by any lease + +## Retention Mechanism + +1. Failed shard copies stop updating their leases +2. Elasticsearch preserves new operations for potential replay +3. Leases expire after a set time (default: 12 hours) +4. Expired leases allow Elasticsearch to discard history +5. Late recoveries result in full index copying + +Elasticsearch balances between retaining necessary operations and preventing indefinite index growth. + +## Configuration + +Soft deletes are enabled by default on recent indices but can be explicitly configured: + +```yaml +index.soft_deletes.enabled: true +index.soft_deletes.retention_lease.period: 12h +``` + +Note: Disabling soft deletes impacts peer recoveries and prevents cross-cluster replication. If soft deletes are disabled, peer recoveries may still occur by copying missing operations from the translog, as long as those operations are retained there. + +## Implications + +1. Balances efficient recovery with space management +2. Allows for quick recovery of briefly offline shards +3. Prevents indefinite history retention for permanently failed shards +4. Enables cross-cluster replication functionality + +# Understanding and Managing Indexing Pressure in Elasticsearch + +Elasticsearch, a powerful search and analytics engine, relies on efficient indexing processes to maintain optimal performance. However, the act of indexing documents can significantly impact system resources, potentially affecting overall cluster health and responsiveness. This document explores the concept of indexing pressure, its implications, and how Elasticsearch manages it to ensure system stability. + +## What is Indexing Pressure? + +Indexing pressure refers to the load placed on an Elasticsearch cluster due to document indexing operations. These operations consume memory and CPU resources across multiple nodes in a cluster, involving three key stages: + +1. Coordinating +2. Primary +3. Replica + +Indexing pressure can accumulate from various sources, including: +- External indexing requests +- Internal processes (e.g., recoveries and cross-cluster replication) + +When indexing pressure becomes excessive, it can lead to: +- Cluster saturation +- Degraded search performance +- Impaired cluster coordination +- Disrupted background processing + +To mitigate these risks, Elasticsearch employs internal monitoring mechanisms and rejects new indexing work when predefined limits are exceeded. + +## Memory Management for Indexing Operations + +Elasticsearch uses a configurable memory limit to control indexing pressure. This limit is set using the `indexing_pressure.memory.limit` node setting. + +Key points about the memory limit: +- Default value: 10% of the heap +- Purpose: Restricts the number of bytes available for outstanding indexing requests +- Scope: Applies to coordinating, primary, and replica stages + +### How Memory Accounting Works + +1. At the start of each indexing stage, Elasticsearch accounts for the bytes consumed by the request. +2. This accounting is only released at the end of the stage. +3. Upstream stages continue to account for the request until all downstream stages complete. + +Example: +- Coordinating request remains accounted for until primary and replica stages finish. +- Primary request stays accounted for until all in-sync replicas respond (enabling replica retries if necessary). + +### Rejection Thresholds + +1. Coordinating and Primary Stages: + - Rejection occurs when outstanding indexing bytes exceed the configured limit. + +2. Replica Stage: + - Rejection begins when outstanding replica indexing bytes surpass 1.5 times the configured limit. + - This design prioritizes completing outstanding replica work over accepting new coordinating and primary tasks as pressure builds. + +### Caution When Adjusting the Limit + +The default 10% limit is generously sized. Consider the following before modifying: +- Only indexing requests contribute to this limit. +- Additional indexing overhead (e.g., buffers, listeners) also requires heap space. +- Other Elasticsearch components need memory too. +- Setting the limit too high may deprive other operations and components of necessary memory. + +## Monitoring Indexing Pressure + +To gain insights into indexing pressure metrics, utilize the node stats API provided by Elasticsearch. + +## Configuration + +The primary setting for managing indexing pressure is: + +``` +indexing_pressure.memory.limit +``` + +Adjust this setting cautiously, considering your specific use case and system resources. + +By understanding and properly managing indexing pressure, you can maintain a healthy and responsive Elasticsearch cluster, ensuring optimal performance for both indexing and search operations. + +## Indexing Stages + +External indexing operations go through three stages: coordinating, primary, and replica. For more detailed information on this process, refer to the Elasticsearch documentation on the Basic write model. + +# Elasticsearch: The Deprecation and Removal of Mapping Types + +## Introduction + +Elasticsearch has undergone significant changes regarding mapping types. This document outlines the reasons behind these changes, the timeline for their implementation, and guidance for migrating to the new typeless structure. + +## Understanding Mapping Types + +Historically, Elasticsearch allowed multiple mapping types within a single index. Each type could represent different entities (e.g., 'user' and 'tweet' in a 'twitter' index) with their own fields. However, this approach led to complications and inefficiencies. + +## Rationale for Removal + +1. **Field Conflicts**: Fields with the same name across different types in an index share the same Lucene field, causing potential conflicts. +2. **Data Sparsity**: Storing diverse entities in one index can lead to sparse data, reducing Lucene's compression efficiency. +3. **Conceptual Mismatch**: The analogy of types to SQL tables was misleading, as it didn't accurately represent Elasticsearch's internal structure. + +## Timeline for Removal + +- **Elasticsearch 5.6**: + - Introduction of `index.mapping.single_type: true` setting + - `join` field introduced as a replacement for parent-child relationships +- **Elasticsearch 6.x**: + - Single-type-per-index enforced for new indices + - `_type` field no longer combined with `_id` to form `_uid` + - Deprecation of `_default_` mapping type +- **Elasticsearch 6.8**: + - Introduction of `include_type_name` parameter (defaults to `true`) +- **Elasticsearch 7.0**: + - `include_type_name` parameter defaults to `false` + - Specifying types in requests deprecated + - `_default_` mapping type removed +- **Elasticsearch 8.0**: + - Complete removal of mapping types + +## Migration Strategies + +### 1. Separate Indices + +Split multi-type indices into separate indices for each type: + +```json +PUT users +{ + "settings": { + "index.mapping.single_type": true + }, + "mappings": { + "_doc": { + "properties": { + "name": { "type": "text" }, + "user_name": { "type": "keyword" }, + "email": { "type": "keyword" } + } + } + } +} + +PUT tweets +{ + "settings": { + "index.mapping.single_type": true + }, + "mappings": { + "_doc": { + "properties": { + "content": { "type": "text" }, + "user_name": { "type": "keyword" }, + "tweeted_at": { "type": "date" } + } + } + } +} +``` + +Use the Reindex API to migrate data: + +```json +POST _reindex +{ + "source": { + "index": "twitter", + "type": "user" + }, + "dest": { + "index": "users", + "type": "_doc" + } +} + +POST _reindex +{ + "source": { + "index": "twitter", + "type": "tweet" + }, + "dest": { + "index": "tweets", + "type": "_doc" + } +} +``` + +### 2. Custom Type Field + +Add a custom type field to distinguish between document types within a single index: + +```json +PUT new_twitter +{ + "mappings": { + "_doc": { + "properties": { + "type": { "type": "keyword" }, + "name": { "type": "text" }, + "user_name": { "type": "keyword" }, + "email": { "type": "keyword" }, + "content": { "type": "text" }, + "tweeted_at": { "type": "date" } + } + } + } +} + +POST _reindex +{ + "source": { + "index": "twitter" + }, + "dest": { + "index": "new_twitter" + }, + "script": { + "source": \"\"\" + ctx._source.type = ctx._type; + ctx._id = ctx._type + '-' + ctx._id; + ctx._type = '_doc'; + \"\"\" + } +} +``` + +## Parent/Child Relationships + +The `join` field replaces the previous parent-child relationship implementation. It allows you to create one-to-many relationships within documents of the same index. Here's a basic example: + +```json +PUT my_index +{ + "mappings": { + "properties": { + "my_join_field": { + "type": "join", + "relations": { + "question": "answer" + } + } + } + } +} +``` + +## API Changes in Elasticsearch 7.0 + +### Index APIs + +- Use `include_type_name=false` parameter +- Mappings directly under the `mappings` key without type names + +Example: + +```json +PUT /my-index-000001?include_type_name=false +{ + "mappings": { + "properties": { + "foo": { "type": "keyword" } + } + } +} +``` + +### Document APIs + +- Use `{index}/_doc/{id}` path for explicit IDs +- Use `{index}/_doc` for auto-generated IDs + +Example: + +```json +PUT /my-index-000001/_doc/1 +{ + "foo": "baz" +} +``` + +### Search APIs + +- Omit types from URLs +- Avoid using `_type` field in queries, aggregations, or scripts + +Example of a typeless search: + +```json +GET /my-index-000001/_search +{ + "query": { + "match": { + "user_name": "kimchy" + } + } +} +``` + +### Responses + +- `_type` field in responses is deprecated (returns `_doc` for typeless APIs) + +Example response: + +```json +{ + "_index" : "my-index-000001", + "_type" : "_doc", + "_id" : "1", + "_version" : 1, + "_seq_no" : 0, + "_primary_term" : 1, + "found": true, + "_source" : { + "foo" : "baz" + } +} +``` + +### Index Templates + +- Use `include_type_name=false` for typeless templates +- Templates use `_doc` type internally for backwards compatibility + +Example of a typeless index template: + +```json +PUT _template/template_1 +{ + "index_patterns": ["*"], + "mappings": { + "properties": { + "field1": { "type": "keyword" } + } + } +} +``` + +## Handling Mixed-Version Clusters + +- Specify `include_type_name` parameter in index APIs +- Typeless document APIs (e.g., `bulk`, `update`) only work with 7.0+ nodes + +In 6.8, `include_type_name` defaults to `true`, while in 7.0 it defaults to `false`. Always specify this parameter explicitly in mixed-version clusters to avoid inconsistencies. + +# Elasticsearch 7.10 Migration Guide + +This guide outlines critical changes and considerations when upgrading your Elasticsearch application to version 7.10. For a comprehensive overview, refer to the "What's new in 7.10" and "Release notes" documentation. + +## Breaking Changes + +The following modifications in Elasticsearch 7.10 may significantly impact your applications. Review and address these changes before upgrading to ensure smooth operation. + +### Authentication Updates + +- **API Key Creation**: The `name` property is now mandatory when creating or granting API keys. + + Example: + ```json + { + "api_key": { + "name": "key-1" + } + } + ``` + +### Java Modifications + +- **MappedFieldType#fielddataBuilder**: This method now accepts an additional `Supplier` argument to support future feature development. Plugin developers should update their implementations accordingly. + +### Networking Adjustments + +- **TCP Keep-Alive Settings**: The maximum value for `{network,transport,http}.tcp.keep_idle` and `{network,transport,http}.tcp.keep_interval` is now 300 seconds (5 minutes). Ensure your configuration doesn't exceed this limit to avoid startup errors. + +### Search Enhancements + +- **Doc Value Fields Limit**: The `index.max_docvalue_fields_search` setting now applies to doc value fields returned by `inner_hits` sections and `top_hits` aggregations, in addition to the top-level `docvalue_fields` parameter. Adjust this setting as needed for your use case. + +## Deprecations + +The following features are deprecated in 7.10 and will be removed in 8.0. While they won't immediately affect your applications, it's recommended to update your code accordingly after upgrading. + +To identify deprecated functionality in your current setup, enable deprecation logging. + +### Security Configuration + +1. **Transport SSL Settings**: + - Explicitly set `xpack.security.transport.ssl.enabled` to `true` or `false` when configuring other `xpack.security.transport.ssl` settings. + - If enabled, provide a certificate and key using either `xpack.security.transport.ssl.keystore.path` or both `xpack.security.transport.ssl.certificate` and `xpack.security.transport.ssl.key`. + + Example: + ```yaml + xpack.security.transport.ssl.enabled: true + xpack.security.transport.ssl.keystore.path: elastic-certificates.p12 + xpack.security.transport.ssl.truststore.path: elastic-certificates.p12 + ``` + +2. **HTTP SSL Settings**: + - Explicitly set `xpack.security.http.ssl.enabled` to `true` or `false` when configuring other `xpack.security.http.ssl` settings. + - If enabled, provide a certificate and key using either `xpack.security.http.ssl.keystore.path` or both `xpack.security.http.ssl.certificate` and `xpack.security.http.ssl.key`. + + Example: + ```yaml + xpack.security.http.ssl.enabled: true + xpack.security.http.ssl.certificate: elasticsearch.crt + xpack.security.http.ssl.key: elasticsearch.key + xpack.security.http.ssl.certificate_authorities: [ "corporate-ca.crt" ] + ``` + +### Cluster Settings + +- The `cluster.join.timeout` node setting is no longer necessary and will be removed in 8.0. + +### Indices Access + +- Direct REST API access to system indices will be restricted by default in future major versions. Certain API endpoints are exempt from this restriction, including: + ``` + GET _cluster/health + GET {index}/_recovery + GET _cluster/allocation/explain + GET _cluster/state + POST _cluster/reroute + GET {index}/_stats + GET {index}/_segments + GET {index}/_shard_stores + GET _cat/[indices,aliases,health,recovery,shards,segments] + ``` + +### Machine Learning Parameters + +- Replace `allow_no_jobs` and `allow_no_datafeeds` with `allow_no_match` in machine learning APIs. + +### Mapping Features + +- Field-specific index-time boosts in mappings are deprecated. Use query-time boosts instead. + +### Snapshot and Restore + +- The repository stats API is deprecated. Use the repositories metering APIs instead. + +""" \ No newline at end of file diff --git a/TransformationPlayground/playground/transform_expert/prompting/templates.py b/TransformationPlayground/playground/transform_expert/prompting/templates.py index 504734249..b2febce33 100644 --- a/TransformationPlayground/playground/transform_expert/prompting/templates.py +++ b/TransformationPlayground/playground/transform_expert/prompting/templates.py @@ -12,6 +12,7 @@ - Do not attempt to be friendly in your responses. Be as direct and succint as possible. - Think through the problem, extract all data from the task and the previous conversations before creating a plan. - Never assume any parameter values while invoking a tool or function. +- Always heed the user's guidance, which will be wrapped in user_guidance XML tags. - You may NOT ask clarifying questions to the user if you need more information. @@ -20,13 +21,24 @@ - Your code MUST NEVER INCLUDE any network calls or I/O operations. - All code must be Python 3.10+ compatible. - Ensure any code you provide can be executed with all required imports and variables defined. -- Structure your code to start with the required imports, then a description of the transformation logic, +- Structure your code to start with the required imports, then a detailed description of the transformation logic, and finally the transformation code. +- If the user provided any user_guidance, ensure that the code follows it and that your description of the + transformation logic explains how the guidance was followed. Specifically call out what the user's guidance was. - While you may generate multiple functions to assist in the transformation and make the code more readable, the final transformation should be a single function. It MUST have the following signature: `def transform(source_json: Dict[str, Any]) -> List[Dict[str, Any]]:` +The input JSON will the settings for an Index from the source cluster, and will ALWAYS be in the following format: + +* A dictionary with two keys: "index_name" and "index_json". +* The "index_name" key will contain a string with the original name of the index. +* The "index_json" key will contain a dictionary with the raw JSON defining the index's configuration. + + +The output of the transformation function you create will ALWAYS be a list containing one or more entries that confirm to the source_json_format. + The source cluster's version is {source_version}. If there is any grounded knowledge on this source_version, it will be provided here: {source_knowledge} @@ -39,14 +51,7 @@ If there is any special guidance for this target_version, it will be provided here: {target_guidance} -The input JSON will the settings for an Index from the source cluster, and will ALWAYS be in the following format: - -* A dictionary with two keys: "index_name" and "index_json". -* The "index_name" key will contain a string with the original name of the index. -* The "index_json" key will contain a dictionary with the raw JSON defining the index's configuration. - - -The output of the transformation function you create will ALWAYS be a list containing one or more entries that confirm to the source_json_format. +The user has provided the following guidance for this transformation: {user_guidance} The index-level settings JSON from the source cluster is: {source_json} diff --git a/TransformationPlayground/playground_frontend/src/app/page.tsx b/TransformationPlayground/playground_frontend/src/app/page.tsx index 491eb089f..719cc7089 100644 --- a/TransformationPlayground/playground_frontend/src/app/page.tsx +++ b/TransformationPlayground/playground_frontend/src/app/page.tsx @@ -2,7 +2,8 @@ import React, { useState } from "react"; import "@cloudscape-design/global-styles/index.css"; -import { Grid, Container, FormField, Textarea, Button, Input, Select, SelectProps, Spinner } from "@cloudscape-design/components"; +import { Box, Button, Container, FormField, Grid, Input, Modal, Select, + SelectProps, SpaceBetween, Spinner, Textarea } from "@cloudscape-design/components"; import { Configuration, TransformsApi, TransformsIndexCreateRequest, TransformsIndexTestRequest, SourceVersionEnum, TargetVersionEnum, TransformLanguageEnum @@ -10,7 +11,7 @@ import { Configuration, TransformsApi, TransformsIndexCreateRequest, TransformsI const TransformationPage = () => { - // States for user inputs + // States for page elements const [inputShape, setInputShape] = useState(""); const [transformLogic, setTransformLogic] = useState(""); const [outputShape, setOutputShape] = useState(""); @@ -18,7 +19,9 @@ const TransformationPage = () => { const [testTargetUrl, setTestTargetUrl] = useState(""); const [isRecommending, setIsRecommending] = useState(false); const [isTesting, setIsTesting] = useState(false); - + const [userGuidanceVisible, setUserGuidanceVisible] = useState(false); + const [userGuidance, setUserGuidance] = useState(""); + const [userGuidanceTemp, setUserGuidanceTemp] = useState(""); // Select options for dropdowns using enumerated types const sourceVersionOptions: SelectProps.Options = Object.values(SourceVersionEnum).map((value) => ({ @@ -61,10 +64,13 @@ const TransformationPage = () => { source_version: sourceVersion.value as SourceVersionEnum, target_version: targetVersion.value as TargetVersionEnum, transform_language: transformLanguage.value as TransformLanguageEnum, - ...(testTargetUrl && { test_target_url: testTargetUrl }), // Add optional field + // Add optional fields + ...(transformLogic && { transform_logic: transformLogic }), + ...(testTargetUrl && { test_target_url: testTargetUrl }), + ...(userGuidance && { user_guidance: userGuidance }), }; - const response = await apiClient.transformsIndexCreate(payload); + const response = await apiClient.transformsIndexCreateCreate(payload); // Update state with response data setTransformLogic(response.data.transform_logic || ""); @@ -174,6 +180,46 @@ const TransformationPage = () => { + + setUserGuidanceVisible(false)} + visible={userGuidanceVisible} + footer={ + + + + + + + } + header="Modal title" + > + +