Skip to content

Commit

Permalink
Adding semantic search workload that includes vector and bm25 search
Browse files Browse the repository at this point in the history
Signed-off-by: Martin Gaievski <[email protected]>
  • Loading branch information
martin-gaievski committed Jul 14, 2024
1 parent 411e304 commit 875dd1d
Show file tree
Hide file tree
Showing 8 changed files with 844 additions and 0 deletions.
233 changes: 233 additions & 0 deletions trec_covid_semantic_search/README.md

Large diffs are not rendered by default.

46 changes: 46 additions & 0 deletions trec_covid_semantic_search/index.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
{
"settings": {
"index.number_of_shards": {{number_of_shards | default(1)}},
"index.number_of_replicas": {{number_of_replicas | default(0)}},
"index.queries.cache.enabled": {{query_cache_enabled | default(false) | tojson}},
"index.requests.cache.enable": {{requests_cache_enabled | default(false) | tojson}},
"index.merge.policy.max_merged_segment": "100GB",
"index.knn": true,
"default_pipeline": "nlp-ingest-pipeline"
},
"mappings": {
"dynamic": "true",
"_source": {
"enabled": {{ source_enabled | default(true) | tojson }}
},
"properties": {
"title": {
"type": "text"
},
"metadata": {
"type": "nested",
"properties": {
"url": {
"type": "text"
},
"pubmed_id": {
"type": "integer"
}
}
},
"passage_embedding": {
"type": "knn_vector",
"dimension": 768,
"method": {
"name": "hnsw",
"space_type": "innerproduct",
"engine": "faiss",
"parameters": {
"ef_construction": 256,
"m": 256
}
}
}
}
}
}
172 changes: 172 additions & 0 deletions trec_covid_semantic_search/operations/default.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
{
"name": "index",
"operation-type": "bulk",
"bulk-size": {{bulk_size | default(100)}},
"ingest-percentage": {{ingest_percentage | default(100)}}
},
{
"name": "delete-ingest-pipeline",
"operation-type": "delete-pipeline",
"id": "nlp-ingest-pipeline"
},
{
"name": "create-ingest-pipeline",
"operation-type": "put-pipeline",
"param-source": "create-ingest-pipeline",
"id": "nlp-ingest-pipeline",
"body": {
"description": "An NLP ingest pipeline",
"processors": [
{
"text_embedding": {
"model_id": "",
"field_map": {
"title": "passage_embedding"
}
}
}
]
}
},
{
"name": "index-append",
"operation-type": "bulk",
"bulk-size": {{bulk_size | default(100)}},
"ingest-percentage": {{ingest_percentage | default(100)}}
},
{
"name": "default",
"operation-type": "search",
"body": {
"query": {
"match_all": {}
}
}
},
{
"name": "semantic-search-neural",
"operation-type": "search",
"variable-queries": {{variable_queries | default(0)}},
"param-source": "semantic-search-neural-source",
"body": {
"_source": {
"excludes": [
"passage_embedding"
]
},
"query": {
"neural": {
"passage_embedding": {
"query_text": "what types of rapid testing for Covid-19 have been developed?",
"model_id": "",
"k": {{k | default(10)}}
}
}
}
}
},
{
"name": "create-normalization-processor-no-weights-search-pipeline",
"operation-type": "create-search-pipeline",
"id": "nlp-min-max-arithmetic-search-pipeline",
"body": {
"description": "Post processor for hybrid search with min_max normalization and arithmetic_mean combination",
"phase_results_processors": [
{
"normalization-processor": {
"normalization": {
"technique": "min_max"
},
"combination": {
"technique": "arithmetic_mean"
}
}
}
]
}
},
{
"name": "semantic-search-hybrid-bm25-and-neural-search",
"operation-type": "search",
"request-params": {
"search_pipeline": "nlp-min-max-arithmetic-search-pipeline"
},
"variable-queries": {{variable_queries | default(0)}},
"param-source": "hybrid-query-bm25-neural-search-source",
"body": {
"_source": {
"excludes": [
"passage_embedding"
]
},
"query": {
"hybrid": {
"queries": [
{
"match": {
"title": ""
}
},
{
"neural": {
"passage_embedding": {
"query_text": "what types of rapid testing for Covid-19 have been developed?",
"model_id": "",
"k": {{k | default(10)}}
}
}
}
]
}
}
}
},
{
"name": "semantic-search-hybrid-bm25-range-and-neural-search",
"operation-type": "search",
"request-params": {
"search_pipeline": "nlp-min-max-arithmetic-search-pipeline"
},
"variable-queries": {{variable_queries | default(0)}},
"param-source": "hybrid-query-bm25-neural-search-source",
"body": {
"_source": {
"excludes": [
"passage_embedding"
]
},
"query": {
"hybrid": {
"queries": [
{
"match": {
"title": ""
}
},
{
"neural": {
"passage_embedding": {
"query_text": "what types of rapid testing for Covid-19 have been developed?",
"model_id": "",
"k": {{k | default(10)}}
}
}
},
{
"nested": {
"path": "metadata",
"query": {
"range": {
"metadata.pubmed_id": {
"gte": {{range_gte | default(100)}},
"lte": {{range_lte | default(10000000)}}
}
}
}
}
}
]
}
}
}
}
12 changes: 12 additions & 0 deletions trec_covid_semantic_search/params/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"bulk_indexing_clients": 2,
"bulk_size": 100,
"number_of_replicas": 1,
"number_of_shards" :8,
"ingest_percentage":100,
"search_clients": 8,
"warmup_iterations": 20,
"iterations": 100,
"variable_queries": 50,
"k": 100
}
162 changes: 162 additions & 0 deletions trec_covid_semantic_search/test_procedures/procedures.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
{
"name": "create-index-ingest-data-search",
"description": "Indexes the whole document corpus using OpenSearch default settings. After that several query groups are run.",
"default": true,
"schedule": [
{
"name": "cluster-settings",
"operation": {
"operation-type": "put-settings",
"body": {
"persistent": {
"plugins": {
"ml_commons": {
"only_run_on_ml_node": "false",
"native_memory_threshold": "100",
"allow_registering_model_via_local_file": "true",
"allow_registering_model_via_url": "true"
}
}
}
}
}
},
{
"operation": "delete-index"
},
{
"operation": "delete-ingest-pipeline"
},
{
"operation": {
"operation-type": "delete-ml-model",
"model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}"
}
},
{
"operation": {
"operation-type": "register-ml-model",
"model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}",
"model-version": "{{ model_version | default('1.0.1') }}",
"model-format": "{{ model_format | default('TORCH_SCRIPT') }}",
"model-config-file": "{{ model_config_file | default('') }}"
}
},
{
"operation": "deploy-ml-model"
},
{
"operation": "create-ingest-pipeline"
},
{
"operation": {
"operation-type": "create-index",
"settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} {
"index.number_of_shards": {{number_of_shards | default(3)}},
"index.number_of_replicas": {{number_of_replicas | default(0)}},
"index.store.type": "{{store_type | default('fs')}}"
}{%- endif %}
}
},
{
"name": "check-cluster-health-before-index-creation",
"operation": {
"operation-type": "cluster-health",
"index": "trec-covid",
"request-params": {
"wait_for_status": "{{cluster_health | default('green')}}",
"wait_for_no_relocating_shards": "true"
},
"retry-until-success": true
}
},
{
"operation": "index-append",
"warmup-time-period": 60,
"clients": {{bulk_indexing_clients | default(1)}},
"ignore-response-error-level": "{{error_level | default('non-fatal')}}"
},
{
"name": "refresh-after-index-created",
"operation": "refresh"
},
{
"operation": {
"operation-type": "force-merge",
"request-timeout": 7200{%- if force_merge_max_num_segments is defined %},
"max-num-segments": {{ force_merge_max_num_segments | tojson }}
{%- endif %}
}
},
{
"name": "refresh-after-force-merge",
"operation": "refresh"
},
{
"name": "wait-until-merges-finish",
"operation": {
"operation-type": "index-stats",
"index": "_all",
"condition": {
"path": "_all.total.merges.current",
"expected-value": 0
},
"retry-until-success": true,
"include-in-reporting": false
}
},
{
"operation": "default",
"warmup-iterations": {{warmup_iterations | default(500) | tojson}},
"iterations": {{iterations | default(500) | tojson }},
"target-throughput": {{ target_throughput | default(100) | tojson}},
"clients": {{ search_clients | default(1) }}
}
]
},
{
"name": "search",
"description": "Run semantic search work.",
"default": false,
"schedule": [
{
"operation": {
"operation-type": "delete-ml-model",
"model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}"
}
},
{
"operation": {
"operation-type": "register-ml-model",
"model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}",
"model-version": "{{ model_version | default('1.0.1') }}",
"model-format": "{{ model_format | default('TORCH_SCRIPT') }}",
"model-config-file": "{{ model_config_file | default('') }}"
}
},
{
"operation": "deploy-ml-model"
},
{
"operation": "create-normalization-processor-no-weights-search-pipeline"
},
{
"operation": "semantic-search-neural",
"warmup-iterations": {{warmup_iterations | default(50) | tojson}},
"iterations": {{iterations | default(100) | tojson }},
"clients": {{ search_clients | default(1)}}
},
{
"operation": "semantic-search-hybrid-bm25-and-neural-search",
"warmup-iterations": {{warmup_iterations | default(50) | tojson}},
"iterations": {{iterations | default(100) | tojson }},
"clients": {{ search_clients | default(1)}}
},
{
"operation": "semantic-search-hybrid-bm25-range-and-neural-search",
"warmup-iterations": {{warmup_iterations | default(50) | tojson}},
"iterations": {{iterations | default(100) | tojson }},
"clients": {{ search_clients | default(1)}}
}
]
}
Loading

0 comments on commit 875dd1d

Please sign in to comment.