From 875dd1ddc4391d32e986d63c69fef6a9e3975049 Mon Sep 17 00:00:00 2001 From: Martin Gaievski Date: Sun, 14 Jul 2024 22:03:52 +0000 Subject: [PATCH] Adding semantic search workload that includes vector and bm25 search Signed-off-by: Martin Gaievski --- trec_covid_semantic_search/README.md | 233 ++++++++++++++++++ trec_covid_semantic_search/index.json | 46 ++++ .../operations/default.json | 172 +++++++++++++ trec_covid_semantic_search/params/params.json | 12 + .../test_procedures/procedures.json | 162 ++++++++++++ trec_covid_semantic_search/workload.json | 30 +++ trec_covid_semantic_search/workload.py | 183 ++++++++++++++ .../workload_queries_knn.json | 6 + 8 files changed, 844 insertions(+) create mode 100644 trec_covid_semantic_search/README.md create mode 100644 trec_covid_semantic_search/index.json create mode 100644 trec_covid_semantic_search/operations/default.json create mode 100644 trec_covid_semantic_search/params/params.json create mode 100644 trec_covid_semantic_search/test_procedures/procedures.json create mode 100644 trec_covid_semantic_search/workload.json create mode 100644 trec_covid_semantic_search/workload.py create mode 100644 trec_covid_semantic_search/workload_queries_knn.json diff --git a/trec_covid_semantic_search/README.md b/trec_covid_semantic_search/README.md new file mode 100644 index 00000000..93fa67e0 --- /dev/null +++ b/trec_covid_semantic_search/README.md @@ -0,0 +1,233 @@ +# Semantic Search Workload + +This workload is to benchmark performance of search of Semantic Search queries of OpenSearch. Ingested documents will have embeddings that are generated during ingestion process by one of pre-trained models. + +## Datasets + +We usae processed version of trec-covid dataset. Trec-Covid is a dataset collection of documents about COVID-19 information. + +- Trec-Covid website: https://ir.nist.gov/covidSubmit/index.html +- Dataset: https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/trec-covid.zip + +We processed the dataset by creating 6 copies of the same document and shuffle copies so they are ingested in random order. We create custom artifact for queries by extracting queries portion from original `trec-covid` dataset and generating vector embeddings for query text using 768 dimension vector, same dimensions as used for document ingestion. + +### Example Document + +Following is example of document that is beeing ingested during indexing: + +```json +{ + "title": "Simultaneous Video-EEG-ECG Monitoring to Identify Neurocardiac Dysfunction in Mouse Models of Epilepsy.", + "metadata": { + "url": "https://doi.org/10.3791/57300; https://www.ncbi.nlm.nih.gov/pubmed/29443088/", + "pubmed_id": "29443088" + } +} +``` + +Following is example of query: + +```json +{ + "_id": "1", + "query": "what is the origin of COVID-19", + "vector_embedding": [ + -0.06979332, + 0.05764826, + ... + ] +} + +``` + +## Parameters + +This workload allows the following parameters to be specified using `--workload-params`: + +* `bulk_size` (default: 100) +* `bulk_indexing_clients` (default: 1): Number of clients that issue bulk indexing requests. +* `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested. +* `number_of_replicas` (default: 0) +* `number_of_shards` (default: 1) +* `query_cache_enabled` (default: false) +* `requests_cache_enabled` (default: false) +* `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. +* `force_merge_max_num_segments` (default: unset): An integer specifying the max amount of segments the force-merge operation should use. +* `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly. +* `cluster_health` (default: "green"): The minimum required cluster health. +* `error_level` (default: "non-fatal"): Available for bulk operations only to specify ignore-response-error-level. +* `target_throughput` (default: default values for each operation): Number of requests per second, `""` for no limit. +* `search_clients`: Number of clients that issue search requests. +* `model_name` (default: huggingface/sentence-transformers/all-mpnet-base-v2) OpenSearch-provided pretrained model name. +* `model_version` (default: 1.0.1) Model version. +* `model_format` (default: TORCH_SCRIPT) Model format. +* `dimensions` (default: 768): Vector dimensions, needed to match the model. +* `engine` (default:` lucene): The approximate k-NN library to use for indexing and search. +* `method` (default:` hnsw): K-NN search algorithm. +* `space_type` (default:` l2): The vector space used to calculate the distance between vectors. +* `k` (default: 10) Number of nearest neighbors are returned. +* `warmup_iterations` Number of Warmup iteration of each search client executes. +* `iterations` Number of test iterations of each search client executes. +* `num_variable_queries` (default: 0) Number of variable queries will be used for the semantic search task, 0 means fixed query and max value is 50. +* `range_gte` (default: 100) Number that defines the lower bound (inclusive) for range query when it's used as elemnts in semantic search query +* `range_lte` (default: 10000000) Number that defines the upper bound (inclusive) for range query when it's used as elemnts in semantic search query + +### Running a benchmark + +Before running a benchmark, ensure that the load generation host is able to access your cluster endpoint and that the +appropriate dataset is available on the host. + +Currently, we support 2 test procedures for the semantic search workload. The default procedure is `create-index-ingest-data-search` and does create an index, ingest data and run a base set of search queries. + +To run the default workload, invoke the following command. + +``` +# OpenSearch Cluster End point url with hostname and port +export ENDPOINT= +# Absolute file path of Workload file +export WORKLOAD_FILE= + +opensearch-benchmark execute-test \ + --workload-path="/opensearch-benchmark-workloads/trec_covid_semantic_search/" \ + --workload-params="/trec_covid_semantic_search/params/params.json" \ + --pipeline=benchmark-only \ + --target-host=$ENDPOINT \ + --kill-running-processes \ + --test-procedure="search" +``` + +## Current Procedures + +### Create index with data + +This procedure creates index, deploy model localy, creaes pipeline with ingest and search processors and ingest documents. At the end we ran the match_all query that returns all documents in the index. +Procedure name `create-index-ingest-data-search`. +This is a default precedure for this workload. + +### Run semantic search queries + +This search procedure runs semantic search queries: neural, hybrid. It deletes and deploys an ml model and creates processor and uses this model to generate search specific embeddings. +Procedure name `search`. + +#### Sample Output + +The output of a sample test run is provided below. Metrics are captured in the result's data store as usual, and this can be configured to be +either in-memory, or an external OpenSearch cluster. + +``` + + ____ _____ __ ____ __ __ + / __ \____ ___ ____ / ___/___ ____ ___________/ /_ / __ )___ ____ _____/ /_ ____ ___ ____ ______/ /__ + / / / / __ \/ _ \/ __ \\__ \/ _ \/ __ `/ ___/ ___/ __ \ / __ / _ \/ __ \/ ___/ __ \/ __ `__ \/ __ `/ ___/ //_/ +/ /_/ / /_/ / __/ / / /__/ / __/ /_/ / / / /__/ / / / / /_/ / __/ / / / /__/ / / / / / / / / /_/ / / / ,< +\____/ .___/\___/_/ /_/____/\___/\__,_/_/ \___/_/ /_/ /_____/\___/_/ /_/\___/_/ /_/_/ /_/ /_/\__,_/_/ /_/|_| + /_/ + +[INFO] Executing test with workload [workload], test_procedure [hybrid-query-aggs-light] and provision_config_instance ['external'] with version [2.14.0]. + +[WARNING] indexing_total_time is 11 ms indicating that the cluster is not in a defined clean state. Recorded index time metrics may be misleading. +[WARNING] refresh_total_time is 27 ms indicating that the cluster is not in a defined clean state. Recorded index time metrics may be misleading. +[WARNING] flush_total_time is 11 ms indicating that the cluster is not in a defined clean state. Recorded index time metrics may be misleading. + +Running delete-ml-model [100% done] +Running register-ml-model [100% done] +Running deploy-ml-model [100% done] +Running create-normalization-processor-no-weights-search-pipeline [100% done] +Running semantic-search-neural [100% done] +Running semantic-search-hybrid-bm25-and-neural-search [100% done] +Running semantic-search-hybrid-bm25-range-and-neural-search [100% done] +------------------------------------------------------ + _______ __ _____ + / ____(_)___ ____ _/ / / ___/_________ ________ + / /_ / / __ \/ __ `/ / \__ \/ ___/ __ \/ ___/ _ \ + / __/ / / / / / /_/ / / ___/ / /__/ /_/ / / / __/ +/_/ /_/_/ /_/\__,_/_/ /____/\___/\____/_/ \___/ +| Cumulative merge time of primary shards | | 1.05333 | min | +| Cumulative merge count of primary shards | | 58 | | +| Min cumulative merge time across primary shards | | 0 | min | +| Median cumulative merge time across primary shards | | 0.02405 | min | +| Max cumulative merge time across primary shards | | 0.740383 | min | +| Cumulative merge throttle time of primary shards | | 0.718733 | min | +| Min cumulative merge throttle time across primary shards | | 0 | min | +| Median cumulative merge throttle time across primary shards | | 0 | min | +| Max cumulative merge throttle time across primary shards | | 0.718733 | min | +| Cumulative refresh time of primary shards | | 15.7122 | min | +| Cumulative refresh count of primary shards | | 877 | | +| Min cumulative refresh time across primary shards | | 0 | min | +| Median cumulative refresh time across primary shards | | 1.64122 | min | +| Max cumulative refresh time across primary shards | | 3.16232 | min | +| Cumulative flush time of primary shards | | 27.8492 | min | +| Cumulative flush count of primary shards | | 35 | | +| Min cumulative flush time across primary shards | | 0 | min | +| Median cumulative flush time across primary shards | | 2.21 | min | +| Max cumulative flush time across primary shards | | 5.80563 | min | +| Total Young Gen GC time | | 0.193 | s | +| Total Young Gen GC count | | 10 | | +| Total Old Gen GC time | | 0 | s | +| Total Old Gen GC count | | 0 | | +| Store size | | 30.2634 | GB | +| Translog size | | 0.0721769 | GB | +| Heap used for segments | | 0 | MB | +| Heap used for doc values | | 0 | MB | +| Heap used for terms | | 0 | MB | +| Heap used for norms | | 0 | MB | +| Heap used for points | | 0 | MB | +| Heap used for stored fields | | 0 | MB | +| Segment count | | 225 | | +| Min Throughput | semantic-search-neural | 27.82 | ops/s | +| Mean Throughput | semantic-search-neural | 33.37 | ops/s | +| Median Throughput | semantic-search-neural | 34.08 | ops/s | +| Max Throughput | semantic-search-neural | 35.37 | ops/s | +| 50th percentile latency | semantic-search-neural | 211.371 | ms | +| 90th percentile latency | semantic-search-neural | 230.603 | ms | +| 99th percentile latency | semantic-search-neural | 248.195 | ms | +| 100th percentile latency | semantic-search-neural | 260.313 | ms | +| 50th percentile service time | semantic-search-neural | 211.371 | ms | +| 90th percentile service time | semantic-search-neural | 230.603 | ms | +| 99th percentile service time | semantic-search-neural | 248.195 | ms | +| 100th percentile service time | semantic-search-neural | 260.313 | ms | +| error rate | semantic-search-neural | 0 | % | +| Min Throughput | semantic-search-hybrid-bm25-and-neural-search | 35.62 | ops/s | +| Mean Throughput | semantic-search-hybrid-bm25-and-neural-search | 36.61 | ops/s | +| Median Throughput | semantic-search-hybrid-bm25-and-neural-search | 36.74 | ops/s | +| Max Throughput | semantic-search-hybrid-bm25-and-neural-search | 36.96 | ops/s | +| 50th percentile latency | semantic-search-hybrid-bm25-and-neural-search | 212.232 | ms | +| 90th percentile latency | semantic-search-hybrid-bm25-and-neural-search | 228.367 | ms | +| 99th percentile latency | semantic-search-hybrid-bm25-and-neural-search | 252.348 | ms | +| 100th percentile latency | semantic-search-hybrid-bm25-and-neural-search | 270.056 | ms | +| 50th percentile service time | semantic-search-hybrid-bm25-and-neural-search | 212.232 | ms | +| 90th percentile service time | semantic-search-hybrid-bm25-and-neural-search | 228.367 | ms | +| 99th percentile service time | semantic-search-hybrid-bm25-and-neural-search | 252.348 | ms | +| 100th percentile service time | semantic-search-hybrid-bm25-and-neural-search | 270.056 | ms | +| error rate | semantic-search-hybrid-bm25-and-neural-search | 0 | % | +| Min Throughput | semantic-search-hybrid-bm25-range-and-neural-search | 34.9 | ops/s | +| Mean Throughput | semantic-search-hybrid-bm25-range-and-neural-search | 36.06 | ops/s | +| Median Throughput | semantic-search-hybrid-bm25-range-and-neural-search | 36.23 | ops/s | +| Max Throughput | semantic-search-hybrid-bm25-range-and-neural-search | 36.62 | ops/s | +| 50th percentile latency | semantic-search-hybrid-bm25-range-and-neural-search | 213.919 | ms | +| 90th percentile latency | semantic-search-hybrid-bm25-range-and-neural-search | 231.788 | ms | +| 99th percentile latency | semantic-search-hybrid-bm25-range-and-neural-search | 248.793 | ms | +| 100th percentile latency | semantic-search-hybrid-bm25-range-and-neural-search | 265.484 | ms | +| 50th percentile service time | semantic-search-hybrid-bm25-range-and-neural-search | 213.919 | ms | +| 90th percentile service time | semantic-search-hybrid-bm25-range-and-neural-search | 231.788 | ms | +| 99th percentile service time | semantic-search-hybrid-bm25-range-and-neural-search | 248.793 | ms | +| 100th percentile service time | semantic-search-hybrid-bm25-range-and-neural-search | 265.484 | ms | +| error rate | semantic-search-hybrid-bm25-range-and-neural-search | 0 | % | + + +--------------------------------- +[INFO] SUCCESS (took 164 seconds) +--------------------------------- +``` + +## License + +Following license used by original dataset and we're using it too. +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ +``` +Covid-trec [1] is part of the COVID-19 Open Research dataset [2], which is licensed under Apache 2.0. +[1] https://arxiv.org/pdf/2005.04474v1.pdf +[2] https://github.com/allenai/cord19/ diff --git a/trec_covid_semantic_search/index.json b/trec_covid_semantic_search/index.json new file mode 100644 index 00000000..b3a10a2b --- /dev/null +++ b/trec_covid_semantic_search/index.json @@ -0,0 +1,46 @@ +{ + "settings": { + "index.number_of_shards": {{number_of_shards | default(1)}}, + "index.number_of_replicas": {{number_of_replicas | default(0)}}, + "index.queries.cache.enabled": {{query_cache_enabled | default(false) | tojson}}, + "index.requests.cache.enable": {{requests_cache_enabled | default(false) | tojson}}, + "index.merge.policy.max_merged_segment": "100GB", + "index.knn": true, + "default_pipeline": "nlp-ingest-pipeline" + }, + "mappings": { + "dynamic": "true", + "_source": { + "enabled": {{ source_enabled | default(true) | tojson }} + }, + "properties": { + "title": { + "type": "text" + }, + "metadata": { + "type": "nested", + "properties": { + "url": { + "type": "text" + }, + "pubmed_id": { + "type": "integer" + } + } + }, + "passage_embedding": { + "type": "knn_vector", + "dimension": 768, + "method": { + "name": "hnsw", + "space_type": "innerproduct", + "engine": "faiss", + "parameters": { + "ef_construction": 256, + "m": 256 + } + } + } + } + } +} diff --git a/trec_covid_semantic_search/operations/default.json b/trec_covid_semantic_search/operations/default.json new file mode 100644 index 00000000..9821be46 --- /dev/null +++ b/trec_covid_semantic_search/operations/default.json @@ -0,0 +1,172 @@ +{ + "name": "index", + "operation-type": "bulk", + "bulk-size": {{bulk_size | default(100)}}, + "ingest-percentage": {{ingest_percentage | default(100)}} +}, +{ + "name": "delete-ingest-pipeline", + "operation-type": "delete-pipeline", + "id": "nlp-ingest-pipeline" + }, + { + "name": "create-ingest-pipeline", + "operation-type": "put-pipeline", + "param-source": "create-ingest-pipeline", + "id": "nlp-ingest-pipeline", + "body": { + "description": "An NLP ingest pipeline", + "processors": [ + { + "text_embedding": { + "model_id": "", + "field_map": { + "title": "passage_embedding" + } + } + } + ] + } + }, + { + "name": "index-append", + "operation-type": "bulk", + "bulk-size": {{bulk_size | default(100)}}, + "ingest-percentage": {{ingest_percentage | default(100)}} + }, + { + "name": "default", + "operation-type": "search", + "body": { + "query": { + "match_all": {} + } + } + }, + { + "name": "semantic-search-neural", + "operation-type": "search", + "variable-queries": {{variable_queries | default(0)}}, + "param-source": "semantic-search-neural-source", + "body": { + "_source": { + "excludes": [ + "passage_embedding" + ] + }, + "query": { + "neural": { + "passage_embedding": { + "query_text": "what types of rapid testing for Covid-19 have been developed?", + "model_id": "", + "k": {{k | default(10)}} + } + } + } + } + }, + { + "name": "create-normalization-processor-no-weights-search-pipeline", + "operation-type": "create-search-pipeline", + "id": "nlp-min-max-arithmetic-search-pipeline", + "body": { + "description": "Post processor for hybrid search with min_max normalization and arithmetic_mean combination", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "min_max" + }, + "combination": { + "technique": "arithmetic_mean" + } + } + } + ] + } + }, + { + "name": "semantic-search-hybrid-bm25-and-neural-search", + "operation-type": "search", + "request-params": { + "search_pipeline": "nlp-min-max-arithmetic-search-pipeline" + }, + "variable-queries": {{variable_queries | default(0)}}, + "param-source": "hybrid-query-bm25-neural-search-source", + "body": { + "_source": { + "excludes": [ + "passage_embedding" + ] + }, + "query": { + "hybrid": { + "queries": [ + { + "match": { + "title": "" + } + }, + { + "neural": { + "passage_embedding": { + "query_text": "what types of rapid testing for Covid-19 have been developed?", + "model_id": "", + "k": {{k | default(10)}} + } + } + } + ] + } + } + } + }, + { + "name": "semantic-search-hybrid-bm25-range-and-neural-search", + "operation-type": "search", + "request-params": { + "search_pipeline": "nlp-min-max-arithmetic-search-pipeline" + }, + "variable-queries": {{variable_queries | default(0)}}, + "param-source": "hybrid-query-bm25-neural-search-source", + "body": { + "_source": { + "excludes": [ + "passage_embedding" + ] + }, + "query": { + "hybrid": { + "queries": [ + { + "match": { + "title": "" + } + }, + { + "neural": { + "passage_embedding": { + "query_text": "what types of rapid testing for Covid-19 have been developed?", + "model_id": "", + "k": {{k | default(10)}} + } + } + }, + { + "nested": { + "path": "metadata", + "query": { + "range": { + "metadata.pubmed_id": { + "gte": {{range_gte | default(100)}}, + "lte": {{range_lte | default(10000000)}} + } + } + } + } + } + ] + } + } + } + } diff --git a/trec_covid_semantic_search/params/params.json b/trec_covid_semantic_search/params/params.json new file mode 100644 index 00000000..310565b0 --- /dev/null +++ b/trec_covid_semantic_search/params/params.json @@ -0,0 +1,12 @@ +{ + "bulk_indexing_clients": 2, + "bulk_size": 100, + "number_of_replicas": 1, + "number_of_shards" :8, + "ingest_percentage":100, + "search_clients": 8, + "warmup_iterations": 20, + "iterations": 100, + "variable_queries": 50, + "k": 100 +} diff --git a/trec_covid_semantic_search/test_procedures/procedures.json b/trec_covid_semantic_search/test_procedures/procedures.json new file mode 100644 index 00000000..8c5b5f6d --- /dev/null +++ b/trec_covid_semantic_search/test_procedures/procedures.json @@ -0,0 +1,162 @@ +{ + "name": "create-index-ingest-data-search", + "description": "Indexes the whole document corpus using OpenSearch default settings. After that several query groups are run.", + "default": true, + "schedule": [ + { + "name": "cluster-settings", + "operation": { + "operation-type": "put-settings", + "body": { + "persistent": { + "plugins": { + "ml_commons": { + "only_run_on_ml_node": "false", + "native_memory_threshold": "100", + "allow_registering_model_via_local_file": "true", + "allow_registering_model_via_url": "true" + } + } + } + } + } + }, + { + "operation": "delete-index" + }, + { + "operation": "delete-ingest-pipeline" + }, + { + "operation": { + "operation-type": "delete-ml-model", + "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}" + } + }, + { + "operation": { + "operation-type": "register-ml-model", + "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}", + "model-version": "{{ model_version | default('1.0.1') }}", + "model-format": "{{ model_format | default('TORCH_SCRIPT') }}", + "model-config-file": "{{ model_config_file | default('') }}" + } + }, + { + "operation": "deploy-ml-model" + }, + { + "operation": "create-ingest-pipeline" + }, + { + "operation": { + "operation-type": "create-index", + "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { + "index.number_of_shards": {{number_of_shards | default(3)}}, + "index.number_of_replicas": {{number_of_replicas | default(0)}}, + "index.store.type": "{{store_type | default('fs')}}" + }{%- endif %} + } + }, + { + "name": "check-cluster-health-before-index-creation", + "operation": { + "operation-type": "cluster-health", + "index": "trec-covid", + "request-params": { + "wait_for_status": "{{cluster_health | default('green')}}", + "wait_for_no_relocating_shards": "true" + }, + "retry-until-success": true + } + }, + { + "operation": "index-append", + "warmup-time-period": 60, + "clients": {{bulk_indexing_clients | default(1)}}, + "ignore-response-error-level": "{{error_level | default('non-fatal')}}" + }, + { + "name": "refresh-after-index-created", + "operation": "refresh" + }, + { + "operation": { + "operation-type": "force-merge", + "request-timeout": 7200{%- if force_merge_max_num_segments is defined %}, + "max-num-segments": {{ force_merge_max_num_segments | tojson }} + {%- endif %} + } + }, + { + "name": "refresh-after-force-merge", + "operation": "refresh" + }, + { + "name": "wait-until-merges-finish", + "operation": { + "operation-type": "index-stats", + "index": "_all", + "condition": { + "path": "_all.total.merges.current", + "expected-value": 0 + }, + "retry-until-success": true, + "include-in-reporting": false + } + }, + { + "operation": "default", + "warmup-iterations": {{warmup_iterations | default(500) | tojson}}, + "iterations": {{iterations | default(500) | tojson }}, + "target-throughput": {{ target_throughput | default(100) | tojson}}, + "clients": {{ search_clients | default(1) }} + } + ] +}, +{ + "name": "search", + "description": "Run semantic search work.", + "default": false, + "schedule": [ + { + "operation": { + "operation-type": "delete-ml-model", + "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}" + } + }, + { + "operation": { + "operation-type": "register-ml-model", + "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}", + "model-version": "{{ model_version | default('1.0.1') }}", + "model-format": "{{ model_format | default('TORCH_SCRIPT') }}", + "model-config-file": "{{ model_config_file | default('') }}" + } + }, + { + "operation": "deploy-ml-model" + }, + { + "operation": "create-normalization-processor-no-weights-search-pipeline" + }, + { + "operation": "semantic-search-neural", + "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, + "iterations": {{iterations | default(100) | tojson }}, + "clients": {{ search_clients | default(1)}} + }, + { + "operation": "semantic-search-hybrid-bm25-and-neural-search", + "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, + "iterations": {{iterations | default(100) | tojson }}, + "clients": {{ search_clients | default(1)}} + }, + { + "operation": "semantic-search-hybrid-bm25-range-and-neural-search", + "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, + "iterations": {{iterations | default(100) | tojson }}, + "clients": {{ search_clients | default(1)}} + } + ] +} \ No newline at end of file diff --git a/trec_covid_semantic_search/workload.json b/trec_covid_semantic_search/workload.json new file mode 100644 index 00000000..b8eedc27 --- /dev/null +++ b/trec_covid_semantic_search/workload.json @@ -0,0 +1,30 @@ +{% import "benchmark.helpers" as benchmark with context %} + +{ + "version": 2, + "description": "Benchmark performance of semantic search queries based on dataset of global daily weather measurements from NOAA", + "indices": [ + { + "name": "trec-covid", + "body": "index.json" + } + ], + "corpora": [ + { + "name": "trec-covid", + "base-url": "https://github.com/martin-gaievski/neural-search/releases/download/trec_covid_dataset_1M_v1", + "documents": [ + { + "source-file": "documents.json.zip", + "document-count": 1027950 + } + ] + } + ], + "operations": [ + {{ benchmark.collect(parts="operations/*.json") }} + ], + "test_procedures": [ + {{ benchmark.collect(parts="test_procedures/*.json") }} + ] +} diff --git a/trec_covid_semantic_search/workload.py b/trec_covid_semantic_search/workload.py new file mode 100644 index 00000000..64932c5d --- /dev/null +++ b/trec_covid_semantic_search/workload.py @@ -0,0 +1,183 @@ +import random +import os +import json +from pathlib import Path + +from osbenchmark.workload.loader import Downloader +from osbenchmark.workload.loader import Decompressor +from osbenchmark.workload.loader import Decompressor + +script_dir = os.path.dirname(os.path.realpath(__file__)) + +def ingest_pipeline_param_source(workload, params, **kwargs): + model_id = params['body']['processors'][0]['text_embedding']['model_id'] + if not model_id: + with open('model_id.json') as f: + d = json.loads(f.read()) + model_id = d['model_id'] + params['body']['processors'][0]['text_embedding']['model_id'] = model_id + return params + +class QueryParamSourceNeural: + def __init__(self, workload, params, **kwargs): + if len(workload.indices) == 1: + index = workload.indices[0].name + if len(workload.indices[0].types) == 1: + type = workload.indices[0].types[0].name + else: + type = None + else: + index = "_all" + type = None + + self._params = params + self._params['index'] = index + self._params['type'] = type + self._params['variable-queries'] = params.get("variable-queries", 0) + self.infinite = True + + if self._params['variable-queries'] > 0: + with open(script_dir + os.sep + 'workload_queries_knn.json', 'r') as f: + d = json.loads(f.read()) + source_file = d['source-file'] + base_url = d['base-url'] + compressed_bytes = d['compressed-bytes'] + uncompressed_bytes = d['uncompressed-bytes'] + compressed_path = script_dir + os.sep + source_file + uncompressed_path = script_dir + os.sep + Path(source_file).stem + if not os.path.exists(compressed_path): + downloader = Downloader(False, False) + downloader.download(base_url, None, compressed_path, compressed_bytes) + if not os.path.exists(uncompressed_path): + decompressor = Decompressor() + decompressor.decompress(compressed_path, uncompressed_path, uncompressed_bytes) + + def partition(self, partition_index, total_partitions): + return self + + def params(self): + params = self._params + with open('model_id.json', 'r') as f: + d = json.loads(f.read()) + params['body']['query']['neural']['passage_embedding']['model_id'] = d['model_id'] + count = self._params.get("variable-queries", 0) + if count > 0: + script_dir = os.path.dirname(os.path.realpath(__file__)) + with open(script_dir + '/queries.json', 'r') as f: + lines = f.read().splitlines() + line =random.choice(lines) + query_text = json.loads(line)['query'] + params['body']['query']['neural']['passage_embedding']['query_text'] = query_text + + return params + +class QueryParamSourceHybridBm25: + def __init__(self, workload, params, **kwargs): + if len(workload.indices) == 1: + index = workload.indices[0].name + if len(workload.indices[0].types) == 1: + type = workload.indices[0].types[0].name + else: + type = None + else: + index = "_all" + type = None + + self._params = params + self._params['index'] = index + self._params['type'] = type + self._params['variable-queries'] = params.get("variable-queries", 0) + self.infinite = True + + if self._params['variable-queries'] > 0: + with open(script_dir + os.sep + 'workload_queries_knn.json', 'r') as f: + d = json.loads(f.read()) + source_file = d['source-file'] + base_url = d['base-url'] + compressed_bytes = d['compressed-bytes'] + uncompressed_bytes = d['uncompressed-bytes'] + compressed_path = script_dir + os.sep + source_file + uncompressed_path = script_dir + os.sep + Path(source_file).stem + if not os.path.exists(compressed_path): + downloader = Downloader(False, False) + downloader.download(base_url, None, compressed_path, compressed_bytes) + if not os.path.exists(uncompressed_path): + decompressor = Decompressor() + decompressor.decompress(compressed_path, uncompressed_path, uncompressed_bytes) + + def partition(self, partition_index, total_partitions): + return self + + def params(self): + params = self._params + count = self._params.get("variable-queries", 0) + if count > 0: + script_dir = os.path.dirname(os.path.realpath(__file__)) + with open(script_dir + '/queries.json', 'r') as f: + lines = f.read().splitlines() + line =random.choice(lines) + query_text = json.loads(line)['query'] + match_query = random.choice(query_text.split()).lower() + params['body']['query']['hybrid']['queries'][0]['match']['title'] = match_query + return params + +class QueryParamSourceHybridBm25Neural: + def __init__(self, workload, params, **kwargs): + if len(workload.indices) == 1: + index = workload.indices[0].name + if len(workload.indices[0].types) == 1: + type = workload.indices[0].types[0].name + else: + type = None + else: + index = "_all" + type = None + + self._params = params + self._params['index'] = index + self._params['type'] = type + self._params['variable-queries'] = params.get("variable-queries", 0) + self.infinite = True + + if self._params['variable-queries'] > 0: + with open(script_dir + os.sep + 'workload_queries_knn.json', 'r') as f: + d = json.loads(f.read()) + source_file = d['source-file'] + base_url = d['base-url'] + compressed_bytes = d['compressed-bytes'] + uncompressed_bytes = d['uncompressed-bytes'] + compressed_path = script_dir + os.sep + source_file + uncompressed_path = script_dir + os.sep + Path(source_file).stem + if not os.path.exists(compressed_path): + downloader = Downloader(False, False) + downloader.download(base_url, None, compressed_path, compressed_bytes) + if not os.path.exists(uncompressed_path): + decompressor = Decompressor() + decompressor.decompress(compressed_path, uncompressed_path, uncompressed_bytes) + + def partition(self, partition_index, total_partitions): + return self + + def params(self): + params = self._params + count = self._params.get("variable-queries", 0) + if count > 0: + script_dir = os.path.dirname(os.path.realpath(__file__)) + model_id = '' + with open('model_id.json', 'r') as f: + d = json.loads(f.read()) + model_id = d['model_id'] + with open(script_dir + '/queries.json', 'r') as f: + lines = f.read().splitlines() + line =random.choice(lines) + query_text = json.loads(line)['query'] + match_query = random.choice(query_text.split()).lower() + params['body']['query']['hybrid']['queries'][0]['match']['title'] = match_query + params['body']['query']['hybrid']['queries'][1]['neural']['passage_embedding']['model_id'] = model_id + params['body']['query']['hybrid']['queries'][1]['neural']['passage_embedding']['query_text'] = query_text + return params + +def register(registry): + registry.register_param_source("semantic-search-neural-source", QueryParamSourceNeural) + registry.register_param_source("hybrid-query-bm25-neural-search-source", QueryParamSourceHybridBm25Neural) + registry.register_param_source("create-ingest-pipeline", ingest_pipeline_param_source) \ No newline at end of file diff --git a/trec_covid_semantic_search/workload_queries_knn.json b/trec_covid_semantic_search/workload_queries_knn.json new file mode 100644 index 00000000..e9073cfe --- /dev/null +++ b/trec_covid_semantic_search/workload_queries_knn.json @@ -0,0 +1,6 @@ +{ + "base-url": "https://github.com/martin-gaievski/neural-search/releases/download/trec_covid_queries_knn", + "source-file": "queries.json.zip", + "compressed-bytes" : 98855, + "uncompressed-bytes": 260018 +}