Skip to content

Commit

Permalink
Adding semantic search workload that includes vector and bm25 search
Browse files Browse the repository at this point in the history
Signed-off-by: Martin Gaievski <gaievski@amazon.com>
  • Loading branch information
martin-gaievski committed Jul 15, 2024
1 parent 411e304 commit 43bc7ff
Show file tree
Hide file tree
Showing 8 changed files with 920 additions and 0 deletions.
265 changes: 265 additions & 0 deletions trec_covid_semantic_search/README.md

Large diffs are not rendered by default.

46 changes: 46 additions & 0 deletions trec_covid_semantic_search/index.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
{
"settings": {
"index.number_of_shards": {{number_of_shards | default(1)}},
"index.number_of_replicas": {{number_of_replicas | default(0)}},
"index.queries.cache.enabled": {{query_cache_enabled | default(false) | tojson}},
"index.requests.cache.enable": {{requests_cache_enabled | default(false) | tojson}},
"index.merge.policy.max_merged_segment": "100GB",
"index.knn": true,
"default_pipeline": "nlp-ingest-pipeline"
},
"mappings": {
"dynamic": "true",
"_source": {
"enabled": {{ source_enabled | default(true) | tojson }}
},
"properties": {
"title": {
"type": "text"
},
"metadata": {
"type": "nested",
"properties": {
"url": {
"type": "text"
},
"pubmed_id": {
"type": "integer"
}
}
},
"passage_embedding": {
"type": "knn_vector",
"dimension": 768,
"method": {
"name": "hnsw",
"space_type": "innerproduct",
"engine": "faiss",
"parameters": {
"ef_construction": 256,
"m": 256
}
}
}
}
}
}
207 changes: 207 additions & 0 deletions trec_covid_semantic_search/operations/default.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
{
"name": "index",
"operation-type": "bulk",
"bulk-size": {{bulk_size | default(100)}},
"ingest-percentage": {{ingest_percentage | default(100)}}
},
{
"name": "delete-ingest-pipeline",
"operation-type": "delete-pipeline",
"id": "nlp-ingest-pipeline"
},
{
"name": "create-ingest-pipeline",
"operation-type": "put-pipeline",
"param-source": "create-ingest-pipeline",
"id": "nlp-ingest-pipeline",
"body": {
"description": "An NLP ingest pipeline",
"processors": [
{
"text_embedding": {
"model_id": "",
"field_map": {
"title": "passage_embedding"
}
}
}
]
}
},
{
"name": "index-append",
"operation-type": "bulk",
"bulk-size": {{bulk_size | default(100)}},
"ingest-percentage": {{ingest_percentage | default(100)}}
},
{
"name": "default",
"operation-type": "search",
"body": {
"query": {
"match_all": {}
}
}
},
{
"name": "semantic-search-neural",
"operation-type": "search",
"variable-queries": {{variable_queries | default(0)}},
"param-source": "semantic-search-neural-source",
"body": {
"_source": {
"excludes": [
"passage_embedding"
]
},
"query": {
"neural": {
"passage_embedding": {
"query_text": "what types of rapid testing for Covid-19 have been developed?",
"model_id": "",
"k": {{k | default(10)}}
}
}
}
}
},
{
"name": "create-normalization-processor-no-weights-search-pipeline",
"operation-type": "create-search-pipeline",
"id": "nlp-min-max-arithmetic-search-pipeline",
"body": {
"description": "Post processor for hybrid search with min_max normalization and arithmetic_mean combination",
"phase_results_processors": [
{
"normalization-processor": {
"normalization": {
"technique": "min_max"
},
"combination": {
"technique": "arithmetic_mean"
}
}
}
]
}
},
{
"name": "semantic-search-hybrid-bm25-and-neural-search",
"operation-type": "search",
"request-params": {
"search_pipeline": "nlp-min-max-arithmetic-search-pipeline"
},
"variable-queries": {{variable_queries | default(0)}},
"param-source": "hybrid-query-bm25-neural-search-source",
"body": {
"_source": {
"excludes": [
"passage_embedding"
]
},
"query": {
"hybrid": {
"queries": [
{
"match": {
"title": ""
}
},
{
"neural": {
"passage_embedding": {
"query_text": "what types of rapid testing for Covid-19 have been developed?",
"model_id": "",
"k": {{k | default(10)}}
}
}
}
]
}
}
}
},
{
"name": "semantic-search-hybrid-bm25-and-knn-search",
"operation-type": "search",
"request-params": {
"search_pipeline": "nlp-min-max-arithmetic-search-pipeline"
},
"variable-queries": {{variable_queries | default(0)}},
"param-source": "hybrid-query-bm25-knn-search-source",
"body": {
"_source": {
"excludes": [
"passage_embedding"
]
},
"query": {
"hybrid": {
"queries": [
{
"match": {
"title": ""
}
},
{
"knn": {
"passage_embedding": {
"vector": "[1, 2, 3]",
"k": {{k | default(100)}}
}
}
}
]
}
}
}
},
{
"name": "semantic-search-hybrid-bm25-range-and-neural-search",
"operation-type": "search",
"request-params": {
"search_pipeline": "nlp-min-max-arithmetic-search-pipeline"
},
"variable-queries": {{variable_queries | default(0)}},
"param-source": "hybrid-query-bm25-neural-search-source",
"body": {
"_source": {
"excludes": [
"passage_embedding"
]
},
"query": {
"hybrid": {
"queries": [
{
"match": {
"title": ""
}
},
{
"neural": {
"passage_embedding": {
"query_text": "what types of rapid testing for Covid-19 have been developed?",
"model_id": "",
"k": {{k | default(10)}}
}
}
},
{
"nested": {
"path": "metadata",
"query": {
"range": {
"metadata.pubmed_id": {
"gte": {{range_gte | default(100)}},
"lte": {{range_lte | default(10000000)}}
}
}
}
}
}
]
}
}
}
}
12 changes: 12 additions & 0 deletions trec_covid_semantic_search/params/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"bulk_indexing_clients": 2,
"bulk_size": 100,
"number_of_replicas": 1,
"number_of_shards" :8,
"ingest_percentage":100,
"search_clients": 8,
"warmup_iterations": 20,
"iterations": 100,
"variable_queries": 50,
"k": 100
}
Loading

0 comments on commit 43bc7ff

Please sign in to comment.