-
Notifications
You must be signed in to change notification settings - Fork 0
Elasticsearch Tips
Some handy snippets for Elasticsearch:
GET /gingerbread/_search
GET /gingerbread,gingerbread-v2/_search
GET /gingerbread/posts/_search
{
"query": {
"match": {
"body": "hello"
}
}
}
GET /gingerbread/posts/_search
{
"query": {
"multi_match": {
"query": "hello",
"fields": [ "body", "subject" ]
}
}
}
GET /gingerbread/posts/_search
{
"query": {
"match_phrase": {
"body": "hello world"
}
}
}
returns phrase or word wrapped in <em></em>
GET /gingerbread/posts/_search
{
"query": {
"match_phrase": {
"body": "hello world"
}
},
"highlight": {
"fields": { "body": {} }
}
}
GET /gingerbread/posts/_search
{
"query": {
"bool": {
"must": {
"match": {
"body": "hello"
}
},
"filter": {
"range": {
"date": { "gte": "2017-01-01" }
}
}
}
}
}
sorts by most recent first rather than by relevance score
GET gingerbread/posts/_search
{
"query": {
"match": {
"body": "hello"
}
},
"sort": {
"date": { "order": "desc" }
}
}
First orders by date
and if date fields are the same orders by _score
GET gingerbread/posts/_search
{
"query": {
"match": {
"body": "hello"
}
},
"sort": [
{ "date": { "order": "desc" } },
{ "_score": { "order": "desc" } }
]
}
A document must match the percentage of words in the query to be considered relevant
The 75% gets rounded down to 66.6% so two of the 3 words must appear in the body
GET gingerbread/posts/_search
{
"query": {
"match": {
"body": "coping hard difficult",
"minimum_should_match": "75%"
}
}
}
if should is the only clause, specifies a minimum number of sub clauses that need to match
GET gingerbread/posts/_search
{
"query": {
"bool": {
"should": [
{ "match": { "title": "difficult" } },
{ "match": { "body": "coping" } },
{ "match": { "body": "tough" } }
],
"minimum_should_match": 2
}
}
}
GET gingerbread/_search
{
"query": {
"bool": {
"must": {
"match": {
"body": {
"query": "hello there",
"operator": "and"
}
}
},
"should": {
"match": {
"body":{
"query": "tough",
"boost": 3
}
}
}
}
}
}
Kind of like a should with OR - each match is scored individually (rather than relevance for hello in body and there in title)
GET gingerbread/posts/_search
{
"query": {
"dis_max": {
"queries": [
{ "match": { "body": "hello there" } },
{ "match": { "title": "hello there" } }
]
}
}
}
Same as above but rather than just picking the first matching clause it weights the other matching ones with a value and adds them all together
tie_breaker
must be a floating point between 0
and 1
GET gingerbread/posts/_search
{
"query": {
"dis_max": {
"queries": [
{ "match": { "body": "hello there" } },
{ "match": { "title": "hello there" } }
],
"tie_breaker": 0.3
}
}
}
The above query could be re-written in multi match
GET gingerbread/posts/_search
{
"multi_match": {
"query": "hello there",
"type": "best_fields",
"fields": ["title", "body"],
"tie_breaker": 0.3,
"minimum_should_match": "30%"
}
}
You can set up sub fields on each field that can be analyzed differently
e.g. you could have an english analyzer on title
and a standard analyzer on title.std
PUT gingerbread
{
"mappings": {
"posts": {
"properties": {
"title": {
"type": "text",
"analyzer": "english",
"fields": {
"std": {
"type": "text",
"analyzer": "standard"
}
}
}
}
}
}
}
These can then be used to boost the relevance of exact searches whilst still including normalised words in the english analyzer e.g.:
GET gingerbread/posts/_search
{
"query": {
"multi_match": {
"query": "jumping rabbits",
"type": "most_fields",
"fields": ["title", "title.std"]
}
}
}
Will score "jumping rabbits" higher than "jump rabbit"
Adding a ^2
to a field will boost it's relevance relative to other fields
To match specifically on multiple fields (say something like a person's full name, address and phonenumber), often the best solution is to add a custom field
Use copy_to
in the mappings to achieve this
PUT gingerbread
{
"mappings": {
"post": {
"properties": {
"first_name": {
"type": "text",
"copy_to": "full_name"
},
"last_name": {
"type": "text",
"copy_to": "full_name"
},
"full_name": {
"type": "text"
}
}
}
}
}
This can also be done at the query level using cross_fields
GET gingerbread/posts/_search
{
"query": {
"multi_match": {
"query": "gingerbread poppy",
"type": "cross_fields",
"operator": "and",
"fields": ["first_name", "last_name"]
}
}
}
The slop parameter tells how far appart terms are allowed to be whilst still considering the document a match
GET gingerbread/posts/_search
{
"query": {
"match_phrase": {
"body": {
"query": "just too much",
"slop": 5
}
}
}
}
GET gingerbread/posts/_search
{
"query": {
"match_phrase_prefxix": {
"body": "just too much"
}
}
}
However it's more performant to do this at index time if possible (increases size of index but faster queries)
PUT gingerbread
{
"settings": {
"number_of_shards": 1,
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 20
}
},
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
}
}
}
}
completion suggester is also a more efficient option - https://www.elastic.co/guide/en/elasticsearch/reference/2.4/search-suggesters-completion.html
e.g. unique users
GET /gingerbread/posts/_search
{
"aggs": {
"unique_users": {
"terms": { "field": "username" }
}
}
}
returns users in buckets like:
"aggregations": {
"unique _users": {
"doc_count_error_upper_bound": 81,
"sum_other_doc_count": 21953,
"buckets": [
{
"key": "maisie-:o)",
"doc_count": 383
},
{
"key": "daffodils-and-sheep",
"doc_count": 314
},
{
"key": "louisemc",
"doc_count": 239
},
{
"key": "pocoyo",
"doc_count": 196
},
{
"key": "andyin",
"doc_count": 186
},
{
"key": "markthecarpenter",
"doc_count": 170
},
{
"key": "gingerbread-sitara",
"doc_count": 163
},
{
"key": "lovejoy72",
"doc_count": 153
},
{
"key": "titch!",
"doc_count": 150
},
{
"key": "gingerbreadsuzanne",
"doc_count": 136
}
]
}
}
make the aggregation more specific
GET /gingerbread/posts/_search
{
"query": {
"match": {
"visible": true
}
},
"aggs": {
"unique_users": {
"terms": { "field": "username" }
}
}
}
e.g. divide the forum up into years, get unique users at that time
GET /gingerbread/posts/_search
{
"aggs": {
"forum_time_period": {
"date_histogram": {
"field": "date",
"interval": "year"
},
"aggs": {
"users_at_time": {
"terms": { "field": "username" }
}
}
}
}
}