From 300a5081233b283b2d94af9cd9379e29e6f3d87d Mon Sep 17 00:00:00 2001 From: Joao Rechena Date: Sat, 13 Jun 2020 12:45:34 +0100 Subject: [PATCH] Changing the index to be a var --- deduplicate-elaticsearch.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/deduplicate-elaticsearch.py b/deduplicate-elaticsearch.py index 566ff24..b70fc07 100644 --- a/deduplicate-elaticsearch.py +++ b/deduplicate-elaticsearch.py @@ -1,6 +1,6 @@ #!/usr/local/bin/python3 -# A description and analysis of this code can be found at +# A description and analysis of this code can be found at # https://alexmarquardt.com/2018/07/23/deduplicating-documents-in-elasticsearch/ import hashlib @@ -13,6 +13,8 @@ es = Elasticsearch([ES_HOST], http_auth=(ES_USER, ES_PASSWORD)) dict_of_duplicate_docs = {} +index_to_search = "stocks" + # The following line defines the fields that will be # used to determine if a document is a duplicate keys_to_include_in_hash = ["CAC", "FTSE", "SMI"] @@ -41,7 +43,7 @@ def populate_dict_of_duplicate_docs(hit): # Loop over all documents in the index, and populate the # dict_of_duplicate_docs data structure. def scroll_over_all_docs(): - for hit in helpers.scan(es, index='stocks'): + for hit in helpers.scan(es, index=index_to_search): populate_dict_of_duplicate_docs(hit) @@ -49,18 +51,19 @@ def loop_over_hashes_and_remove_duplicates(): # Search through the hash of doc values to see if any # duplicate hashes have been found for hashval, array_of_ids in dict_of_duplicate_docs.items(): - if len(array_of_ids) > 1: - print("********** Duplicate docs hash=%s **********" % hashval) - # Get the documents that have mapped to the current hasval - matching_docs = es.mget(index="stocks", doc_type="doc", body={"ids": array_of_ids}) - for doc in matching_docs['docs']: - # In order to remove the possibility of hash collisions, - # write code here to check all fields in the docs to - # see if they are truly identical - if so, then execute a - # DELETE operation on all except one. - # In this example, we just print the docs. - print("doc=%s\n" % doc) - + if len(array_of_ids) > 1: + print("********** Duplicate docs hash=%s **********" % hashval) + # Get the documents that have mapped to the current hasval + matching_docs = es.mget( + index=index_to_search, + doc_type="doc", body={"ids": array_of_ids}) + for doc in matching_docs['docs']: + # In order to remove the possibility of hash collisions, + # write code here to check all fields in the docs to + # see if they are truly identical - if so, then execute a + # DELETE operation on all except one. + # In this example, we just print the docs. + print("doc=%s\n" % doc) def main():