From 8a1c7fc338984d4513dd2abd87bda076ac149c04 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Thu, 31 Oct 2024 12:35:49 +0100 Subject: [PATCH 1/2] Added support for semantic searches based on strings - Fixed an issue when running the MoreLikeThisQuery - Added a method to query the semantic_embedding API with a string. - run_knn_topic_inferance -> run_knn_similarity_search, to better reflect the purpose of the script. --- ...erance.py => run_knn_similarity_search.py} | 45 +++++++++++++++---- zeeguu/core/elastic/elastic_query_builder.py | 33 +++++++++++++- zeeguu/core/semantic_search/__init__.py | 1 + .../elastic_semantic_search.py | 30 ++++++++++++- zeeguu/core/semantic_vector_api/__init__.py | 6 ++- .../retrieve_embeddings.py | 11 +++++ 6 files changed, 115 insertions(+), 11 deletions(-) rename tools/{run_knn_topic_inferance.py => run_knn_similarity_search.py} (73%) diff --git a/tools/run_knn_topic_inferance.py b/tools/run_knn_similarity_search.py similarity index 73% rename from tools/run_knn_topic_inferance.py rename to tools/run_knn_similarity_search.py index bbe3bf20..1331d863 100644 --- a/tools/run_knn_topic_inferance.py +++ b/tools/run_knn_similarity_search.py @@ -2,15 +2,13 @@ articles_like_this_semantic, add_topics_based_on_semantic_hood_search, articles_like_this_tfidf, + find_articles_based_on_text, ) from zeeguu.core.model.article import Article -from zeeguu.core.model.language import Language - -from zeeguu.core.elastic.settings import ES_CONN_STRING, ES_ZINDEX +from zeeguu.core.elastic.settings import ES_CONN_STRING from elasticsearch import Elasticsearch from collections import Counter -from zeeguu.core.elastic.elastic_query_builder import build_elastic_recommender_query from zeeguu.api.app import create_app import argparse @@ -24,7 +22,8 @@ parser = argparse.ArgumentParser( description="Utilizes the various similar document queries in ES, to analyze the results." ) -parser.add_argument("article_id", type=int, help="article id to search with") +parser.add_argument("-a", "--article_id", type=int, help="article id to search with") +parser.add_argument("-k", "--keyword", type=str, help="keyword to search with") def search_similar_to_article(article_id): @@ -82,10 +81,10 @@ def search_similar_to_article(article_id): neighbouring_topics = [t.new_topic for a in a_found_t for t in a.new_topics] TOPICS_TO_NOT_COUNT = set(["news", "aktuell", "nyheder", "nieuws", "article"]) neighbouring_keywords = [ - t.url_keywords + t.url_keyword for a in a_found_t for t in a.url_keywords - if t.url_keywords.keyword not in TOPICS_TO_NOT_COUNT + if t.url_keyword.keyword not in TOPICS_TO_NOT_COUNT ] print() @@ -105,7 +104,37 @@ def search_similar_to_article(article_id): print(a_found[0].content[:100]) +def search_similar_to_keyword(keyword): + app = create_app() + app.app_context().push() + + es = Elasticsearch(ES_CONN_STRING) + + a_found, hits = find_articles_based_on_text(keyword) + print("------------------------------------------------") + + print("Keyword Searched: ", keyword) + print() + print("Similar articles:") + for hit in hits: + print( + hit["_id"], + hit["_source"]["old_topics"], + hit["_source"]["language"], + f"New Topics: {hit['_source']['topics']}", + hit["_source"].get("url_keywords", []), + hit["_source"].get("url", ""), + hit["_score"], + ) + print("Article list: ") + print(a_found) + + if __name__ == "__main__": args = parser.parse_args() article_id = args.article_id - search_similar_to_article(article_id) + keyword = args.keyword + if article_id: + search_similar_to_article(article_id) + if keyword: + search_similar_to_keyword(keyword) diff --git a/zeeguu/core/elastic/elastic_query_builder.py b/zeeguu/core/elastic/elastic_query_builder.py index 225da517..820252ce 100644 --- a/zeeguu/core/elastic/elastic_query_builder.py +++ b/zeeguu/core/elastic/elastic_query_builder.py @@ -38,7 +38,7 @@ def more_like_this_query(count, article_text, language, page=0): .filter("term", language=language.name.lower()) ) - return {"from": page * count, "size": count, "query": s.to_dict()} + return {"from": page * count, "size": count, "query": s.to_dict()["query"]} def build_elastic_recommender_query( @@ -326,6 +326,37 @@ def build_elastic_semantic_sim_query( return query +def build_elastic_semantic_sim_query_for_text( + count, + text_embedding, + n_candidates=100, + language=None, +): + """ + Similar to build_elastic_semantic_sim_query, but taking a text embedding + """ + s = Search() + # s = s.exclude("match", id=article.id) + if language: + s = s.knn( + field="sem_vec", + k=count, + num_candidates=n_candidates, + query_vector=text_embedding, + filter=(Q("match", language__keyword=language.name)), + ) + else: + s = s.knn( + field="sem_vec", + k=count, + num_candidates=n_candidates, + query_vector=text_embedding, + ) + + query = s.to_dict() + return query + + def build_elastic_semantic_sim_query_for_topic_cls( k_count, article, diff --git a/zeeguu/core/semantic_search/__init__.py b/zeeguu/core/semantic_search/__init__.py index b32059f6..bc8dce4d 100644 --- a/zeeguu/core/semantic_search/__init__.py +++ b/zeeguu/core/semantic_search/__init__.py @@ -2,4 +2,5 @@ articles_like_this_semantic, add_topics_based_on_semantic_hood_search, articles_like_this_tfidf, + find_articles_based_on_text, ) diff --git a/zeeguu/core/semantic_search/elastic_semantic_search.py b/zeeguu/core/semantic_search/elastic_semantic_search.py index e03c2be0..ec6c0891 100644 --- a/zeeguu/core/semantic_search/elastic_semantic_search.py +++ b/zeeguu/core/semantic_search/elastic_semantic_search.py @@ -8,11 +8,15 @@ from zeeguu.core.elastic.elastic_query_builder import ( build_elastic_semantic_sim_query, build_elastic_semantic_sim_query_for_topic_cls, + build_elastic_semantic_sim_query_for_text, more_like_this_query, ) from zeeguu.core.util.timer_logging_decorator import time_this from zeeguu.core.elastic.settings import ES_CONN_STRING, ES_ZINDEX -from zeeguu.core.semantic_vector_api import get_embedding_from_article +from zeeguu.core.semantic_vector_api import ( + get_embedding_from_article, + get_embedding_from_text, +) @time_this @@ -86,6 +90,30 @@ def add_topics_based_on_semantic_hood_search( return [], [] +@time_this +def find_articles_based_on_text(text, k: int = 9): # hood = (slang) neighborhood + query_body = build_elastic_semantic_sim_query_for_text( + k, get_embedding_from_text(text) + ) + final_article_mix = [] + + try: + es = Elasticsearch(ES_CONN_STRING) + res = es.search(index=ES_ZINDEX, body=query_body) + + hit_list = res["hits"].get("hits") + final_article_mix.extend(_to_articles_from_ES_hits(hit_list)) + + return [ + a for a in final_article_mix if a is not None and not a.broken + ], hit_list + except ConnectionError: + print("Could not connect to ES server.") + except Exception as e: + print(f"Error encountered: {e}") + return [], [] + + def _to_articles_from_ES_hits(hits): articles = [] for hit in hits: diff --git a/zeeguu/core/semantic_vector_api/__init__.py b/zeeguu/core/semantic_vector_api/__init__.py index 40ce9e01..8966959a 100644 --- a/zeeguu/core/semantic_vector_api/__init__.py +++ b/zeeguu/core/semantic_vector_api/__init__.py @@ -1 +1,5 @@ -from .retrieve_embeddings import get_embedding_from_article, EMB_API_CONN_STRING \ No newline at end of file +from .retrieve_embeddings import ( + get_embedding_from_article, + get_embedding_from_text, + EMB_API_CONN_STRING, +) diff --git a/zeeguu/core/semantic_vector_api/retrieve_embeddings.py b/zeeguu/core/semantic_vector_api/retrieve_embeddings.py index d28db527..7e7f31b7 100644 --- a/zeeguu/core/semantic_vector_api/retrieve_embeddings.py +++ b/zeeguu/core/semantic_vector_api/retrieve_embeddings.py @@ -6,6 +6,7 @@ "ZEEGUU_EMB_API_CONN_STRING", "http://127.0.0.1:8000" ) + def get_embedding_from_article(a: Article): r = requests.post( url=f"{EMB_API_CONN_STRING}/get_article_embedding", @@ -15,3 +16,13 @@ def get_embedding_from_article(a: Article): }, ) return r.json() + + +def get_embedding_from_text(text: str, language: str = None): + data = { + "article_content": text, + } + if language: + data["article_language"] = language + r = requests.post(url=f"{EMB_API_CONN_STRING}/get_article_embedding", json=data) + return r.json() From b7ae60c172f2b1e853ce8ed852ef221634780d66 Mon Sep 17 00:00:00 2001 From: Tiago Ribeiro Date: Mon, 25 Nov 2024 09:22:59 +0100 Subject: [PATCH 2/2] Update run_knn_similarity_search.py - Improved printing format --- tools/run_knn_similarity_search.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tools/run_knn_similarity_search.py b/tools/run_knn_similarity_search.py index fac1dfc4..4c990347 100644 --- a/tools/run_knn_similarity_search.py +++ b/tools/run_knn_similarity_search.py @@ -90,8 +90,6 @@ def search_similar_to_keyword(keyword): app = create_app() app.app_context().push() - es = Elasticsearch(ES_CONN_STRING) - a_found, hits = find_articles_based_on_text(keyword) print("------------------------------------------------") @@ -100,13 +98,7 @@ def search_similar_to_keyword(keyword): print("Similar articles:") for hit in hits: print( - hit["_id"], - hit["_source"]["old_topics"], - hit["_source"]["language"], - f"New Topics: {hit['_source']['topics']}", - hit["_source"].get("url_keywords", []), - hit["_source"].get("url", ""), - hit["_score"], + f"{hit["_id"]} {hit["_score"]:.4f} {hit["_source"]["language"]}, Topics: {hit['_source']['topics']} {hit["_source"].get("url_keywords", [])} {hit["_source"].get("url", "")}" ) print("Article list: ") print(a_found) @@ -120,3 +112,5 @@ def search_similar_to_keyword(keyword): search_similar_to_article(article_id) if keyword: search_similar_to_keyword(keyword) + if not keyword and not article_id: + parser.print_help()