Skip to content

Commit

Permalink
Merge pull request #306 from zeeguu/275-add-a-way-to-run-semantic-sea…
Browse files Browse the repository at this point in the history
…rch-on-text

Added a tool to run KNN searches based on keyword or article IDs
  • Loading branch information
mircealungu authored Nov 26, 2024
2 parents 4edfaeb + b7ae60c commit 7f9cad5
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
articles_like_this_semantic,
add_topics_based_on_semantic_hood_search,
articles_like_this_tfidf,
find_articles_based_on_text,
)

from zeeguu.core.model.article import Article
from zeeguu.core.model.language import Language
from zeeguu.core.model.url_keyword import UrlKeyword

from zeeguu.core.elastic.settings import ES_CONN_STRING, ES_ZINDEX
from zeeguu.core.elastic.settings import ES_CONN_STRING
from elasticsearch import Elasticsearch
from collections import Counter

Expand All @@ -24,7 +24,8 @@
parser = argparse.ArgumentParser(
description="Utilizes the various similar document queries in ES, to analyze the results."
)
parser.add_argument("article_id", type=int, help="article id to search with")
parser.add_argument("-a", "--article_id", type=int, help="article id to search with")
parser.add_argument("-k", "--keyword", type=str, help="keyword to search with")


def search_similar_to_article(article_id):
Expand Down Expand Up @@ -85,7 +86,31 @@ def search_similar_to_article(article_id):
print(a_found[0].content[:100])


def search_similar_to_keyword(keyword):
app = create_app()
app.app_context().push()

a_found, hits = find_articles_based_on_text(keyword)
print("------------------------------------------------")

print("Keyword Searched: ", keyword)
print()
print("Similar articles:")
for hit in hits:
print(
f"{hit["_id"]} {hit["_score"]:.4f} {hit["_source"]["language"]}, Topics: {hit['_source']['topics']} {hit["_source"].get("url_keywords", [])} {hit["_source"].get("url", "")}"
)
print("Article list: ")
print(a_found)


if __name__ == "__main__":
args = parser.parse_args()
article_id = args.article_id
search_similar_to_article(article_id)
keyword = args.keyword
if article_id:
search_similar_to_article(article_id)
if keyword:
search_similar_to_keyword(keyword)
if not keyword and not article_id:
parser.print_help()
31 changes: 31 additions & 0 deletions zeeguu/core/elastic/elastic_query_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,37 @@ def build_elastic_semantic_sim_query(
return query


def build_elastic_semantic_sim_query_for_text(
count,
text_embedding,
n_candidates=100,
language=None,
):
"""
Similar to build_elastic_semantic_sim_query, but taking a text embedding
"""
s = Search()
# s = s.exclude("match", id=article.id)
if language:
s = s.knn(
field="sem_vec",
k=count,
num_candidates=n_candidates,
query_vector=text_embedding,
filter=(Q("match", language__keyword=language.name)),
)
else:
s = s.knn(
field="sem_vec",
k=count,
num_candidates=n_candidates,
query_vector=text_embedding,
)

query = s.to_dict()
return query


def build_elastic_semantic_sim_query_for_topic_cls(
k_count,
article,
Expand Down
1 change: 1 addition & 0 deletions zeeguu/core/semantic_search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
articles_like_this_semantic,
add_topics_based_on_semantic_hood_search,
articles_like_this_tfidf,
find_articles_based_on_text,
)
30 changes: 29 additions & 1 deletion zeeguu/core/semantic_search/elastic_semantic_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,15 @@
from zeeguu.core.elastic.elastic_query_builder import (
build_elastic_semantic_sim_query,
build_elastic_semantic_sim_query_for_topic_cls,
build_elastic_semantic_sim_query_for_text,
more_like_this_query,
)
from zeeguu.core.util.timer_logging_decorator import time_this
from zeeguu.core.elastic.settings import ES_CONN_STRING, ES_ZINDEX
from zeeguu.core.semantic_vector_api import get_embedding_from_article
from zeeguu.core.semantic_vector_api import (
get_embedding_from_article,
get_embedding_from_text,
)


@time_this
Expand Down Expand Up @@ -86,6 +90,30 @@ def add_topics_based_on_semantic_hood_search(
return [], []


@time_this
def find_articles_based_on_text(text, k: int = 9): # hood = (slang) neighborhood
query_body = build_elastic_semantic_sim_query_for_text(
k, get_embedding_from_text(text)
)
final_article_mix = []

try:
es = Elasticsearch(ES_CONN_STRING)
res = es.search(index=ES_ZINDEX, body=query_body)

hit_list = res["hits"].get("hits")
final_article_mix.extend(_to_articles_from_ES_hits(hit_list))

return [
a for a in final_article_mix if a is not None and not a.broken
], hit_list
except ConnectionError:
print("Could not connect to ES server.")
except Exception as e:
print(f"Error encountered: {e}")
return [], []


def _to_articles_from_ES_hits(hits):
articles = []
for hit in hits:
Expand Down
6 changes: 5 additions & 1 deletion zeeguu/core/semantic_vector_api/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
from .retrieve_embeddings import get_embedding_from_article, EMB_API_CONN_STRING
from .retrieve_embeddings import (
get_embedding_from_article,
get_embedding_from_text,
EMB_API_CONN_STRING,
)
11 changes: 11 additions & 0 deletions zeeguu/core/semantic_vector_api/retrieve_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"ZEEGUU_EMB_API_CONN_STRING", "http://127.0.0.1:8000"
)


def get_embedding_from_article(a: Article):
r = requests.post(
url=f"{EMB_API_CONN_STRING}/get_article_embedding",
Expand All @@ -15,3 +16,13 @@ def get_embedding_from_article(a: Article):
},
)
return r.json()


def get_embedding_from_text(text: str, language: str = None):
data = {
"article_content": text,
}
if language:
data["article_language"] = language
r = requests.post(url=f"{EMB_API_CONN_STRING}/get_article_embedding", json=data)
return r.json()

0 comments on commit 7f9cad5

Please sign in to comment.