Skip to content

Commit

Permalink
From vector DB, often get dupes, which means when end up returning (#210
Browse files Browse the repository at this point in the history
)

less then top_k elements.  So, fetch top_k=(2 * limit) and limit to
just (limit)
  • Loading branch information
cybermaggedon authored Dec 10, 2024
1 parent cd8d0c8 commit 07f9b1f
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ def handle(self, msg):

print(f"Handling input {id}...", flush=True)

entities = set()
entity_set = set()
entities = []

for vec in v.vectors:

Expand All @@ -85,20 +86,30 @@ def handle(self, msg):

index = self.pinecone.Index(index_name)

# Heuristic hack, get (2*limit), so that we have more chance
# of getting (limit) entities
results = index.query(
namespace=v.collection,
vector=vec,
top_k=v.limit,
top_k=v.limit * 2,
include_values=False,
include_metadata=True
)

for r in results.matches:

ent = r.metadata["entity"]
entities.add(ent)

# Convert set to list
entities = list(entities)
# De-dupe entities
if ent not in entity_set:
entity_set.add(ent)
entities.append(ent)

# Keep adding entities until limit
if len(entity_set) >= v.limit: break

# Keep adding entities until limit
if len(entity_set) >= v.limit: break

ents2 = []

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ def handle(self, msg):

print(f"Handling input {id}...", flush=True)

entities = set()
entity_set = set()
entities = []

for vec in v.vectors:

Expand All @@ -71,19 +72,28 @@ def handle(self, msg):
str(dim)
)

# Heuristic hack, get (2*limit), so that we have more chance
# of getting (limit) entities
search_result = self.client.query_points(
collection_name=collection,
query=vec,
limit=v.limit,
limit=v.limit * 2,
with_payload=True,
).points

for r in search_result:
ent = r.payload["entity"]
entities.add(ent)

# Convert set to list
entities = list(entities)
# De-dupe entities
if ent not in entity_set:
entity_set.add(ent)
entities.append(ent)

# Keep adding entities until limit
if len(entity_set) >= v.limit: break

# Keep adding entities until limit
if len(entity_set) >= v.limit: break

ents2 = []

Expand Down

0 comments on commit 07f9b1f

Please sign in to comment.