From 21d7887accdcd50530b00ae110a44a2aad306e2f Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Thu, 14 Sep 2023 05:33:00 +0200 Subject: [PATCH] Properly batch items to be removed (#188) --- align_data/db/models.py | 1 - align_data/embeddings/pinecone/update_pinecone.py | 8 +++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/align_data/db/models.py b/align_data/db/models.py index d67ebfd..79090e1 100644 --- a/align_data/db/models.py +++ b/align_data/db/models.py @@ -14,7 +14,6 @@ Enum, ForeignKey, String, - Boolean, Text, func, event, diff --git a/align_data/embeddings/pinecone/update_pinecone.py b/align_data/embeddings/pinecone/update_pinecone.py index 92fa726..f1a06f3 100644 --- a/align_data/embeddings/pinecone/update_pinecone.py +++ b/align_data/embeddings/pinecone/update_pinecone.py @@ -74,7 +74,8 @@ def save_batch(self, session: Session, batch: List[Any]): session.rollback() def batch_entries(self, article_stream: Generator[Article, None, None]) -> Iterator[List[Article]]: - while batch := tuple(islice(article_stream, self.batch_size)): + items = iter(article_stream) + while batch := tuple(islice(items, self.batch_size)): yield list(batch) @@ -102,7 +103,8 @@ def process_batch(self, batch: List[Tuple[Article, PineconeEntry | None]]): def batch_entries( self, article_stream: Generator[Article, None, None] ) -> Iterator[List[Tuple[Article, PineconeEntry | None]]]: - while batch := tuple(islice(article_stream, self.batch_size)): + items = iter(article_stream) + while batch := tuple(islice(items, self.batch_size)): yield [(article, self._make_pinecone_entry(article)) for article in batch] def _make_pinecone_entry(self, article: Article) -> PineconeEntry | None: @@ -163,7 +165,7 @@ def _articles_by_id(self, session, ids: List[str], _force_update: bool): def process_batch(self, batch: List[Article]): self.pinecone_db.delete_entries([a.id for a in batch]) - logger.info('removing batch %s', len(batch)) + logger.info('removing batch of %s items', len(batch)) for article in batch: article.pinecone_status = PineconeStatus.absent return batch