From 5ce08e102750301df04e08803bc8d7ba3ed816c7 Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Tue, 12 Sep 2023 05:33:57 +0200 Subject: [PATCH] Properly batch items to be removed --- align_data/db/models.py | 1 - align_data/embeddings/pinecone/update_pinecone.py | 8 +++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/align_data/db/models.py b/align_data/db/models.py index e79da23..ce7e41d 100644 --- a/align_data/db/models.py +++ b/align_data/db/models.py @@ -14,7 +14,6 @@ Enum, ForeignKey, String, - Boolean, Text, func, event, diff --git a/align_data/embeddings/pinecone/update_pinecone.py b/align_data/embeddings/pinecone/update_pinecone.py index 4e6ac03..6fa1064 100644 --- a/align_data/embeddings/pinecone/update_pinecone.py +++ b/align_data/embeddings/pinecone/update_pinecone.py @@ -76,7 +76,8 @@ def save_batch(self, session: Session, batch: List[Any]): session.rollback() def batch_entries(self, article_stream: Generator[Article, None, None]) -> Iterator[List[Article]]: - while batch := tuple(islice(article_stream, self.batch_size)): + items = iter(article_stream) + while batch := tuple(islice(items, self.batch_size)): yield list(batch) @@ -104,7 +105,8 @@ def process_batch(self, batch: List[Tuple[Article, PineconeEntry | None]]): def batch_entries( self, article_stream: Generator[Article, None, None] ) -> Iterator[List[Tuple[Article, PineconeEntry | None]]]: - while batch := tuple(islice(article_stream, self.batch_size)): + items = iter(article_stream) + while batch := tuple(islice(items, self.batch_size)): yield [(article, self._make_pinecone_entry(article)) for article in batch] def _make_pinecone_entry(self, article: Article) -> PineconeEntry | None: @@ -165,7 +167,7 @@ def _articles_by_id(self, session, ids: List[str], _force_update: bool): def process_batch(self, batch: List[Article]): self.pinecone_db.delete_entries([a.id for a in batch]) - logger.info('removing batch %s', len(batch)) + logger.info('removing batch of %s items', len(batch)) for article in batch: article.pinecone_status = PineconeStatus.absent return batch