From 244e18f8b2f57ce44d2bfd59f2d25487e61db455 Mon Sep 17 00:00:00 2001 From: Kasper Fyhn Date: Wed, 27 Nov 2024 13:59:45 +0100 Subject: [PATCH] fixing bug in SafeFastCoref which led to empty texts --- .../docprocessing/coref/safefastcoref.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/conspiracies/docprocessing/coref/safefastcoref.py b/src/conspiracies/docprocessing/coref/safefastcoref.py index 27348e7..993aada 100644 --- a/src/conspiracies/docprocessing/coref/safefastcoref.py +++ b/src/conspiracies/docprocessing/coref/safefastcoref.py @@ -5,6 +5,8 @@ import logging +from spacy.util import minibatch + logging.getLogger("fastcoref").setLevel(logging.WARNING) @@ -15,17 +17,19 @@ def __init__(self, component: FastCorefResolver): def pipe(self, stream: Iterable, batch_size: int = 128): """Wrap the pipe method of the component.""" - try: - yield from self.component.pipe( - stream, - batch_size=batch_size, - resolve_text=True, - ) - except Exception as e: - # Log the error and return the unprocessed documents - logging.error(f"Error in SafeFastCoref pipe: {e}") - for doc in stream: - yield doc # Return the original document + for mb in minibatch(stream, size=batch_size): + try: + yield from self.component.pipe( + mb, + batch_size=batch_size, + resolve_text=True, + ) + except Exception as e: + # Log the error and return the unprocessed documents + logging.error(f"Error in SafeFastCoref pipe: {e}") + for doc in mb: + doc._.resolved_text = doc.text + yield doc # Return the original document def __call__(self, doc): """Wrap the __call__ method of the component."""