From 7c869db982a6753927b15126e68291510a5e8fc1 Mon Sep 17 00:00:00 2001 From: Jakob Niermann Date: Sun, 11 Dec 2022 01:13:23 +0100 Subject: [PATCH 1/2] fix: flush cashe to reduce memory consumption --- src/layoutparser/io/pdf.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/layoutparser/io/pdf.py b/src/layoutparser/io/pdf.py index ad35cf4..4763c8e 100644 --- a/src/layoutparser/io/pdf.py +++ b/src/layoutparser/io/pdf.py @@ -78,6 +78,8 @@ def extract_words_for_page( block_type="rectangle", ) + page.flush_cashe() + return page_tokens @@ -202,6 +204,8 @@ def load_pdf( all_page_layout.append(page_tokens) + plumber_pdf_object.flush_cashe() + if not load_images: return all_page_layout else: From 80f26512170ddea7615aa4a7030a73cc19b6bcfb Mon Sep 17 00:00:00 2001 From: Jakob Niermann Date: Sun, 11 Dec 2022 01:15:38 +0100 Subject: [PATCH 2/2] fix: use as context manager so that pdf stream is closed afterwards --- src/layoutparser/io/pdf.py | 42 ++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/src/layoutparser/io/pdf.py b/src/layoutparser/io/pdf.py index 4763c8e..420e516 100644 --- a/src/layoutparser/io/pdf.py +++ b/src/layoutparser/io/pdf.py @@ -183,26 +183,28 @@ def load_pdf( plumber_pdf_object = pdfplumber.open(filename) all_page_layout = [] - for page_id in range(len(plumber_pdf_object.pages)): - cur_page = plumber_pdf_object.pages[page_id] - - page_tokens = extract_words_for_page( - cur_page, - x_tolerance=x_tolerance, - y_tolerance=y_tolerance, - keep_blank_chars=keep_blank_chars, - use_text_flow=use_text_flow, - horizontal_ltr=horizontal_ltr, - vertical_ttb=vertical_ttb, - extra_attrs=extra_attrs, - ) - - # Adding metadata for the current page - page_tokens.page_data["width"] = float(cur_page.width) - page_tokens.page_data["height"] = float(cur_page.height) - page_tokens.page_data["index"] = page_id - - all_page_layout.append(page_tokens) + + with plumber_pdf_object: + for page_id in range(len(plumber_pdf_object.pages)): + cur_page = plumber_pdf_object.pages[page_id] + + page_tokens = extract_words_for_page( + cur_page, + x_tolerance=x_tolerance, + y_tolerance=y_tolerance, + keep_blank_chars=keep_blank_chars, + use_text_flow=use_text_flow, + horizontal_ltr=horizontal_ltr, + vertical_ttb=vertical_ttb, + extra_attrs=extra_attrs, + ) + + # Adding metadata for the current page + page_tokens.page_data["width"] = float(cur_page.width) + page_tokens.page_data["height"] = float(cur_page.height) + page_tokens.page_data["index"] = page_id + + all_page_layout.append(page_tokens) plumber_pdf_object.flush_cashe()