From 2f288c1915aae171aebe526fc390589956c32c17 Mon Sep 17 00:00:00 2001 From: Ceceliachenen Date: Wed, 11 Sep 2024 17:37:26 +0800 Subject: [PATCH] fix no headings case (#211) * fix no headings case * fix no headings case --- .../integrations/readers/pai_pdf_reader.py | 35 +++++++++++++------ 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/src/pai_rag/integrations/readers/pai_pdf_reader.py b/src/pai_rag/integrations/readers/pai_pdf_reader.py index 9436bce9..86cec1f0 100644 --- a/src/pai_rag/integrations/readers/pai_pdf_reader.py +++ b/src/pai_rag/integrations/readers/pai_pdf_reader.py @@ -130,20 +130,33 @@ def combine_images_with_text(markdown_text): output = {} - for i in range(1, len(sections), 3): - title_level = sections[i] - title_text = sections[i + 1] - content = sections[i + 2] if i + 2 < len(sections) else "" + # 没有标题的情况 + if len(sections) == 1: + content = sections[0] content_without_images_url = PaiPDFReader.remove_image_paths(content) - url_pattern = IMAGE_URL_PATTERN images = re.findall(url_pattern, content) - if title_level: - images_url_list = [image[0] for image in images if len(image[0]) > 0] - if len(images_url_list) > 0: - output[ - f"{title_level} {title_text}\n\n{content_without_images_url.strip()}" - ] = images_url_list + images_url_list = [image[0] for image in images if len(image[0]) > 0] + if len(images_url_list) > 0: + output[f"{content_without_images_url.strip()}"] = images_url_list + # 有标题的情况 + else: + for i in range(1, len(sections), 3): + title_level = sections[i] + title_text = sections[i + 1] + content = sections[i + 2] if i + 2 < len(sections) else "" + content_without_images_url = PaiPDFReader.remove_image_paths(content) + + url_pattern = IMAGE_URL_PATTERN + images = re.findall(url_pattern, content) + if title_level: + images_url_list = [ + image[0] for image in images if len(image[0]) > 0 + ] + if len(images_url_list) > 0: + output[ + f"{title_level} {title_text}\n\n{content_without_images_url.strip()}" + ] = images_url_list return output @staticmethod