Skip to content

Commit

Permalink
fix no headings case (#211)
Browse files Browse the repository at this point in the history
* fix no headings case

* fix no headings case
  • Loading branch information
Ceceliachenen authored Sep 11, 2024
1 parent c146ae8 commit 2f288c1
Showing 1 changed file with 24 additions and 11 deletions.
35 changes: 24 additions & 11 deletions src/pai_rag/integrations/readers/pai_pdf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,20 +130,33 @@ def combine_images_with_text(markdown_text):

output = {}

for i in range(1, len(sections), 3):
title_level = sections[i]
title_text = sections[i + 1]
content = sections[i + 2] if i + 2 < len(sections) else ""
# 没有标题的情况
if len(sections) == 1:
content = sections[0]
content_without_images_url = PaiPDFReader.remove_image_paths(content)

url_pattern = IMAGE_URL_PATTERN
images = re.findall(url_pattern, content)
if title_level:
images_url_list = [image[0] for image in images if len(image[0]) > 0]
if len(images_url_list) > 0:
output[
f"{title_level} {title_text}\n\n{content_without_images_url.strip()}"
] = images_url_list
images_url_list = [image[0] for image in images if len(image[0]) > 0]
if len(images_url_list) > 0:
output[f"{content_without_images_url.strip()}"] = images_url_list
# 有标题的情况
else:
for i in range(1, len(sections), 3):
title_level = sections[i]
title_text = sections[i + 1]
content = sections[i + 2] if i + 2 < len(sections) else ""
content_without_images_url = PaiPDFReader.remove_image_paths(content)

url_pattern = IMAGE_URL_PATTERN
images = re.findall(url_pattern, content)
if title_level:
images_url_list = [
image[0] for image in images if len(image[0]) > 0
]
if len(images_url_list) > 0:
output[
f"{title_level} {title_text}\n\n{content_without_images_url.strip()}"
] = images_url_list
return output

@staticmethod
Expand Down

0 comments on commit 2f288c1

Please sign in to comment.