Skip to content

Commit

Permalink
Make diff more minimal
Browse files Browse the repository at this point in the history
  • Loading branch information
pamelafox committed Nov 20, 2024
1 parent 4ef5622 commit 7a1c07f
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 24 deletions.
5 changes: 3 additions & 2 deletions app/backend/prepdocslib/blobmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,10 @@ async def upload_pdf_blob_images(
file_md5 = file.metadata.get("md5")
blob_md5 = blob_metadata.get("md5")
if file_md5 == blob_md5:
continue # documemt already uploaded
logger.info("Blob %s exists and md5 values match, skipping upload", blob_name)
continue # document already uploaded

logger.debug("Converting page %s to image and uploading -> %s", i, blob_name)
logger.info("Converting page %s to image and uploading -> %s", i, blob_name)

doc = fitz.open(file.content.name)
page = doc.load_page(i)
Expand Down
42 changes: 22 additions & 20 deletions app/backend/prepdocslib/filestrategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ async def parse_file(
if processor is None:
logger.info("Skipping '%s', no parser found.", file.filename())
return []
logger.debug("Ingesting '%s'", file.filename())
logger.info("Ingesting '%s'", file.filename())
pages = [page async for page in processor.parser.parse(content=file.content)]
logger.debug("Splitting '%s' into sections", file.filename())
logger.info("Splitting '%s' into sections", file.filename())
if image_embeddings:
logger.debug("Each page will be split into smaller chunks of text, but images will be of the entire page.")
logger.info("Each page will be split into smaller chunks of text, but images will be of the entire page.")
sections = [
Section(split_page, content=file, category=category) for split_page in processor.splitter.split_pages(pages)
]
Expand Down Expand Up @@ -79,41 +79,43 @@ async def run(self):
search_manager = SearchManager(
self.search_info, self.search_analyzer_name, self.use_acls, False, self.embeddings
)
doccount = self.list_file_strategy.count_docs()
logger.info(f"Processing {doccount} files")
doc_count = self.list_file_strategy.count_docs()
logger.info("Processing %s files", doc_count)
processed_count = 0
if self.document_action == DocumentAction.Add:
files = self.list_file_strategy.list()
async for file in files:
try:
if self.ignore_checksum or not await search_manager.file_exists(file):
sections = await parse_file(file, self.file_processors, self.category, self.image_embeddings)
if sections:
blob_sas_uris = await self.blob_manager.upload_blob(file)
blob_image_embeddings: Optional[List[List[float]]] = None
if self.image_embeddings and blob_sas_uris:
blob_image_embeddings = await self.image_embeddings.create_embeddings(blob_sas_uris)
await search_manager.update_content(
sections=sections, file=file, image_embeddings=blob_image_embeddings
)
if not self.ignore_checksum and await search_manager.file_exists(file):
logger.info("'%s' has already been processed", file.filename())
continue
sections = await parse_file(file, self.file_processors, self.category, self.image_embeddings)
if sections:
blob_sas_uris = await self.blob_manager.upload_blob(file)
blob_image_embeddings: Optional[List[List[float]]] = None
if self.image_embeddings and blob_sas_uris:
blob_image_embeddings = await self.image_embeddings.create_embeddings(blob_sas_uris)
await search_manager.update_content(
sections=sections, file=file, image_embeddings=blob_image_embeddings
)
finally:
if file:
file.close()
processed_count += 1
if processed_count % 10 == 0:
remaining = max(doccount - processed_count, 1)
logger.info(f"{processed_count} processed, {remaining} documents remaining")
remaining = max(doc_count - processed_count, 1)
logger.info("%s processed, %s documents remaining", processed_count, remaining)

elif self.document_action == DocumentAction.Remove:
doccount = self.list_file_strategy.count_docs()
doc_count = self.list_file_strategy.count_docs()
paths = self.list_file_strategy.list_paths()
async for path in paths:
await self.blob_manager.remove_blob(path)
await search_manager.remove_content(path)
processed_count += 1
if processed_count % 10 == 0:
remaining = max(doccount - processed_count, 1)
logger.info(f"{processed_count} removed, {remaining} documents remaining")
remaining = max(doc_count - processed_count, 1)
logger.info("%s removed, %s documents remaining", processed_count, remaining)

elif self.document_action == DocumentAction.RemoveAll:
await self.blob_manager.remove_blob()
Expand Down
4 changes: 2 additions & 2 deletions app/backend/prepdocslib/pdfparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class LocalPdfParser(Parser):
"""

async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
logger.debug("Extracting text from '%s' using local PDF parser (pypdf)", content.name)
logger.info("Extracting text from '%s' using local PDF parser (pypdf)", content.name)

reader = PdfReader(content)
pages = reader.pages
Expand All @@ -46,7 +46,7 @@ def __init__(
self.credential = credential

async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
logger.debug("Extracting text from '%s' using Azure Document Intelligence", content.name)
logger.info("Extracting text from '%s' using Azure Document Intelligence", content.name)

async with DocumentIntelligenceClient(
endpoint=self.endpoint, credential=self.credential
Expand Down

0 comments on commit 7a1c07f

Please sign in to comment.