[wip] trying to fix page numbers

This commit is contained in:
zoazhyga
2024-09-05 18:03:43 +02:00
parent cc6091961b
commit f592ae712c

View File

@@ -139,10 +139,11 @@ class IngestionHelper:
text = pytesseract.image_to_string(image, lang="rus") text = pytesseract.image_to_string(image, lang="rus")
doc = StringIterableReader().load_data( doc = StringIterableReader().load_data(
[text], [text],
)[0] )
doc.metadata["page_label"] = i # )[0]
# doc.metadata["page_label"] = str(i + 1)
documents.extend(doc) documents.extend([doc])
except Exception as e: except Exception as e:
logger.error(f"Error extracting images from PDF: {e}") logger.error(f"Error extracting images from PDF: {e}")
raise ValueError(f"No text extracted from PDF={file_name}") raise ValueError(f"No text extracted from PDF={file_name}")