From f592ae712c907fa16ec5455757d62f715412c06d Mon Sep 17 00:00:00 2001 From: zoazhyga Date: Thu, 5 Sep 2024 18:03:43 +0200 Subject: [PATCH] [wip] trying to fix page numbers --- private_gpt/components/ingest/ingest_helper.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/private_gpt/components/ingest/ingest_helper.py b/private_gpt/components/ingest/ingest_helper.py index ca3f750a..8521ea07 100644 --- a/private_gpt/components/ingest/ingest_helper.py +++ b/private_gpt/components/ingest/ingest_helper.py @@ -139,10 +139,11 @@ class IngestionHelper: text = pytesseract.image_to_string(image, lang="rus") doc = StringIterableReader().load_data( [text], - )[0] - doc.metadata["page_label"] = i + ) + # )[0] + # doc.metadata["page_label"] = str(i + 1) - documents.extend(doc) + documents.extend([doc]) except Exception as e: logger.error(f"Error extracting images from PDF: {e}") raise ValueError(f"No text extracted from PDF={file_name}")