mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-09-02 15:55:02 +00:00
[wip] trying to fix page numbers
This commit is contained in:
@@ -139,10 +139,11 @@ class IngestionHelper:
|
|||||||
text = pytesseract.image_to_string(image, lang="rus")
|
text = pytesseract.image_to_string(image, lang="rus")
|
||||||
doc = StringIterableReader().load_data(
|
doc = StringIterableReader().load_data(
|
||||||
[text],
|
[text],
|
||||||
)[0]
|
)
|
||||||
doc.metadata["page_label"] = i
|
# )[0]
|
||||||
|
# doc.metadata["page_label"] = str(i + 1)
|
||||||
|
|
||||||
documents.extend(doc)
|
documents.extend([doc])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error extracting images from PDF: {e}")
|
logger.error(f"Error extracting images from PDF: {e}")
|
||||||
raise ValueError(f"No text extracted from PDF={file_name}")
|
raise ValueError(f"No text extracted from PDF={file_name}")
|
||||||
|
Reference in New Issue
Block a user