mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-09-02 15:55:02 +00:00
[wip] trying to fix page numbers
This commit is contained in:
@@ -139,10 +139,11 @@ class IngestionHelper:
|
||||
text = pytesseract.image_to_string(image, lang="rus")
|
||||
doc = StringIterableReader().load_data(
|
||||
[text],
|
||||
)[0]
|
||||
doc.metadata["page_label"] = i
|
||||
)
|
||||
# )[0]
|
||||
# doc.metadata["page_label"] = str(i + 1)
|
||||
|
||||
documents.extend(doc)
|
||||
documents.extend([doc])
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting images from PDF: {e}")
|
||||
raise ValueError(f"No text extracted from PDF={file_name}")
|
||||
|
Reference in New Issue
Block a user