Merge remote-tracking branch 'origin/pprados/06-pdfplumber' into pprados/06-pdfplumber

This commit is contained in:
Philippe Prados 2025-03-07 14:45:44 +01:00
commit 89903c87ee

View File

@ -10,10 +10,6 @@ import warnings
from datetime import datetime
from pathlib import Path
from tempfile import TemporaryDirectory
from urllib.parse import urlparse
import numpy
import numpy as np
from typing import (
TYPE_CHECKING,
Any,
@ -27,6 +23,11 @@ from typing import (
Union,
cast,
)
from urllib.parse import urlparse
import numpy
import numpy as np
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
@ -34,7 +35,6 @@ from langchain_community.document_loaders.parsers.images import (
BaseImageBlobParser,
RapidOCRBlobParser,
)
from langchain_core.documents import Document
if TYPE_CHECKING:
import pdfplumber
@ -1266,8 +1266,8 @@ class PyPDFium2Parser(BaseBlobParser):
self.pages_delimiter = pages_delimiter
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""
Lazily parse the blob.
"""Lazily parse the blob.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
@ -1469,7 +1469,6 @@ class PDFPlumberParser(BaseBlobParser):
Raises:
ValueError: If the `mode` is not "single" or "page".
ValueError: If the `extract_tables` is not "csv", "markdown" or "html".
"""
super().__init__()
if mode not in ["single", "page"]:
@ -1495,10 +1494,7 @@ class PDFPlumberParser(BaseBlobParser):
}
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""
Lazily parse the blob.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
"""Lazily parse the blob.
Args:
blob: The blob to parse.
@ -1534,8 +1530,8 @@ class PDFPlumberParser(BaseBlobParser):
# the metadatas.
doc_metadata = (
doc.metadata | # Legacy metdata with...
_purge_metadata(
doc.metadata # Legacy metdata with...
| _purge_metadata(
(
doc.metadata # Add parser metdata
| { # with more keys
@ -1696,23 +1692,19 @@ class PDFPlumberParser(BaseBlobParser):
)
yield new_textmap.to_string()
extract_wordmaps.clear()
# and yield the table
# And yield the table
used_arrays[i] = True
# print(f"yield table {i}")
yield tables_content[i]
break
if not is_table:
# print(f' Add {word["text"]}')
extract_wordmaps.append((word, o))
if extract_wordmaps:
# Text after the array ?
new_wordmap = text.WordMap(tuples=extract_wordmaps)
new_textmap = new_wordmap.to_textmap(
**{k: kwargs[k] for k in text.TEXTMAP_KWARGS if k in kwargs}
)
# print(f"yield {new_textmap.to_string()}")
yield new_textmap.to_string()
# Add images-
# Add images
for content in images_content:
yield content
@ -1882,7 +1874,6 @@ class PDFPlumberParser(BaseBlobParser):
output += "|" + "|".join("---" for i in range(col_count)) + "|\n"
# skip first row in details if header is part of the table
# iterate over detail rows
for row in table:
line = "|"
@ -2013,8 +2004,7 @@ class AmazonTextractPDFParser(BaseBlobParser):
the blob.data is taken
"""
url_parse_result = urlparse(
str(blob.path)) if blob.path else None # type: ignore[attr-defined]
url_parse_result = urlparse(str(blob.path)) if blob.path else None # type: ignore[attr-defined]
# Either call with S3 path (multi-page) or with bytes (single-page)
if (
url_parse_result
@ -2060,8 +2050,7 @@ class DocumentIntelligenceParser(BaseBlobParser):
self.client = client
self.model = model
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[
Document]: # type: ignore[valid-type]
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: # type: ignore[valid-type]
for p in result.pages:
content = " ".join([line.content for line in p.lines])