Merge remote-tracking branch 'origin/pprados/06-pdfplumber' into pprados/06-pdfplumber

This commit is contained in:
Philippe Prados 2025-03-07 14:45:44 +01:00
commit 89903c87ee

View File

@ -10,10 +10,6 @@ import warnings
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from urllib.parse import urlparse
import numpy
import numpy as np
from typing import ( from typing import (
TYPE_CHECKING, TYPE_CHECKING,
Any, Any,
@ -27,6 +23,11 @@ from typing import (
Union, Union,
cast, cast,
) )
from urllib.parse import urlparse
import numpy
import numpy as np
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.blob_loaders import Blob
@ -34,7 +35,6 @@ from langchain_community.document_loaders.parsers.images import (
BaseImageBlobParser, BaseImageBlobParser,
RapidOCRBlobParser, RapidOCRBlobParser,
) )
from langchain_core.documents import Document
if TYPE_CHECKING: if TYPE_CHECKING:
import pdfplumber import pdfplumber
@ -1266,8 +1266,8 @@ class PyPDFium2Parser(BaseBlobParser):
self.pages_delimiter = pages_delimiter self.pages_delimiter = pages_delimiter
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
""" """Lazily parse the blob.
Lazily parse the blob.
Insert image, if possible, between two paragraphs. Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page. In this way, a paragraph can be continued on the next page.
@ -1469,7 +1469,6 @@ class PDFPlumberParser(BaseBlobParser):
Raises: Raises:
ValueError: If the `mode` is not "single" or "page". ValueError: If the `mode` is not "single" or "page".
ValueError: If the `extract_tables` is not "csv", "markdown" or "html". ValueError: If the `extract_tables` is not "csv", "markdown" or "html".
""" """
super().__init__() super().__init__()
if mode not in ["single", "page"]: if mode not in ["single", "page"]:
@ -1495,10 +1494,7 @@ class PDFPlumberParser(BaseBlobParser):
} }
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
""" """Lazily parse the blob.
Lazily parse the blob.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
Args: Args:
blob: The blob to parse. blob: The blob to parse.
@ -1534,8 +1530,8 @@ class PDFPlumberParser(BaseBlobParser):
# the metadatas. # the metadatas.
doc_metadata = ( doc_metadata = (
doc.metadata | # Legacy metdata with... doc.metadata # Legacy metdata with...
_purge_metadata( | _purge_metadata(
( (
doc.metadata # Add parser metdata doc.metadata # Add parser metdata
| { # with more keys | { # with more keys
@ -1696,23 +1692,19 @@ class PDFPlumberParser(BaseBlobParser):
) )
yield new_textmap.to_string() yield new_textmap.to_string()
extract_wordmaps.clear() extract_wordmaps.clear()
# and yield the table # And yield the table
used_arrays[i] = True used_arrays[i] = True
# print(f"yield table {i}")
yield tables_content[i] yield tables_content[i]
break break
if not is_table: if not is_table:
# print(f' Add {word["text"]}')
extract_wordmaps.append((word, o)) extract_wordmaps.append((word, o))
if extract_wordmaps: if extract_wordmaps:
# Text after the array ?
new_wordmap = text.WordMap(tuples=extract_wordmaps) new_wordmap = text.WordMap(tuples=extract_wordmaps)
new_textmap = new_wordmap.to_textmap( new_textmap = new_wordmap.to_textmap(
**{k: kwargs[k] for k in text.TEXTMAP_KWARGS if k in kwargs} **{k: kwargs[k] for k in text.TEXTMAP_KWARGS if k in kwargs}
) )
# print(f"yield {new_textmap.to_string()}")
yield new_textmap.to_string() yield new_textmap.to_string()
# Add images- # Add images
for content in images_content: for content in images_content:
yield content yield content
@ -1882,7 +1874,6 @@ class PDFPlumberParser(BaseBlobParser):
output += "|" + "|".join("---" for i in range(col_count)) + "|\n" output += "|" + "|".join("---" for i in range(col_count)) + "|\n"
# skip first row in details if header is part of the table # skip first row in details if header is part of the table
# iterate over detail rows # iterate over detail rows
for row in table: for row in table:
line = "|" line = "|"
@ -2013,8 +2004,7 @@ class AmazonTextractPDFParser(BaseBlobParser):
the blob.data is taken the blob.data is taken
""" """
url_parse_result = urlparse( url_parse_result = urlparse(str(blob.path)) if blob.path else None # type: ignore[attr-defined]
str(blob.path)) if blob.path else None # type: ignore[attr-defined]
# Either call with S3 path (multi-page) or with bytes (single-page) # Either call with S3 path (multi-page) or with bytes (single-page)
if ( if (
url_parse_result url_parse_result
@ -2060,8 +2050,7 @@ class DocumentIntelligenceParser(BaseBlobParser):
self.client = client self.client = client
self.model = model self.model = model
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[ def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: # type: ignore[valid-type]
Document]: # type: ignore[valid-type]
for p in result.pages: for p in result.pages:
content = " ".join([line.content for line in p.lines]) content = " ".join([line.content for line in p.lines])