mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-22 23:00:00 +00:00
Merge remote-tracking branch 'origin/pprados/06-pdfplumber' into pprados/06-pdfplumber
This commit is contained in:
commit
89903c87ee
@ -10,10 +10,6 @@ import warnings
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from tempfile import TemporaryDirectory
|
from tempfile import TemporaryDirectory
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
import numpy
|
|
||||||
import numpy as np
|
|
||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
Any,
|
Any,
|
||||||
@ -27,6 +23,11 @@ from typing import (
|
|||||||
Union,
|
Union,
|
||||||
cast,
|
cast,
|
||||||
)
|
)
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
import numpy as np
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
from langchain_community.document_loaders.base import BaseBlobParser
|
from langchain_community.document_loaders.base import BaseBlobParser
|
||||||
from langchain_community.document_loaders.blob_loaders import Blob
|
from langchain_community.document_loaders.blob_loaders import Blob
|
||||||
@ -34,7 +35,6 @@ from langchain_community.document_loaders.parsers.images import (
|
|||||||
BaseImageBlobParser,
|
BaseImageBlobParser,
|
||||||
RapidOCRBlobParser,
|
RapidOCRBlobParser,
|
||||||
)
|
)
|
||||||
from langchain_core.documents import Document
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
@ -1266,8 +1266,8 @@ class PyPDFium2Parser(BaseBlobParser):
|
|||||||
self.pages_delimiter = pages_delimiter
|
self.pages_delimiter = pages_delimiter
|
||||||
|
|
||||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
||||||
"""
|
"""Lazily parse the blob.
|
||||||
Lazily parse the blob.
|
|
||||||
Insert image, if possible, between two paragraphs.
|
Insert image, if possible, between two paragraphs.
|
||||||
In this way, a paragraph can be continued on the next page.
|
In this way, a paragraph can be continued on the next page.
|
||||||
|
|
||||||
@ -1469,7 +1469,6 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
Raises:
|
Raises:
|
||||||
ValueError: If the `mode` is not "single" or "page".
|
ValueError: If the `mode` is not "single" or "page".
|
||||||
ValueError: If the `extract_tables` is not "csv", "markdown" or "html".
|
ValueError: If the `extract_tables` is not "csv", "markdown" or "html".
|
||||||
|
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
if mode not in ["single", "page"]:
|
if mode not in ["single", "page"]:
|
||||||
@ -1495,10 +1494,7 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
}
|
}
|
||||||
|
|
||||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
||||||
"""
|
"""Lazily parse the blob.
|
||||||
Lazily parse the blob.
|
|
||||||
Insert image, if possible, between two paragraphs.
|
|
||||||
In this way, a paragraph can be continued on the next page.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
blob: The blob to parse.
|
blob: The blob to parse.
|
||||||
@ -1534,8 +1530,8 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
# the metadatas.
|
# the metadatas.
|
||||||
|
|
||||||
doc_metadata = (
|
doc_metadata = (
|
||||||
doc.metadata | # Legacy metdata with...
|
doc.metadata # Legacy metdata with...
|
||||||
_purge_metadata(
|
| _purge_metadata(
|
||||||
(
|
(
|
||||||
doc.metadata # Add parser metdata
|
doc.metadata # Add parser metdata
|
||||||
| { # with more keys
|
| { # with more keys
|
||||||
@ -1696,23 +1692,19 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
)
|
)
|
||||||
yield new_textmap.to_string()
|
yield new_textmap.to_string()
|
||||||
extract_wordmaps.clear()
|
extract_wordmaps.clear()
|
||||||
# and yield the table
|
# And yield the table
|
||||||
used_arrays[i] = True
|
used_arrays[i] = True
|
||||||
# print(f"yield table {i}")
|
|
||||||
yield tables_content[i]
|
yield tables_content[i]
|
||||||
break
|
break
|
||||||
if not is_table:
|
if not is_table:
|
||||||
# print(f' Add {word["text"]}')
|
|
||||||
extract_wordmaps.append((word, o))
|
extract_wordmaps.append((word, o))
|
||||||
if extract_wordmaps:
|
if extract_wordmaps:
|
||||||
# Text after the array ?
|
|
||||||
new_wordmap = text.WordMap(tuples=extract_wordmaps)
|
new_wordmap = text.WordMap(tuples=extract_wordmaps)
|
||||||
new_textmap = new_wordmap.to_textmap(
|
new_textmap = new_wordmap.to_textmap(
|
||||||
**{k: kwargs[k] for k in text.TEXTMAP_KWARGS if k in kwargs}
|
**{k: kwargs[k] for k in text.TEXTMAP_KWARGS if k in kwargs}
|
||||||
)
|
)
|
||||||
# print(f"yield {new_textmap.to_string()}")
|
|
||||||
yield new_textmap.to_string()
|
yield new_textmap.to_string()
|
||||||
# Add images-
|
# Add images
|
||||||
for content in images_content:
|
for content in images_content:
|
||||||
yield content
|
yield content
|
||||||
|
|
||||||
@ -1882,7 +1874,6 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
output += "|" + "|".join("---" for i in range(col_count)) + "|\n"
|
output += "|" + "|".join("---" for i in range(col_count)) + "|\n"
|
||||||
|
|
||||||
# skip first row in details if header is part of the table
|
# skip first row in details if header is part of the table
|
||||||
|
|
||||||
# iterate over detail rows
|
# iterate over detail rows
|
||||||
for row in table:
|
for row in table:
|
||||||
line = "|"
|
line = "|"
|
||||||
@ -2013,8 +2004,7 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
|||||||
the blob.data is taken
|
the blob.data is taken
|
||||||
"""
|
"""
|
||||||
|
|
||||||
url_parse_result = urlparse(
|
url_parse_result = urlparse(str(blob.path)) if blob.path else None # type: ignore[attr-defined]
|
||||||
str(blob.path)) if blob.path else None # type: ignore[attr-defined]
|
|
||||||
# Either call with S3 path (multi-page) or with bytes (single-page)
|
# Either call with S3 path (multi-page) or with bytes (single-page)
|
||||||
if (
|
if (
|
||||||
url_parse_result
|
url_parse_result
|
||||||
@ -2060,8 +2050,7 @@ class DocumentIntelligenceParser(BaseBlobParser):
|
|||||||
self.client = client
|
self.client = client
|
||||||
self.model = model
|
self.model = model
|
||||||
|
|
||||||
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[
|
def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: # type: ignore[valid-type]
|
||||||
Document]: # type: ignore[valid-type]
|
|
||||||
for p in result.pages:
|
for p in result.pages:
|
||||||
content = " ".join([line.content for line in p.lines])
|
content = " ".join([line.content for line in p.lines])
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user