mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-14 15:16:21 +00:00
Fix revue
This commit is contained in:
parent
dd909d2914
commit
38b50e3277
@ -454,9 +454,7 @@ class PyPDFParser(BaseBlobParser):
|
|||||||
image_bytes = io.BytesIO()
|
image_bytes = io.BytesIO()
|
||||||
Image.fromarray(np_image).save(image_bytes, format="PNG")
|
Image.fromarray(np_image).save(image_bytes, format="PNG")
|
||||||
blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
|
blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
|
||||||
image_text = next(
|
image_text = next(self.images_parser.lazy_parse(blob)).page_content
|
||||||
self.images_parser.lazy_parse(blob) # type: ignore
|
|
||||||
).page_content
|
|
||||||
images.append(
|
images.append(
|
||||||
_format_inner_image(blob, image_text, self.images_inner_format)
|
_format_inner_image(blob, image_text, self.images_inner_format)
|
||||||
)
|
)
|
||||||
@ -751,7 +749,7 @@ class PDFMinerParser(BaseBlobParser):
|
|||||||
blob = Blob.from_path(Path(tempdir) / filename)
|
blob = Blob.from_path(Path(tempdir) / filename)
|
||||||
blob.metadata["source"] = "#"
|
blob.metadata["source"] = "#"
|
||||||
image_text = next(
|
image_text = next(
|
||||||
self.images_parser.lazy_parse(blob) # type: ignore
|
self.images_parser.lazy_parse(blob)
|
||||||
).page_content
|
).page_content
|
||||||
|
|
||||||
text_io.write(
|
text_io.write(
|
||||||
@ -1104,9 +1102,7 @@ class PyMuPDFParser(BaseBlobParser):
|
|||||||
blob = Blob.from_data(
|
blob = Blob.from_data(
|
||||||
image_bytes.getvalue(), mime_type="application/x-npy"
|
image_bytes.getvalue(), mime_type="application/x-npy"
|
||||||
)
|
)
|
||||||
image_text = next(
|
image_text = next(self.images_parser.lazy_parse(blob)).page_content
|
||||||
self.images_parser.lazy_parse(blob) # type: ignore
|
|
||||||
).page_content
|
|
||||||
|
|
||||||
images.append(
|
images.append(
|
||||||
_format_inner_image(blob, image_text, self.images_inner_format)
|
_format_inner_image(blob, image_text, self.images_inner_format)
|
||||||
@ -1196,6 +1192,8 @@ class PyPDFium2Parser(BaseBlobParser):
|
|||||||
# password=None,
|
# password=None,
|
||||||
mode="page",
|
mode="page",
|
||||||
pages_delimiter="\n\f",
|
pages_delimiter="\n\f",
|
||||||
|
# extract_images = True,
|
||||||
|
# images_to_text = convert_images_to_text_with_tesseract(),
|
||||||
)
|
)
|
||||||
|
|
||||||
Lazily parse the blob:
|
Lazily parse the blob:
|
||||||
@ -1365,9 +1363,7 @@ class PyPDFium2Parser(BaseBlobParser):
|
|||||||
continue
|
continue
|
||||||
numpy.save(image_bytes, image.get_bitmap().to_numpy())
|
numpy.save(image_bytes, image.get_bitmap().to_numpy())
|
||||||
blob = Blob.from_data(image_bytes.getvalue(), mime_type="application/x-npy")
|
blob = Blob.from_data(image_bytes.getvalue(), mime_type="application/x-npy")
|
||||||
text_from_image = next(
|
text_from_image = next(self.images_parser.lazy_parse(blob)).page_content
|
||||||
self.images_parser.lazy_parse(blob) # type: ignore
|
|
||||||
).page_content
|
|
||||||
str_images.append(
|
str_images.append(
|
||||||
_format_inner_image(blob, text_from_image, self.images_inner_format)
|
_format_inner_image(blob, text_from_image, self.images_inner_format)
|
||||||
)
|
)
|
||||||
@ -1410,6 +1406,7 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
mode = "single",
|
mode = "single",
|
||||||
pages_delimiter = "\n\f",
|
pages_delimiter = "\n\f",
|
||||||
# extract_tables="markdown",
|
# extract_tables="markdown",
|
||||||
|
metadata_format="standard",
|
||||||
)
|
)
|
||||||
|
|
||||||
Lazily parse the blob:
|
Lazily parse the blob:
|
||||||
@ -1438,6 +1435,7 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||||
extract_tables: Optional[Literal["csv", "markdown", "html"]] = None,
|
extract_tables: Optional[Literal["csv", "markdown", "html"]] = None,
|
||||||
extract_tables_settings: Optional[dict[str, Any]] = None,
|
extract_tables_settings: Optional[dict[str, Any]] = None,
|
||||||
|
metadata_format: Literal["legacy", "standard"] = "legacy",
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the parser.
|
"""Initialize the parser.
|
||||||
|
|
||||||
@ -1461,6 +1459,8 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
dedupe: Avoiding the error of duplicate characters if `dedupe=True`
|
dedupe: Avoiding the error of duplicate characters if `dedupe=True`
|
||||||
extract_tables_settings: Optional dictionary of settings for customizing
|
extract_tables_settings: Optional dictionary of settings for customizing
|
||||||
table extraction.
|
table extraction.
|
||||||
|
metadata_format: Use CamelCase keys with 'legacy'
|
||||||
|
and lower keys with 'standard'.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
This method does not directly return data. Use the `parse` or `lazy_parse`
|
This method does not directly return data. Use the `parse` or `lazy_parse`
|
||||||
@ -1492,6 +1492,19 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
"snap_y_tolerance": 5,
|
"snap_y_tolerance": 5,
|
||||||
"intersection_x_tolerance": 15,
|
"intersection_x_tolerance": 15,
|
||||||
}
|
}
|
||||||
|
if metadata_format == "legacy":
|
||||||
|
warnings.warn(
|
||||||
|
"The default value 'legacy' use some CamelCase keys. "
|
||||||
|
"It's will be deprecated in the next major version."
|
||||||
|
)
|
||||||
|
|
||||||
|
self.metadata_format = metadata_format
|
||||||
|
|
||||||
|
def _validate_metadata(self, metadata: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
if self.metadata_format == "legacy":
|
||||||
|
return metadata
|
||||||
|
else:
|
||||||
|
return _validate_metadata(metadata)
|
||||||
|
|
||||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
||||||
"""Lazily parse the blob.
|
"""Lazily parse the blob.
|
||||||
@ -1520,19 +1533,8 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
contents = []
|
contents = []
|
||||||
# The legacy version, use CreationDate, Creator, etc.
|
# The legacy version, use CreationDate, Creator, etc.
|
||||||
# The new 'standard' version must use lower case key.
|
# The new 'standard' version must use lower case key.
|
||||||
# This next line, merge the legecy keys and standard keys
|
if self.metadata_format == "legacy":
|
||||||
# in the same dictionary.
|
|
||||||
# - The CreationDate is duplicate to `creationdate` with iso format.
|
|
||||||
# - The Creator is duplicate to 'creator', etc.
|
|
||||||
# With this strategy, the legacy code can continue to use CreationDate
|
|
||||||
# or Creator. The new code, can use `creationdate` or `creator`.
|
|
||||||
# _purge_metadata() convert and normalize the name and format of
|
|
||||||
# the metadatas.
|
|
||||||
|
|
||||||
doc_metadata = (
|
doc_metadata = (
|
||||||
doc.metadata # Legacy metdata with...
|
|
||||||
| _purge_metadata(
|
|
||||||
(
|
|
||||||
doc.metadata # Add parser metdata
|
doc.metadata # Add parser metdata
|
||||||
| { # with more keys
|
| { # with more keys
|
||||||
"source": blob.source,
|
"source": blob.source,
|
||||||
@ -1540,6 +1542,15 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
"total_pages": len(doc.pages),
|
"total_pages": len(doc.pages),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
doc_metadata = _purge_metadata(
|
||||||
|
(
|
||||||
|
doc.metadata # Add parser metdata
|
||||||
|
| { # with more keys
|
||||||
|
"source": blob.source,
|
||||||
|
"file_path": blob.source,
|
||||||
|
"total_pages": len(doc.pages),
|
||||||
|
}
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1596,7 +1607,7 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
all_text += "\n"
|
all_text += "\n"
|
||||||
yield Document(
|
yield Document(
|
||||||
page_content=all_text,
|
page_content=all_text,
|
||||||
metadata=_validate_metadata(
|
metadata=self._validate_metadata(
|
||||||
doc_metadata
|
doc_metadata
|
||||||
| {
|
| {
|
||||||
"page": page.page_number - 1,
|
"page": page.page_number - 1,
|
||||||
@ -1608,7 +1619,7 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
if self.mode == "single":
|
if self.mode == "single":
|
||||||
yield Document(
|
yield Document(
|
||||||
page_content=self.pages_delimiter.join(contents),
|
page_content=self.pages_delimiter.join(contents),
|
||||||
metadata=_validate_metadata(doc_metadata),
|
metadata=self._validate_metadata(doc_metadata),
|
||||||
)
|
)
|
||||||
|
|
||||||
def _process_page_content(self, page: pdfplumber.page.Page) -> str:
|
def _process_page_content(self, page: pdfplumber.page.Page) -> str:
|
||||||
|
@ -1044,6 +1044,7 @@ class PDFPlumberLoader(BasePDFLoader):
|
|||||||
# extract_tables_settings = None,
|
# extract_tables_settings = None,
|
||||||
# text_kwargs = {"use_text_flow": False, "keep_blank_chars": False},
|
# text_kwargs = {"use_text_flow": False, "keep_blank_chars": False},
|
||||||
# dedupe = False,
|
# dedupe = False,
|
||||||
|
metadata_format="standard",
|
||||||
)
|
)
|
||||||
|
|
||||||
Lazy load documents:
|
Lazy load documents:
|
||||||
@ -1082,6 +1083,7 @@ class PDFPlumberLoader(BasePDFLoader):
|
|||||||
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||||
extract_tables: Optional[Literal["csv", "markdown", "html"]] = None,
|
extract_tables: Optional[Literal["csv", "markdown", "html"]] = None,
|
||||||
extract_tables_settings: Optional[dict[str, Any]] = None,
|
extract_tables_settings: Optional[dict[str, Any]] = None,
|
||||||
|
metadata_format: Literal["legacy", "standard"] = "legacy",
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize with a file path.
|
"""Initialize with a file path.
|
||||||
|
|
||||||
@ -1108,6 +1110,8 @@ class PDFPlumberLoader(BasePDFLoader):
|
|||||||
table extraction.
|
table extraction.
|
||||||
text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
|
text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
|
||||||
dedupe: Avoiding the error of duplicate characters if `dedupe=True`
|
dedupe: Avoiding the error of duplicate characters if `dedupe=True`
|
||||||
|
metadata_format: Use CamelCase keys with 'legacy'
|
||||||
|
and lower keys with 'standard'.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
This method does not directly return data. Use the `load`, `lazy_load`,
|
This method does not directly return data. Use the `load`, `lazy_load`,
|
||||||
@ -1129,6 +1133,7 @@ class PDFPlumberLoader(BasePDFLoader):
|
|||||||
text_kwargs=text_kwargs,
|
text_kwargs=text_kwargs,
|
||||||
extract_tables_settings=extract_tables_settings,
|
extract_tables_settings=extract_tables_settings,
|
||||||
dedupe=dedupe,
|
dedupe=dedupe,
|
||||||
|
metadata_format=metadata_format,
|
||||||
)
|
)
|
||||||
|
|
||||||
def lazy_load(
|
def lazy_load(
|
||||||
|
@ -2,19 +2,18 @@
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Iterator
|
from typing import TYPE_CHECKING, Iterator, Type
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import langchain_community.document_loaders.parsers as pdf_parsers
|
|
||||||
from langchain_community.document_loaders.base import BaseBlobParser
|
from langchain_community.document_loaders.base import BaseBlobParser
|
||||||
from langchain_community.document_loaders.blob_loaders import Blob
|
from langchain_community.document_loaders.blob_loaders import Blob
|
||||||
from langchain_community.document_loaders.parsers import (
|
from langchain_community.document_loaders.parsers import (
|
||||||
BaseImageBlobParser,
|
BaseImageBlobParser,
|
||||||
PDFPlumberParser,
|
|
||||||
)
|
)
|
||||||
from langchain_community.document_loaders.parsers.pdf import (
|
from langchain_community.document_loaders.parsers.pdf import (
|
||||||
PDFMinerParser,
|
PDFMinerParser,
|
||||||
|
PDFPlumberParser,
|
||||||
PyMuPDFParser,
|
PyMuPDFParser,
|
||||||
PyPDFium2Parser,
|
PyPDFium2Parser,
|
||||||
PyPDFParser,
|
PyPDFParser,
|
||||||
@ -114,7 +113,7 @@ class EmptyImageBlobParser(BaseImageBlobParser):
|
|||||||
"parser_class,params",
|
"parser_class,params",
|
||||||
[
|
[
|
||||||
(PDFMinerParser, {}),
|
(PDFMinerParser, {}),
|
||||||
(PDFPlumberParser, {}),
|
(PDFPlumberParser, {"metadata_format": "standard"}),
|
||||||
(PyMuPDFParser, {}),
|
(PyMuPDFParser, {}),
|
||||||
(PyPDFium2Parser, {}),
|
(PyPDFium2Parser, {}),
|
||||||
(PyPDFParser, {"extraction_mode": "plain"}),
|
(PyPDFParser, {"extraction_mode": "plain"}),
|
||||||
@ -145,7 +144,7 @@ def test_mode_and_extract_images_variations(
|
|||||||
"parser_class,params",
|
"parser_class,params",
|
||||||
[
|
[
|
||||||
(PDFMinerParser, {}),
|
(PDFMinerParser, {}),
|
||||||
(PDFPlumberParser, {}),
|
(PDFPlumberParser, {"metadata_format": "standard"}),
|
||||||
(PyMuPDFParser, {}),
|
(PyMuPDFParser, {}),
|
||||||
(PyPDFium2Parser, {}),
|
(PyPDFium2Parser, {}),
|
||||||
(PyPDFParser, {"extraction_mode": "plain"}),
|
(PyPDFParser, {"extraction_mode": "plain"}),
|
||||||
@ -254,8 +253,6 @@ def test_parser_with_table(
|
|||||||
mode: str,
|
mode: str,
|
||||||
extract_tables: str,
|
extract_tables: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
parser_class = getattr(pdf_parsers, parser_factory)
|
|
||||||
|
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
|
|
||||||
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
|
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
|
||||||
|
@ -1,15 +1,15 @@
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Sequence, Union
|
from typing import Sequence, Type, Union
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import langchain_community.document_loaders as pdf_loaders
|
from langchain_community.document_loaders.pdf import (
|
||||||
from langchain_community.document_loaders import (
|
|
||||||
AmazonTextractPDFLoader,
|
AmazonTextractPDFLoader,
|
||||||
MathpixPDFLoader,
|
MathpixPDFLoader,
|
||||||
PDFMinerLoader,
|
PDFMinerLoader,
|
||||||
PDFMinerPDFasHTMLLoader,
|
PDFMinerPDFasHTMLLoader,
|
||||||
|
PDFPlumberLoader,
|
||||||
PyMuPDFLoader,
|
PyMuPDFLoader,
|
||||||
PyPDFium2Loader,
|
PyPDFium2Loader,
|
||||||
PyPDFLoader,
|
PyPDFLoader,
|
||||||
@ -171,7 +171,7 @@ def test_amazontextract_loader_failures() -> None:
|
|||||||
"loader_class,params",
|
"loader_class,params",
|
||||||
[
|
[
|
||||||
(PDFMinerLoader, {}),
|
(PDFMinerLoader, {}),
|
||||||
(PDFPlumberLoader, {}),
|
(PDFPlumberLoader, {"metadata_format": "standard"}),
|
||||||
(PyMuPDFLoader, {}),
|
(PyMuPDFLoader, {}),
|
||||||
(PyPDFium2Loader, {}),
|
(PyPDFium2Loader, {}),
|
||||||
(PyPDFLoader, {}),
|
(PyPDFLoader, {}),
|
||||||
@ -181,7 +181,6 @@ def test_standard_parameters(
|
|||||||
loader_class: Type,
|
loader_class: Type,
|
||||||
params: dict,
|
params: dict,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||||
loader = loader_class(file_path)
|
loader = loader_class(file_path)
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
|
@ -2,15 +2,15 @@
|
|||||||
|
|
||||||
import importlib
|
import importlib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Iterator
|
from typing import Any, Iterator, Type
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import langchain_community.document_loaders.parsers as pdf_parsers
|
|
||||||
from langchain_community.document_loaders.base import BaseBlobParser
|
from langchain_community.document_loaders.base import BaseBlobParser
|
||||||
from langchain_community.document_loaders.blob_loaders import Blob
|
from langchain_community.document_loaders.blob_loaders import Blob
|
||||||
from langchain_community.document_loaders.parsers.pdf import (
|
from langchain_community.document_loaders.parsers.pdf import (
|
||||||
PDFMinerParser,
|
PDFMinerParser,
|
||||||
|
PDFPlumberParser,
|
||||||
PyMuPDFParser,
|
PyMuPDFParser,
|
||||||
PyPDFium2Parser,
|
PyPDFium2Parser,
|
||||||
PyPDFParser,
|
PyPDFParser,
|
||||||
@ -78,24 +78,25 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"parser_class,require,params",
|
"parser_class,require,ctr_params,params",
|
||||||
[
|
[
|
||||||
(PDFMinerParser, "pdfminer", {"splits_by_page": False}),
|
(PDFMinerParser, "pdfminer", {}, {"splits_by_page": False}),
|
||||||
(PDFPlumberParser, "pdfplumber", {}),
|
(PDFPlumberParser, "pdfplumber", {"metadata_format": "standard"}, {}),
|
||||||
(PyMuPDFParser, "pymupdf", {}),
|
(PyMuPDFParser, "pymupdf", {}, {}),
|
||||||
(PyPDFParser, "pypdf", {}),
|
(PyPDFParser, "pypdf", {}, {}),
|
||||||
(PyPDFium2Parser, "pypdfium2", {}),
|
(PyPDFium2Parser, "pypdfium2", {}, {}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_parsers(
|
def test_parsers(
|
||||||
parser_class: Type,
|
parser_class: Type,
|
||||||
require: str,
|
require: str,
|
||||||
|
ctr_params: dict[str, Any],
|
||||||
params: dict[str, Any],
|
params: dict[str, Any],
|
||||||
) -> None:
|
) -> None:
|
||||||
try:
|
try:
|
||||||
require = require.replace("-", "")
|
require = require.replace("-", "")
|
||||||
importlib.import_module(require, package=None)
|
importlib.import_module(require, package=None)
|
||||||
parser = parser_class()
|
parser = parser_class(**ctr_params)
|
||||||
_assert_with_parser(parser, **params)
|
_assert_with_parser(parser, **params)
|
||||||
except ModuleNotFoundError:
|
except ModuleNotFoundError:
|
||||||
pytest.skip(f"{parser_class} skiped. Require '{require}'")
|
pytest.skip(f"{parser_class} skiped. Require '{require}'")
|
||||||
|
Loading…
Reference in New Issue
Block a user