mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-21 14:18:52 +00:00
Amazon Textract as document loader (#8661)
Description: Adding support for [Amazon Textract](https://aws.amazon.com/textract/) as a PDF document loader --------- Co-authored-by: schadem <45048633+schadem@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
82ef1f587d
commit
8374367de2
@ -108,6 +108,7 @@ from langchain.document_loaders.onedrive_file import OneDriveFileLoader
|
||||
from langchain.document_loaders.open_city_data import OpenCityDataLoader
|
||||
from langchain.document_loaders.org_mode import UnstructuredOrgModeLoader
|
||||
from langchain.document_loaders.pdf import (
|
||||
AmazonTextractPDFLoader,
|
||||
MathpixPDFLoader,
|
||||
OnlinePDFLoader,
|
||||
PDFMinerLoader,
|
||||
@ -330,4 +331,5 @@ __all__ = [
|
||||
"YoutubeAudioLoader",
|
||||
"YoutubeLoader",
|
||||
"ConcurrentLoader",
|
||||
"AmazonTextractPDFLoader",
|
||||
]
|
||||
|
@ -1,5 +1,6 @@
|
||||
"""Module contains common parsers for PDFs."""
|
||||
from typing import Any, Iterator, Mapping, Optional, Union
|
||||
from typing import Any, Iterator, Mapping, Optional, Sequence, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from langchain.document_loaders.base import BaseBlobParser
|
||||
from langchain.document_loaders.blob_loaders import Blob
|
||||
@ -149,3 +150,97 @@ class PDFPlumberParser(BaseBlobParser):
|
||||
)
|
||||
for page in doc.pages
|
||||
]
|
||||
|
||||
|
||||
class AmazonTextractPDFParser(BaseBlobParser):
|
||||
"""Sends PDF files to Amazon Textract and parses them to generate Documents.
|
||||
|
||||
For parsing multi-page PDFs, they have to reside on S3.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
textract_features: Optional[Sequence[int]] = None,
|
||||
client: Optional[Any] = None,
|
||||
) -> None:
|
||||
"""Initializes the parser.
|
||||
|
||||
Args:
|
||||
textract_features: Features to be used for extraction, each feature
|
||||
should be passed as an int that conforms to the enum
|
||||
`Textract_Features`, see `amazon-textract-caller` pkg
|
||||
client: boto3 textract client
|
||||
"""
|
||||
|
||||
try:
|
||||
import textractcaller as tc
|
||||
|
||||
self.tc = tc
|
||||
if textract_features is not None:
|
||||
self.textract_features = [
|
||||
tc.Textract_Features(f) for f in textract_features
|
||||
]
|
||||
else:
|
||||
self.textract_features = []
|
||||
except ImportError:
|
||||
raise ModuleNotFoundError(
|
||||
"Could not import amazon-textract-caller python package. "
|
||||
"Please install it with `pip install amazon-textract-caller`."
|
||||
)
|
||||
|
||||
if not client:
|
||||
try:
|
||||
import boto3
|
||||
|
||||
self.boto3_textract_client = boto3.client("textract")
|
||||
except ImportError:
|
||||
raise ModuleNotFoundError(
|
||||
"Could not import boto3 python package. "
|
||||
"Please install it with `pip install boto3`."
|
||||
)
|
||||
else:
|
||||
self.boto3_textract_client = client
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Iterates over the Blob pages and returns an Iterator with a Document
|
||||
for each page, like the other parsers If multi-page document, blob.path
|
||||
has to be set to the S3 URI and for single page docs the blob.data is taken
|
||||
"""
|
||||
|
||||
url_parse_result = urlparse(str(blob.path)) if blob.path else None
|
||||
# Either call with S3 path (multi-page) or with bytes (single-page)
|
||||
if (
|
||||
url_parse_result
|
||||
and url_parse_result.scheme == "s3"
|
||||
and url_parse_result.netloc
|
||||
):
|
||||
textract_response_json = self.tc.call_textract(
|
||||
input_document=str(blob.path),
|
||||
features=self.textract_features,
|
||||
boto3_textract_client=self.boto3_textract_client,
|
||||
)
|
||||
else:
|
||||
textract_response_json = self.tc.call_textract(
|
||||
input_document=blob.as_bytes(),
|
||||
features=self.textract_features,
|
||||
call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC,
|
||||
boto3_textract_client=self.boto3_textract_client,
|
||||
)
|
||||
|
||||
current_text = ""
|
||||
current_page = 1
|
||||
for block in textract_response_json["Blocks"]:
|
||||
if "Page" in block and not (int(block["Page"]) == current_page):
|
||||
yield Document(
|
||||
page_content=current_text,
|
||||
metadata={"source": blob.source, "page": current_page},
|
||||
)
|
||||
current_text = ""
|
||||
current_page = int(block["Page"])
|
||||
if "Text" in block:
|
||||
current_text += block["Text"] + " "
|
||||
|
||||
yield Document(
|
||||
page_content=current_text,
|
||||
metadata={"source": blob.source, "page": current_page},
|
||||
)
|
||||
|
@ -7,7 +7,7 @@ import time
|
||||
from abc import ABC
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterator, List, Mapping, Optional, Union
|
||||
from typing import Any, Iterator, List, Mapping, Optional, Sequence, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
@ -16,6 +16,7 @@ from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.blob_loaders import Blob
|
||||
from langchain.document_loaders.parsers.pdf import (
|
||||
AmazonTextractPDFParser,
|
||||
PDFMinerParser,
|
||||
PDFPlumberParser,
|
||||
PyMuPDFParser,
|
||||
@ -71,8 +72,14 @@ class BasePDFLoader(BaseLoader, ABC):
|
||||
if "~" in self.file_path:
|
||||
self.file_path = os.path.expanduser(self.file_path)
|
||||
|
||||
# If the file is a web path, download it to a temporary file, and use that
|
||||
# If the file is a web path or S3, download it to a temporary file, and use that
|
||||
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
|
||||
self.temp_dir = tempfile.TemporaryDirectory()
|
||||
_, suffix = os.path.splitext(self.file_path)
|
||||
temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}")
|
||||
if self._is_s3_url(self.file_path):
|
||||
self.web_path = self.file_path
|
||||
else:
|
||||
r = requests.get(self.file_path)
|
||||
|
||||
if r.status_code != 200:
|
||||
@ -82,8 +89,6 @@ class BasePDFLoader(BaseLoader, ABC):
|
||||
)
|
||||
|
||||
self.web_path = self.file_path
|
||||
self.temp_dir = tempfile.TemporaryDirectory()
|
||||
temp_pdf = Path(self.temp_dir.name) / "tmp.pdf"
|
||||
with open(temp_pdf, mode="wb") as f:
|
||||
f.write(r.content)
|
||||
self.file_path = str(temp_pdf)
|
||||
@ -100,6 +105,17 @@ class BasePDFLoader(BaseLoader, ABC):
|
||||
parsed = urlparse(url)
|
||||
return bool(parsed.netloc) and bool(parsed.scheme)
|
||||
|
||||
@staticmethod
|
||||
def _is_s3_url(url: str) -> bool:
|
||||
"""check if the url is S3"""
|
||||
try:
|
||||
result = urlparse(url)
|
||||
if result.scheme == "s3" and result.netloc:
|
||||
return True
|
||||
return False
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
@property
|
||||
def source(self) -> str:
|
||||
return self.web_path if self.web_path is not None else self.file_path
|
||||
@ -440,3 +456,144 @@ class PDFPlumberLoader(BasePDFLoader):
|
||||
parser = PDFPlumberParser(text_kwargs=self.text_kwargs)
|
||||
blob = Blob.from_path(self.file_path)
|
||||
return parser.parse(blob)
|
||||
|
||||
|
||||
class AmazonTextractPDFLoader(BasePDFLoader):
|
||||
"""Loads a PDF document from local file system, HTTP or S3.
|
||||
|
||||
To authenticate, the AWS client uses the following methods to
|
||||
automatically load credentials:
|
||||
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
|
||||
|
||||
If a specific credential profile should be used, you must pass
|
||||
the name of the profile from the ~/.aws/credentials file that is to be used.
|
||||
|
||||
Make sure the credentials / roles used have the required policies to
|
||||
access the Amazon Textract service.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
from langchain.document_loaders import AmazonTextractPDFLoader
|
||||
loader = AmazonTextractPDFLoader(
|
||||
file_path="s3://pdfs/myfile.pdf"
|
||||
)
|
||||
document = loader.load()
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
textract_features: Optional[Sequence[str]] = None,
|
||||
client: Optional[Any] = None,
|
||||
credentials_profile_name: Optional[str] = None,
|
||||
region_name: Optional[str] = None,
|
||||
endpoint_url: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Initialize the loader.
|
||||
|
||||
Args:
|
||||
file_path: A file, url or s3 path for input file
|
||||
textract_features: Features to be used for extraction, each feature
|
||||
should be passed as a str that conforms to the enum
|
||||
`Textract_Features`, see `amazon-textract-caller` pkg
|
||||
client: boto3 textract client (Optional)
|
||||
credentials_profile_name: AWS profile name, if not default (Optional)
|
||||
region_name: AWS region, eg us-east-1 (Optional)
|
||||
endpoint_url: endpoint url for the textract service (Optional)
|
||||
|
||||
"""
|
||||
super().__init__(file_path)
|
||||
|
||||
try:
|
||||
import textractcaller as tc # noqa: F401
|
||||
except ImportError:
|
||||
raise ModuleNotFoundError(
|
||||
"Could not import amazon-textract-caller python package. "
|
||||
"Please install it with `pip install amazon-textract-caller`."
|
||||
)
|
||||
if textract_features:
|
||||
features = [tc.Textract_Features[x] for x in textract_features]
|
||||
else:
|
||||
features = []
|
||||
|
||||
if credentials_profile_name or region_name or endpoint_url:
|
||||
try:
|
||||
import boto3
|
||||
|
||||
if credentials_profile_name is not None:
|
||||
session = boto3.Session(profile_name=credentials_profile_name)
|
||||
else:
|
||||
# use default credentials
|
||||
session = boto3.Session()
|
||||
|
||||
client_params = {}
|
||||
if region_name:
|
||||
client_params["region_name"] = region_name
|
||||
if endpoint_url:
|
||||
client_params["endpoint_url"] = endpoint_url
|
||||
|
||||
client = session.client("textract", **client_params)
|
||||
|
||||
except ImportError:
|
||||
raise ModuleNotFoundError(
|
||||
"Could not import boto3 python package. "
|
||||
"Please install it with `pip install boto3`."
|
||||
)
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
"Could not load credentials to authenticate with AWS client. "
|
||||
"Please check that credentials in the specified "
|
||||
"profile name are valid."
|
||||
) from e
|
||||
self.parser = AmazonTextractPDFParser(textract_features=features, client=client)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load given path as pages."""
|
||||
return list(self.lazy_load())
|
||||
|
||||
def lazy_load(
|
||||
self,
|
||||
) -> Iterator[Document]:
|
||||
"""Lazy load documents"""
|
||||
# the self.file_path is local, but the blob has to include
|
||||
# the S3 location if the file originated from S3 for multi-page documents
|
||||
# raises ValueError when multi-page and not on S3"""
|
||||
|
||||
if self.web_path and self._is_s3_url(self.web_path):
|
||||
blob = Blob(path=self.web_path)
|
||||
else:
|
||||
blob = Blob.from_path(self.file_path)
|
||||
if AmazonTextractPDFLoader._get_number_of_pages(blob) > 1:
|
||||
raise ValueError(
|
||||
f"the file {blob.path} is a multi-page document, \
|
||||
but not stored on S3. \
|
||||
Textract requires multi-page documents to be on S3."
|
||||
)
|
||||
|
||||
yield from self.parser.parse(blob)
|
||||
|
||||
@staticmethod
|
||||
def _get_number_of_pages(blob: Blob) -> int:
|
||||
try:
|
||||
import pypdf
|
||||
from PIL import Image, ImageSequence
|
||||
|
||||
except ImportError:
|
||||
raise ModuleNotFoundError(
|
||||
"Could not import pypdf or Pilloe python package. "
|
||||
"Please install it with `pip install pypdf Pillow`."
|
||||
)
|
||||
if blob.mimetype == "application/pdf":
|
||||
with blob.as_bytes_io() as input_pdf_file:
|
||||
pdf_reader = pypdf.PdfReader(input_pdf_file)
|
||||
return len(pdf_reader.pages)
|
||||
elif blob.mimetype == "image/tiff":
|
||||
num_pages = 0
|
||||
img = Image.open(blob.as_bytes())
|
||||
for _, _ in enumerate(ImageSequence.Iterator(img)):
|
||||
num_pages += 1
|
||||
return num_pages
|
||||
elif blob.mimetype in ["image/png", "image/jpeg"]:
|
||||
return 1
|
||||
else:
|
||||
raise ValueError(f"unsupported mime type: {blob.mimetype}")
|
||||
|
41
libs/langchain/poetry.lock
generated
41
libs/langchain/poetry.lock
generated
@ -338,6 +338,42 @@ files = [
|
||||
{file = "amadeus-8.1.0.tar.gz", hash = "sha256:df31e7c84383a85ee2dce95b11e7a0774fdf31762229f768519b5cb176bc167d"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "amazon-textract-caller"
|
||||
version = "0.0.29"
|
||||
description = "Amazon Textract Caller tools"
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "amazon-textract-caller-0.0.29.tar.gz", hash = "sha256:53770d82db67d4984a99825a90908a319f8920e64d6d48a45456b18d6ab3771a"},
|
||||
{file = "amazon_textract_caller-0.0.29-py2.py3-none-any.whl", hash = "sha256:c5898fc7e84eea2564a9ececcf9101778b7533fa58e2c8e6eb1daa48869788fc"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
amazon-textract-response-parser = ">=0.1.39"
|
||||
boto3 = ">=1.26.35"
|
||||
botocore = "*"
|
||||
|
||||
[package.extras]
|
||||
testing = ["amazon-textract-response-parser", "pytest"]
|
||||
|
||||
[[package]]
|
||||
name = "amazon-textract-response-parser"
|
||||
version = "1.0.0"
|
||||
description = "Easily parse JSON returned by Amazon Textract."
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "amazon-textract-response-parser-1.0.0.tar.gz", hash = "sha256:52e94e002b714195d678ea83b99ebc11d68ea716c9371852aed03a10e385dd41"},
|
||||
{file = "amazon_textract_response_parser-1.0.0-py2.py3-none-any.whl", hash = "sha256:668ffb4604ed365de9c60d6a77ca9190c2614679997edfba0ce7398e2579c574"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
boto3 = "*"
|
||||
marshmallow = ">=3.14,<4"
|
||||
|
||||
[[package]]
|
||||
name = "anthropic"
|
||||
version = "0.3.2"
|
||||
@ -4702,6 +4738,7 @@ optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
|
||||
files = [
|
||||
{file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
|
||||
{file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -13539,7 +13576,7 @@ clarifai = ["clarifai"]
|
||||
cohere = ["cohere"]
|
||||
docarray = ["docarray"]
|
||||
embeddings = ["sentence-transformers"]
|
||||
extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"]
|
||||
extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"]
|
||||
javascript = ["esprima"]
|
||||
llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers", "xinference"]
|
||||
openai = ["openai", "tiktoken"]
|
||||
@ -13549,4 +13586,4 @@ text-helpers = ["chardet"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "0708c3b45f59eea36919ff9ff99fa6eddc81bccb654cce183641ef8396ea5290"
|
||||
content-hash = "39305f23d3d69179d247d643631133ac50f5e944d98518c8a56c5f839b8e7a04"
|
||||
|
@ -130,6 +130,7 @@ gitpython = {version = "^3.1.32", optional = true}
|
||||
librosa = {version="^0.10.0.post2", optional = true }
|
||||
feedparser = {version = "^6.0.10", optional = true}
|
||||
newspaper3k = {version = "^0.2.8", optional = true}
|
||||
amazon-textract-caller = {version = "<2", optional = true}
|
||||
|
||||
[tool.poetry.group.test.dependencies]
|
||||
# The only dependencies that should be added are
|
||||
@ -329,6 +330,7 @@ all = [
|
||||
# Please use new-line on formatting to make it easier to add new packages without
|
||||
# merge-conflicts
|
||||
extended_testing = [
|
||||
"amazon-textract-caller",
|
||||
"beautifulsoup4",
|
||||
"bibtexparser",
|
||||
"cassio",
|
||||
|
@ -1,6 +1,10 @@
|
||||
from pathlib import Path
|
||||
from typing import Sequence, Union
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.document_loaders import (
|
||||
AmazonTextractPDFLoader,
|
||||
MathpixPDFLoader,
|
||||
PDFMinerLoader,
|
||||
PDFMinerPDFasHTMLLoader,
|
||||
@ -136,3 +140,56 @@ def test_mathpix_loader() -> None:
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
print(docs[0].page_content)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file_path, features, docs_length, create_client",
|
||||
[
|
||||
(
|
||||
(
|
||||
"https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"
|
||||
"/langchain/alejandro_rosalez_sample_1.jpg"
|
||||
),
|
||||
["FORMS", "TABLES"],
|
||||
1,
|
||||
False,
|
||||
),
|
||||
(str(Path(__file__).parent.parent / "examples/hello.pdf"), ["FORMS"], 1, False),
|
||||
(
|
||||
"s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf",
|
||||
None,
|
||||
16,
|
||||
True,
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.skip(reason="Requires AWS credentials to run")
|
||||
def test_amazontextract_loader(
|
||||
file_path: str,
|
||||
features: Union[Sequence[str], None],
|
||||
docs_length: int,
|
||||
create_client: bool,
|
||||
) -> None:
|
||||
if create_client:
|
||||
import boto3
|
||||
|
||||
textract_client = boto3.client("textract", region_name="us-east-2")
|
||||
loader = AmazonTextractPDFLoader(
|
||||
file_path, textract_features=features, client=textract_client
|
||||
)
|
||||
else:
|
||||
loader = AmazonTextractPDFLoader(file_path, textract_features=features)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == docs_length
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Requires AWS credentials to run")
|
||||
def test_amazontextract_loader_failures() -> None:
|
||||
# 2-page PDF local file system
|
||||
two_page_pdf = str(
|
||||
Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"
|
||||
)
|
||||
loader = AmazonTextractPDFLoader(two_page_pdf)
|
||||
with pytest.raises(ValueError):
|
||||
loader.load()
|
||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user