mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-21 06:14:37 +00:00
Amazon Textract as document loader (#8661)
Description: Adding support for [Amazon Textract](https://aws.amazon.com/textract/) as a PDF document loader --------- Co-authored-by: schadem <45048633+schadem@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
82ef1f587d
commit
8374367de2
@ -108,6 +108,7 @@ from langchain.document_loaders.onedrive_file import OneDriveFileLoader
|
|||||||
from langchain.document_loaders.open_city_data import OpenCityDataLoader
|
from langchain.document_loaders.open_city_data import OpenCityDataLoader
|
||||||
from langchain.document_loaders.org_mode import UnstructuredOrgModeLoader
|
from langchain.document_loaders.org_mode import UnstructuredOrgModeLoader
|
||||||
from langchain.document_loaders.pdf import (
|
from langchain.document_loaders.pdf import (
|
||||||
|
AmazonTextractPDFLoader,
|
||||||
MathpixPDFLoader,
|
MathpixPDFLoader,
|
||||||
OnlinePDFLoader,
|
OnlinePDFLoader,
|
||||||
PDFMinerLoader,
|
PDFMinerLoader,
|
||||||
@ -330,4 +331,5 @@ __all__ = [
|
|||||||
"YoutubeAudioLoader",
|
"YoutubeAudioLoader",
|
||||||
"YoutubeLoader",
|
"YoutubeLoader",
|
||||||
"ConcurrentLoader",
|
"ConcurrentLoader",
|
||||||
|
"AmazonTextractPDFLoader",
|
||||||
]
|
]
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
"""Module contains common parsers for PDFs."""
|
"""Module contains common parsers for PDFs."""
|
||||||
from typing import Any, Iterator, Mapping, Optional, Union
|
from typing import Any, Iterator, Mapping, Optional, Sequence, Union
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from langchain.document_loaders.base import BaseBlobParser
|
from langchain.document_loaders.base import BaseBlobParser
|
||||||
from langchain.document_loaders.blob_loaders import Blob
|
from langchain.document_loaders.blob_loaders import Blob
|
||||||
@ -149,3 +150,97 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
)
|
)
|
||||||
for page in doc.pages
|
for page in doc.pages
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class AmazonTextractPDFParser(BaseBlobParser):
|
||||||
|
"""Sends PDF files to Amazon Textract and parses them to generate Documents.
|
||||||
|
|
||||||
|
For parsing multi-page PDFs, they have to reside on S3.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
textract_features: Optional[Sequence[int]] = None,
|
||||||
|
client: Optional[Any] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Initializes the parser.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
textract_features: Features to be used for extraction, each feature
|
||||||
|
should be passed as an int that conforms to the enum
|
||||||
|
`Textract_Features`, see `amazon-textract-caller` pkg
|
||||||
|
client: boto3 textract client
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
import textractcaller as tc
|
||||||
|
|
||||||
|
self.tc = tc
|
||||||
|
if textract_features is not None:
|
||||||
|
self.textract_features = [
|
||||||
|
tc.Textract_Features(f) for f in textract_features
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
self.textract_features = []
|
||||||
|
except ImportError:
|
||||||
|
raise ModuleNotFoundError(
|
||||||
|
"Could not import amazon-textract-caller python package. "
|
||||||
|
"Please install it with `pip install amazon-textract-caller`."
|
||||||
|
)
|
||||||
|
|
||||||
|
if not client:
|
||||||
|
try:
|
||||||
|
import boto3
|
||||||
|
|
||||||
|
self.boto3_textract_client = boto3.client("textract")
|
||||||
|
except ImportError:
|
||||||
|
raise ModuleNotFoundError(
|
||||||
|
"Could not import boto3 python package. "
|
||||||
|
"Please install it with `pip install boto3`."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.boto3_textract_client = client
|
||||||
|
|
||||||
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||||
|
"""Iterates over the Blob pages and returns an Iterator with a Document
|
||||||
|
for each page, like the other parsers If multi-page document, blob.path
|
||||||
|
has to be set to the S3 URI and for single page docs the blob.data is taken
|
||||||
|
"""
|
||||||
|
|
||||||
|
url_parse_result = urlparse(str(blob.path)) if blob.path else None
|
||||||
|
# Either call with S3 path (multi-page) or with bytes (single-page)
|
||||||
|
if (
|
||||||
|
url_parse_result
|
||||||
|
and url_parse_result.scheme == "s3"
|
||||||
|
and url_parse_result.netloc
|
||||||
|
):
|
||||||
|
textract_response_json = self.tc.call_textract(
|
||||||
|
input_document=str(blob.path),
|
||||||
|
features=self.textract_features,
|
||||||
|
boto3_textract_client=self.boto3_textract_client,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
textract_response_json = self.tc.call_textract(
|
||||||
|
input_document=blob.as_bytes(),
|
||||||
|
features=self.textract_features,
|
||||||
|
call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC,
|
||||||
|
boto3_textract_client=self.boto3_textract_client,
|
||||||
|
)
|
||||||
|
|
||||||
|
current_text = ""
|
||||||
|
current_page = 1
|
||||||
|
for block in textract_response_json["Blocks"]:
|
||||||
|
if "Page" in block and not (int(block["Page"]) == current_page):
|
||||||
|
yield Document(
|
||||||
|
page_content=current_text,
|
||||||
|
metadata={"source": blob.source, "page": current_page},
|
||||||
|
)
|
||||||
|
current_text = ""
|
||||||
|
current_page = int(block["Page"])
|
||||||
|
if "Text" in block:
|
||||||
|
current_text += block["Text"] + " "
|
||||||
|
|
||||||
|
yield Document(
|
||||||
|
page_content=current_text,
|
||||||
|
metadata={"source": blob.source, "page": current_page},
|
||||||
|
)
|
||||||
|
@ -7,7 +7,7 @@ import time
|
|||||||
from abc import ABC
|
from abc import ABC
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Iterator, List, Mapping, Optional, Union
|
from typing import Any, Iterator, List, Mapping, Optional, Sequence, Union
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
@ -16,6 +16,7 @@ from langchain.docstore.document import Document
|
|||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
from langchain.document_loaders.blob_loaders import Blob
|
from langchain.document_loaders.blob_loaders import Blob
|
||||||
from langchain.document_loaders.parsers.pdf import (
|
from langchain.document_loaders.parsers.pdf import (
|
||||||
|
AmazonTextractPDFParser,
|
||||||
PDFMinerParser,
|
PDFMinerParser,
|
||||||
PDFPlumberParser,
|
PDFPlumberParser,
|
||||||
PyMuPDFParser,
|
PyMuPDFParser,
|
||||||
@ -71,22 +72,26 @@ class BasePDFLoader(BaseLoader, ABC):
|
|||||||
if "~" in self.file_path:
|
if "~" in self.file_path:
|
||||||
self.file_path = os.path.expanduser(self.file_path)
|
self.file_path = os.path.expanduser(self.file_path)
|
||||||
|
|
||||||
# If the file is a web path, download it to a temporary file, and use that
|
# If the file is a web path or S3, download it to a temporary file, and use that
|
||||||
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
|
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
|
||||||
r = requests.get(self.file_path)
|
|
||||||
|
|
||||||
if r.status_code != 200:
|
|
||||||
raise ValueError(
|
|
||||||
"Check the url of your file; returned status code %s"
|
|
||||||
% r.status_code
|
|
||||||
)
|
|
||||||
|
|
||||||
self.web_path = self.file_path
|
|
||||||
self.temp_dir = tempfile.TemporaryDirectory()
|
self.temp_dir = tempfile.TemporaryDirectory()
|
||||||
temp_pdf = Path(self.temp_dir.name) / "tmp.pdf"
|
_, suffix = os.path.splitext(self.file_path)
|
||||||
with open(temp_pdf, mode="wb") as f:
|
temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}")
|
||||||
f.write(r.content)
|
if self._is_s3_url(self.file_path):
|
||||||
self.file_path = str(temp_pdf)
|
self.web_path = self.file_path
|
||||||
|
else:
|
||||||
|
r = requests.get(self.file_path)
|
||||||
|
|
||||||
|
if r.status_code != 200:
|
||||||
|
raise ValueError(
|
||||||
|
"Check the url of your file; returned status code %s"
|
||||||
|
% r.status_code
|
||||||
|
)
|
||||||
|
|
||||||
|
self.web_path = self.file_path
|
||||||
|
with open(temp_pdf, mode="wb") as f:
|
||||||
|
f.write(r.content)
|
||||||
|
self.file_path = str(temp_pdf)
|
||||||
elif not os.path.isfile(self.file_path):
|
elif not os.path.isfile(self.file_path):
|
||||||
raise ValueError("File path %s is not a valid file or url" % self.file_path)
|
raise ValueError("File path %s is not a valid file or url" % self.file_path)
|
||||||
|
|
||||||
@ -100,6 +105,17 @@ class BasePDFLoader(BaseLoader, ABC):
|
|||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
return bool(parsed.netloc) and bool(parsed.scheme)
|
return bool(parsed.netloc) and bool(parsed.scheme)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_s3_url(url: str) -> bool:
|
||||||
|
"""check if the url is S3"""
|
||||||
|
try:
|
||||||
|
result = urlparse(url)
|
||||||
|
if result.scheme == "s3" and result.netloc:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def source(self) -> str:
|
def source(self) -> str:
|
||||||
return self.web_path if self.web_path is not None else self.file_path
|
return self.web_path if self.web_path is not None else self.file_path
|
||||||
@ -440,3 +456,144 @@ class PDFPlumberLoader(BasePDFLoader):
|
|||||||
parser = PDFPlumberParser(text_kwargs=self.text_kwargs)
|
parser = PDFPlumberParser(text_kwargs=self.text_kwargs)
|
||||||
blob = Blob.from_path(self.file_path)
|
blob = Blob.from_path(self.file_path)
|
||||||
return parser.parse(blob)
|
return parser.parse(blob)
|
||||||
|
|
||||||
|
|
||||||
|
class AmazonTextractPDFLoader(BasePDFLoader):
|
||||||
|
"""Loads a PDF document from local file system, HTTP or S3.
|
||||||
|
|
||||||
|
To authenticate, the AWS client uses the following methods to
|
||||||
|
automatically load credentials:
|
||||||
|
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
|
||||||
|
|
||||||
|
If a specific credential profile should be used, you must pass
|
||||||
|
the name of the profile from the ~/.aws/credentials file that is to be used.
|
||||||
|
|
||||||
|
Make sure the credentials / roles used have the required policies to
|
||||||
|
access the Amazon Textract service.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
.. code-block:: python
|
||||||
|
from langchain.document_loaders import AmazonTextractPDFLoader
|
||||||
|
loader = AmazonTextractPDFLoader(
|
||||||
|
file_path="s3://pdfs/myfile.pdf"
|
||||||
|
)
|
||||||
|
document = loader.load()
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
file_path: str,
|
||||||
|
textract_features: Optional[Sequence[str]] = None,
|
||||||
|
client: Optional[Any] = None,
|
||||||
|
credentials_profile_name: Optional[str] = None,
|
||||||
|
region_name: Optional[str] = None,
|
||||||
|
endpoint_url: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Initialize the loader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: A file, url or s3 path for input file
|
||||||
|
textract_features: Features to be used for extraction, each feature
|
||||||
|
should be passed as a str that conforms to the enum
|
||||||
|
`Textract_Features`, see `amazon-textract-caller` pkg
|
||||||
|
client: boto3 textract client (Optional)
|
||||||
|
credentials_profile_name: AWS profile name, if not default (Optional)
|
||||||
|
region_name: AWS region, eg us-east-1 (Optional)
|
||||||
|
endpoint_url: endpoint url for the textract service (Optional)
|
||||||
|
|
||||||
|
"""
|
||||||
|
super().__init__(file_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
import textractcaller as tc # noqa: F401
|
||||||
|
except ImportError:
|
||||||
|
raise ModuleNotFoundError(
|
||||||
|
"Could not import amazon-textract-caller python package. "
|
||||||
|
"Please install it with `pip install amazon-textract-caller`."
|
||||||
|
)
|
||||||
|
if textract_features:
|
||||||
|
features = [tc.Textract_Features[x] for x in textract_features]
|
||||||
|
else:
|
||||||
|
features = []
|
||||||
|
|
||||||
|
if credentials_profile_name or region_name or endpoint_url:
|
||||||
|
try:
|
||||||
|
import boto3
|
||||||
|
|
||||||
|
if credentials_profile_name is not None:
|
||||||
|
session = boto3.Session(profile_name=credentials_profile_name)
|
||||||
|
else:
|
||||||
|
# use default credentials
|
||||||
|
session = boto3.Session()
|
||||||
|
|
||||||
|
client_params = {}
|
||||||
|
if region_name:
|
||||||
|
client_params["region_name"] = region_name
|
||||||
|
if endpoint_url:
|
||||||
|
client_params["endpoint_url"] = endpoint_url
|
||||||
|
|
||||||
|
client = session.client("textract", **client_params)
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
raise ModuleNotFoundError(
|
||||||
|
"Could not import boto3 python package. "
|
||||||
|
"Please install it with `pip install boto3`."
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(
|
||||||
|
"Could not load credentials to authenticate with AWS client. "
|
||||||
|
"Please check that credentials in the specified "
|
||||||
|
"profile name are valid."
|
||||||
|
) from e
|
||||||
|
self.parser = AmazonTextractPDFParser(textract_features=features, client=client)
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load given path as pages."""
|
||||||
|
return list(self.lazy_load())
|
||||||
|
|
||||||
|
def lazy_load(
|
||||||
|
self,
|
||||||
|
) -> Iterator[Document]:
|
||||||
|
"""Lazy load documents"""
|
||||||
|
# the self.file_path is local, but the blob has to include
|
||||||
|
# the S3 location if the file originated from S3 for multi-page documents
|
||||||
|
# raises ValueError when multi-page and not on S3"""
|
||||||
|
|
||||||
|
if self.web_path and self._is_s3_url(self.web_path):
|
||||||
|
blob = Blob(path=self.web_path)
|
||||||
|
else:
|
||||||
|
blob = Blob.from_path(self.file_path)
|
||||||
|
if AmazonTextractPDFLoader._get_number_of_pages(blob) > 1:
|
||||||
|
raise ValueError(
|
||||||
|
f"the file {blob.path} is a multi-page document, \
|
||||||
|
but not stored on S3. \
|
||||||
|
Textract requires multi-page documents to be on S3."
|
||||||
|
)
|
||||||
|
|
||||||
|
yield from self.parser.parse(blob)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_number_of_pages(blob: Blob) -> int:
|
||||||
|
try:
|
||||||
|
import pypdf
|
||||||
|
from PIL import Image, ImageSequence
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
raise ModuleNotFoundError(
|
||||||
|
"Could not import pypdf or Pilloe python package. "
|
||||||
|
"Please install it with `pip install pypdf Pillow`."
|
||||||
|
)
|
||||||
|
if blob.mimetype == "application/pdf":
|
||||||
|
with blob.as_bytes_io() as input_pdf_file:
|
||||||
|
pdf_reader = pypdf.PdfReader(input_pdf_file)
|
||||||
|
return len(pdf_reader.pages)
|
||||||
|
elif blob.mimetype == "image/tiff":
|
||||||
|
num_pages = 0
|
||||||
|
img = Image.open(blob.as_bytes())
|
||||||
|
for _, _ in enumerate(ImageSequence.Iterator(img)):
|
||||||
|
num_pages += 1
|
||||||
|
return num_pages
|
||||||
|
elif blob.mimetype in ["image/png", "image/jpeg"]:
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
raise ValueError(f"unsupported mime type: {blob.mimetype}")
|
||||||
|
41
libs/langchain/poetry.lock
generated
41
libs/langchain/poetry.lock
generated
@ -338,6 +338,42 @@ files = [
|
|||||||
{file = "amadeus-8.1.0.tar.gz", hash = "sha256:df31e7c84383a85ee2dce95b11e7a0774fdf31762229f768519b5cb176bc167d"},
|
{file = "amadeus-8.1.0.tar.gz", hash = "sha256:df31e7c84383a85ee2dce95b11e7a0774fdf31762229f768519b5cb176bc167d"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "amazon-textract-caller"
|
||||||
|
version = "0.0.29"
|
||||||
|
description = "Amazon Textract Caller tools"
|
||||||
|
category = "main"
|
||||||
|
optional = true
|
||||||
|
python-versions = ">=3.6"
|
||||||
|
files = [
|
||||||
|
{file = "amazon-textract-caller-0.0.29.tar.gz", hash = "sha256:53770d82db67d4984a99825a90908a319f8920e64d6d48a45456b18d6ab3771a"},
|
||||||
|
{file = "amazon_textract_caller-0.0.29-py2.py3-none-any.whl", hash = "sha256:c5898fc7e84eea2564a9ececcf9101778b7533fa58e2c8e6eb1daa48869788fc"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
amazon-textract-response-parser = ">=0.1.39"
|
||||||
|
boto3 = ">=1.26.35"
|
||||||
|
botocore = "*"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
testing = ["amazon-textract-response-parser", "pytest"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "amazon-textract-response-parser"
|
||||||
|
version = "1.0.0"
|
||||||
|
description = "Easily parse JSON returned by Amazon Textract."
|
||||||
|
category = "main"
|
||||||
|
optional = true
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "amazon-textract-response-parser-1.0.0.tar.gz", hash = "sha256:52e94e002b714195d678ea83b99ebc11d68ea716c9371852aed03a10e385dd41"},
|
||||||
|
{file = "amazon_textract_response_parser-1.0.0-py2.py3-none-any.whl", hash = "sha256:668ffb4604ed365de9c60d6a77ca9190c2614679997edfba0ce7398e2579c574"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
boto3 = "*"
|
||||||
|
marshmallow = ">=3.14,<4"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anthropic"
|
name = "anthropic"
|
||||||
version = "0.3.2"
|
version = "0.3.2"
|
||||||
@ -4702,6 +4738,7 @@ optional = false
|
|||||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
|
||||||
files = [
|
files = [
|
||||||
{file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
|
{file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
|
||||||
|
{file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -13539,7 +13576,7 @@ clarifai = ["clarifai"]
|
|||||||
cohere = ["cohere"]
|
cohere = ["cohere"]
|
||||||
docarray = ["docarray"]
|
docarray = ["docarray"]
|
||||||
embeddings = ["sentence-transformers"]
|
embeddings = ["sentence-transformers"]
|
||||||
extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"]
|
extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"]
|
||||||
javascript = ["esprima"]
|
javascript = ["esprima"]
|
||||||
llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers", "xinference"]
|
llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers", "xinference"]
|
||||||
openai = ["openai", "tiktoken"]
|
openai = ["openai", "tiktoken"]
|
||||||
@ -13549,4 +13586,4 @@ text-helpers = ["chardet"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">=3.8.1,<4.0"
|
python-versions = ">=3.8.1,<4.0"
|
||||||
content-hash = "0708c3b45f59eea36919ff9ff99fa6eddc81bccb654cce183641ef8396ea5290"
|
content-hash = "39305f23d3d69179d247d643631133ac50f5e944d98518c8a56c5f839b8e7a04"
|
||||||
|
@ -130,6 +130,7 @@ gitpython = {version = "^3.1.32", optional = true}
|
|||||||
librosa = {version="^0.10.0.post2", optional = true }
|
librosa = {version="^0.10.0.post2", optional = true }
|
||||||
feedparser = {version = "^6.0.10", optional = true}
|
feedparser = {version = "^6.0.10", optional = true}
|
||||||
newspaper3k = {version = "^0.2.8", optional = true}
|
newspaper3k = {version = "^0.2.8", optional = true}
|
||||||
|
amazon-textract-caller = {version = "<2", optional = true}
|
||||||
|
|
||||||
[tool.poetry.group.test.dependencies]
|
[tool.poetry.group.test.dependencies]
|
||||||
# The only dependencies that should be added are
|
# The only dependencies that should be added are
|
||||||
@ -329,6 +330,7 @@ all = [
|
|||||||
# Please use new-line on formatting to make it easier to add new packages without
|
# Please use new-line on formatting to make it easier to add new packages without
|
||||||
# merge-conflicts
|
# merge-conflicts
|
||||||
extended_testing = [
|
extended_testing = [
|
||||||
|
"amazon-textract-caller",
|
||||||
"beautifulsoup4",
|
"beautifulsoup4",
|
||||||
"bibtexparser",
|
"bibtexparser",
|
||||||
"cassio",
|
"cassio",
|
||||||
|
@ -1,6 +1,10 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
from langchain.document_loaders import (
|
from langchain.document_loaders import (
|
||||||
|
AmazonTextractPDFLoader,
|
||||||
MathpixPDFLoader,
|
MathpixPDFLoader,
|
||||||
PDFMinerLoader,
|
PDFMinerLoader,
|
||||||
PDFMinerPDFasHTMLLoader,
|
PDFMinerPDFasHTMLLoader,
|
||||||
@ -136,3 +140,56 @@ def test_mathpix_loader() -> None:
|
|||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
print(docs[0].page_content)
|
print(docs[0].page_content)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"file_path, features, docs_length, create_client",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
(
|
||||||
|
"https://amazon-textract-public-content.s3.us-east-2.amazonaws.com"
|
||||||
|
"/langchain/alejandro_rosalez_sample_1.jpg"
|
||||||
|
),
|
||||||
|
["FORMS", "TABLES"],
|
||||||
|
1,
|
||||||
|
False,
|
||||||
|
),
|
||||||
|
(str(Path(__file__).parent.parent / "examples/hello.pdf"), ["FORMS"], 1, False),
|
||||||
|
(
|
||||||
|
"s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf",
|
||||||
|
None,
|
||||||
|
16,
|
||||||
|
True,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.skip(reason="Requires AWS credentials to run")
|
||||||
|
def test_amazontextract_loader(
|
||||||
|
file_path: str,
|
||||||
|
features: Union[Sequence[str], None],
|
||||||
|
docs_length: int,
|
||||||
|
create_client: bool,
|
||||||
|
) -> None:
|
||||||
|
if create_client:
|
||||||
|
import boto3
|
||||||
|
|
||||||
|
textract_client = boto3.client("textract", region_name="us-east-2")
|
||||||
|
loader = AmazonTextractPDFLoader(
|
||||||
|
file_path, textract_features=features, client=textract_client
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
loader = AmazonTextractPDFLoader(file_path, textract_features=features)
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) == docs_length
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="Requires AWS credentials to run")
|
||||||
|
def test_amazontextract_loader_failures() -> None:
|
||||||
|
# 2-page PDF local file system
|
||||||
|
two_page_pdf = str(
|
||||||
|
Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"
|
||||||
|
)
|
||||||
|
loader = AmazonTextractPDFLoader(two_page_pdf)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
loader.load()
|
||||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user