diff --git a/libs/langchain/langchain/document_loaders/__init__.py b/libs/langchain/langchain/document_loaders/__init__.py index af974bc4089..ccc59d3f0e9 100644 --- a/libs/langchain/langchain/document_loaders/__init__.py +++ b/libs/langchain/langchain/document_loaders/__init__.py @@ -108,6 +108,7 @@ from langchain.document_loaders.onedrive_file import OneDriveFileLoader from langchain.document_loaders.open_city_data import OpenCityDataLoader from langchain.document_loaders.org_mode import UnstructuredOrgModeLoader from langchain.document_loaders.pdf import ( + AmazonTextractPDFLoader, MathpixPDFLoader, OnlinePDFLoader, PDFMinerLoader, @@ -330,4 +331,5 @@ __all__ = [ "YoutubeAudioLoader", "YoutubeLoader", "ConcurrentLoader", + "AmazonTextractPDFLoader", ] diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index 464a928e2b0..8298253f6c6 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -1,5 +1,6 @@ """Module contains common parsers for PDFs.""" -from typing import Any, Iterator, Mapping, Optional, Union +from typing import Any, Iterator, Mapping, Optional, Sequence, Union +from urllib.parse import urlparse from langchain.document_loaders.base import BaseBlobParser from langchain.document_loaders.blob_loaders import Blob @@ -149,3 +150,97 @@ class PDFPlumberParser(BaseBlobParser): ) for page in doc.pages ] + + +class AmazonTextractPDFParser(BaseBlobParser): + """Sends PDF files to Amazon Textract and parses them to generate Documents. + + For parsing multi-page PDFs, they have to reside on S3. + """ + + def __init__( + self, + textract_features: Optional[Sequence[int]] = None, + client: Optional[Any] = None, + ) -> None: + """Initializes the parser. + + Args: + textract_features: Features to be used for extraction, each feature + should be passed as an int that conforms to the enum + `Textract_Features`, see `amazon-textract-caller` pkg + client: boto3 textract client + """ + + try: + import textractcaller as tc + + self.tc = tc + if textract_features is not None: + self.textract_features = [ + tc.Textract_Features(f) for f in textract_features + ] + else: + self.textract_features = [] + except ImportError: + raise ModuleNotFoundError( + "Could not import amazon-textract-caller python package. " + "Please install it with `pip install amazon-textract-caller`." + ) + + if not client: + try: + import boto3 + + self.boto3_textract_client = boto3.client("textract") + except ImportError: + raise ModuleNotFoundError( + "Could not import boto3 python package. " + "Please install it with `pip install boto3`." + ) + else: + self.boto3_textract_client = client + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Iterates over the Blob pages and returns an Iterator with a Document + for each page, like the other parsers If multi-page document, blob.path + has to be set to the S3 URI and for single page docs the blob.data is taken + """ + + url_parse_result = urlparse(str(blob.path)) if blob.path else None + # Either call with S3 path (multi-page) or with bytes (single-page) + if ( + url_parse_result + and url_parse_result.scheme == "s3" + and url_parse_result.netloc + ): + textract_response_json = self.tc.call_textract( + input_document=str(blob.path), + features=self.textract_features, + boto3_textract_client=self.boto3_textract_client, + ) + else: + textract_response_json = self.tc.call_textract( + input_document=blob.as_bytes(), + features=self.textract_features, + call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC, + boto3_textract_client=self.boto3_textract_client, + ) + + current_text = "" + current_page = 1 + for block in textract_response_json["Blocks"]: + if "Page" in block and not (int(block["Page"]) == current_page): + yield Document( + page_content=current_text, + metadata={"source": blob.source, "page": current_page}, + ) + current_text = "" + current_page = int(block["Page"]) + if "Text" in block: + current_text += block["Text"] + " " + + yield Document( + page_content=current_text, + metadata={"source": blob.source, "page": current_page}, + ) diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index 530aac097e8..d8eba3d9819 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -7,7 +7,7 @@ import time from abc import ABC from io import StringIO from pathlib import Path -from typing import Any, Iterator, List, Mapping, Optional, Union +from typing import Any, Iterator, List, Mapping, Optional, Sequence, Union from urllib.parse import urlparse import requests @@ -16,6 +16,7 @@ from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.blob_loaders import Blob from langchain.document_loaders.parsers.pdf import ( + AmazonTextractPDFParser, PDFMinerParser, PDFPlumberParser, PyMuPDFParser, @@ -71,22 +72,26 @@ class BasePDFLoader(BaseLoader, ABC): if "~" in self.file_path: self.file_path = os.path.expanduser(self.file_path) - # If the file is a web path, download it to a temporary file, and use that + # If the file is a web path or S3, download it to a temporary file, and use that if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path): - r = requests.get(self.file_path) - - if r.status_code != 200: - raise ValueError( - "Check the url of your file; returned status code %s" - % r.status_code - ) - - self.web_path = self.file_path self.temp_dir = tempfile.TemporaryDirectory() - temp_pdf = Path(self.temp_dir.name) / "tmp.pdf" - with open(temp_pdf, mode="wb") as f: - f.write(r.content) - self.file_path = str(temp_pdf) + _, suffix = os.path.splitext(self.file_path) + temp_pdf = os.path.join(self.temp_dir.name, f"tmp{suffix}") + if self._is_s3_url(self.file_path): + self.web_path = self.file_path + else: + r = requests.get(self.file_path) + + if r.status_code != 200: + raise ValueError( + "Check the url of your file; returned status code %s" + % r.status_code + ) + + self.web_path = self.file_path + with open(temp_pdf, mode="wb") as f: + f.write(r.content) + self.file_path = str(temp_pdf) elif not os.path.isfile(self.file_path): raise ValueError("File path %s is not a valid file or url" % self.file_path) @@ -100,6 +105,17 @@ class BasePDFLoader(BaseLoader, ABC): parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) + @staticmethod + def _is_s3_url(url: str) -> bool: + """check if the url is S3""" + try: + result = urlparse(url) + if result.scheme == "s3" and result.netloc: + return True + return False + except ValueError: + return False + @property def source(self) -> str: return self.web_path if self.web_path is not None else self.file_path @@ -440,3 +456,144 @@ class PDFPlumberLoader(BasePDFLoader): parser = PDFPlumberParser(text_kwargs=self.text_kwargs) blob = Blob.from_path(self.file_path) return parser.parse(blob) + + +class AmazonTextractPDFLoader(BasePDFLoader): + """Loads a PDF document from local file system, HTTP or S3. + + To authenticate, the AWS client uses the following methods to + automatically load credentials: + https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html + + If a specific credential profile should be used, you must pass + the name of the profile from the ~/.aws/credentials file that is to be used. + + Make sure the credentials / roles used have the required policies to + access the Amazon Textract service. + + Example: + .. code-block:: python + from langchain.document_loaders import AmazonTextractPDFLoader + loader = AmazonTextractPDFLoader( + file_path="s3://pdfs/myfile.pdf" + ) + document = loader.load() + """ + + def __init__( + self, + file_path: str, + textract_features: Optional[Sequence[str]] = None, + client: Optional[Any] = None, + credentials_profile_name: Optional[str] = None, + region_name: Optional[str] = None, + endpoint_url: Optional[str] = None, + ) -> None: + """Initialize the loader. + + Args: + file_path: A file, url or s3 path for input file + textract_features: Features to be used for extraction, each feature + should be passed as a str that conforms to the enum + `Textract_Features`, see `amazon-textract-caller` pkg + client: boto3 textract client (Optional) + credentials_profile_name: AWS profile name, if not default (Optional) + region_name: AWS region, eg us-east-1 (Optional) + endpoint_url: endpoint url for the textract service (Optional) + + """ + super().__init__(file_path) + + try: + import textractcaller as tc # noqa: F401 + except ImportError: + raise ModuleNotFoundError( + "Could not import amazon-textract-caller python package. " + "Please install it with `pip install amazon-textract-caller`." + ) + if textract_features: + features = [tc.Textract_Features[x] for x in textract_features] + else: + features = [] + + if credentials_profile_name or region_name or endpoint_url: + try: + import boto3 + + if credentials_profile_name is not None: + session = boto3.Session(profile_name=credentials_profile_name) + else: + # use default credentials + session = boto3.Session() + + client_params = {} + if region_name: + client_params["region_name"] = region_name + if endpoint_url: + client_params["endpoint_url"] = endpoint_url + + client = session.client("textract", **client_params) + + except ImportError: + raise ModuleNotFoundError( + "Could not import boto3 python package. " + "Please install it with `pip install boto3`." + ) + except Exception as e: + raise ValueError( + "Could not load credentials to authenticate with AWS client. " + "Please check that credentials in the specified " + "profile name are valid." + ) from e + self.parser = AmazonTextractPDFParser(textract_features=features, client=client) + + def load(self) -> List[Document]: + """Load given path as pages.""" + return list(self.lazy_load()) + + def lazy_load( + self, + ) -> Iterator[Document]: + """Lazy load documents""" + # the self.file_path is local, but the blob has to include + # the S3 location if the file originated from S3 for multi-page documents + # raises ValueError when multi-page and not on S3""" + + if self.web_path and self._is_s3_url(self.web_path): + blob = Blob(path=self.web_path) + else: + blob = Blob.from_path(self.file_path) + if AmazonTextractPDFLoader._get_number_of_pages(blob) > 1: + raise ValueError( + f"the file {blob.path} is a multi-page document, \ + but not stored on S3. \ + Textract requires multi-page documents to be on S3." + ) + + yield from self.parser.parse(blob) + + @staticmethod + def _get_number_of_pages(blob: Blob) -> int: + try: + import pypdf + from PIL import Image, ImageSequence + + except ImportError: + raise ModuleNotFoundError( + "Could not import pypdf or Pilloe python package. " + "Please install it with `pip install pypdf Pillow`." + ) + if blob.mimetype == "application/pdf": + with blob.as_bytes_io() as input_pdf_file: + pdf_reader = pypdf.PdfReader(input_pdf_file) + return len(pdf_reader.pages) + elif blob.mimetype == "image/tiff": + num_pages = 0 + img = Image.open(blob.as_bytes()) + for _, _ in enumerate(ImageSequence.Iterator(img)): + num_pages += 1 + return num_pages + elif blob.mimetype in ["image/png", "image/jpeg"]: + return 1 + else: + raise ValueError(f"unsupported mime type: {blob.mimetype}") diff --git a/libs/langchain/poetry.lock b/libs/langchain/poetry.lock index 8673a60e3ff..535393cdb9b 100644 --- a/libs/langchain/poetry.lock +++ b/libs/langchain/poetry.lock @@ -338,6 +338,42 @@ files = [ {file = "amadeus-8.1.0.tar.gz", hash = "sha256:df31e7c84383a85ee2dce95b11e7a0774fdf31762229f768519b5cb176bc167d"}, ] +[[package]] +name = "amazon-textract-caller" +version = "0.0.29" +description = "Amazon Textract Caller tools" +category = "main" +optional = true +python-versions = ">=3.6" +files = [ + {file = "amazon-textract-caller-0.0.29.tar.gz", hash = "sha256:53770d82db67d4984a99825a90908a319f8920e64d6d48a45456b18d6ab3771a"}, + {file = "amazon_textract_caller-0.0.29-py2.py3-none-any.whl", hash = "sha256:c5898fc7e84eea2564a9ececcf9101778b7533fa58e2c8e6eb1daa48869788fc"}, +] + +[package.dependencies] +amazon-textract-response-parser = ">=0.1.39" +boto3 = ">=1.26.35" +botocore = "*" + +[package.extras] +testing = ["amazon-textract-response-parser", "pytest"] + +[[package]] +name = "amazon-textract-response-parser" +version = "1.0.0" +description = "Easily parse JSON returned by Amazon Textract." +category = "main" +optional = true +python-versions = ">=3.8" +files = [ + {file = "amazon-textract-response-parser-1.0.0.tar.gz", hash = "sha256:52e94e002b714195d678ea83b99ebc11d68ea716c9371852aed03a10e385dd41"}, + {file = "amazon_textract_response_parser-1.0.0-py2.py3-none-any.whl", hash = "sha256:668ffb4604ed365de9c60d6a77ca9190c2614679997edfba0ce7398e2579c574"}, +] + +[package.dependencies] +boto3 = "*" +marshmallow = ">=3.14,<4" + [[package]] name = "anthropic" version = "0.3.2" @@ -4702,6 +4738,7 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" files = [ {file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"}, + {file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"}, ] [[package]] @@ -13539,7 +13576,7 @@ clarifai = ["clarifai"] cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"] +extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"] javascript = ["esprima"] llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers", "xinference"] openai = ["openai", "tiktoken"] @@ -13549,4 +13586,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "0708c3b45f59eea36919ff9ff99fa6eddc81bccb654cce183641ef8396ea5290" +content-hash = "39305f23d3d69179d247d643631133ac50f5e944d98518c8a56c5f839b8e7a04" diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index e04f5a3ee44..4b0d9c83e4c 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -130,6 +130,7 @@ gitpython = {version = "^3.1.32", optional = true} librosa = {version="^0.10.0.post2", optional = true } feedparser = {version = "^6.0.10", optional = true} newspaper3k = {version = "^0.2.8", optional = true} +amazon-textract-caller = {version = "<2", optional = true} [tool.poetry.group.test.dependencies] # The only dependencies that should be added are @@ -329,6 +330,7 @@ all = [ # Please use new-line on formatting to make it easier to add new packages without # merge-conflicts extended_testing = [ + "amazon-textract-caller", "beautifulsoup4", "bibtexparser", "cassio", diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py index 324a2e02120..aea75c11fda 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py @@ -1,6 +1,10 @@ from pathlib import Path +from typing import Sequence, Union + +import pytest from langchain.document_loaders import ( + AmazonTextractPDFLoader, MathpixPDFLoader, PDFMinerLoader, PDFMinerPDFasHTMLLoader, @@ -136,3 +140,56 @@ def test_mathpix_loader() -> None: docs = loader.load() assert len(docs) == 1 print(docs[0].page_content) + + +@pytest.mark.parametrize( + "file_path, features, docs_length, create_client", + [ + ( + ( + "https://amazon-textract-public-content.s3.us-east-2.amazonaws.com" + "/langchain/alejandro_rosalez_sample_1.jpg" + ), + ["FORMS", "TABLES"], + 1, + False, + ), + (str(Path(__file__).parent.parent / "examples/hello.pdf"), ["FORMS"], 1, False), + ( + "s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf", + None, + 16, + True, + ), + ], +) +@pytest.mark.skip(reason="Requires AWS credentials to run") +def test_amazontextract_loader( + file_path: str, + features: Union[Sequence[str], None], + docs_length: int, + create_client: bool, +) -> None: + if create_client: + import boto3 + + textract_client = boto3.client("textract", region_name="us-east-2") + loader = AmazonTextractPDFLoader( + file_path, textract_features=features, client=textract_client + ) + else: + loader = AmazonTextractPDFLoader(file_path, textract_features=features) + docs = loader.load() + + assert len(docs) == docs_length + + +@pytest.mark.skip(reason="Requires AWS credentials to run") +def test_amazontextract_loader_failures() -> None: + # 2-page PDF local file system + two_page_pdf = str( + Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf" + ) + loader = AmazonTextractPDFLoader(two_page_pdf) + with pytest.raises(ValueError): + loader.load() diff --git a/libs/langchain/tests/integration_tests/examples/multi-page-forms-sample-2-page.pdf b/libs/langchain/tests/integration_tests/examples/multi-page-forms-sample-2-page.pdf new file mode 100644 index 00000000000..de6ddd0f7e8 Binary files /dev/null and b/libs/langchain/tests/integration_tests/examples/multi-page-forms-sample-2-page.pdf differ