"""Module contains common parsers for PDFs.""" from __future__ import annotations import html import io import logging import threading import warnings from datetime import datetime from pathlib import Path from tempfile import TemporaryDirectory from urllib.parse import urlparse import numpy import numpy as np from typing import ( TYPE_CHECKING, Any, BinaryIO, Iterable, Iterator, Literal, Mapping, Optional, Sequence, Union, cast, ) from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.parsers.images import ( BaseImageBlobParser, RapidOCRBlobParser, ) from langchain_core.documents import Document if TYPE_CHECKING: import pdfplumber import pymupdf import pypdf import pypdfium2 from textractor.data.text_linearization_config import TextLinearizationConfig _PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"] _PDF_FILTER_WITHOUT_LOSS = [ "LZWDecode", "LZW", "FlateDecode", "Fl", "ASCII85Decode", "A85", "ASCIIHexDecode", "AHx", "RunLengthDecode", "RL", "CCITTFaxDecode", "CCF", "JBIG2Decode", ] def extract_from_images_with_rapidocr( images: Sequence[Union[Iterable[np.ndarray], bytes]], ) -> str: """Extract text from images with RapidOCR. Args: images: Images to extract text from. Returns: Text extracted from images. Raises: ImportError: If `rapidocr-onnxruntime` package is not installed. """ try: from rapidocr_onnxruntime import RapidOCR except ImportError: raise ImportError( "`rapidocr-onnxruntime` package not found, please install it with " "`pip install rapidocr-onnxruntime`" ) ocr = RapidOCR() text = "" for img in images: result, _ = ocr(img) if result: result = [text[1] for text in result] text += "\n".join(result) return text logger = logging.getLogger(__name__) _FORMAT_IMAGE_STR = "\n\n{image_text}\n\n" _JOIN_IMAGES = "\n" _JOIN_TABLES = "\n" _DEFAULT_PAGES_DELIMITER = "\n\f" _STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"} def _format_inner_image(blob: Blob, content: str, format: str) -> str: """Format the content of the image with the source of the blob. blob: The blob containing the image. format:: The format for the parsed output. - "text" = return the content as is - "markdown-img" = wrap the content into an image markdown link, w/ link pointing to (`![body)(#)`] - "html-img" = wrap the content as the `alt` text of an tag and link to (`{body}`) """ if content: source = blob.source or "#" if format == "markdown-img": content = content.replace("]", r"\\]") content = f"![{content}]({source})" elif format == "html-img": content = f'{html.escape(content, quote=True)} src=' return content def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]: """Validate that the metadata has all the standard keys and the page is an integer. The standard keys are: - source - page (if mode='page') - total_page - creationdate - creator - producer Validate that page is an integer if it is present. """ if not _STD_METADATA_KEYS.issubset(metadata.keys()): raise ValueError("The PDF parser must valorize the standard metadata.") if not isinstance(metadata.get("page", 0), int): raise ValueError("The PDF metadata page must be a integer.") return metadata def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]: """Purge metadata from unwanted keys and normalize key names. Args: metadata: The original metadata dictionary. Returns: The cleaned and normalized the key format of metadata dictionary. """ new_metadata: dict[str, Any] = {} map_key = { "page_count": "total_pages", "file_path": "source", } for k, v in metadata.items(): if type(v) not in [str, int]: v = str(v) if k.startswith("/"): k = k[1:] k = k.lower() if k in ["creationdate", "moddate"]: try: new_metadata[k] = datetime.strptime( v.replace("'", ""), "D:%Y%m%d%H%M%S%z" ).isoformat("T") except ValueError: new_metadata[k] = v elif k in map_key: # Normalize key with others PDF parser new_metadata[map_key[k]] = v new_metadata[k] = v elif isinstance(v, str): new_metadata[k] = v.strip() elif isinstance(v, int): new_metadata[k] = v return new_metadata _PARAGRAPH_DELIMITER = [ "\n\n\n", "\n\n", ] # To insert images or table in the middle of the page. def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str: """Insert extras such as image/table in a text between two paragraphs if possible, else at the end of the text. Args: extras: List of extra content (images/tables) to insert. text_from_page: The text content from the page. Returns: The merged text with extras inserted. """ def _recurs_merge_text_and_extras( extras: list[str], text_from_page: str, recurs: bool ) -> Optional[str]: if extras: for delim in _PARAGRAPH_DELIMITER: pos = text_from_page.rfind(delim) if pos != -1: # search penultimate, to bypass an error in footer previous_text = None if recurs: previous_text = _recurs_merge_text_and_extras( extras, text_from_page[:pos], False ) if previous_text: all_text = previous_text + text_from_page[pos:] else: all_extras = "" str_extras = "\n\n".join(filter(lambda x: x, extras)) if str_extras: all_extras = delim + str_extras all_text = ( text_from_page[:pos] + all_extras + text_from_page[pos:] ) break else: all_text = None else: all_text = text_from_page return all_text all_text = _recurs_merge_text_and_extras(extras, text_from_page, True) if not all_text: all_extras = "" str_extras = "\n\n".join(filter(lambda x: x, extras)) if str_extras: all_extras = _PARAGRAPH_DELIMITER[-1] + str_extras all_text = text_from_page + all_extras return all_text class PyPDFParser(BaseBlobParser): """Parse a blob from a PDF using `pypdf` library. This class provides methods to parse a blob from a PDF document, supporting various configurations such as handling password-protected PDFs, extracting images. It integrates the 'pypdf' library for PDF processing and offers synchronous blob parsing. Examples: Setup: .. code-block:: bash pip install -U langchain-community pypdf Load a blob from a PDF file: .. code-block:: python from langchain_core.documents.base import Blob blob = Blob.from_path("./example_data/layout-parser-paper.pdf") Instantiate the parser: .. code-block:: python from langchain_community.document_loaders.parsers import PyPDFParser parser = PyPDFParser( # password = None, mode = "single", pages_delimiter = "\n\f", # images_parser = TesseractBlobParser(), ) Lazily parse the blob: .. code-block:: python docs = [] docs_lazy = parser.lazy_parse(blob) for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) """ def __init__( self, password: Optional[Union[str, bytes]] = None, extract_images: bool = False, *, mode: Literal["single", "page"] = "page", pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, images_parser: Optional[BaseImageBlobParser] = None, images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", extraction_mode: Literal["plain", "layout"] = "plain", extraction_kwargs: Optional[dict[str, Any]] = None, ): """Initialize a parser based on PyPDF. Args: password: Optional password for opening encrypted PDFs. extract_images: Whether to extract images from the PDF. mode: The extraction mode, either "single" for the entire document or "page" for page-wise extraction. pages_delimiter: A string delimiter to separate pages in single-mode extraction. images_parser: Optional image blob parser. images_inner_format: The format for the parsed output. - "text" = return the content as is - "markdown-img" = wrap the content into an image markdown link, w/ link pointing to (`![body)(#)`] - "html-img" = wrap the content as the `alt` text of an tag and link to (`{body}`) extraction_mode: “plain” for legacy functionality, “layout” extract text in a fixed width format that closely adheres to the rendered layout in the source pdf. extraction_kwargs: Optional additional parameters for the extraction process. Raises: ValueError: If the `mode` is not "single" or "page". """ super().__init__() if mode not in ["single", "page"]: raise ValueError("mode must be single or page") self.extract_images = extract_images if extract_images and not images_parser: images_parser = RapidOCRBlobParser() self.images_parser = images_parser self.images_inner_format = images_inner_format self.password = password self.mode = mode self.pages_delimiter = pages_delimiter self.extraction_mode = extraction_mode self.extraction_kwargs = extraction_kwargs or {} def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """ Lazily parse the blob. Insert image, if possible, between two paragraphs. In this way, a paragraph can be continued on the next page. Args: blob: The blob to parse. Raises: ImportError: If the `pypdf` package is not found. Yield: An iterator over the parsed documents. """ try: import pypdf except ImportError: raise ImportError( "`pypdf` package not found, please install it with `pip install pypdf`" ) def _extract_text_from_page(page: pypdf.PageObject) -> str: """ Extract text from image given the version of pypdf. Args: page: The page object to extract text from. Returns: str: The extracted text. """ if pypdf.__version__.startswith("3"): return page.extract_text() else: return page.extract_text( extraction_mode=self.extraction_mode, **self.extraction_kwargs, ) with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined] pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password) doc_metadata = _purge_metadata( {"producer": "PyPDF", "creator": "PyPDF", "creationdate": ""} | cast(dict, pdf_reader.metadata or {}) | { "source": blob.source, "total_pages": len(pdf_reader.pages), } ) single_texts = [] for page_number, page in enumerate(pdf_reader.pages): text_from_page = _extract_text_from_page(page=page) images_from_page = self.extract_images_from_page(page) all_text = _merge_text_and_extras( [images_from_page], text_from_page ).strip() if self.mode == "page": yield Document( page_content=all_text, metadata=_validate_metadata( doc_metadata | { "page": page_number, "page_label": pdf_reader.page_labels[page_number], } ), ) else: single_texts.append(all_text) if self.mode == "single": yield Document( page_content=self.pages_delimiter.join(single_texts), metadata=_validate_metadata(doc_metadata), ) def extract_images_from_page(self, page: pypdf._page.PageObject) -> str: """Extract images from a PDF page and get the text using images_to_text. Args: page: The page object from which to extract images. Returns: str: The extracted text from the images on the page. """ if not self.images_parser: return "" from PIL import Image if "/XObject" not in cast(dict, page["/Resources"]).keys(): return "" xObject = page["/Resources"]["/XObject"].get_object() # type: ignore[index] images = [] for obj in xObject: np_image: Any = None if xObject[obj]["/Subtype"] == "/Image": if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS: height, width = xObject[obj]["/Height"], xObject[obj]["/Width"] np_image = np.frombuffer( xObject[obj].get_data(), dtype=np.uint8 ).reshape(height, width, -1) elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS: np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data()))) else: logger.warning("Unknown PDF Filter!") if np_image is not None: image_bytes = io.BytesIO() Image.fromarray(np_image).save(image_bytes, format="PNG") blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png") image_text = next( self.images_parser.lazy_parse(blob) # type: ignore ).page_content images.append( _format_inner_image(blob, image_text, self.images_inner_format) ) return _FORMAT_IMAGE_STR.format( image_text=_JOIN_IMAGES.join(filter(None, images)) ) class PDFMinerParser(BaseBlobParser): """Parse a blob from a PDF using `pdfminer.six` library. This class provides methods to parse a blob from a PDF document, supporting various configurations such as handling password-protected PDFs, extracting images, and defining extraction mode. It integrates the 'pdfminer.six' library for PDF processing and offers synchronous blob parsing. Examples: Setup: .. code-block:: bash pip install -U langchain-community pdfminer.six pillow Load a blob from a PDF file: .. code-block:: python from langchain_core.documents.base import Blob blob = Blob.from_path("./example_data/layout-parser-paper.pdf") Instantiate the parser: .. code-block:: python from langchain_community.document_loaders.parsers import PDFMinerParser parser = PDFMinerParser( # password = None, mode = "single", pages_delimiter = "\n\f", # extract_images = True, # images_to_text = convert_images_to_text_with_tesseract(), ) Lazily parse the blob: .. code-block:: python docs = [] docs_lazy = parser.lazy_parse(blob) for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) """ _warn_concatenate_pages = False def __init__( self, extract_images: bool = False, *, password: Optional[str] = None, mode: Literal["single", "page"] = "single", pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, images_parser: Optional[BaseImageBlobParser] = None, images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", concatenate_pages: Optional[bool] = None, ): """Initialize a parser based on PDFMiner. Args: password: Optional password for opening encrypted PDFs. mode: Extraction mode to use. Either "single" or "page" for page-wise extraction. pages_delimiter: A string delimiter to separate pages in single-mode extraction. extract_images: Whether to extract images from PDF. images_inner_format: The format for the parsed output. - "text" = return the content as is - "markdown-img" = wrap the content into an image markdown link, w/ link pointing to (`![body)(#)`] - "html-img" = wrap the content as the `alt` text of an tag and link to (`{body}`) concatenate_pages: Deprecated. If True, concatenate all PDF pages into one a single document. Otherwise, return one document per page. Returns: This method does not directly return data. Use the `parse` or `lazy_parse` methods to retrieve parsed documents with content and metadata. Raises: ValueError: If the `mode` is not "single" or "page". Warnings: `concatenate_pages` parameter is deprecated. Use `mode='single' or 'page' instead. """ super().__init__() if mode not in ["single", "page"]: raise ValueError("mode must be single or page") if extract_images and not images_parser: images_parser = RapidOCRBlobParser() self.extract_images = extract_images self.images_parser = images_parser self.images_inner_format = images_inner_format self.password = password self.mode = mode self.pages_delimiter = pages_delimiter if concatenate_pages is not None: if not PDFMinerParser._warn_concatenate_pages: PDFMinerParser._warn_concatenate_pages = True logger.warning( "`concatenate_pages` parameter is deprecated. " "Use `mode='single' or 'page'` instead." ) self.mode = "single" if concatenate_pages else "page" @staticmethod def decode_text(s: Union[bytes, str]) -> str: """ Decodes a PDFDocEncoding string to Unicode. Adds py3 compatibility to pdfminer's version. Args: s: The string to decode. Returns: str: The decoded Unicode string. """ from pdfminer.utils import PDFDocEncoding if isinstance(s, bytes) and s.startswith(b"\xfe\xff"): return str(s[2:], "utf-16be", "ignore") try: ords = (ord(c) if isinstance(c, str) else c for c in s) return "".join(PDFDocEncoding[o] for o in ords) except IndexError: return str(s) @staticmethod def resolve_and_decode(obj: Any) -> Any: """ Recursively resolve the metadata values. Args: obj: The object to resolve and decode. It can be of any type. Returns: The resolved and decoded object. """ from pdfminer.psparser import PSLiteral if hasattr(obj, "resolve"): obj = obj.resolve() if isinstance(obj, list): return list(map(PDFMinerParser.resolve_and_decode, obj)) elif isinstance(obj, PSLiteral): return PDFMinerParser.decode_text(obj.name) elif isinstance(obj, (str, bytes)): return PDFMinerParser.decode_text(obj) elif isinstance(obj, dict): for k, v in obj.items(): obj[k] = PDFMinerParser.resolve_and_decode(v) return obj return obj def _get_metadata( self, fp: BinaryIO, password: str = "", caching: bool = True, ) -> dict[str, Any]: """ Extract metadata from a PDF file. Args: fp: The file pointer to the PDF file. password: The password for the PDF file, if encrypted. Defaults to an empty string. caching: Whether to cache the PDF structure. Defaults to True. Returns: Metadata of the PDF file. """ from pdfminer.pdfpage import PDFDocument, PDFPage, PDFParser # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument(parser, password=password, caching=caching) metadata = {} for info in doc.info: metadata.update(info) for k, v in metadata.items(): try: metadata[k] = PDFMinerParser.resolve_and_decode(v) except Exception as e: # pragma: nocover # This metadata value could not be parsed. Instead of failing the PDF # read, treat it as a warning only if `strict_metadata=False`. logger.warning( '[WARNING] Metadata key "%s" could not be parsed due to ' "exception: %s", k, str(e), ) # Count number of pages. metadata["total_pages"] = len(list(PDFPage.create_pages(doc))) return metadata def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """ Lazily parse the blob. Insert image, if possible, between two paragraphs. In this way, a paragraph can be continued on the next page. Args: blob: The blob to parse. Raises: ImportError: If the `pdfminer.six` or `pillow` package is not found. Yield: An iterator over the parsed documents. """ try: import pdfminer from pdfminer.converter import PDFLayoutAnalyzer from pdfminer.layout import ( LAParams, LTContainer, LTImage, LTItem, LTPage, LTText, LTTextBox, ) from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage if int(pdfminer.__version__) < 20201018: raise ImportError( "This parser is tested with pdfminer.six version 20201018 or " "later. Remove pdfminer, and install pdfminer.six with " "`pip uninstall pdfminer && pip install pdfminer.six`." ) except ImportError: raise ImportError( "pdfminer package not found, please install it " "with `pip install pdfminer.six`" ) with blob.as_bytes_io() as pdf_file_obj, TemporaryDirectory() as tempdir: pages = PDFPage.get_pages(pdf_file_obj, password=self.password or "") rsrcmgr = PDFResourceManager() doc_metadata = _purge_metadata( self._get_metadata(pdf_file_obj, password=self.password or "") ) doc_metadata["source"] = blob.source class Visitor(PDFLayoutAnalyzer): def __init__( self, rsrcmgr: PDFResourceManager, pageno: int = 1, laparams: Optional[LAParams] = None, ) -> None: super().__init__(rsrcmgr, pageno=pageno, laparams=laparams) def receive_layout(me, ltpage: LTPage) -> None: def render(item: LTItem) -> None: if isinstance(item, LTContainer): for child in item: render(child) elif isinstance(item, LTText): text_io.write(item.get_text()) if isinstance(item, LTTextBox): text_io.write("\n") elif isinstance(item, LTImage): if self.images_parser: from pdfminer.image import ImageWriter image_writer = ImageWriter(tempdir) filename = image_writer.export_image(item) blob = Blob.from_path(Path(tempdir) / filename) blob.metadata["source"] = "#" image_text = next( self.images_parser.lazy_parse(blob) # type: ignore ).page_content text_io.write( _format_inner_image( blob, image_text, self.images_inner_format ) ) else: pass render(ltpage) text_io = io.StringIO() visitor_for_all = PDFPageInterpreter( rsrcmgr, Visitor(rsrcmgr, laparams=LAParams()) ) all_content = [] for i, page in enumerate(pages): text_io.truncate(0) text_io.seek(0) visitor_for_all.process_page(page) all_text = text_io.getvalue() # For legacy compatibility, net strip() all_text = all_text.strip() if self.mode == "page": text_io.truncate(0) text_io.seek(0) yield Document( page_content=all_text, metadata=_validate_metadata(doc_metadata | {"page": i}), ) else: if all_text.endswith("\f"): all_text = all_text[:-1] all_content.append(all_text) if self.mode == "single": # Add pages_delimiter between pages document_content = self.pages_delimiter.join(all_content) yield Document( page_content=document_content, metadata=_validate_metadata(doc_metadata), ) class PyMuPDFParser(BaseBlobParser): """Parse a blob from a PDF using `PyMuPDF` library. This class provides methods to parse a blob from a PDF document, supporting various configurations such as handling password-protected PDFs, extracting images, and defining extraction mode. It integrates the 'PyMuPDF' library for PDF processing and offers synchronous blob parsing. Examples: Setup: .. code-block:: bash pip install -U langchain-community pymupdf Load a blob from a PDF file: .. code-block:: python from langchain_core.documents.base import Blob blob = Blob.from_path("./example_data/layout-parser-paper.pdf") Instantiate the parser: .. code-block:: python from langchain_community.document_loaders.parsers import PyMuPDFParser parser = PyMuPDFParser( # password = None, mode = "single", pages_delimiter = "\n\f", # images_parser = TesseractBlobParser(), # extract_tables="markdown", # extract_tables_settings=None, # text_kwargs=None, ) Lazily parse the blob: .. code-block:: python docs = [] docs_lazy = parser.lazy_parse(blob) for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) """ # PyMuPDF is not thread safe. # See https://pymupdf.readthedocs.io/en/latest/recipes-multiprocessing.html _lock = threading.Lock() def __init__( self, text_kwargs: Optional[dict[str, Any]] = None, extract_images: bool = False, *, password: Optional[str] = None, mode: Literal["single", "page"] = "page", pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, images_parser: Optional[BaseImageBlobParser] = None, images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", extract_tables: Union[Literal["csv", "markdown", "html"], None] = None, extract_tables_settings: Optional[dict[str, Any]] = None, ) -> None: """Initialize a parser based on PyMuPDF. Args: password: Optional password for opening encrypted PDFs. mode: The extraction mode, either "single" for the entire document or "page" for page-wise extraction. pages_delimiter: A string delimiter to separate pages in single-mode extraction. extract_images: Whether to extract images from the PDF. images_parser: Optional image blob parser. images_inner_format: The format for the parsed output. - "text" = return the content as is - "markdown-img" = wrap the content into an image markdown link, w/ link pointing to (`![body)(#)`] - "html-img" = wrap the content as the `alt` text of an tag and link to (`{body}`) extract_tables: Whether to extract tables in a specific format, such as "csv", "markdown", or "html". extract_tables_settings: Optional dictionary of settings for customizing table extraction. Returns: This method does not directly return data. Use the `parse` or `lazy_parse` methods to retrieve parsed documents with content and metadata. Raises: ValueError: If the mode is not "single" or "page". ValueError: If the extract_tables format is not "markdown", "html", or "csv". """ super().__init__() if mode not in ["single", "page"]: raise ValueError("mode must be single or page") if extract_tables and extract_tables not in ["markdown", "html", "csv"]: raise ValueError("mode must be markdown") self.mode = mode self.pages_delimiter = pages_delimiter self.password = password self.text_kwargs = text_kwargs or {} if extract_images and not images_parser: images_parser = RapidOCRBlobParser() self.extract_images = extract_images self.images_inner_format = images_inner_format self.images_parser = images_parser self.extract_tables = extract_tables self.extract_tables_settings = extract_tables_settings def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] return self._lazy_parse( blob, ) def _lazy_parse( self, blob: Blob, # text-kwargs is present for backwards compatibility. # Users should not use it directly. text_kwargs: Optional[dict[str, Any]] = None, ) -> Iterator[Document]: # type: ignore[valid-type] """Lazily parse the blob. Insert image, if possible, between two paragraphs. In this way, a paragraph can be continued on the next page. Args: blob: The blob to parse. text_kwargs: Optional keyword arguments to pass to the `get_text` method. If provided at run time, it will override the default text_kwargs. Raises: ImportError: If the `pypdf` package is not found. Yield: An iterator over the parsed documents. """ try: import pymupdf text_kwargs = text_kwargs or self.text_kwargs if not self.extract_tables_settings: from pymupdf.table import ( DEFAULT_JOIN_TOLERANCE, DEFAULT_MIN_WORDS_HORIZONTAL, DEFAULT_MIN_WORDS_VERTICAL, DEFAULT_SNAP_TOLERANCE, ) self.extract_tables_settings = { # See https://pymupdf.readthedocs.io/en/latest/page.html#Page.find_tables "clip": None, "vertical_strategy": "lines", "horizontal_strategy": "lines", "vertical_lines": None, "horizontal_lines": None, "snap_tolerance": DEFAULT_SNAP_TOLERANCE, "snap_x_tolerance": None, "snap_y_tolerance": None, "join_tolerance": DEFAULT_JOIN_TOLERANCE, "join_x_tolerance": None, "join_y_tolerance": None, "edge_min_length": 3, "min_words_vertical": DEFAULT_MIN_WORDS_VERTICAL, "min_words_horizontal": DEFAULT_MIN_WORDS_HORIZONTAL, "intersection_tolerance": 3, "intersection_x_tolerance": None, "intersection_y_tolerance": None, "text_tolerance": 3, "text_x_tolerance": 3, "text_y_tolerance": 3, "strategy": None, # offer abbreviation "add_lines": None, # optional user-specified lines } except ImportError: raise ImportError( "pymupdf package not found, please install it " "with `pip install pymupdf`" ) with PyMuPDFParser._lock: with blob.as_bytes_io() as file_path: # type: ignore[attr-defined] if blob.data is None: # type: ignore[attr-defined] doc = pymupdf.open(file_path) else: doc = pymupdf.open(stream=file_path, filetype="pdf") if doc.is_encrypted: doc.authenticate(self.password) doc_metadata = self._extract_metadata(doc, blob) full_content = [] for page in doc: all_text = self._get_page_content(doc, page, text_kwargs).strip() if self.mode == "page": yield Document( page_content=all_text, metadata=_validate_metadata( doc_metadata | {"page": page.number} ), ) else: full_content.append(all_text) if self.mode == "single": yield Document( page_content=self.pages_delimiter.join(full_content), metadata=_validate_metadata(doc_metadata), ) def _get_page_content( self, doc: pymupdf.Document, page: pymupdf.Page, text_kwargs: dict[str, Any], ) -> str: """Get the text of the page using PyMuPDF and RapidOCR and issue a warning if it is empty. Args: doc: The PyMuPDF document object. page: The PyMuPDF page object. blob: The blob being parsed. Returns: str: The text content of the page. """ text_from_page = page.get_text(**{**self.text_kwargs, **text_kwargs}) images_from_page = self._extract_images_from_page(doc, page) tables_from_page = self._extract_tables_from_page(page) extras = [] if images_from_page: extras.append(images_from_page) if tables_from_page: extras.append(tables_from_page) all_text = _merge_text_and_extras(extras, text_from_page) return all_text def _extract_metadata(self, doc: pymupdf.Document, blob: Blob) -> dict: """Extract metadata from the document and page. Args: doc: The PyMuPDF document object. blob: The blob being parsed. Returns: dict: The extracted metadata. """ metadata = _purge_metadata( { **{ "producer": "PyMuPDF", "creator": "PyMuPDF", "creationdate": "", "source": blob.source, # type: ignore[attr-defined] "file_path": blob.source, # type: ignore[attr-defined] "total_pages": len(doc), }, **{ k: doc.metadata[k] for k in doc.metadata if isinstance(doc.metadata[k], (str, int)) }, } ) for k in ("modDate", "creationDate"): if k in doc.metadata: metadata[k] = doc.metadata[k] return metadata def _extract_images_from_page( self, doc: pymupdf.Document, page: pymupdf.Page ) -> str: """Extract images from a PDF page and get the text using images_to_text. Args: doc: The PyMuPDF document object. page: The PyMuPDF page object. Returns: str: The extracted text from the images on the page. """ if not self.images_parser: return "" import pymupdf img_list = page.get_images() images = [] for img in img_list: if self.images_parser: xref = img[0] pix = pymupdf.Pixmap(doc, xref) image = np.frombuffer(pix.samples, dtype=np.uint8).reshape( pix.height, pix.width, -1 ) image_bytes = io.BytesIO() numpy.save(image_bytes, image) blob = Blob.from_data( image_bytes.getvalue(), mime_type="application/x-npy" ) image_text = next( self.images_parser.lazy_parse(blob) # type: ignore ).page_content images.append( _format_inner_image(blob, image_text, self.images_inner_format) ) return _FORMAT_IMAGE_STR.format( image_text=_JOIN_IMAGES.join(filter(None, images)) ) def _extract_tables_from_page(self, page: pymupdf.Page) -> str: """Extract tables from a PDF page. Args: page: The PyMuPDF page object. Returns: str: The extracted tables in the specified format. """ if self.extract_tables is None: return "" import pymupdf tables_list = list( pymupdf.table.find_tables(page, **self.extract_tables_settings) ) if tables_list: if self.extract_tables == "markdown": return _JOIN_TABLES.join([table.to_markdown() for table in tables_list]) elif self.extract_tables == "html": return _JOIN_TABLES.join( [ table.to_pandas().to_html( header=False, index=False, bold_rows=False, ) for table in tables_list ] ) elif self.extract_tables == "csv": return _JOIN_TABLES.join( [ table.to_pandas().to_csv( header=False, index=False, ) for table in tables_list ] ) else: raise ValueError( f"extract_tables {self.extract_tables} not implemented" ) return "" class PyPDFium2Parser(BaseBlobParser): """Parse a blob from a PDF using `PyPDFium2` library. This class provides methods to parse a blob from a PDF document, supporting various configurations such as handling password-protected PDFs, extracting images, and defining extraction mode. It integrates the 'PyPDFium2' library for PDF processing and offers synchronous blob parsing. Examples: Setup: .. code-block:: bash pip install -U langchain-community pypdfium2 Load a blob from a PDF file: .. code-block:: python from langchain_core.documents.base import Blob blob = Blob.from_path("./example_data/layout-parser-paper.pdf") Instantiate the parser: .. code-block:: python from langchain_community.document_loaders.parsers import PyPDFium2Parser parser = PyPDFium2Parser( # password=None, mode="page", pages_delimiter="\n\f", ) Lazily parse the blob: .. code-block:: python docs = [] docs_lazy = parser.lazy_parse(blob) for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) """ # PyPDFium2 is not thread safe. # See https://pypdfium2.readthedocs.io/en/stable/python_api.html#thread-incompatibility _lock = threading.Lock() def __init__( self, extract_images: bool = False, *, password: Optional[str] = None, mode: Literal["single", "page"] = "page", pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, images_parser: Optional[BaseImageBlobParser] = None, images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", ) -> None: """Initialize a parser based on PyPDFium2. Args: password: Optional password for opening encrypted PDFs. mode: The extraction mode, either "single" for the entire document or "page" for page-wise extraction. pages_delimiter: A string delimiter to separate pages in single-mode extraction. extract_images: Whether to extract images from the PDF. images_parser: Optional image blob parser. images_inner_format: The format for the parsed output. - "text" = return the content as is - "markdown-img" = wrap the content into an image markdown link, w/ link pointing to (`![body)(#)`] - "html-img" = wrap the content as the `alt` text of an tag and link to (`{body}`) extraction_mode: “plain” for legacy functionality, “layout” for experimental layout mode functionality extraction_kwargs: Optional additional parameters for the extraction process. Returns: This method does not directly return data. Use the `parse` or `lazy_parse` methods to retrieve parsed documents with content and metadata. Raises: ValueError: If the mode is not "single" or "page". """ super().__init__() if mode not in ["single", "page"]: raise ValueError("mode must be single or page") self.extract_images = extract_images if extract_images and not images_parser: images_parser = RapidOCRBlobParser() self.images_parser = images_parser self.images_inner_format = images_inner_format self.password = password self.mode = mode self.pages_delimiter = pages_delimiter def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """ Lazily parse the blob. Insert image, if possible, between two paragraphs. In this way, a paragraph can be continued on the next page. Args: blob: The blob to parse. Raises: ImportError: If the `pypdf` package is not found. Yield: An iterator over the parsed documents. """ try: import pypdfium2 except ImportError: raise ImportError( "pypdfium2 package not found, please install it with" " `pip install pypdfium2`" ) # pypdfium2 is really finicky with respect to closing things, # if done incorrectly creates seg faults. with PyPDFium2Parser._lock: with blob.as_bytes_io() as file_path: # type: ignore[attr-defined] pdf_reader = None try: pdf_reader = pypdfium2.PdfDocument( file_path, password=self.password, autoclose=True ) full_content = [] doc_metadata = _purge_metadata(pdf_reader.get_metadata_dict()) doc_metadata["source"] = blob.source doc_metadata["total_pages"] = len(pdf_reader) for page_number, page in enumerate(pdf_reader): text_page = page.get_textpage() text_from_page = "\n".join( text_page.get_text_range().splitlines() ) # Replace \r\n text_page.close() image_from_page = self._extract_images_from_page(page) all_text = _merge_text_and_extras( [image_from_page], text_from_page ).strip() page.close() if self.mode == "page": # For legacy compatibility, add the last '\n' if not all_text.endswith("\n"): all_text += "\n" yield Document( page_content=all_text, metadata=_validate_metadata( { **doc_metadata, "page": page_number, } ), ) else: full_content.append(all_text) if self.mode == "single": yield Document( page_content=self.pages_delimiter.join(full_content), metadata=_validate_metadata(doc_metadata), ) finally: if pdf_reader: pdf_reader.close() def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> str: """Extract images from a PDF page and get the text using images_to_text. Args: page: The page object from which to extract images. Returns: str: The extracted text from the images on the page. """ if not self.images_parser: return "" import pypdfium2.raw as pdfium_c images = list(page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,))) if not images: return "" str_images = [] for image in images: image_bytes = io.BytesIO() np_image = image.get_bitmap().to_numpy() if np_image.size < 3: continue numpy.save(image_bytes, image.get_bitmap().to_numpy()) blob = Blob.from_data(image_bytes.getvalue(), mime_type="application/x-npy") text_from_image = next( self.images_parser.lazy_parse(blob) # type: ignore ).page_content str_images.append( _format_inner_image(blob, text_from_image, self.images_inner_format) ) image.close() return _FORMAT_IMAGE_STR.format(image_text=_JOIN_IMAGES.join(str_images)) class PDFPlumberParser(BaseBlobParser): """Parse a blob from a PDF using `pdfplumber` library. This class provides methods to parse a blob from a PDF document, supporting various configurations such as handling password-protected PDFs, extracting images, and defining extraction mode. It integrates the 'pdfplumber' library for PDF processing and offers synchronous blob parsing. Examples: Setup: .. code-block:: bash pip install -U langchain-community pdfplumber Load a blob from a PDF file: .. code-block:: python from langchain_core.documents.base import Blob blob = Blob.from_path("./example_data/layout-parser-paper.pdf") Instantiate the parser: .. code-block:: python from langchain_community.document_loaders.parsers import PDFPlumberParser parser = PDFPlumberParser( # password = None, mode = "single", pages_delimiter = "\n\f", # extract_tables="markdown", ) Lazily parse the blob: .. code-block:: python docs = [] docs_lazy = parser.lazy_parse(blob) for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) """ def __init__( self, text_kwargs: Optional[Mapping[str, Any]] = None, dedupe: bool = False, extract_images: bool = False, *, password: Optional[str] = None, mode: Literal["single", "page"] = "page", pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, images_parser: Optional[BaseImageBlobParser] = None, images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", extract_tables: Optional[Literal["csv", "markdown", "html"]] = None, extract_tables_settings: Optional[dict[str, Any]] = None, ) -> None: """Initialize the parser. Args: password: Optional password for opening encrypted PDFs. mode: The extraction mode, either "single" for the entire document or "page" for page-wise extraction. pages_delimiter: A string delimiter to separate pages in single-mode extraction. extract_images: Whether to extract images from the PDF. images_parser: Optional image blob parser. images_inner_format: The format for the parsed output. - "text" = return the content as is - "markdown-img" = wrap the content into an image markdown link, w/ link pointing to (`![body)(#)`] - "html-img" = wrap the content as the `alt` text of an tag and link to (`{body}`) extract_tables: Whether to extract images from the PDF in a specific format, such as "csv", "markdown" or "html". text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()`` dedupe: Avoiding the error of duplicate characters if `dedupe=True` extract_tables_settings: Optional dictionary of settings for customizing table extraction. Returns: This method does not directly return data. Use the `parse` or `lazy_parse` methods to retrieve parsed documents with content and metadata. Raises: ValueError: If the `mode` is not "single" or "page". ValueError: If the `extract_tables` is not "csv", "markdown" or "html". """ super().__init__() if mode not in ["single", "page"]: raise ValueError("mode must be single or page") if extract_tables and extract_tables not in ["csv", "markdown", "html"]: raise ValueError("mode must be csv, markdown or html") if extract_images and not images_parser: images_parser = RapidOCRBlobParser() self.password = password self.extract_images = extract_images self.images_parser = images_parser self.images_inner_format = images_inner_format self.mode = mode self.pages_delimiter = pages_delimiter self.dedupe = dedupe self.text_kwargs = text_kwargs or {} self.extract_tables = extract_tables self.extract_tables_settings = extract_tables_settings or { "vertical_strategy": "lines", "horizontal_strategy": "lines", "snap_y_tolerance": 5, "intersection_x_tolerance": 15, } def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """ Lazily parse the blob. Insert image, if possible, between two paragraphs. In this way, a paragraph can be continued on the next page. Args: blob: The blob to parse. Raises: ImportError: If the `pypdf` package is not found. Yield: An iterator over the parsed documents. """ try: import pdfplumber except ImportError: raise ImportError( "pdfplumber package not found, please install it " "with `pip install pdfplumber`" ) with blob.as_bytes_io() as file_path: # type: ignore[attr-defined] doc = pdfplumber.open(file_path, password=self.password) # open document from pdfplumber.utils import geometry contents = [] # The legacy version, use CreationDate, Creator, etc. # The new 'standard' version must use lower case key. # This next line, merge the legecy keys and standard keys # in the same dictionary. # - The CreationDate is duplicate to `creationdate` with iso format. # - The Creator is duplicate to 'creator', etc. # With this strategy, the legacy code can continue to use CreationDate # or Creator. The new code, can use `creationdate` or `creator`. # _purge_metadata() convert and normalize the name and format of # the metadatas. doc_metadata = ( doc.metadata | # Legacy metdata with... _purge_metadata( ( doc.metadata # Add parser metdata | { # with more keys "source": blob.source, "file_path": blob.source, "total_pages": len(doc.pages), } ) ) ) for page in doc.pages: tables_bbox: list[tuple[float, float, float, float]] = ( self._extract_tables_bbox_from_page(page) ) tables_content = self._extract_tables_from_page(page) images_bbox = [geometry.obj_to_bbox(image) for image in page.images] image_from_page = self._extract_images_from_page(page) page_text = [] extras = [] for content in self._split_page_content( page, tables_bbox, tables_content, images_bbox, image_from_page, ): if isinstance(content, str): # Text page_text.append(content) elif isinstance(content, list): # Table page_text.append(_JOIN_TABLES + self._convert_table(content)) else: # Image if self.images_parser: try: from PIL import Image as Img Img.fromarray(content) # Check if image is valid image_bytes = io.BytesIO() numpy.save(image_bytes, content) blob = Blob.from_data( image_bytes.getvalue(), mime_type="application/x-npy", ) text_from_image = next( self.images_parser.lazy_parse(blob) # type: ignore ).page_content extras.append( _format_inner_image( blob, text_from_image, self.images_inner_format ) ) except TypeError: pass except EOFError: pass all_text = _merge_text_and_extras(extras, "".join(page_text).strip()) if self.mode == "page": # For legacy compatibility, add the last '\n'_ if not all_text.endswith("\n"): all_text += "\n" yield Document( page_content=all_text, metadata=_validate_metadata( doc_metadata | { "page": page.page_number - 1, } ), ) else: contents.append(all_text) if self.mode == "single": yield Document( page_content=self.pages_delimiter.join(contents), metadata=_validate_metadata(doc_metadata), ) def _process_page_content(self, page: pdfplumber.page.Page) -> str: """Process the page content based on dedupe. Args: page: The PDF page to process. Returns: The extracted text from the page. """ if self.dedupe: return page.dedupe_chars().extract_text(**self.text_kwargs) return page.extract_text(**self.text_kwargs) def _split_page_content( self, page: pdfplumber.page.Page, tables_bbox: list[tuple[float, float, float, float]], tables_content: list[list[list[Any]]], images_bbox: list[tuple[float, float, float, float]], images_content: list[np.ndarray], **kwargs: Any, ) -> Iterator[Union[str, list[list[str]], np.ndarray]]: """Split the page content into text, tables, and images. Args: page: The PDF page to process. tables_bbox: Bounding boxes of tables on the page. tables_content: Content of tables on the page. images_bbox: Bounding boxes of images on the page. images_content: Content of images on the page. **kwargs: Additional keyword arguments. Yields: An iterator over the split content (text, tables, images). """ from pdfplumber.utils import ( geometry, text, ) # Iterate over words. If a word is in a table, # yield the accumulated text, and the table # A the word is in a previously see table, ignore it # Finish with the accumulated text kwargs.update( { "keep_blank_chars": True, # "use_text_flow": True, "presorted": True, "layout_bbox": kwargs.get("layout_bbox") or page.cropbox, } ) chars = page.dedupe_chars().objects["char"] if self.dedupe else page.chars extractor = text.WordExtractor( **{k: kwargs[k] for k in text.WORD_EXTRACTOR_KWARGS if k in kwargs} ) wordmap = extractor.extract_wordmap(chars) extract_wordmaps: list[Any] = [] used_arrays = [False] * len(tables_bbox) for word, o in wordmap.tuples: is_table = False word_bbox = geometry.obj_to_bbox(word) for i, table_bbox in enumerate(tables_bbox): if geometry.get_bbox_overlap(word_bbox, table_bbox): # Find a world in a table is_table = True if not used_arrays[i]: # First time I see a word in this array # Yield the previous part if extract_wordmaps: new_wordmap = text.WordMap(tuples=extract_wordmaps) new_textmap = new_wordmap.to_textmap( **{ k: kwargs[k] for k in text.TEXTMAP_KWARGS if k in kwargs } ) yield new_textmap.to_string() extract_wordmaps.clear() # and yield the table used_arrays[i] = True # print(f"yield table {i}") yield tables_content[i] break if not is_table: # print(f' Add {word["text"]}') extract_wordmaps.append((word, o)) if extract_wordmaps: # Text after the array ? new_wordmap = text.WordMap(tuples=extract_wordmaps) new_textmap = new_wordmap.to_textmap( **{k: kwargs[k] for k in text.TEXTMAP_KWARGS if k in kwargs} ) # print(f"yield {new_textmap.to_string()}") yield new_textmap.to_string() # Add images- for content in images_content: yield content def _extract_images_from_page(self, page: pdfplumber.page.Page) -> list[np.ndarray]: """Extract images from a PDF page. Args: page: The PDF page to extract images from. Returns: A list of extracted images as numpy arrays. """ from PIL import Image if not self.images_parser: return [] images = [] for img in page.images: if "Filter" in img["stream"]: if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS: images.append( np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape( img["stream"]["Height"], img["stream"]["Width"], -1 ) ) elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS: buf = np.frombuffer(img["stream"].get_data(), dtype=np.uint8) images.append( np.array(Image.open(io.BytesIO(buf.tobytes()))) # type: ignore ) else: logger.warning("Unknown PDF Filter!") return images def _extract_tables_bbox_from_page( self, page: pdfplumber.page.Page, ) -> list[tuple[float, float, float, float]]: """Extract bounding boxes of tables from a PDF page. Args: page: The PDF page to extract table bounding boxes from. Returns: A list of bounding boxes for tables on the page. """ if not self.extract_tables: return [] from pdfplumber.table import TableSettings table_settings = self.extract_tables_settings tset = TableSettings.resolve(table_settings) return [table.bbox for table in page.find_tables(tset)] def _extract_tables_from_page( self, page: pdfplumber.page.Page, ) -> list[list[list[Any]]]: """Extract tables from a PDF page. Args: page: The PDF page to extract tables from. Returns: A list of tables, where each table is a list of rows, and each row is a list of cell values. """ if not self.extract_tables: return [] table_settings = self.extract_tables_settings tables_list = page.extract_tables(table_settings) return tables_list def _convert_table(self, table: list[list[str]]) -> str: """Convert a table to the specified format. Args: table: The table to convert. Returns: The table content as a string in the specified format. """ format = self.extract_tables if format is None: return "" if format == "markdown": return self._convert_table_to_markdown(table) elif format == "html": return self._convert_table_to_html(table) elif format == "csv": return self._convert_table_to_csv(table) else: raise ValueError(f"Unknown table format: {format}") def _convert_table_to_csv(self, table: list[list[str]]) -> str: """Convert a table to CSV format. Args: table: The table to convert. Returns: The table content as a string in CSV format. Replace "\n" with " ". """ if not table: return "" output = ["\n\n"] # iterate over detail rows for row in table: line = "" for i, cell in enumerate(row): # output None cells with empty string cell = "" if cell is None else cell.replace("\n", " ") line += cell + "," output.append(line) return "\n".join(output) + "\n\n" def _convert_table_to_html(self, table: list[list[str]]) -> str: """ Convert table content as a string in HTML format. If clean is true, markdown syntax is removed from cell content. Args: table: The table to convert. Returns: The table content as a string in HTML format. """ if not len(table): return "" output = "\n" clean = True # iterate over detail rows for row in table: line = "" for i, cell in enumerate(row): # output None cells with empty string cell = "" if cell is None else cell.replace("\n", " ") if clean: # remove sensitive syntax cell = html.escape(cell.replace("-", "-")) line += "" line += "\n" output += line return output + "
" + cell + "
\n" def _convert_table_to_markdown(self, table: list[list[str]]) -> str: """Convert table content as a string in Github-markdown format. Args: table: The table to convert. Returns: The table content as a string in Markdown format. Replace "-" to "-" and "\n" to " ". """ clean = False if not table: return "" col_count = len(table[0]) output = "|" + "|".join("" for i in range(col_count)) + "|\n" output += "|" + "|".join("---" for i in range(col_count)) + "|\n" # skip first row in details if header is part of the table # iterate over detail rows for row in table: line = "|" for i, cell in enumerate(row): # output None cells with empty string cell = "" if cell is None else cell.replace("\n", " ") if clean: # remove sensitive syntax cell = html.escape(cell.replace("-", "-")) line += cell + "|" line += "\n" output += line return output + "\n" class AmazonTextractPDFParser(BaseBlobParser): """Send `PDF` files to `Amazon Textract` and parse them. For parsing multi-page PDFs, they have to reside on S3. The AmazonTextractPDFLoader calls the [Amazon Textract Service](https://aws.amazon.com/textract/) to convert PDFs into a Document structure. Single and multi-page documents are supported with up to 3000 pages and 512 MB of size. For the call to be successful an AWS account is required, similar to the [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) requirements. Besides the AWS configuration, it is very similar to the other PDF loaders, while also supporting JPEG, PNG and TIFF and non-native PDF formats. ```python from langchain_community.document_loaders import AmazonTextractPDFLoader loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg") documents = loader.load() ``` One feature is the linearization of the output. When using the features LAYOUT, FORMS or TABLES together with Textract ```python from langchain_community.document_loaders import AmazonTextractPDFLoader # you can mix and match each of the features loader=AmazonTextractPDFLoader( "example_data/alejandro_rosalez_sample-small.jpeg", textract_features=["TABLES", "LAYOUT"]) documents = loader.load() ``` it will generate output that formats the text in reading order and try to output the information in a tabular structure or output the key/value pairs with a colon (key: value). This helps most LLMs to achieve better accuracy when processing these texts. """ def __init__( self, textract_features: Optional[Sequence[int]] = None, client: Optional[Any] = None, *, linearization_config: Optional[TextLinearizationConfig] = None, ) -> None: """Initializes the parser. Args: textract_features: Features to be used for extraction, each feature should be passed as an int that conforms to the enum `Textract_Features`, see `amazon-textract-caller` pkg client: boto3 textract client linearization_config: Config to be used for linearization of the output should be an instance of TextLinearizationConfig from the `textractor` pkg """ try: import textractcaller as tc import textractor.entities.document as textractor self.tc = tc self.textractor = textractor if textract_features is not None: self.textract_features = [ tc.Textract_Features(f) for f in textract_features ] else: self.textract_features = [] if linearization_config is not None: self.linearization_config = linearization_config else: self.linearization_config = self.textractor.TextLinearizationConfig( hide_figure_layout=True, title_prefix="# ", section_header_prefix="## ", list_element_prefix="*", ) except ImportError: raise ImportError( "Could not import amazon-textract-caller or " "amazon-textract-textractor python package. Please install it " "with `pip install amazon-textract-caller` & " "`pip install amazon-textract-textractor`." ) if not client: try: import boto3 self.boto3_textract_client = boto3.client("textract") except ImportError: raise ImportError( "Could not import boto3 python package. " "Please install it with `pip install boto3`." ) else: self.boto3_textract_client = client def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """Iterates over the Blob pages and returns an Iterator with a Document for each page, like the other parsers If multi-page document, blob.path has to be set to the S3 URI and for single page docs the blob.data is taken """ url_parse_result = urlparse( str(blob.path)) if blob.path else None # type: ignore[attr-defined] # Either call with S3 path (multi-page) or with bytes (single-page) if ( url_parse_result and url_parse_result.scheme == "s3" and url_parse_result.netloc ): textract_response_json = self.tc.call_textract( input_document=str(blob.path), # type: ignore[attr-defined] features=self.textract_features, boto3_textract_client=self.boto3_textract_client, ) else: textract_response_json = self.tc.call_textract( input_document=blob.as_bytes(), # type: ignore[attr-defined] features=self.textract_features, call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC, boto3_textract_client=self.boto3_textract_client, ) document = self.textractor.Document.open(textract_response_json) for idx, page in enumerate(document.pages): yield Document( page_content=page.get_text(config=self.linearization_config), metadata={"source": blob.source, "page": idx + 1}, # type: ignore[attr-defined] ) class DocumentIntelligenceParser(BaseBlobParser): """Loads a PDF with Azure Document Intelligence (formerly Form Recognizer) and chunks at character level.""" def __init__(self, client: Any, model: str): warnings.warn( "langchain_community.document_loaders.parsers.pdf.DocumentIntelligenceParser" "and langchain_community.document_loaders.pdf.DocumentIntelligenceLoader" " are deprecated. Please upgrade to " "langchain_community.document_loaders.DocumentIntelligenceLoader " "for any file parsing purpose using Azure Document Intelligence " "service." ) self.client = client self.model = model def _generate_docs(self, blob: Blob, result: Any) -> Iterator[ Document]: # type: ignore[valid-type] for p in result.pages: content = " ".join([line.content for line in p.lines]) d = Document( page_content=content, metadata={ "source": blob.source, # type: ignore[attr-defined] "page": p.page_number, }, ) yield d def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """Lazily parse the blob.""" with blob.as_bytes_io() as file_obj: # type: ignore[attr-defined] poller = self.client.begin_analyze_document(self.model, file_obj) result = poller.result() docs = self._generate_docs(blob, result) yield from docs