"""Module contains common parsers for PDFs.""" from __future__ import annotations import html import io import logging import threading import warnings from datetime import datetime from typing import ( TYPE_CHECKING, Any, Iterable, Iterator, Literal, Mapping, Optional, Sequence, Union, cast, ) from urllib.parse import urlparse import numpy import numpy as np from langchain_core.documents import Document from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.parsers.images import ( BaseImageBlobParser, RapidOCRBlobParser, ) if TYPE_CHECKING: import pdfminer import pdfplumber import pymupdf import pypdf import pypdfium2 from textractor.data.text_linearization_config import TextLinearizationConfig _PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"] _PDF_FILTER_WITHOUT_LOSS = [ "LZWDecode", "LZW", "FlateDecode", "Fl", "ASCII85Decode", "A85", "ASCIIHexDecode", "AHx", "RunLengthDecode", "RL", "CCITTFaxDecode", "CCF", "JBIG2Decode", ] def extract_from_images_with_rapidocr( images: Sequence[Union[Iterable[np.ndarray], bytes]], ) -> str: """Extract text from images with RapidOCR. Args: images: Images to extract text from. Returns: Text extracted from images. Raises: ImportError: If `rapidocr-onnxruntime` package is not installed. """ try: from rapidocr_onnxruntime import RapidOCR except ImportError: raise ImportError( "`rapidocr-onnxruntime` package not found, please install it with " "`pip install rapidocr-onnxruntime`" ) ocr = RapidOCR() text = "" for img in images: result, _ = ocr(img) if result: result = [text[1] for text in result] text += "\n".join(result) return text logger = logging.getLogger(__name__) _FORMAT_IMAGE_STR = "\n\n{image_text}\n\n" _JOIN_IMAGES = "\n" _JOIN_TABLES = "\n" _DEFAULT_PAGES_DELIMITER = "\n\f" _STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"} def _format_inner_image(blob: Blob, content: str, format: str) -> str: """Format the content of the image with the source of the blob. blob: The blob containing the image. format:: The format for the parsed output. - "text" = return the content as is - "markdown-img" = wrap the content into an image markdown link, w/ link pointing to (`![body)(#)`] - "html-img" = wrap the content as the `alt` text of an tag and link to (`{body}`) """ if content: source = blob.source or "#" if format == "markdown-img": content = content.replace("]", r"\\]") content = f"![{content}]({source})" elif format == "html-img": content = ( f'{html.escape(content, quote=True)} ' f'src=' ) return content def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]: """Validate that the metadata has all the standard keys and the page is an integer. The standard keys are: - source - total_page - creationdate - creator - producer Validate that page is an integer if it is present. """ if not _STD_METADATA_KEYS.issubset(metadata.keys()): raise ValueError("The PDF parser must valorize the standard metadata.") if not isinstance(metadata.get("page", 0), int): raise ValueError("The PDF metadata page must be a integer.") return metadata def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]: """Purge metadata from unwanted keys and normalize key names. Args: metadata: The original metadata dictionary. Returns: The cleaned and normalized the key format of metadata dictionary. """ new_metadata: dict[str, Any] = {} map_key = { "page_count": "total_pages", "file_path": "source", } for k, v in metadata.items(): if type(v) not in [str, int]: v = str(v) if k.startswith("/"): k = k[1:] k = k.lower() if k in ["creationdate", "moddate"]: try: new_metadata[k] = datetime.strptime( v.replace("'", ""), "D:%Y%m%d%H%M%S%z" ).isoformat("T") except ValueError: new_metadata[k] = v elif k in map_key: # Normalize key with others PDF parser new_metadata[map_key[k]] = v new_metadata[k] = v elif isinstance(v, str): new_metadata[k] = v.strip() elif isinstance(v, int): new_metadata[k] = v return new_metadata _PARAGRAPH_DELIMITER = [ "\n\n\n", "\n\n", ] # To insert images or table in the middle of the page. def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str: """Insert extras such as image/table in a text between two paragraphs if possible, else at the end of the text. Args: extras: List of extra content (images/tables) to insert. text_from_page: The text content from the page. Returns: The merged text with extras inserted. """ def _recurs_merge_text_and_extras( extras: list[str], text_from_page: str, recurs: bool ) -> Optional[str]: if extras: for delim in _PARAGRAPH_DELIMITER: pos = text_from_page.rfind(delim) if pos != -1: # search penultimate, to bypass an error in footer previous_text = None if recurs: previous_text = _recurs_merge_text_and_extras( extras, text_from_page[:pos], False ) if previous_text: all_text = previous_text + text_from_page[pos:] else: all_extras = "" str_extras = "\n\n".join(filter(lambda x: x, extras)) if str_extras: all_extras = delim + str_extras all_text = ( text_from_page[:pos] + all_extras + text_from_page[pos:] ) break else: all_text = None else: all_text = text_from_page return all_text all_text = _recurs_merge_text_and_extras(extras, text_from_page, True) if not all_text: all_extras = "" str_extras = "\n\n".join(filter(lambda x: x, extras)) if str_extras: all_extras = _PARAGRAPH_DELIMITER[-1] + str_extras all_text = text_from_page + all_extras return all_text class PyPDFParser(BaseBlobParser): """Parse a blob from a PDF using `pypdf` library. This class provides methods to parse a blob from a PDF document, supporting various configurations such as handling password-protected PDFs, extracting images. It integrates the 'pypdf' library for PDF processing and offers synchronous blob parsing. Examples: Setup: .. code-block:: bash pip install -U langchain-community pypdf Load a blob from a PDF file: .. code-block:: python from langchain_core.documents.base import Blob blob = Blob.from_path("./example_data/layout-parser-paper.pdf") Instantiate the parser: .. code-block:: python from langchain_community.document_loaders.parsers import PyPDFParser parser = PyPDFParser( # password = None, mode = "single", pages_delimiter = "\n\f", # extract_images = True, # images_parser = TesseractBlobParser(), ) Lazily parse the blob: .. code-block:: python docs = [] docs_lazy = parser.lazy_parse(blob) for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) """ def __init__( self, password: Optional[Union[str, bytes]] = None, extract_images: bool = False, *, mode: Literal["single", "page"] = "page", pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, images_parser: Optional[BaseImageBlobParser] = None, images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", extraction_mode: Literal["plain", "layout"] = "plain", extraction_kwargs: Optional[dict[str, Any]] = None, ): """Initialize a parser based on PyPDF. Args: password: Optional password for opening encrypted PDFs. extract_images: Whether to extract images from the PDF. mode: The extraction mode, either "single" for the entire document or "page" for page-wise extraction. pages_delimiter: A string delimiter to separate pages in single-mode extraction. images_parser: Optional image blob parser. images_inner_format: The format for the parsed output. - "text" = return the content as is - "markdown-img" = wrap the content into an image markdown link, w/ link pointing to (`![body)(#)`] - "html-img" = wrap the content as the `alt` text of an tag and link to (`{body}`) extraction_mode: “plain” for legacy functionality, “layout” extract text in a fixed width format that closely adheres to the rendered layout in the source pdf. extraction_kwargs: Optional additional parameters for the extraction process. Raises: ValueError: If the `mode` is not "single" or "page". """ super().__init__() if mode not in ["single", "page"]: raise ValueError("mode must be single or page") self.extract_images = extract_images if extract_images and not images_parser: images_parser = RapidOCRBlobParser() self.images_parser = images_parser self.images_inner_format = images_inner_format self.password = password self.mode = mode self.pages_delimiter = pages_delimiter self.extraction_mode = extraction_mode self.extraction_kwargs = extraction_kwargs or {} def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """ Lazily parse the blob. Insert image, if possible, between two paragraphs. In this way, a paragraph can be continued on the next page. Args: blob: The blob to parse. Raises: ImportError: If the `pypdf` package is not found. Yield: An iterator over the parsed documents. """ try: import pypdf except ImportError: raise ImportError( "pypdf package not found, please install it with `pip install pypdf`" ) def _extract_text_from_page(page: pypdf.PageObject) -> str: """ Extract text from image given the version of pypdf. Args: page: The page object to extract text from. Returns: str: The extracted text. """ if pypdf.__version__.startswith("3"): return page.extract_text() else: return page.extract_text( extraction_mode=self.extraction_mode, **self.extraction_kwargs, ) with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined] pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password) doc_metadata = _purge_metadata( {"producer": "PyPDF", "creator": "PyPDF", "creationdate": ""} | cast(dict, pdf_reader.metadata or {}) | { "source": blob.source, "total_pages": len(pdf_reader.pages), } ) single_texts = [] for page_number, page in enumerate(pdf_reader.pages): text_from_page = _extract_text_from_page(page=page) images_from_page = self.extract_images_from_page(page) all_text = _merge_text_and_extras( [images_from_page], text_from_page ).strip() if self.mode == "page": yield Document( page_content=all_text, metadata=_validate_metadata( doc_metadata | { "page": page_number, "page_label": pdf_reader.page_labels[page_number], } ), ) else: single_texts.append(all_text) if self.mode == "single": yield Document( page_content=self.pages_delimiter.join(single_texts), metadata=_validate_metadata(doc_metadata), ) def extract_images_from_page(self, page: pypdf._page.PageObject) -> str: """Extract images from a PDF page and get the text using images_to_text. Args: page: The page object from which to extract images. Returns: str: The extracted text from the images on the page. """ if not self.images_parser: return "" from PIL import Image if "/XObject" not in cast(dict, page["/Resources"]).keys(): return "" xObject = page["/Resources"]["/XObject"].get_object() # type: ignore[index] images = [] for obj in xObject: np_image: Any = None if xObject[obj]["/Subtype"] == "/Image": if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS: height, width = xObject[obj]["/Height"], xObject[obj]["/Width"] np_image = np.frombuffer( xObject[obj].get_data(), dtype=np.uint8 ).reshape(height, width, -1) elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS: np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data()))) else: logger.warning("Unknown PDF Filter!") if np_image is not None: image_bytes = io.BytesIO() Image.fromarray(np_image).save(image_bytes, format="PNG") blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png") image_text = next(self.images_parser.lazy_parse(blob)).page_content images.append( _format_inner_image(blob, image_text, self.images_inner_format) ) return _FORMAT_IMAGE_STR.format( image_text=_JOIN_IMAGES.join(filter(None, images)) ) class PDFMinerParser(BaseBlobParser): """Parse `PDF` using `PDFMiner`.""" def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True): """Initialize a parser based on PDFMiner. Args: extract_images: Whether to extract images from PDF. concatenate_pages: If True, concatenate all PDF pages into one a single document. Otherwise, return one document per page. """ self.extract_images = extract_images self.concatenate_pages = concatenate_pages def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """Lazily parse the blob.""" if not self.extract_images: try: from pdfminer.high_level import extract_text except ImportError: raise ImportError( "`pdfminer` package not found, please install it with " "`pip install pdfminer.six`" ) with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined] if self.concatenate_pages: text = extract_text(pdf_file_obj) metadata = {"source": blob.source} # type: ignore[attr-defined] yield Document(page_content=text, metadata=metadata) else: from pdfminer.pdfpage import PDFPage pages = PDFPage.get_pages(pdf_file_obj) for i, _ in enumerate(pages): text = extract_text(pdf_file_obj, page_numbers=[i]) metadata = {"source": blob.source, "page": str(i)} # type: ignore[attr-defined] yield Document(page_content=text, metadata=metadata) else: import io from pdfminer.converter import PDFPageAggregator, TextConverter from pdfminer.layout import LAParams from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage text_io = io.StringIO() with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined] pages = PDFPage.get_pages(pdf_file_obj) rsrcmgr = PDFResourceManager() device_for_text = TextConverter(rsrcmgr, text_io, laparams=LAParams()) device_for_image = PDFPageAggregator(rsrcmgr, laparams=LAParams()) interpreter_for_text = PDFPageInterpreter(rsrcmgr, device_for_text) interpreter_for_image = PDFPageInterpreter(rsrcmgr, device_for_image) for i, page in enumerate(pages): interpreter_for_text.process_page(page) interpreter_for_image.process_page(page) content = text_io.getvalue() + self._extract_images_from_page( device_for_image.get_result() ) text_io.truncate(0) text_io.seek(0) metadata = {"source": blob.source, "page": str(i)} # type: ignore[attr-defined] yield Document(page_content=content, metadata=metadata) def _extract_images_from_page(self, page: pdfminer.layout.LTPage) -> str: """Extract images from page and get the text with RapidOCR.""" import pdfminer def get_image(layout_object: Any) -> Any: if isinstance(layout_object, pdfminer.layout.LTImage): return layout_object if isinstance(layout_object, pdfminer.layout.LTContainer): for child in layout_object: return get_image(child) else: return None images = [] for img in filter(bool, map(get_image, page)): img_filter = img.stream["Filter"] if isinstance(img_filter, list): filter_names = [f.name for f in img_filter] else: filter_names = [img_filter.name] without_loss = any( name in _PDF_FILTER_WITHOUT_LOSS for name in filter_names ) with_loss = any(name in _PDF_FILTER_WITH_LOSS for name in filter_names) non_matching = {name for name in filter_names} - { *_PDF_FILTER_WITHOUT_LOSS, *_PDF_FILTER_WITH_LOSS, } if without_loss and with_loss: warnings.warn( "Image has both lossy and lossless filters. Defaulting to lossless" ) if non_matching: warnings.warn(f"Unknown PDF Filter(s): {non_matching}") if without_loss: images.append( np.frombuffer(img.stream.get_data(), dtype=np.uint8).reshape( img.stream["Height"], img.stream["Width"], -1 ) ) elif with_loss: images.append(img.stream.get_data()) return extract_from_images_with_rapidocr(images) class PyMuPDFParser(BaseBlobParser): """Parse a blob from a PDF using `PyMuPDF` library. This class provides methods to parse a blob from a PDF document, supporting various configurations such as handling password-protected PDFs, extracting images, and defining extraction mode. It integrates the 'PyMuPDF' library for PDF processing and offers synchronous blob parsing. Examples: Setup: .. code-block:: bash pip install -U langchain-community pymupdf Load a blob from a PDF file: .. code-block:: python from langchain_core.documents.base import Blob blob = Blob.from_path("./example_data/layout-parser-paper.pdf") Instantiate the parser: .. code-block:: python from langchain_community.document_loaders.parsers import PyMuPDFParser parser = PyMuPDFParser( # password = None, mode = "single", pages_delimiter = "\n\f", # extract_images = True, # images_parser = TesseractBlobParser(), # extract_tables="markdown", # extract_tables_settings=None, # text_kwargs=None, ) Lazily parse the blob: .. code-block:: python docs = [] docs_lazy = parser.lazy_parse(blob) for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) """ # PyMuPDF is not thread safe. # See https://pymupdf.readthedocs.io/en/latest/recipes-multiprocessing.html _lock = threading.Lock() def __init__( self, text_kwargs: Optional[dict[str, Any]] = None, extract_images: bool = False, *, password: Optional[str] = None, mode: Literal["single", "page"] = "page", pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, images_parser: Optional[BaseImageBlobParser] = None, images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", extract_tables: Union[Literal["csv", "markdown", "html"], None] = None, extract_tables_settings: Optional[dict[str, Any]] = None, ) -> None: """Initialize a parser based on PyMuPDF. Args: password: Optional password for opening encrypted PDFs. mode: The extraction mode, either "single" for the entire document or "page" for page-wise extraction. pages_delimiter: A string delimiter to separate pages in single-mode extraction. extract_images: Whether to extract images from the PDF. images_parser: Optional image blob parser. images_inner_format: The format for the parsed output. - "text" = return the content as is - "markdown-img" = wrap the content into an image markdown link, w/ link pointing to (`![body)(#)`] - "html-img" = wrap the content as the `alt` text of an tag and link to (`{body}`) extract_tables: Whether to extract tables in a specific format, such as "csv", "markdown", or "html". extract_tables_settings: Optional dictionary of settings for customizing table extraction. Returns: This method does not directly return data. Use the `parse` or `lazy_parse` methods to retrieve parsed documents with content and metadata. Raises: ValueError: If the mode is not "single" or "page". ValueError: If the extract_tables format is not "markdown", "html", or "csv". """ super().__init__() if mode not in ["single", "page"]: raise ValueError("mode must be single or page") if extract_tables and extract_tables not in ["markdown", "html", "csv"]: raise ValueError("mode must be markdown") self.mode = mode self.pages_delimiter = pages_delimiter self.password = password self.text_kwargs = text_kwargs or {} if extract_images and not images_parser: images_parser = RapidOCRBlobParser() self.extract_images = extract_images self.images_inner_format = images_inner_format self.images_parser = images_parser self.extract_tables = extract_tables self.extract_tables_settings = extract_tables_settings def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] return self._lazy_parse( blob, ) def _lazy_parse( self, blob: Blob, # text-kwargs is present for backwards compatibility. # Users should not use it directly. text_kwargs: Optional[dict[str, Any]] = None, ) -> Iterator[Document]: # type: ignore[valid-type] """Lazily parse the blob. Insert image, if possible, between two paragraphs. In this way, a paragraph can be continued on the next page. Args: blob: The blob to parse. text_kwargs: Optional keyword arguments to pass to the `get_text` method. If provided at run time, it will override the default text_kwargs. Raises: ImportError: If the `pypdf` package is not found. Yield: An iterator over the parsed documents. """ try: import pymupdf text_kwargs = text_kwargs or self.text_kwargs if not self.extract_tables_settings: from pymupdf.table import ( DEFAULT_JOIN_TOLERANCE, DEFAULT_MIN_WORDS_HORIZONTAL, DEFAULT_MIN_WORDS_VERTICAL, DEFAULT_SNAP_TOLERANCE, ) self.extract_tables_settings = { # See https://pymupdf.readthedocs.io/en/latest/page.html#Page.find_tables "clip": None, "vertical_strategy": "lines", "horizontal_strategy": "lines", "vertical_lines": None, "horizontal_lines": None, "snap_tolerance": DEFAULT_SNAP_TOLERANCE, "snap_x_tolerance": None, "snap_y_tolerance": None, "join_tolerance": DEFAULT_JOIN_TOLERANCE, "join_x_tolerance": None, "join_y_tolerance": None, "edge_min_length": 3, "min_words_vertical": DEFAULT_MIN_WORDS_VERTICAL, "min_words_horizontal": DEFAULT_MIN_WORDS_HORIZONTAL, "intersection_tolerance": 3, "intersection_x_tolerance": None, "intersection_y_tolerance": None, "text_tolerance": 3, "text_x_tolerance": 3, "text_y_tolerance": 3, "strategy": None, # offer abbreviation "add_lines": None, # optional user-specified lines } except ImportError: raise ImportError( "pymupdf package not found, please install it " "with `pip install pymupdf`" ) with PyMuPDFParser._lock: with blob.as_bytes_io() as file_path: # type: ignore[attr-defined] if blob.data is None: # type: ignore[attr-defined] doc = pymupdf.open(file_path) else: doc = pymupdf.open(stream=file_path, filetype="pdf") if doc.is_encrypted: doc.authenticate(self.password) doc_metadata = self._extract_metadata(doc, blob) full_content = [] for page in doc: all_text = self._get_page_content(doc, page, text_kwargs).strip() if self.mode == "page": yield Document( page_content=all_text, metadata=_validate_metadata( doc_metadata | {"page": page.number} ), ) else: full_content.append(all_text) if self.mode == "single": yield Document( page_content=self.pages_delimiter.join(full_content), metadata=_validate_metadata(doc_metadata), ) def _get_page_content( self, doc: pymupdf.Document, page: pymupdf.Page, text_kwargs: dict[str, Any], ) -> str: """Get the text of the page using PyMuPDF and RapidOCR and issue a warning if it is empty. Args: doc: The PyMuPDF document object. page: The PyMuPDF page object. blob: The blob being parsed. Returns: str: The text content of the page. """ text_from_page = page.get_text(**{**self.text_kwargs, **text_kwargs}) images_from_page = self._extract_images_from_page(doc, page) tables_from_page = self._extract_tables_from_page(page) extras = [] if images_from_page: extras.append(images_from_page) if tables_from_page: extras.append(tables_from_page) all_text = _merge_text_and_extras(extras, text_from_page) return all_text def _extract_metadata(self, doc: pymupdf.Document, blob: Blob) -> dict: """Extract metadata from the document and page. Args: doc: The PyMuPDF document object. blob: The blob being parsed. Returns: dict: The extracted metadata. """ return _purge_metadata( dict( { "producer": "PyMuPDF", "creator": "PyMuPDF", "creationdate": "", "source": blob.source, # type: ignore[attr-defined] "file_path": blob.source, # type: ignore[attr-defined] "total_pages": len(doc), }, **{ k: doc.metadata[k] for k in doc.metadata if isinstance(doc.metadata[k], (str, int)) }, ) ) def _extract_images_from_page( self, doc: pymupdf.Document, page: pymupdf.Page ) -> str: """Extract images from a PDF page and get the text using images_to_text. Args: doc: The PyMuPDF document object. page: The PyMuPDF page object. Returns: str: The extracted text from the images on the page. """ if not self.images_parser: return "" import pymupdf img_list = page.get_images() images = [] for img in img_list: if self.images_parser: xref = img[0] pix = pymupdf.Pixmap(doc, xref) image = np.frombuffer(pix.samples, dtype=np.uint8).reshape( pix.height, pix.width, -1 ) image_bytes = io.BytesIO() numpy.save(image_bytes, image) blob = Blob.from_data( image_bytes.getvalue(), mime_type="application/x-npy" ) image_text = next(self.images_parser.lazy_parse(blob)).page_content images.append( _format_inner_image(blob, image_text, self.images_inner_format) ) return _FORMAT_IMAGE_STR.format( image_text=_JOIN_IMAGES.join(filter(None, images)) ) def _extract_tables_from_page(self, page: pymupdf.Page) -> str: """Extract tables from a PDF page. Args: page: The PyMuPDF page object. Returns: str: The extracted tables in the specified format. """ if self.extract_tables is None: return "" import pymupdf tables_list = list( pymupdf.table.find_tables(page, **self.extract_tables_settings) ) if tables_list: if self.extract_tables == "markdown": return _JOIN_TABLES.join([table.to_markdown() for table in tables_list]) elif self.extract_tables == "html": return _JOIN_TABLES.join( [ table.to_pandas().to_html( header=False, index=False, bold_rows=False, ) for table in tables_list ] ) elif self.extract_tables == "csv": return _JOIN_TABLES.join( [ table.to_pandas().to_csv( header=False, index=False, ) for table in tables_list ] ) else: raise ValueError( f"extract_tables {self.extract_tables} not implemented" ) return "" class PyPDFium2Parser(BaseBlobParser): """Parse `PDF` with `PyPDFium2`.""" def __init__(self, extract_images: bool = False) -> None: """Initialize the parser.""" try: import pypdfium2 # noqa:F401 except ImportError: raise ImportError( "pypdfium2 package not found, please install it with" " `pip install pypdfium2`" ) self.extract_images = extract_images def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """Lazily parse the blob.""" import pypdfium2 # pypdfium2 is really finicky with respect to closing things, # if done incorrectly creates seg faults. with blob.as_bytes_io() as file_path: # type: ignore[attr-defined] pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True) try: for page_number, page in enumerate(pdf_reader): text_page = page.get_textpage() content = text_page.get_text_range() text_page.close() content += "\n" + self._extract_images_from_page(page) page.close() metadata = {"source": blob.source, "page": page_number} # type: ignore[attr-defined] yield Document(page_content=content, metadata=metadata) finally: pdf_reader.close() def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> str: """Extract images from page and get the text with RapidOCR.""" if not self.extract_images: return "" import pypdfium2.raw as pdfium_c images = list(page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,))) images = list(map(lambda x: x.get_bitmap().to_numpy(), images)) return extract_from_images_with_rapidocr(images) class PDFPlumberParser(BaseBlobParser): """Parse `PDF` with `PDFPlumber`.""" def __init__( self, text_kwargs: Optional[Mapping[str, Any]] = None, dedupe: bool = False, extract_images: bool = False, ) -> None: """Initialize the parser. Args: text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()`` dedupe: Avoiding the error of duplicate characters if `dedupe=True`. """ try: import PIL # noqa:F401 except ImportError: raise ImportError( "pillow package not found, please install it with" " `pip install pillow`" ) self.text_kwargs = text_kwargs or {} self.dedupe = dedupe self.extract_images = extract_images def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """Lazily parse the blob.""" import pdfplumber with blob.as_bytes_io() as file_path: # type: ignore[attr-defined] doc = pdfplumber.open(file_path) # open document yield from [ Document( page_content=self._process_page_content(page) + "\n" + self._extract_images_from_page(page), metadata=dict( { "source": blob.source, # type: ignore[attr-defined] "file_path": blob.source, # type: ignore[attr-defined] "page": page.page_number - 1, "total_pages": len(doc.pages), }, **{ k: doc.metadata[k] for k in doc.metadata if type(doc.metadata[k]) in [str, int] }, ), ) for page in doc.pages ] def _process_page_content(self, page: pdfplumber.page.Page) -> str: """Process the page content based on dedupe.""" if self.dedupe: return page.dedupe_chars().extract_text(**self.text_kwargs) return page.extract_text(**self.text_kwargs) def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str: """Extract images from page and get the text with RapidOCR.""" from PIL import Image if not self.extract_images: return "" images = [] for img in page.images: if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS: if img["stream"]["BitsPerComponent"] == 1: images.append( np.array( Image.frombytes( "1", (img["stream"]["Width"], img["stream"]["Height"]), img["stream"].get_data(), ).convert("L") ) ) else: images.append( np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape( img["stream"]["Height"], img["stream"]["Width"], -1 ) ) elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS: images.append(img["stream"].get_data()) else: warnings.warn("Unknown PDF Filter!") return extract_from_images_with_rapidocr(images) class AmazonTextractPDFParser(BaseBlobParser): """Send `PDF` files to `Amazon Textract` and parse them. For parsing multi-page PDFs, they have to reside on S3. The AmazonTextractPDFLoader calls the [Amazon Textract Service](https://aws.amazon.com/textract/) to convert PDFs into a Document structure. Single and multi-page documents are supported with up to 3000 pages and 512 MB of size. For the call to be successful an AWS account is required, similar to the [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) requirements. Besides the AWS configuration, it is very similar to the other PDF loaders, while also supporting JPEG, PNG and TIFF and non-native PDF formats. ```python from langchain_community.document_loaders import AmazonTextractPDFLoader loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg") documents = loader.load() ``` One feature is the linearization of the output. When using the features LAYOUT, FORMS or TABLES together with Textract ```python from langchain_community.document_loaders import AmazonTextractPDFLoader # you can mix and match each of the features loader=AmazonTextractPDFLoader( "example_data/alejandro_rosalez_sample-small.jpeg", textract_features=["TABLES", "LAYOUT"]) documents = loader.load() ``` it will generate output that formats the text in reading order and try to output the information in a tabular structure or output the key/value pairs with a colon (key: value). This helps most LLMs to achieve better accuracy when processing these texts. """ def __init__( self, textract_features: Optional[Sequence[int]] = None, client: Optional[Any] = None, *, linearization_config: Optional[TextLinearizationConfig] = None, ) -> None: """Initializes the parser. Args: textract_features: Features to be used for extraction, each feature should be passed as an int that conforms to the enum `Textract_Features`, see `amazon-textract-caller` pkg client: boto3 textract client linearization_config: Config to be used for linearization of the output should be an instance of TextLinearizationConfig from the `textractor` pkg """ try: import textractcaller as tc import textractor.entities.document as textractor self.tc = tc self.textractor = textractor if textract_features is not None: self.textract_features = [ tc.Textract_Features(f) for f in textract_features ] else: self.textract_features = [] if linearization_config is not None: self.linearization_config = linearization_config else: self.linearization_config = self.textractor.TextLinearizationConfig( hide_figure_layout=True, title_prefix="# ", section_header_prefix="## ", list_element_prefix="*", ) except ImportError: raise ImportError( "Could not import amazon-textract-caller or " "amazon-textract-textractor python package. Please install it " "with `pip install amazon-textract-caller` & " "`pip install amazon-textract-textractor`." ) if not client: try: import boto3 self.boto3_textract_client = boto3.client("textract") except ImportError: raise ImportError( "Could not import boto3 python package. " "Please install it with `pip install boto3`." ) else: self.boto3_textract_client = client def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """Iterates over the Blob pages and returns an Iterator with a Document for each page, like the other parsers If multi-page document, blob.path has to be set to the S3 URI and for single page docs the blob.data is taken """ url_parse_result = urlparse(str(blob.path)) if blob.path else None # type: ignore[attr-defined] # Either call with S3 path (multi-page) or with bytes (single-page) if ( url_parse_result and url_parse_result.scheme == "s3" and url_parse_result.netloc ): textract_response_json = self.tc.call_textract( input_document=str(blob.path), # type: ignore[attr-defined] features=self.textract_features, boto3_textract_client=self.boto3_textract_client, ) else: textract_response_json = self.tc.call_textract( input_document=blob.as_bytes(), # type: ignore[attr-defined] features=self.textract_features, call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC, boto3_textract_client=self.boto3_textract_client, ) document = self.textractor.Document.open(textract_response_json) for idx, page in enumerate(document.pages): yield Document( page_content=page.get_text(config=self.linearization_config), metadata={"source": blob.source, "page": idx + 1}, # type: ignore[attr-defined] ) class DocumentIntelligenceParser(BaseBlobParser): """Loads a PDF with Azure Document Intelligence (formerly Form Recognizer) and chunks at character level.""" def __init__(self, client: Any, model: str): warnings.warn( "langchain_community.document_loaders.parsers.pdf.DocumentIntelligenceParser" "and langchain_community.document_loaders.pdf.DocumentIntelligenceLoader" " are deprecated. Please upgrade to " "langchain_community.document_loaders.DocumentIntelligenceLoader " "for any file parsing purpose using Azure Document Intelligence " "service." ) self.client = client self.model = model def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: # type: ignore[valid-type] for p in result.pages: content = " ".join([line.content for line in p.lines]) d = Document( page_content=content, metadata={ "source": blob.source, # type: ignore[attr-defined] "page": p.page_number, }, ) yield d def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """Lazily parse the blob.""" with blob.as_bytes_io() as file_obj: # type: ignore[attr-defined] poller = self.client.begin_analyze_document(self.model, file_obj) result = poller.result() docs = self._generate_docs(blob, result) yield from docs