community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)

Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
2025-09-12 12:59:07 +00:00 · 2023-12-11 13:53:30 -08:00
parent c0f4b95aa9
commit ed58eeb9c5
2446 changed files with 171805 additions and 137118 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -0,0 +1,573 @@
+"""Module contains common parsers for PDFs."""
+from __future__ import annotations
+
+import warnings
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Iterable,
+    Iterator,
+    Mapping,
+    Optional,
+    Sequence,
+    Union,
+)
+from urllib.parse import urlparse
+
+import numpy as np
+from langchain_core.documents import Document
+
+from langchain_community.document_loaders.base import BaseBlobParser
+from langchain_community.document_loaders.blob_loaders import Blob
+
+if TYPE_CHECKING:
+    import fitz.fitz
+    import pdfminer.layout
+    import pdfplumber.page
+    import pypdf._page
+    import pypdfium2._helpers.page
+
+
+_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
+_PDF_FILTER_WITHOUT_LOSS = [
+    "LZWDecode",
+    "LZW",
+    "FlateDecode",
+    "Fl",
+    "ASCII85Decode",
+    "A85",
+    "ASCIIHexDecode",
+    "AHx",
+    "RunLengthDecode",
+    "RL",
+    "CCITTFaxDecode",
+    "CCF",
+    "JBIG2Decode",
+]
+
+
+def extract_from_images_with_rapidocr(
+    images: Sequence[Union[Iterable[np.ndarray], bytes]],
+) -> str:
+    """Extract text from images with RapidOCR.
+
+    Args:
+        images: Images to extract text from.
+
+    Returns:
+        Text extracted from images.
+
+    Raises:
+        ImportError: If `rapidocr-onnxruntime` package is not installed.
+    """
+    try:
+        from rapidocr_onnxruntime import RapidOCR
+    except ImportError:
+        raise ImportError(
+            "`rapidocr-onnxruntime` package not found, please install it with "
+            "`pip install rapidocr-onnxruntime`"
+        )
+    ocr = RapidOCR()
+    text = ""
+    for img in images:
+        result, _ = ocr(img)
+        if result:
+            result = [text[1] for text in result]
+            text += "\n".join(result)
+    return text
+
+
+class PyPDFParser(BaseBlobParser):
+    """Load `PDF` using `pypdf`"""
+
+    def __init__(
+        self, password: Optional[Union[str, bytes]] = None, extract_images: bool = False
+    ):
+        self.password = password
+        self.extract_images = extract_images
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Lazily parse the blob."""
+        import pypdf
+
+        with blob.as_bytes_io() as pdf_file_obj:
+            pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
+            yield from [
+                Document(
+                    page_content=page.extract_text()
+                    + self._extract_images_from_page(page),
+                    metadata={"source": blob.source, "page": page_number},
+                )
+                for page_number, page in enumerate(pdf_reader.pages)
+            ]
+
+    def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
+        """Extract images from page and get the text with RapidOCR."""
+        if not self.extract_images or "/XObject" not in page["/Resources"].keys():
+            return ""
+
+        xObject = page["/Resources"]["/XObject"].get_object()  # type: ignore
+        images = []
+        for obj in xObject:
+            if xObject[obj]["/Subtype"] == "/Image":
+                if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS:
+                    height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]
+
+                    images.append(
+                        np.frombuffer(xObject[obj].get_data(), dtype=np.uint8).reshape(
+                            height, width, -1
+                        )
+                    )
+                elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS:
+                    images.append(xObject[obj].get_data())
+                else:
+                    warnings.warn("Unknown PDF Filter!")
+        return extract_from_images_with_rapidocr(images)
+
+
+class PDFMinerParser(BaseBlobParser):
+    """Parse `PDF` using `PDFMiner`."""
+
+    def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True):
+        """Initialize a parser based on PDFMiner.
+
+        Args:
+            extract_images: Whether to extract images from PDF.
+            concatenate_pages: If True, concatenate all PDF pages into one a single
+                               document. Otherwise, return one document per page.
+        """
+        self.extract_images = extract_images
+        self.concatenate_pages = concatenate_pages
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Lazily parse the blob."""
+
+        if not self.extract_images:
+            from pdfminer.high_level import extract_text
+
+            with blob.as_bytes_io() as pdf_file_obj:
+                if self.concatenate_pages:
+                    text = extract_text(pdf_file_obj)
+                    metadata = {"source": blob.source}
+                    yield Document(page_content=text, metadata=metadata)
+                else:
+                    from pdfminer.pdfpage import PDFPage
+
+                    pages = PDFPage.get_pages(pdf_file_obj)
+                    for i, _ in enumerate(pages):
+                        text = extract_text(pdf_file_obj, page_numbers=[i])
+                        metadata = {"source": blob.source, "page": str(i)}
+                        yield Document(page_content=text, metadata=metadata)
+        else:
+            import io
+
+            from pdfminer.converter import PDFPageAggregator, TextConverter
+            from pdfminer.layout import LAParams
+            from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
+            from pdfminer.pdfpage import PDFPage
+
+            text_io = io.StringIO()
+            with blob.as_bytes_io() as pdf_file_obj:
+                pages = PDFPage.get_pages(pdf_file_obj)
+                rsrcmgr = PDFResourceManager()
+                device_for_text = TextConverter(rsrcmgr, text_io, laparams=LAParams())
+                device_for_image = PDFPageAggregator(rsrcmgr, laparams=LAParams())
+                interpreter_for_text = PDFPageInterpreter(rsrcmgr, device_for_text)
+                interpreter_for_image = PDFPageInterpreter(rsrcmgr, device_for_image)
+                for i, page in enumerate(pages):
+                    interpreter_for_text.process_page(page)
+                    interpreter_for_image.process_page(page)
+                    content = text_io.getvalue() + self._extract_images_from_page(
+                        device_for_image.get_result()
+                    )
+                    text_io.truncate(0)
+                    text_io.seek(0)
+                    metadata = {"source": blob.source, "page": str(i)}
+                    yield Document(page_content=content, metadata=metadata)
+
+    def _extract_images_from_page(self, page: pdfminer.layout.LTPage) -> str:
+        """Extract images from page and get the text with RapidOCR."""
+        import pdfminer
+
+        def get_image(layout_object: Any) -> Any:
+            if isinstance(layout_object, pdfminer.layout.LTImage):
+                return layout_object
+            if isinstance(layout_object, pdfminer.layout.LTContainer):
+                for child in layout_object:
+                    return get_image(child)
+            else:
+                return None
+
+        images = []
+
+        for img in list(filter(bool, map(get_image, page))):
+            if img.stream["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
+                images.append(
+                    np.frombuffer(img.stream.get_data(), dtype=np.uint8).reshape(
+                        img.stream["Height"], img.stream["Width"], -1
+                    )
+                )
+            elif img.stream["Filter"].name in _PDF_FILTER_WITH_LOSS:
+                images.append(img.stream.get_data())
+            else:
+                warnings.warn("Unknown PDF Filter!")
+        return extract_from_images_with_rapidocr(images)
+
+
+class PyMuPDFParser(BaseBlobParser):
+    """Parse `PDF` using `PyMuPDF`."""
+
+    def __init__(
+        self,
+        text_kwargs: Optional[Mapping[str, Any]] = None,
+        extract_images: bool = False,
+    ) -> None:
+        """Initialize the parser.
+
+        Args:
+            text_kwargs: Keyword arguments to pass to ``fitz.Page.get_text()``.
+        """
+        self.text_kwargs = text_kwargs or {}
+        self.extract_images = extract_images
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Lazily parse the blob."""
+        import fitz
+
+        with blob.as_bytes_io() as file_path:
+            if blob.data is None:
+                doc = fitz.open(file_path)
+            else:
+                doc = fitz.open(stream=file_path, filetype="pdf")
+
+            yield from [
+                Document(
+                    page_content=page.get_text(**self.text_kwargs)
+                    + self._extract_images_from_page(doc, page),
+                    metadata=dict(
+                        {
+                            "source": blob.source,
+                            "file_path": blob.source,
+                            "page": page.number,
+                            "total_pages": len(doc),
+                        },
+                        **{
+                            k: doc.metadata[k]
+                            for k in doc.metadata
+                            if type(doc.metadata[k]) in [str, int]
+                        },
+                    ),
+                )
+                for page in doc
+            ]
+
+    def _extract_images_from_page(
+        self, doc: fitz.fitz.Document, page: fitz.fitz.Page
+    ) -> str:
+        """Extract images from page and get the text with RapidOCR."""
+        if not self.extract_images:
+            return ""
+        import fitz
+
+        img_list = page.get_images()
+        imgs = []
+        for img in img_list:
+            xref = img[0]
+            pix = fitz.Pixmap(doc, xref)
+            imgs.append(
+                np.frombuffer(pix.samples, dtype=np.uint8).reshape(
+                    pix.height, pix.width, -1
+                )
+            )
+        return extract_from_images_with_rapidocr(imgs)
+
+
+class PyPDFium2Parser(BaseBlobParser):
+    """Parse `PDF` with `PyPDFium2`."""
+
+    def __init__(self, extract_images: bool = False) -> None:
+        """Initialize the parser."""
+        try:
+            import pypdfium2  # noqa:F401
+        except ImportError:
+            raise ImportError(
+                "pypdfium2 package not found, please install it with"
+                " `pip install pypdfium2`"
+            )
+        self.extract_images = extract_images
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Lazily parse the blob."""
+        import pypdfium2
+
+        # pypdfium2 is really finicky with respect to closing things,
+        # if done incorrectly creates seg faults.
+        with blob.as_bytes_io() as file_path:
+            pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
+            try:
+                for page_number, page in enumerate(pdf_reader):
+                    text_page = page.get_textpage()
+                    content = text_page.get_text_range()
+                    text_page.close()
+                    content += "\n" + self._extract_images_from_page(page)
+                    page.close()
+                    metadata = {"source": blob.source, "page": page_number}
+                    yield Document(page_content=content, metadata=metadata)
+            finally:
+                pdf_reader.close()
+
+    def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> str:
+        """Extract images from page and get the text with RapidOCR."""
+        if not self.extract_images:
+            return ""
+
+        import pypdfium2.raw as pdfium_c
+
+        images = list(page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,)))
+
+        images = list(map(lambda x: x.get_bitmap().to_numpy(), images))
+        return extract_from_images_with_rapidocr(images)
+
+
+class PDFPlumberParser(BaseBlobParser):
+    """Parse `PDF` with `PDFPlumber`."""
+
+    def __init__(
+        self,
+        text_kwargs: Optional[Mapping[str, Any]] = None,
+        dedupe: bool = False,
+        extract_images: bool = False,
+    ) -> None:
+        """Initialize the parser.
+
+        Args:
+            text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
+            dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
+        """
+        self.text_kwargs = text_kwargs or {}
+        self.dedupe = dedupe
+        self.extract_images = extract_images
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Lazily parse the blob."""
+        import pdfplumber
+
+        with blob.as_bytes_io() as file_path:
+            doc = pdfplumber.open(file_path)  # open document
+
+            yield from [
+                Document(
+                    page_content=self._process_page_content(page)
+                    + "\n"
+                    + self._extract_images_from_page(page),
+                    metadata=dict(
+                        {
+                            "source": blob.source,
+                            "file_path": blob.source,
+                            "page": page.page_number - 1,
+                            "total_pages": len(doc.pages),
+                        },
+                        **{
+                            k: doc.metadata[k]
+                            for k in doc.metadata
+                            if type(doc.metadata[k]) in [str, int]
+                        },
+                    ),
+                )
+                for page in doc.pages
+            ]
+
+    def _process_page_content(self, page: pdfplumber.page.Page) -> str:
+        """Process the page content based on dedupe."""
+        if self.dedupe:
+            return page.dedupe_chars().extract_text(**self.text_kwargs)
+        return page.extract_text(**self.text_kwargs)
+
+    def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
+        """Extract images from page and get the text with RapidOCR."""
+        if not self.extract_images:
+            return ""
+
+        images = []
+        for img in page.images:
+            if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
+                images.append(
+                    np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
+                        img["stream"]["Height"], img["stream"]["Width"], -1
+                    )
+                )
+            elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
+                images.append(img["stream"].get_data())
+            else:
+                warnings.warn("Unknown PDF Filter!")
+
+        return extract_from_images_with_rapidocr(images)
+
+
+class AmazonTextractPDFParser(BaseBlobParser):
+    """Send `PDF` files to `Amazon Textract` and parse them.
+
+    For parsing multi-page PDFs, they have to reside on S3.
+
+    The AmazonTextractPDFLoader calls the
+    [Amazon Textract Service](https://aws.amazon.com/textract/)
+    to convert PDFs into a Document structure.
+    Single and multi-page documents are supported with up to 3000 pages
+    and 512 MB of size.
+
+    For the call to be successful an AWS account is required,
+    similar to the
+    [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
+    requirements.
+
+    Besides the AWS configuration, it is very similar to the other PDF
+    loaders, while also supporting JPEG, PNG and TIFF and non-native
+    PDF formats.
+
+    ```python
+    from langchain_community.document_loaders import AmazonTextractPDFLoader
+    loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg")
+    documents = loader.load()
+    ```
+
+    One feature is the linearization of the output.
+    When using the features LAYOUT, FORMS or TABLES together with Textract
+
+    ```python
+    from langchain_community.document_loaders import AmazonTextractPDFLoader
+    # you can mix and match each of the features
+    loader=AmazonTextractPDFLoader(
+        "example_data/alejandro_rosalez_sample-small.jpeg",
+        textract_features=["TABLES", "LAYOUT"])
+    documents = loader.load()
+    ```
+
+    it will generate output that formats the text in reading order and
+    try to output the information in a tabular structure or
+    output the key/value pairs with a colon (key: value).
+    This helps most LLMs to achieve better accuracy when
+    processing these texts.
+
+    """
+
+    def __init__(
+        self,
+        textract_features: Optional[Sequence[int]] = None,
+        client: Optional[Any] = None,
+    ) -> None:
+        """Initializes the parser.
+
+        Args:
+            textract_features: Features to be used for extraction, each feature
+                               should be passed as an int that conforms to the enum
+                               `Textract_Features`, see `amazon-textract-caller` pkg
+            client: boto3 textract client
+        """
+
+        try:
+            import textractcaller as tc
+            import textractor.entities.document as textractor
+
+            self.tc = tc
+            self.textractor = textractor
+
+            if textract_features is not None:
+                self.textract_features = [
+                    tc.Textract_Features(f) for f in textract_features
+                ]
+            else:
+                self.textract_features = []
+        except ImportError:
+            raise ImportError(
+                "Could not import amazon-textract-caller or "
+                "amazon-textract-textractor python package. Please install it "
+                "with `pip install amazon-textract-caller` & "
+                "`pip install amazon-textract-textractor`."
+            )
+
+        if not client:
+            try:
+                import boto3
+
+                self.boto3_textract_client = boto3.client("textract")
+            except ImportError:
+                raise ImportError(
+                    "Could not import boto3 python package. "
+                    "Please install it with `pip install boto3`."
+                )
+        else:
+            self.boto3_textract_client = client
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Iterates over the Blob pages and returns an Iterator with a Document
+        for each page, like the other parsers If multi-page document, blob.path
+        has to be set to the S3 URI and for single page docs
+        the blob.data is taken
+        """
+
+        url_parse_result = urlparse(str(blob.path)) if blob.path else None
+        # Either call with S3 path (multi-page) or with bytes (single-page)
+        if (
+            url_parse_result
+            and url_parse_result.scheme == "s3"
+            and url_parse_result.netloc
+        ):
+            textract_response_json = self.tc.call_textract(
+                input_document=str(blob.path),
+                features=self.textract_features,
+                boto3_textract_client=self.boto3_textract_client,
+            )
+        else:
+            textract_response_json = self.tc.call_textract(
+                input_document=blob.as_bytes(),
+                features=self.textract_features,
+                call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC,
+                boto3_textract_client=self.boto3_textract_client,
+            )
+
+        document = self.textractor.Document.open(textract_response_json)
+
+        linearizer_config = self.textractor.TextLinearizationConfig(
+            hide_figure_layout=True,
+            title_prefix="# ",
+            section_header_prefix="## ",
+            list_element_prefix="*",
+        )
+        for idx, page in enumerate(document.pages):
+            yield Document(
+                page_content=page.get_text(config=linearizer_config),
+                metadata={"source": blob.source, "page": idx + 1},
+            )
+
+
+class DocumentIntelligenceParser(BaseBlobParser):
+    """Loads a PDF with Azure Document Intelligence
+    (formerly Forms Recognizer) and chunks at character level."""
+
+    def __init__(self, client: Any, model: str):
+        self.client = client
+        self.model = model
+
+    def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]:
+        for p in result.pages:
+            content = " ".join([line.content for line in p.lines])
+
+            d = Document(
+                page_content=content,
+                metadata={
+                    "source": blob.source,
+                    "page": p.page_number,
+                },
+            )
+            yield d
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Lazily parse the blob."""
+
+        with blob.as_bytes_io() as file_obj:
+            poller = self.client.begin_analyze_document(self.model, file_obj)
+            result = poller.result()
+
+            docs = self._generate_docs(blob, result)
+
+            yield from docs