community[patch]: import flattening fix (#20110)

This PR should make it easier for linters to do type checking and for IDEs to jump to definition of code. See #20050 as a template for this PR. - As a byproduct: Added 3 missed `test_imports`. - Added missed `SolarChat` in to __init___.py Added it into test_import ut. - Added `# type: ignore` to fix linting. It is not clear, why linting errors appear after ^ changes. --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
2025-09-13 13:36:15 +00:00 · 2024-04-10 10:01:19 -07:00
parent 12190ad728
commit 4cb5f4c353
60 changed files with 2973 additions and 163 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/msword.py
+++ b/libs/community/langchain_community/document_loaders/parsers/msword.py
@@ -9,7 +9,7 @@ from langchain_community.document_loaders.blob_loaders import Blob
 class MsWordParser(BaseBlobParser):
    """Parse the Microsoft Word documents from a blob."""

-    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """Parse a Microsoft Word document into the Document iterator.

        Args:
@@ -33,13 +33,13 @@ class MsWordParser(BaseBlobParser):
                partition_docx
            ),
        }
-        if blob.mimetype not in (
+        if blob.mimetype not in (  # type: ignore[attr-defined]
            "application/msword",
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        ):
            raise ValueError("This blob type is not supported for this parser.")
-        with blob.as_bytes_io() as word_document:
-            elements = mime_type_parser[blob.mimetype](file=word_document)
+        with blob.as_bytes_io() as word_document:  # type: ignore[attr-defined]
+            elements = mime_type_parser[blob.mimetype](file=word_document)  # type: ignore[attr-defined]
            text = "\n\n".join([str(el) for el in elements])
-            metadata = {"source": blob.source}
+            metadata = {"source": blob.source}  # type: ignore[attr-defined]
            yield Document(page_content=text, metadata=metadata)
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -87,17 +87,17 @@ class PyPDFParser(BaseBlobParser):
        self.password = password
        self.extract_images = extract_images

-    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """Lazily parse the blob."""
        import pypdf

-        with blob.as_bytes_io() as pdf_file_obj:
+        with blob.as_bytes_io() as pdf_file_obj:  # type: ignore[attr-defined]
            pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
            yield from [
                Document(
                    page_content=page.extract_text()
                    + self._extract_images_from_page(page),
-                    metadata={"source": blob.source, "page": page_number},
+                    metadata={"source": blob.source, "page": page_number},  # type: ignore[attr-defined]
                )
                for page_number, page in enumerate(pdf_reader.pages)
            ]
@@ -140,16 +140,16 @@ class PDFMinerParser(BaseBlobParser):
        self.extract_images = extract_images
        self.concatenate_pages = concatenate_pages

-    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """Lazily parse the blob."""

        if not self.extract_images:
            from pdfminer.high_level import extract_text

-            with blob.as_bytes_io() as pdf_file_obj:
+            with blob.as_bytes_io() as pdf_file_obj:  # type: ignore[attr-defined]
                if self.concatenate_pages:
                    text = extract_text(pdf_file_obj)
-                    metadata = {"source": blob.source}
+                    metadata = {"source": blob.source}  # type: ignore[attr-defined]
                    yield Document(page_content=text, metadata=metadata)
                else:
                    from pdfminer.pdfpage import PDFPage
@@ -157,7 +157,7 @@ class PDFMinerParser(BaseBlobParser):
                    pages = PDFPage.get_pages(pdf_file_obj)
                    for i, _ in enumerate(pages):
                        text = extract_text(pdf_file_obj, page_numbers=[i])
-                        metadata = {"source": blob.source, "page": str(i)}
+                        metadata = {"source": blob.source, "page": str(i)}  # type: ignore[attr-defined]
                        yield Document(page_content=text, metadata=metadata)
        else:
            import io
@@ -168,7 +168,7 @@ class PDFMinerParser(BaseBlobParser):
            from pdfminer.pdfpage import PDFPage

            text_io = io.StringIO()
-            with blob.as_bytes_io() as pdf_file_obj:
+            with blob.as_bytes_io() as pdf_file_obj:  # type: ignore[attr-defined]
                pages = PDFPage.get_pages(pdf_file_obj)
                rsrcmgr = PDFResourceManager()
                device_for_text = TextConverter(rsrcmgr, text_io, laparams=LAParams())
@@ -183,7 +183,7 @@ class PDFMinerParser(BaseBlobParser):
                    )
                    text_io.truncate(0)
                    text_io.seek(0)
-                    metadata = {"source": blob.source, "page": str(i)}
+                    metadata = {"source": blob.source, "page": str(i)}  # type: ignore[attr-defined]
                    yield Document(page_content=content, metadata=metadata)

    def _extract_images_from_page(self, page: pdfminer.layout.LTPage) -> str:
@@ -231,12 +231,12 @@ class PyMuPDFParser(BaseBlobParser):
        self.text_kwargs = text_kwargs or {}
        self.extract_images = extract_images

-    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """Lazily parse the blob."""
        import fitz

-        with blob.as_bytes_io() as file_path:
-            if blob.data is None:
+        with blob.as_bytes_io() as file_path:  # type: ignore[attr-defined]
+            if blob.data is None:  # type: ignore[attr-defined]
                doc = fitz.open(file_path)
            else:
                doc = fitz.open(stream=file_path, filetype="pdf")
@@ -247,8 +247,8 @@ class PyMuPDFParser(BaseBlobParser):
                    + self._extract_images_from_page(doc, page),
                    metadata=dict(
                        {
-                            "source": blob.source,
-                            "file_path": blob.source,
+                            "source": blob.source,  # type: ignore[attr-defined]
+                            "file_path": blob.source,  # type: ignore[attr-defined]
                            "page": page.number,
                            "total_pages": len(doc),
                        },
@@ -297,13 +297,13 @@ class PyPDFium2Parser(BaseBlobParser):
            )
        self.extract_images = extract_images

-    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """Lazily parse the blob."""
        import pypdfium2

        # pypdfium2 is really finicky with respect to closing things,
        # if done incorrectly creates seg faults.
-        with blob.as_bytes_io() as file_path:
+        with blob.as_bytes_io() as file_path:  # type: ignore[attr-defined]
            pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
            try:
                for page_number, page in enumerate(pdf_reader):
@@ -312,7 +312,7 @@ class PyPDFium2Parser(BaseBlobParser):
                    text_page.close()
                    content += "\n" + self._extract_images_from_page(page)
                    page.close()
-                    metadata = {"source": blob.source, "page": page_number}
+                    metadata = {"source": blob.source, "page": page_number}  # type: ignore[attr-defined]
                    yield Document(page_content=content, metadata=metadata)
            finally:
                pdf_reader.close()
@@ -349,11 +349,11 @@ class PDFPlumberParser(BaseBlobParser):
        self.dedupe = dedupe
        self.extract_images = extract_images

-    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """Lazily parse the blob."""
        import pdfplumber

-        with blob.as_bytes_io() as file_path:
+        with blob.as_bytes_io() as file_path:  # type: ignore[attr-defined]
            doc = pdfplumber.open(file_path)  # open document

            yield from [
@@ -363,8 +363,8 @@ class PDFPlumberParser(BaseBlobParser):
                    + self._extract_images_from_page(page),
                    metadata=dict(
                        {
-                            "source": blob.source,
-                            "file_path": blob.source,
+                            "source": blob.source,  # type: ignore[attr-defined]
+                            "file_path": blob.source,  # type: ignore[attr-defined]
                            "page": page.page_number - 1,
                            "total_pages": len(doc.pages),
                        },
@@ -514,14 +514,14 @@ class AmazonTextractPDFParser(BaseBlobParser):
        else:
            self.boto3_textract_client = client

-    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """Iterates over the Blob pages and returns an Iterator with a Document
        for each page, like the other parsers If multi-page document, blob.path
        has to be set to the S3 URI and for single page docs
        the blob.data is taken
        """

-        url_parse_result = urlparse(str(blob.path)) if blob.path else None
+        url_parse_result = urlparse(str(blob.path)) if blob.path else None  # type: ignore[attr-defined]
        # Either call with S3 path (multi-page) or with bytes (single-page)
        if (
            url_parse_result
@@ -529,13 +529,13 @@ class AmazonTextractPDFParser(BaseBlobParser):
            and url_parse_result.netloc
        ):
            textract_response_json = self.tc.call_textract(
-                input_document=str(blob.path),
+                input_document=str(blob.path),  # type: ignore[attr-defined]
                features=self.textract_features,
                boto3_textract_client=self.boto3_textract_client,
            )
        else:
            textract_response_json = self.tc.call_textract(
-                input_document=blob.as_bytes(),
+                input_document=blob.as_bytes(),  # type: ignore[attr-defined]
                features=self.textract_features,
                call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC,
                boto3_textract_client=self.boto3_textract_client,
@@ -546,7 +546,7 @@ class AmazonTextractPDFParser(BaseBlobParser):
        for idx, page in enumerate(document.pages):
            yield Document(
                page_content=page.get_text(config=self.linearization_config),
-                metadata={"source": blob.source, "page": idx + 1},
+                metadata={"source": blob.source, "page": idx + 1},  # type: ignore[attr-defined]
            )


@@ -566,23 +566,23 @@ class DocumentIntelligenceParser(BaseBlobParser):
        self.client = client
        self.model = model

-    def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]:
+    def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]:  # type: ignore[valid-type]
        for p in result.pages:
            content = " ".join([line.content for line in p.lines])

            d = Document(
                page_content=content,
                metadata={
-                    "source": blob.source,
+                    "source": blob.source,  # type: ignore[attr-defined]
                    "page": p.page_number,
                },
            )
            yield d

-    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """Lazily parse the blob."""

-        with blob.as_bytes_io() as file_obj:
+        with blob.as_bytes_io() as file_obj:  # type: ignore[attr-defined]
            poller = self.client.begin_analyze_document(self.model, file_obj)
            result = poller.result()

--- a/libs/community/langchain_community/document_loaders/parsers/txt.py
+++ b/libs/community/langchain_community/document_loaders/parsers/txt.py
@@ -10,6 +10,6 @@ from langchain_community.document_loaders.blob_loaders import Blob
 class TextParser(BaseBlobParser):
    """Parser for text blobs."""

-    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-type]
        """Lazily parse the blob."""
-        yield Document(page_content=blob.as_string(), metadata={"source": blob.source})
+        yield Document(page_content=blob.as_string(), metadata={"source": blob.source})  # type: ignore[attr-defined]