community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)

Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
2025-09-15 22:44:36 +00:00 · 2023-12-11 13:53:30 -08:00
parent c0f4b95aa9
commit ed58eeb9c5
2446 changed files with 171805 additions and 137118 deletions
--- a/libs/community/tests/unit_tests/document_loaders/parsers/init.py
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/init.py
--- a/libs/community/tests/unit_tests/document_loaders/parsers/language/init.py
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/init.py
--- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_cobol.py
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_cobol.py
@@ -0,0 +1,49 @@
+from langchain_community.document_loaders.parsers.language.cobol import CobolSegmenter
+
+EXAMPLE_CODE = """
+IDENTIFICATION DIVISION.
+PROGRAM-ID. SampleProgram.
+DATA DIVISION.
+WORKING-STORAGE SECTION.
+01  SAMPLE-VAR         PIC X(20)   VALUE 'Sample Value'.
+
+PROCEDURE DIVISION.
+A000-INITIALIZE-PARA.
+    DISPLAY 'Initialization Paragraph'.
+    MOVE 'New Value' TO SAMPLE-VAR.
+
+A100-PROCESS-PARA.
+    DISPLAY SAMPLE-VAR.
+    STOP RUN.
+"""
+
+
+def test_extract_functions_classes() -> None:
+    """Test that functions and classes are extracted correctly."""
+    segmenter = CobolSegmenter(EXAMPLE_CODE)
+    extracted_code = segmenter.extract_functions_classes()
+    assert extracted_code == [
+        "A000-INITIALIZE-PARA.\n    "
+        "DISPLAY 'Initialization Paragraph'.\n    "
+        "MOVE 'New Value' TO SAMPLE-VAR.",
+        "A100-PROCESS-PARA.\n    DISPLAY SAMPLE-VAR.\n    STOP RUN.",
+    ]
+
+
+def test_simplify_code() -> None:
+    """Test that code is simplified correctly."""
+    expected_simplified_code = (
+        "IDENTIFICATION DIVISION.\n"
+        "PROGRAM-ID. SampleProgram.\n"
+        "DATA DIVISION.\n"
+        "WORKING-STORAGE SECTION.\n"
+        "* OMITTED CODE *\n"
+        "PROCEDURE DIVISION.\n"
+        "A000-INITIALIZE-PARA.\n"
+        "* OMITTED CODE *\n"
+        "A100-PROCESS-PARA.\n"
+        "* OMITTED CODE *\n"
+    )
+    segmenter = CobolSegmenter(EXAMPLE_CODE)
+    simplified_code = segmenter.simplify_code()
+    assert simplified_code.strip() == expected_simplified_code.strip()
--- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_javascript.py
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_javascript.py
@@ -0,0 +1,48 @@
+import unittest
+
+import pytest
+
+from langchain_community.document_loaders.parsers.language.javascript import (
+    JavaScriptSegmenter,
+)
+
+
+@pytest.mark.requires("esprima")
+class TestJavaScriptSegmenter(unittest.TestCase):
+    def setUp(self) -> None:
+        self.example_code = """const os = require('os');
+
+function hello(text) {
+    console.log(text);
+}
+
+class Simple {
+    constructor() {
+        this.a = 1;
+    }
+}
+
+hello("Hello!");"""
+
+        self.expected_simplified_code = """const os = require('os');
+
+// Code for: function hello(text) {
+
+// Code for: class Simple {
+
+hello("Hello!");"""
+
+        self.expected_extracted_code = [
+            "function hello(text) {\n    console.log(text);\n}",
+            "class Simple {\n    constructor() {\n        this.a = 1;\n    }\n}",
+        ]
+
+    def test_extract_functions_classes(self) -> None:
+        segmenter = JavaScriptSegmenter(self.example_code)
+        extracted_code = segmenter.extract_functions_classes()
+        self.assertEqual(extracted_code, self.expected_extracted_code)
+
+    def test_simplify_code(self) -> None:
+        segmenter = JavaScriptSegmenter(self.example_code)
+        simplified_code = segmenter.simplify_code()
+        self.assertEqual(simplified_code, self.expected_simplified_code)
--- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_python.py
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_python.py
@@ -0,0 +1,40 @@
+import unittest
+
+from langchain_community.document_loaders.parsers.language.python import PythonSegmenter
+
+
+class TestPythonSegmenter(unittest.TestCase):
+    def setUp(self) -> None:
+        self.example_code = """import os
+
+def hello(text):
+    print(text)
+
+class Simple:
+    def __init__(self):
+        self.a = 1
+
+hello("Hello!")"""
+
+        self.expected_simplified_code = """import os
+
+# Code for: def hello(text):
+
+# Code for: class Simple:
+
+hello("Hello!")"""
+
+        self.expected_extracted_code = [
+            "def hello(text):\n" "    print(text)",
+            "class Simple:\n" "    def __init__(self):\n" "        self.a = 1",
+        ]
+
+    def test_extract_functions_classes(self) -> None:
+        segmenter = PythonSegmenter(self.example_code)
+        extracted_code = segmenter.extract_functions_classes()
+        self.assertEqual(extracted_code, self.expected_extracted_code)
+
+    def test_simplify_code(self) -> None:
+        segmenter = PythonSegmenter(self.example_code)
+        simplified_code = segmenter.simplify_code()
+        self.assertEqual(simplified_code, self.expected_simplified_code)
--- a/libs/community/tests/unit_tests/document_loaders/parsers/test_generic.py
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_generic.py
@@ -0,0 +1,95 @@
+"""Module to test generic parsers."""
+
+from typing import Iterator
+
+import pytest
+from langchain_core.documents import Document
+
+from langchain_community.document_loaders.base import BaseBlobParser
+from langchain_community.document_loaders.blob_loaders import Blob
+from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
+
+
+class TestMimeBasedParser:
+    """Test mime based parser."""
+
+    def test_without_fallback_parser(self) -> None:
+        class FirstCharParser(BaseBlobParser):
+            def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+                """Extract the first character of a blob."""
+                yield Document(page_content=blob.as_string()[0])
+
+        class SecondCharParser(BaseBlobParser):
+            def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+                """Extract the second character of a blob."""
+                yield Document(page_content=blob.as_string()[1])
+
+        parser = MimeTypeBasedParser(
+            handlers={
+                "text/plain": FirstCharParser(),
+                "text/html": SecondCharParser(),
+            },
+        )
+
+        blob = Blob(data=b"Hello World", mimetype="text/plain")
+        docs = parser.parse(blob)
+        assert len(docs) == 1
+        doc = docs[0]
+        assert doc.page_content == "H"
+
+        # Check text/html handler.
+        blob = Blob(data=b"Hello World", mimetype="text/html")
+        docs = parser.parse(blob)
+        assert len(docs) == 1
+        doc = docs[0]
+        assert doc.page_content == "e"
+
+        blob = Blob(data=b"Hello World", mimetype="text/csv")
+
+        with pytest.raises(ValueError, match="Unsupported mime type"):
+            # Check that the fallback parser is used when the mimetype is not found.
+            parser.parse(blob)
+
+    def test_with_fallback_parser(self) -> None:
+        class FirstCharParser(BaseBlobParser):
+            def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+                """Extract the first character of a blob."""
+                yield Document(page_content=blob.as_string()[0])
+
+        class SecondCharParser(BaseBlobParser):
+            def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+                """Extract the second character of a blob."""
+                yield Document(page_content=blob.as_string()[1])
+
+        class ThirdCharParser(BaseBlobParser):
+            def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+                """Extract the third character of a blob."""
+                yield Document(page_content=blob.as_string()[2])
+
+        parser = MimeTypeBasedParser(
+            handlers={
+                "text/plain": FirstCharParser(),
+                "text/html": SecondCharParser(),
+            },
+            fallback_parser=ThirdCharParser(),
+        )
+
+        blob = Blob(data=b"Hello World", mimetype="text/plain")
+        docs = parser.parse(blob)
+        assert len(docs) == 1
+        doc = docs[0]
+        assert doc.page_content == "H"
+
+        # Check text/html handler.
+        blob = Blob(data=b"Hello World", mimetype="text/html")
+        docs = parser.parse(blob)
+        assert len(docs) == 1
+        doc = docs[0]
+        assert doc.page_content == "e"
+
+        # Check that the fallback parser is used when the mimetype is not found.
+        blob = Blob(data=b"Hello World", mimetype="text/csv")
+        docs = parser.parse(blob)
+        assert len(docs) == 1
+        doc = docs[0]
+        assert doc.page_content == "l"
--- a/libs/community/tests/unit_tests/document_loaders/parsers/test_html_parsers.py
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_html_parsers.py
@@ -0,0 +1,28 @@
+"""Tests for the HTML parsers."""
+from pathlib import Path
+
+import pytest
+
+from langchain_community.document_loaders.blob_loaders import Blob
+from langchain_community.document_loaders.parsers.html import BS4HTMLParser
+
+HERE = Path(__file__).parent
+EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples"
+
+
+@pytest.mark.requires("bs4", "lxml")
+def test_bs_html_loader() -> None:
+    """Test unstructured loader."""
+    file_path = EXAMPLES / "example.html"
+    blob = Blob.from_path(file_path)
+    parser = BS4HTMLParser(get_text_separator="|")
+    docs = list(parser.lazy_parse(blob))
+    assert isinstance(docs, list)
+    assert len(docs) == 1
+
+    metadata = docs[0].metadata
+    content = docs[0].page_content
+
+    assert metadata["title"] == "Chew dad's slippers"
+    assert metadata["source"] == str(file_path)
+    assert content[:2] == "\n|"
--- a/libs/community/tests/unit_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_pdf_parsers.py
@@ -0,0 +1,96 @@
+"""Tests for the various PDF parsers."""
+from pathlib import Path
+from typing import Iterator
+
+import pytest
+
+from langchain_community.document_loaders.base import BaseBlobParser
+from langchain_community.document_loaders.blob_loaders import Blob
+from langchain_community.document_loaders.parsers.pdf import (
+    PDFMinerParser,
+    PyMuPDFParser,
+    PyPDFium2Parser,
+    PyPDFParser,
+)
+
+_THIS_DIR = Path(__file__).parents[3]
+
+_EXAMPLES_DIR = _THIS_DIR / "examples"
+
+# Paths to test PDF files
+HELLO_PDF = _EXAMPLES_DIR / "hello.pdf"
+LAYOUT_PARSER_PAPER_PDF = _EXAMPLES_DIR / "layout-parser-paper.pdf"
+
+
+def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None:
+    """Standard tests to verify that the given parser works.
+
+    Args:
+        parser (BaseBlobParser): The parser to test.
+        splits_by_page (bool): Whether the parser splits by page or not by default.
+    """
+    blob = Blob.from_path(HELLO_PDF)
+    doc_generator = parser.lazy_parse(blob)
+    assert isinstance(doc_generator, Iterator)
+    docs = list(doc_generator)
+    assert len(docs) == 1
+    page_content = docs[0].page_content
+    assert isinstance(page_content, str)
+    # The different parsers return different amount of whitespace, so using
+    # startswith instead of equals.
+    assert docs[0].page_content.startswith("Hello world!")
+
+    blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
+    doc_generator = parser.lazy_parse(blob)
+    assert isinstance(doc_generator, Iterator)
+    docs = list(doc_generator)
+
+    if splits_by_page:
+        assert len(docs) == 16
+    else:
+        assert len(docs) == 1
+    # Test is imprecise since the parsers yield different parse information depending
+    # on configuration. Each parser seems to yield a slightly different result
+    # for this page!
+    assert "LayoutParser" in docs[0].page_content
+    metadata = docs[0].metadata
+
+    assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF)
+
+    if splits_by_page:
+        assert int(metadata["page"]) == 0
+
+
+@pytest.mark.requires("pypdf")
+def test_pypdf_parser() -> None:
+    """Test PyPDF parser."""
+    _assert_with_parser(PyPDFParser())
+
+
+@pytest.mark.requires("pdfminer")
+def test_pdfminer_parser() -> None:
+    """Test PDFMiner parser."""
+    # Does not follow defaults to split by page.
+    _assert_with_parser(PDFMinerParser(), splits_by_page=False)
+
+
+@pytest.mark.requires("fitz")  # package is PyMuPDF
+def test_pymupdf_loader() -> None:
+    """Test PyMuPDF loader."""
+    _assert_with_parser(PyMuPDFParser())
+
+
+@pytest.mark.requires("pypdfium2")
+def test_pypdfium2_parser() -> None:
+    """Test PyPDFium2 parser."""
+    # Does not follow defaults to split by page.
+    _assert_with_parser(PyPDFium2Parser())
+
+
+@pytest.mark.requires("rapidocr_onnxruntime")
+def test_extract_images_text_from_pdf() -> None:
+    """Test extract image from pdf and recognize text with rapid ocr"""
+    _assert_with_parser(PyPDFParser(extract_images=True))
+    _assert_with_parser(PDFMinerParser(extract_images=True))
+    _assert_with_parser(PyMuPDFParser(extract_images=True))
+    _assert_with_parser(PyPDFium2Parser(extract_images=True))
--- a/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py
+++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_public_api.py
@@ -0,0 +1,17 @@
+from langchain_community.document_loaders.parsers import __all__
+
+
+def test_parsers_public_api_correct() -> None:
+    """Test public API of parsers for breaking changes."""
+    assert set(__all__) == {
+        "BS4HTMLParser",
+        "DocAIParser",
+        "GrobidParser",
+        "LanguageParser",
+        "OpenAIWhisperParser",
+        "PyPDFParser",
+        "PDFMinerParser",
+        "PyMuPDFParser",
+        "PyPDFium2Parser",
+        "PDFPlumberParser",
+    }