community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)

Moved the following modules to new package langchain-community in a backwards compatible fashion:

```
mv langchain/langchain/adapters community/langchain_community
mv langchain/langchain/callbacks community/langchain_community/callbacks
mv langchain/langchain/chat_loaders community/langchain_community
mv langchain/langchain/chat_models community/langchain_community
mv langchain/langchain/document_loaders community/langchain_community
mv langchain/langchain/docstore community/langchain_community
mv langchain/langchain/document_transformers community/langchain_community
mv langchain/langchain/embeddings community/langchain_community
mv langchain/langchain/graphs community/langchain_community
mv langchain/langchain/llms community/langchain_community
mv langchain/langchain/memory/chat_message_histories community/langchain_community
mv langchain/langchain/retrievers community/langchain_community
mv langchain/langchain/storage community/langchain_community
mv langchain/langchain/tools community/langchain_community
mv langchain/langchain/utilities community/langchain_community
mv langchain/langchain/vectorstores community/langchain_community
mv langchain/langchain/agents/agent_toolkits community/langchain_community
mv langchain/langchain/cache.py community/langchain_community
mv langchain/langchain/adapters community/langchain_community
mv langchain/langchain/callbacks community/langchain_community/callbacks
mv langchain/langchain/chat_loaders community/langchain_community
mv langchain/langchain/chat_models community/langchain_community
mv langchain/langchain/document_loaders community/langchain_community
mv langchain/langchain/docstore community/langchain_community
mv langchain/langchain/document_transformers community/langchain_community
mv langchain/langchain/embeddings community/langchain_community
mv langchain/langchain/graphs community/langchain_community
mv langchain/langchain/llms community/langchain_community
mv langchain/langchain/memory/chat_message_histories community/langchain_community
mv langchain/langchain/retrievers community/langchain_community
mv langchain/langchain/storage community/langchain_community
mv langchain/langchain/tools community/langchain_community
mv langchain/langchain/utilities community/langchain_community
mv langchain/langchain/vectorstores community/langchain_community
mv langchain/langchain/agents/agent_toolkits community/langchain_community
mv langchain/langchain/cache.py community/langchain_community
```

Moved the following to core
```
mv langchain/langchain/utils/json_schema.py core/langchain_core/utils
mv langchain/langchain/utils/html.py core/langchain_core/utils
mv langchain/langchain/utils/strings.py core/langchain_core/utils
cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py
rm langchain/langchain/utils/env.py
```

See .scripts/community_split/script_integrations.sh for all changes
This commit is contained in:
Bagatur
2023-12-11 13:53:30 -08:00
committed by GitHub
parent c0f4b95aa9
commit ed58eeb9c5
2446 changed files with 171805 additions and 137118 deletions

View File

@@ -0,0 +1,49 @@
from langchain_community.document_loaders.parsers.language.cobol import CobolSegmenter
EXAMPLE_CODE = """
IDENTIFICATION DIVISION.
PROGRAM-ID. SampleProgram.
DATA DIVISION.
WORKING-STORAGE SECTION.
01 SAMPLE-VAR PIC X(20) VALUE 'Sample Value'.
PROCEDURE DIVISION.
A000-INITIALIZE-PARA.
DISPLAY 'Initialization Paragraph'.
MOVE 'New Value' TO SAMPLE-VAR.
A100-PROCESS-PARA.
DISPLAY SAMPLE-VAR.
STOP RUN.
"""
def test_extract_functions_classes() -> None:
"""Test that functions and classes are extracted correctly."""
segmenter = CobolSegmenter(EXAMPLE_CODE)
extracted_code = segmenter.extract_functions_classes()
assert extracted_code == [
"A000-INITIALIZE-PARA.\n "
"DISPLAY 'Initialization Paragraph'.\n "
"MOVE 'New Value' TO SAMPLE-VAR.",
"A100-PROCESS-PARA.\n DISPLAY SAMPLE-VAR.\n STOP RUN.",
]
def test_simplify_code() -> None:
"""Test that code is simplified correctly."""
expected_simplified_code = (
"IDENTIFICATION DIVISION.\n"
"PROGRAM-ID. SampleProgram.\n"
"DATA DIVISION.\n"
"WORKING-STORAGE SECTION.\n"
"* OMITTED CODE *\n"
"PROCEDURE DIVISION.\n"
"A000-INITIALIZE-PARA.\n"
"* OMITTED CODE *\n"
"A100-PROCESS-PARA.\n"
"* OMITTED CODE *\n"
)
segmenter = CobolSegmenter(EXAMPLE_CODE)
simplified_code = segmenter.simplify_code()
assert simplified_code.strip() == expected_simplified_code.strip()

View File

@@ -0,0 +1,48 @@
import unittest
import pytest
from langchain_community.document_loaders.parsers.language.javascript import (
JavaScriptSegmenter,
)
@pytest.mark.requires("esprima")
class TestJavaScriptSegmenter(unittest.TestCase):
def setUp(self) -> None:
self.example_code = """const os = require('os');
function hello(text) {
console.log(text);
}
class Simple {
constructor() {
this.a = 1;
}
}
hello("Hello!");"""
self.expected_simplified_code = """const os = require('os');
// Code for: function hello(text) {
// Code for: class Simple {
hello("Hello!");"""
self.expected_extracted_code = [
"function hello(text) {\n console.log(text);\n}",
"class Simple {\n constructor() {\n this.a = 1;\n }\n}",
]
def test_extract_functions_classes(self) -> None:
segmenter = JavaScriptSegmenter(self.example_code)
extracted_code = segmenter.extract_functions_classes()
self.assertEqual(extracted_code, self.expected_extracted_code)
def test_simplify_code(self) -> None:
segmenter = JavaScriptSegmenter(self.example_code)
simplified_code = segmenter.simplify_code()
self.assertEqual(simplified_code, self.expected_simplified_code)

View File

@@ -0,0 +1,40 @@
import unittest
from langchain_community.document_loaders.parsers.language.python import PythonSegmenter
class TestPythonSegmenter(unittest.TestCase):
def setUp(self) -> None:
self.example_code = """import os
def hello(text):
print(text)
class Simple:
def __init__(self):
self.a = 1
hello("Hello!")"""
self.expected_simplified_code = """import os
# Code for: def hello(text):
# Code for: class Simple:
hello("Hello!")"""
self.expected_extracted_code = [
"def hello(text):\n" " print(text)",
"class Simple:\n" " def __init__(self):\n" " self.a = 1",
]
def test_extract_functions_classes(self) -> None:
segmenter = PythonSegmenter(self.example_code)
extracted_code = segmenter.extract_functions_classes()
self.assertEqual(extracted_code, self.expected_extracted_code)
def test_simplify_code(self) -> None:
segmenter = PythonSegmenter(self.example_code)
simplified_code = segmenter.simplify_code()
self.assertEqual(simplified_code, self.expected_simplified_code)

View File

@@ -0,0 +1,95 @@
"""Module to test generic parsers."""
from typing import Iterator
import pytest
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
class TestMimeBasedParser:
"""Test mime based parser."""
def test_without_fallback_parser(self) -> None:
class FirstCharParser(BaseBlobParser):
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Extract the first character of a blob."""
yield Document(page_content=blob.as_string()[0])
class SecondCharParser(BaseBlobParser):
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Extract the second character of a blob."""
yield Document(page_content=blob.as_string()[1])
parser = MimeTypeBasedParser(
handlers={
"text/plain": FirstCharParser(),
"text/html": SecondCharParser(),
},
)
blob = Blob(data=b"Hello World", mimetype="text/plain")
docs = parser.parse(blob)
assert len(docs) == 1
doc = docs[0]
assert doc.page_content == "H"
# Check text/html handler.
blob = Blob(data=b"Hello World", mimetype="text/html")
docs = parser.parse(blob)
assert len(docs) == 1
doc = docs[0]
assert doc.page_content == "e"
blob = Blob(data=b"Hello World", mimetype="text/csv")
with pytest.raises(ValueError, match="Unsupported mime type"):
# Check that the fallback parser is used when the mimetype is not found.
parser.parse(blob)
def test_with_fallback_parser(self) -> None:
class FirstCharParser(BaseBlobParser):
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Extract the first character of a blob."""
yield Document(page_content=blob.as_string()[0])
class SecondCharParser(BaseBlobParser):
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Extract the second character of a blob."""
yield Document(page_content=blob.as_string()[1])
class ThirdCharParser(BaseBlobParser):
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Extract the third character of a blob."""
yield Document(page_content=blob.as_string()[2])
parser = MimeTypeBasedParser(
handlers={
"text/plain": FirstCharParser(),
"text/html": SecondCharParser(),
},
fallback_parser=ThirdCharParser(),
)
blob = Blob(data=b"Hello World", mimetype="text/plain")
docs = parser.parse(blob)
assert len(docs) == 1
doc = docs[0]
assert doc.page_content == "H"
# Check text/html handler.
blob = Blob(data=b"Hello World", mimetype="text/html")
docs = parser.parse(blob)
assert len(docs) == 1
doc = docs[0]
assert doc.page_content == "e"
# Check that the fallback parser is used when the mimetype is not found.
blob = Blob(data=b"Hello World", mimetype="text/csv")
docs = parser.parse(blob)
assert len(docs) == 1
doc = docs[0]
assert doc.page_content == "l"

View File

@@ -0,0 +1,28 @@
"""Tests for the HTML parsers."""
from pathlib import Path
import pytest
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.html import BS4HTMLParser
HERE = Path(__file__).parent
EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples"
@pytest.mark.requires("bs4", "lxml")
def test_bs_html_loader() -> None:
"""Test unstructured loader."""
file_path = EXAMPLES / "example.html"
blob = Blob.from_path(file_path)
parser = BS4HTMLParser(get_text_separator="|")
docs = list(parser.lazy_parse(blob))
assert isinstance(docs, list)
assert len(docs) == 1
metadata = docs[0].metadata
content = docs[0].page_content
assert metadata["title"] == "Chew dad's slippers"
assert metadata["source"] == str(file_path)
assert content[:2] == "\n|"

View File

@@ -0,0 +1,96 @@
"""Tests for the various PDF parsers."""
from pathlib import Path
from typing import Iterator
import pytest
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.pdf import (
PDFMinerParser,
PyMuPDFParser,
PyPDFium2Parser,
PyPDFParser,
)
_THIS_DIR = Path(__file__).parents[3]
_EXAMPLES_DIR = _THIS_DIR / "examples"
# Paths to test PDF files
HELLO_PDF = _EXAMPLES_DIR / "hello.pdf"
LAYOUT_PARSER_PAPER_PDF = _EXAMPLES_DIR / "layout-parser-paper.pdf"
def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None:
"""Standard tests to verify that the given parser works.
Args:
parser (BaseBlobParser): The parser to test.
splits_by_page (bool): Whether the parser splits by page or not by default.
"""
blob = Blob.from_path(HELLO_PDF)
doc_generator = parser.lazy_parse(blob)
assert isinstance(doc_generator, Iterator)
docs = list(doc_generator)
assert len(docs) == 1
page_content = docs[0].page_content
assert isinstance(page_content, str)
# The different parsers return different amount of whitespace, so using
# startswith instead of equals.
assert docs[0].page_content.startswith("Hello world!")
blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
doc_generator = parser.lazy_parse(blob)
assert isinstance(doc_generator, Iterator)
docs = list(doc_generator)
if splits_by_page:
assert len(docs) == 16
else:
assert len(docs) == 1
# Test is imprecise since the parsers yield different parse information depending
# on configuration. Each parser seems to yield a slightly different result
# for this page!
assert "LayoutParser" in docs[0].page_content
metadata = docs[0].metadata
assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF)
if splits_by_page:
assert int(metadata["page"]) == 0
@pytest.mark.requires("pypdf")
def test_pypdf_parser() -> None:
"""Test PyPDF parser."""
_assert_with_parser(PyPDFParser())
@pytest.mark.requires("pdfminer")
def test_pdfminer_parser() -> None:
"""Test PDFMiner parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
@pytest.mark.requires("fitz") # package is PyMuPDF
def test_pymupdf_loader() -> None:
"""Test PyMuPDF loader."""
_assert_with_parser(PyMuPDFParser())
@pytest.mark.requires("pypdfium2")
def test_pypdfium2_parser() -> None:
"""Test PyPDFium2 parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PyPDFium2Parser())
@pytest.mark.requires("rapidocr_onnxruntime")
def test_extract_images_text_from_pdf() -> None:
"""Test extract image from pdf and recognize text with rapid ocr"""
_assert_with_parser(PyPDFParser(extract_images=True))
_assert_with_parser(PDFMinerParser(extract_images=True))
_assert_with_parser(PyMuPDFParser(extract_images=True))
_assert_with_parser(PyPDFium2Parser(extract_images=True))

View File

@@ -0,0 +1,17 @@
from langchain_community.document_loaders.parsers import __all__
def test_parsers_public_api_correct() -> None:
"""Test public API of parsers for breaking changes."""
assert set(__all__) == {
"BS4HTMLParser",
"DocAIParser",
"GrobidParser",
"LanguageParser",
"OpenAIWhisperParser",
"PyPDFParser",
"PDFMinerParser",
"PyMuPDFParser",
"PyPDFium2Parser",
"PDFPlumberParser",
}