mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-15 22:44:36 +00:00
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)
Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
This commit is contained in:
@@ -0,0 +1,49 @@
|
||||
from langchain_community.document_loaders.parsers.language.cobol import CobolSegmenter
|
||||
|
||||
EXAMPLE_CODE = """
|
||||
IDENTIFICATION DIVISION.
|
||||
PROGRAM-ID. SampleProgram.
|
||||
DATA DIVISION.
|
||||
WORKING-STORAGE SECTION.
|
||||
01 SAMPLE-VAR PIC X(20) VALUE 'Sample Value'.
|
||||
|
||||
PROCEDURE DIVISION.
|
||||
A000-INITIALIZE-PARA.
|
||||
DISPLAY 'Initialization Paragraph'.
|
||||
MOVE 'New Value' TO SAMPLE-VAR.
|
||||
|
||||
A100-PROCESS-PARA.
|
||||
DISPLAY SAMPLE-VAR.
|
||||
STOP RUN.
|
||||
"""
|
||||
|
||||
|
||||
def test_extract_functions_classes() -> None:
|
||||
"""Test that functions and classes are extracted correctly."""
|
||||
segmenter = CobolSegmenter(EXAMPLE_CODE)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
assert extracted_code == [
|
||||
"A000-INITIALIZE-PARA.\n "
|
||||
"DISPLAY 'Initialization Paragraph'.\n "
|
||||
"MOVE 'New Value' TO SAMPLE-VAR.",
|
||||
"A100-PROCESS-PARA.\n DISPLAY SAMPLE-VAR.\n STOP RUN.",
|
||||
]
|
||||
|
||||
|
||||
def test_simplify_code() -> None:
|
||||
"""Test that code is simplified correctly."""
|
||||
expected_simplified_code = (
|
||||
"IDENTIFICATION DIVISION.\n"
|
||||
"PROGRAM-ID. SampleProgram.\n"
|
||||
"DATA DIVISION.\n"
|
||||
"WORKING-STORAGE SECTION.\n"
|
||||
"* OMITTED CODE *\n"
|
||||
"PROCEDURE DIVISION.\n"
|
||||
"A000-INITIALIZE-PARA.\n"
|
||||
"* OMITTED CODE *\n"
|
||||
"A100-PROCESS-PARA.\n"
|
||||
"* OMITTED CODE *\n"
|
||||
)
|
||||
segmenter = CobolSegmenter(EXAMPLE_CODE)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
assert simplified_code.strip() == expected_simplified_code.strip()
|
@@ -0,0 +1,48 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.javascript import (
|
||||
JavaScriptSegmenter,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("esprima")
|
||||
class TestJavaScriptSegmenter(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.example_code = """const os = require('os');
|
||||
|
||||
function hello(text) {
|
||||
console.log(text);
|
||||
}
|
||||
|
||||
class Simple {
|
||||
constructor() {
|
||||
this.a = 1;
|
||||
}
|
||||
}
|
||||
|
||||
hello("Hello!");"""
|
||||
|
||||
self.expected_simplified_code = """const os = require('os');
|
||||
|
||||
// Code for: function hello(text) {
|
||||
|
||||
// Code for: class Simple {
|
||||
|
||||
hello("Hello!");"""
|
||||
|
||||
self.expected_extracted_code = [
|
||||
"function hello(text) {\n console.log(text);\n}",
|
||||
"class Simple {\n constructor() {\n this.a = 1;\n }\n}",
|
||||
]
|
||||
|
||||
def test_extract_functions_classes(self) -> None:
|
||||
segmenter = JavaScriptSegmenter(self.example_code)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
self.assertEqual(extracted_code, self.expected_extracted_code)
|
||||
|
||||
def test_simplify_code(self) -> None:
|
||||
segmenter = JavaScriptSegmenter(self.example_code)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
self.assertEqual(simplified_code, self.expected_simplified_code)
|
@@ -0,0 +1,40 @@
|
||||
import unittest
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.python import PythonSegmenter
|
||||
|
||||
|
||||
class TestPythonSegmenter(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.example_code = """import os
|
||||
|
||||
def hello(text):
|
||||
print(text)
|
||||
|
||||
class Simple:
|
||||
def __init__(self):
|
||||
self.a = 1
|
||||
|
||||
hello("Hello!")"""
|
||||
|
||||
self.expected_simplified_code = """import os
|
||||
|
||||
# Code for: def hello(text):
|
||||
|
||||
# Code for: class Simple:
|
||||
|
||||
hello("Hello!")"""
|
||||
|
||||
self.expected_extracted_code = [
|
||||
"def hello(text):\n" " print(text)",
|
||||
"class Simple:\n" " def __init__(self):\n" " self.a = 1",
|
||||
]
|
||||
|
||||
def test_extract_functions_classes(self) -> None:
|
||||
segmenter = PythonSegmenter(self.example_code)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
self.assertEqual(extracted_code, self.expected_extracted_code)
|
||||
|
||||
def test_simplify_code(self) -> None:
|
||||
segmenter = PythonSegmenter(self.example_code)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
self.assertEqual(simplified_code, self.expected_simplified_code)
|
@@ -0,0 +1,95 @@
|
||||
"""Module to test generic parsers."""
|
||||
|
||||
from typing import Iterator
|
||||
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
|
||||
|
||||
|
||||
class TestMimeBasedParser:
|
||||
"""Test mime based parser."""
|
||||
|
||||
def test_without_fallback_parser(self) -> None:
|
||||
class FirstCharParser(BaseBlobParser):
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Extract the first character of a blob."""
|
||||
yield Document(page_content=blob.as_string()[0])
|
||||
|
||||
class SecondCharParser(BaseBlobParser):
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Extract the second character of a blob."""
|
||||
yield Document(page_content=blob.as_string()[1])
|
||||
|
||||
parser = MimeTypeBasedParser(
|
||||
handlers={
|
||||
"text/plain": FirstCharParser(),
|
||||
"text/html": SecondCharParser(),
|
||||
},
|
||||
)
|
||||
|
||||
blob = Blob(data=b"Hello World", mimetype="text/plain")
|
||||
docs = parser.parse(blob)
|
||||
assert len(docs) == 1
|
||||
doc = docs[0]
|
||||
assert doc.page_content == "H"
|
||||
|
||||
# Check text/html handler.
|
||||
blob = Blob(data=b"Hello World", mimetype="text/html")
|
||||
docs = parser.parse(blob)
|
||||
assert len(docs) == 1
|
||||
doc = docs[0]
|
||||
assert doc.page_content == "e"
|
||||
|
||||
blob = Blob(data=b"Hello World", mimetype="text/csv")
|
||||
|
||||
with pytest.raises(ValueError, match="Unsupported mime type"):
|
||||
# Check that the fallback parser is used when the mimetype is not found.
|
||||
parser.parse(blob)
|
||||
|
||||
def test_with_fallback_parser(self) -> None:
|
||||
class FirstCharParser(BaseBlobParser):
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Extract the first character of a blob."""
|
||||
yield Document(page_content=blob.as_string()[0])
|
||||
|
||||
class SecondCharParser(BaseBlobParser):
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Extract the second character of a blob."""
|
||||
yield Document(page_content=blob.as_string()[1])
|
||||
|
||||
class ThirdCharParser(BaseBlobParser):
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Extract the third character of a blob."""
|
||||
yield Document(page_content=blob.as_string()[2])
|
||||
|
||||
parser = MimeTypeBasedParser(
|
||||
handlers={
|
||||
"text/plain": FirstCharParser(),
|
||||
"text/html": SecondCharParser(),
|
||||
},
|
||||
fallback_parser=ThirdCharParser(),
|
||||
)
|
||||
|
||||
blob = Blob(data=b"Hello World", mimetype="text/plain")
|
||||
docs = parser.parse(blob)
|
||||
assert len(docs) == 1
|
||||
doc = docs[0]
|
||||
assert doc.page_content == "H"
|
||||
|
||||
# Check text/html handler.
|
||||
blob = Blob(data=b"Hello World", mimetype="text/html")
|
||||
docs = parser.parse(blob)
|
||||
assert len(docs) == 1
|
||||
doc = docs[0]
|
||||
assert doc.page_content == "e"
|
||||
|
||||
# Check that the fallback parser is used when the mimetype is not found.
|
||||
blob = Blob(data=b"Hello World", mimetype="text/csv")
|
||||
docs = parser.parse(blob)
|
||||
assert len(docs) == 1
|
||||
doc = docs[0]
|
||||
assert doc.page_content == "l"
|
@@ -0,0 +1,28 @@
|
||||
"""Tests for the HTML parsers."""
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers.html import BS4HTMLParser
|
||||
|
||||
HERE = Path(__file__).parent
|
||||
EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples"
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4", "lxml")
|
||||
def test_bs_html_loader() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = EXAMPLES / "example.html"
|
||||
blob = Blob.from_path(file_path)
|
||||
parser = BS4HTMLParser(get_text_separator="|")
|
||||
docs = list(parser.lazy_parse(blob))
|
||||
assert isinstance(docs, list)
|
||||
assert len(docs) == 1
|
||||
|
||||
metadata = docs[0].metadata
|
||||
content = docs[0].page_content
|
||||
|
||||
assert metadata["title"] == "Chew dad's slippers"
|
||||
assert metadata["source"] == str(file_path)
|
||||
assert content[:2] == "\n|"
|
@@ -0,0 +1,96 @@
|
||||
"""Tests for the various PDF parsers."""
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers.pdf import (
|
||||
PDFMinerParser,
|
||||
PyMuPDFParser,
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser,
|
||||
)
|
||||
|
||||
_THIS_DIR = Path(__file__).parents[3]
|
||||
|
||||
_EXAMPLES_DIR = _THIS_DIR / "examples"
|
||||
|
||||
# Paths to test PDF files
|
||||
HELLO_PDF = _EXAMPLES_DIR / "hello.pdf"
|
||||
LAYOUT_PARSER_PAPER_PDF = _EXAMPLES_DIR / "layout-parser-paper.pdf"
|
||||
|
||||
|
||||
def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None:
|
||||
"""Standard tests to verify that the given parser works.
|
||||
|
||||
Args:
|
||||
parser (BaseBlobParser): The parser to test.
|
||||
splits_by_page (bool): Whether the parser splits by page or not by default.
|
||||
"""
|
||||
blob = Blob.from_path(HELLO_PDF)
|
||||
doc_generator = parser.lazy_parse(blob)
|
||||
assert isinstance(doc_generator, Iterator)
|
||||
docs = list(doc_generator)
|
||||
assert len(docs) == 1
|
||||
page_content = docs[0].page_content
|
||||
assert isinstance(page_content, str)
|
||||
# The different parsers return different amount of whitespace, so using
|
||||
# startswith instead of equals.
|
||||
assert docs[0].page_content.startswith("Hello world!")
|
||||
|
||||
blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
|
||||
doc_generator = parser.lazy_parse(blob)
|
||||
assert isinstance(doc_generator, Iterator)
|
||||
docs = list(doc_generator)
|
||||
|
||||
if splits_by_page:
|
||||
assert len(docs) == 16
|
||||
else:
|
||||
assert len(docs) == 1
|
||||
# Test is imprecise since the parsers yield different parse information depending
|
||||
# on configuration. Each parser seems to yield a slightly different result
|
||||
# for this page!
|
||||
assert "LayoutParser" in docs[0].page_content
|
||||
metadata = docs[0].metadata
|
||||
|
||||
assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF)
|
||||
|
||||
if splits_by_page:
|
||||
assert int(metadata["page"]) == 0
|
||||
|
||||
|
||||
@pytest.mark.requires("pypdf")
|
||||
def test_pypdf_parser() -> None:
|
||||
"""Test PyPDF parser."""
|
||||
_assert_with_parser(PyPDFParser())
|
||||
|
||||
|
||||
@pytest.mark.requires("pdfminer")
|
||||
def test_pdfminer_parser() -> None:
|
||||
"""Test PDFMiner parser."""
|
||||
# Does not follow defaults to split by page.
|
||||
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
||||
|
||||
|
||||
@pytest.mark.requires("fitz") # package is PyMuPDF
|
||||
def test_pymupdf_loader() -> None:
|
||||
"""Test PyMuPDF loader."""
|
||||
_assert_with_parser(PyMuPDFParser())
|
||||
|
||||
|
||||
@pytest.mark.requires("pypdfium2")
|
||||
def test_pypdfium2_parser() -> None:
|
||||
"""Test PyPDFium2 parser."""
|
||||
# Does not follow defaults to split by page.
|
||||
_assert_with_parser(PyPDFium2Parser())
|
||||
|
||||
|
||||
@pytest.mark.requires("rapidocr_onnxruntime")
|
||||
def test_extract_images_text_from_pdf() -> None:
|
||||
"""Test extract image from pdf and recognize text with rapid ocr"""
|
||||
_assert_with_parser(PyPDFParser(extract_images=True))
|
||||
_assert_with_parser(PDFMinerParser(extract_images=True))
|
||||
_assert_with_parser(PyMuPDFParser(extract_images=True))
|
||||
_assert_with_parser(PyPDFium2Parser(extract_images=True))
|
@@ -0,0 +1,17 @@
|
||||
from langchain_community.document_loaders.parsers import __all__
|
||||
|
||||
|
||||
def test_parsers_public_api_correct() -> None:
|
||||
"""Test public API of parsers for breaking changes."""
|
||||
assert set(__all__) == {
|
||||
"BS4HTMLParser",
|
||||
"DocAIParser",
|
||||
"GrobidParser",
|
||||
"LanguageParser",
|
||||
"OpenAIWhisperParser",
|
||||
"PyPDFParser",
|
||||
"PDFMinerParser",
|
||||
"PyMuPDFParser",
|
||||
"PyPDFium2Parser",
|
||||
"PDFPlumberParser",
|
||||
}
|
Reference in New Issue
Block a user