mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-16 23:13:31 +00:00
Harrison/document cleanup (#2062)
Co-authored-by: Delip Rao <delip@users.noreply.github.com>
This commit is contained in:
@@ -1,59 +0,0 @@
|
||||
"""Test document functionality."""
|
||||
from langchain.docstore.document import Document
|
||||
|
||||
_PAGE_CONTENT = """This is a page about LangChain.
|
||||
|
||||
It is a really cool framework.
|
||||
|
||||
What isn't there to love about langchain?
|
||||
|
||||
Made in 2022."""
|
||||
|
||||
|
||||
def test_document_summary() -> None:
|
||||
"""Test that we extract the summary okay."""
|
||||
page = Document(page_content=_PAGE_CONTENT)
|
||||
assert page.summary == "This is a page about LangChain."
|
||||
|
||||
|
||||
def test_document_lookup() -> None:
|
||||
"""Test that can lookup things okay."""
|
||||
page = Document(page_content=_PAGE_CONTENT)
|
||||
|
||||
# Start with lookup on "LangChain".
|
||||
output = page.lookup("LangChain")
|
||||
assert output == "(Result 1/2) This is a page about LangChain."
|
||||
|
||||
# Now switch to looking up "framework".
|
||||
output = page.lookup("framework")
|
||||
assert output == "(Result 1/1) It is a really cool framework."
|
||||
|
||||
# Now switch back to looking up "LangChain", should reset.
|
||||
output = page.lookup("LangChain")
|
||||
assert output == "(Result 1/2) This is a page about LangChain."
|
||||
|
||||
# Lookup "LangChain" again, should go to the next mention.
|
||||
output = page.lookup("LangChain")
|
||||
assert output == "(Result 2/2) What isn't there to love about langchain?"
|
||||
|
||||
|
||||
def test_document_lookups_dont_exist() -> None:
|
||||
"""Test lookup on term that doesn't exist in the document."""
|
||||
page = Document(page_content=_PAGE_CONTENT)
|
||||
|
||||
# Start with lookup on "harrison".
|
||||
output = page.lookup("harrison")
|
||||
assert output == "No Results"
|
||||
|
||||
|
||||
def test_document_lookups_too_many() -> None:
|
||||
"""Test lookup on term too many times."""
|
||||
page = Document(page_content=_PAGE_CONTENT)
|
||||
|
||||
# Start with lookup on "framework".
|
||||
output = page.lookup("framework")
|
||||
assert output == "(Result 1/1) It is a really cool framework."
|
||||
|
||||
# Now try again, should be exhausted.
|
||||
output = page.lookup("framework")
|
||||
assert output == "No More Results"
|
43
tests/unit_tests/document_loader/test_dataframe.py
Normal file
43
tests/unit_tests/document_loader/test_dataframe.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from langchain.document_loaders import DataFrameLoader
|
||||
from langchain.schema import Document
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_data_frame() -> pd.DataFrame:
|
||||
data = {
|
||||
"text": ["Hello", "World"],
|
||||
"author": ["Alice", "Bob"],
|
||||
"date": ["2022-01-01", "2022-01-02"],
|
||||
}
|
||||
return pd.DataFrame(data)
|
||||
|
||||
|
||||
def test_load_returns_list_of_documents(sample_data_frame: pd.DataFrame) -> None:
|
||||
loader = DataFrameLoader(sample_data_frame)
|
||||
docs = loader.load()
|
||||
assert isinstance(docs, list)
|
||||
assert all(isinstance(doc, Document) for doc in docs)
|
||||
assert len(docs) == 2
|
||||
|
||||
|
||||
def test_load_converts_dataframe_columns_to_document_metadata(
|
||||
sample_data_frame: pd.DataFrame,
|
||||
) -> None:
|
||||
loader = DataFrameLoader(sample_data_frame)
|
||||
docs = loader.load()
|
||||
for i, doc in enumerate(docs):
|
||||
assert doc.metadata["author"] == sample_data_frame.loc[i, "author"]
|
||||
assert doc.metadata["date"] == sample_data_frame.loc[i, "date"]
|
||||
|
||||
|
||||
def test_load_uses_page_content_column_to_create_document_text(
|
||||
sample_data_frame: pd.DataFrame,
|
||||
) -> None:
|
||||
sample_data_frame = sample_data_frame.rename(columns={"text": "dummy_test_column"})
|
||||
loader = DataFrameLoader(sample_data_frame, page_content_column="dummy_test_column")
|
||||
docs = loader.load()
|
||||
assert docs[0].page_content == "Hello"
|
||||
assert docs[1].page_content == "World"
|
Reference in New Issue
Block a user