Harrison/document cleanup (#2062)

Co-authored-by: Delip Rao <delip@users.noreply.github.com>
2025-09-16 23:13:31 +00:00 · 2023-03-27 16:32:55 -07:00
parent a0cd6672aa
commit 30e3b31b04
10 changed files with 545 additions and 259 deletions
--- a/tests/unit_tests/docstore/test_document.py
+++ b/tests/unit_tests/docstore/test_document.py
@@ -1,59 +0,0 @@
-"""Test document functionality."""
-from langchain.docstore.document import Document
-
-_PAGE_CONTENT = """This is a page about LangChain.
-
-It is a really cool framework.
-
-What isn't there to love about langchain?
-
-Made in 2022."""
-
-
-def test_document_summary() -> None:
-    """Test that we extract the summary okay."""
-    page = Document(page_content=_PAGE_CONTENT)
-    assert page.summary == "This is a page about LangChain."
-
-
-def test_document_lookup() -> None:
-    """Test that can lookup things okay."""
-    page = Document(page_content=_PAGE_CONTENT)
-
-    # Start with lookup on "LangChain".
-    output = page.lookup("LangChain")
-    assert output == "(Result 1/2) This is a page about LangChain."
-
-    # Now switch to looking up "framework".
-    output = page.lookup("framework")
-    assert output == "(Result 1/1) It is a really cool framework."
-
-    # Now switch back to looking up "LangChain", should reset.
-    output = page.lookup("LangChain")
-    assert output == "(Result 1/2) This is a page about LangChain."
-
-    # Lookup "LangChain" again, should go to the next mention.
-    output = page.lookup("LangChain")
-    assert output == "(Result 2/2) What isn't there to love about langchain?"
-
-
-def test_document_lookups_dont_exist() -> None:
-    """Test lookup on term that doesn't exist in the document."""
-    page = Document(page_content=_PAGE_CONTENT)
-
-    # Start with lookup on "harrison".
-    output = page.lookup("harrison")
-    assert output == "No Results"
-
-
-def test_document_lookups_too_many() -> None:
-    """Test lookup on term too many times."""
-    page = Document(page_content=_PAGE_CONTENT)
-
-    # Start with lookup on "framework".
-    output = page.lookup("framework")
-    assert output == "(Result 1/1) It is a really cool framework."
-
-    # Now try again, should be exhausted.
-    output = page.lookup("framework")
-    assert output == "No More Results"
--- a/tests/unit_tests/document_loader/test_dataframe.py
+++ b/tests/unit_tests/document_loader/test_dataframe.py
@@ -0,0 +1,43 @@
+import pandas as pd
+import pytest
+
+from langchain.document_loaders import DataFrameLoader
+from langchain.schema import Document
+
+
+@pytest.fixture
+def sample_data_frame() -> pd.DataFrame:
+    data = {
+        "text": ["Hello", "World"],
+        "author": ["Alice", "Bob"],
+        "date": ["2022-01-01", "2022-01-02"],
+    }
+    return pd.DataFrame(data)
+
+
+def test_load_returns_list_of_documents(sample_data_frame: pd.DataFrame) -> None:
+    loader = DataFrameLoader(sample_data_frame)
+    docs = loader.load()
+    assert isinstance(docs, list)
+    assert all(isinstance(doc, Document) for doc in docs)
+    assert len(docs) == 2
+
+
+def test_load_converts_dataframe_columns_to_document_metadata(
+    sample_data_frame: pd.DataFrame,
+) -> None:
+    loader = DataFrameLoader(sample_data_frame)
+    docs = loader.load()
+    for i, doc in enumerate(docs):
+        assert doc.metadata["author"] == sample_data_frame.loc[i, "author"]
+        assert doc.metadata["date"] == sample_data_frame.loc[i, "date"]
+
+
+def test_load_uses_page_content_column_to_create_document_text(
+    sample_data_frame: pd.DataFrame,
+) -> None:
+    sample_data_frame = sample_data_frame.rename(columns={"text": "dummy_test_column"})
+    loader = DataFrameLoader(sample_data_frame, page_content_column="dummy_test_column")
+    docs = loader.load()
+    assert docs[0].page_content == "Hello"
+    assert docs[1].page_content == "World"