tfidf retriever (#5114)

Co-authored-by: vempaliakhil96 <vempaliakhil96@gmail.com>
2025-09-15 22:44:36 +00:00 · 2023-05-24 10:02:09 -07:00
parent b00c77dc62
commit 2b2176a3c1
5 changed files with 80 additions and 26 deletions
--- a/tests/integration_tests/retrievers/test_tfidf.py
+++ b/tests/integration_tests/retrievers/test_tfidf.py
@@ -1,6 +1,10 @@
+import pytest
+
 from langchain.retrievers.tfidf import TFIDFRetriever
+from langchain.schema import Document


+@pytest.mark.requires("sklearn")
 def test_from_texts() -> None:
    input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."]
    tfidf_retriever = TFIDFRetriever.from_texts(texts=input_texts)
@@ -8,6 +12,7 @@ def test_from_texts() -> None:
    assert tfidf_retriever.tfidf_array.toarray().shape == (3, 5)


+@pytest.mark.requires("sklearn")
 def test_from_texts_with_tfidf_params() -> None:
    input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."]
    tfidf_retriever = TFIDFRetriever.from_texts(
@@ -15,3 +20,15 @@ def test_from_texts_with_tfidf_params() -> None:
    )
    # should count only multiple words (have, pan)
    assert tfidf_retriever.tfidf_array.toarray().shape == (3, 2)
+
+
+@pytest.mark.requires("sklearn")
+def test_from_documents() -> None:
+    input_docs = [
+        Document(page_content="I have a pen."),
+        Document(page_content="Do you have a pen?"),
+        Document(page_content="I have a bag."),
+    ]
+    tfidf_retriever = TFIDFRetriever.from_documents(documents=input_docs)
+    assert len(tfidf_retriever.docs) == 3
+    assert tfidf_retriever.tfidf_array.toarray().shape == (3, 5)