langchain/libs/community/tests/unit_tests/retrievers/test_bm25.py

import pytest
from langchain_core.documents import Document

from langchain_community.retrievers.bm25 import BM25Retriever


@pytest.mark.requires("rank_bm25")
def test_from_texts() -> None:
    input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."]
    bm25_retriever = BM25Retriever.from_texts(texts=input_texts)
    assert len(bm25_retriever.docs) == 3
    assert bm25_retriever.vectorizer.doc_len == [4, 5, 4]


@pytest.mark.requires("rank_bm25")
def test_from_texts_with_bm25_params() -> None:
    input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."]
    bm25_retriever = BM25Retriever.from_texts(
        texts=input_texts, bm25_params={"epsilon": 10}
    )
    # should count only multiple words (have, pan)
    assert bm25_retriever.vectorizer.epsilon == 10


@pytest.mark.requires("rank_bm25")
def test_from_documents() -> None:
    input_docs = [
        Document(page_content="I have a pen."),
        Document(page_content="Do you have a pen?"),
        Document(page_content="I have a bag."),
    ]
    bm25_retriever = BM25Retriever.from_documents(documents=input_docs)
    assert len(bm25_retriever.docs) == 3
    assert bm25_retriever.vectorizer.doc_len == [4, 5, 4]


@pytest.mark.requires("rank_bm25")
def test_repr() -> None:
    input_docs = [
        Document(page_content="I have a pen."),
        Document(page_content="Do you have a pen?"),
        Document(page_content="I have a bag."),
    ]
    bm25_retriever = BM25Retriever.from_documents(documents=input_docs)
    assert "I have a pen" not in repr(bm25_retriever)


@pytest.mark.requires("rank_bm25")
def test_doc_id() -> None:
    docs_with_ids = [
        Document(page_content="I have a pen.", id="1"),
        Document(page_content="Do you have a pen?", id="2"),
        Document(page_content="I have a bag.", id="3"),
    ]
    docs_without_ids = [
        Document(page_content="I have a pen."),
        Document(page_content="Do you have a pen?"),
        Document(page_content="I have a bag."),
    ]
    docs_with_some_ids = [
        Document(page_content="I have a pen.", id="1"),
        Document(page_content="Do you have a pen?"),
        Document(page_content="I have a bag.", id="3"),
    ]
    bm25_retriever_with_ids = BM25Retriever.from_documents(documents=docs_with_ids)
    bm25_retriever_without_ids = BM25Retriever.from_documents(
        documents=docs_without_ids
    )
    bm25_retriever_with_some_ids = BM25Retriever.from_documents(
        documents=docs_with_some_ids
    )
    for doc in bm25_retriever_with_ids.docs:
        assert doc.id is not None
    for doc in bm25_retriever_without_ids.docs:
        assert doc.id is None
    for doc in bm25_retriever_with_some_ids.docs:
        if doc.page_content == "I have a pen.":
            assert doc.id == "1"
        elif doc.page_content == "Do you have a pen?":
            assert doc.id is None
        elif doc.page_content == "I have a bag.":
            assert doc.id == "3"
        else:
            raise ValueError("Unexpected document")