diff --git a/docs/modules/indexes/retrievers/examples/contextual-compression.ipynb b/docs/modules/indexes/retrievers/examples/contextual-compression.ipynb index 1aba01f27f9..c804e43aaec 100644 --- a/docs/modules/indexes/retrievers/examples/contextual-compression.ipynb +++ b/docs/modules/indexes/retrievers/examples/contextual-compression.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "28e8dc12", "metadata": {}, "outputs": [], @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "9fbcc58f", "metadata": {}, "outputs": [ @@ -124,22 +124,22 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "id": "9a658023", "metadata": {}, "outputs": [], "source": [ "from langchain.llms import OpenAI\n", "from langchain.retrievers import ContextualCompressionRetriever\n", - "from langchain.retrievers.document_filters import LLMChainDocumentCompressor\n", + "from langchain.retrievers.document_filters import LLMChainExtractionDocumentFilter\n", "\n", - "_filter = LLMChainDocumentCompressor.from_llm(OpenAI(temperature=0))\n", + "_filter = LLMChainExtractionDocumentFilter.from_llm(OpenAI(temperature=0))\n", "compression_retriever = ContextualCompressionRetriever(base_filter=_filter, base_retriever=retriever)" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "id": "398622c5", "metadata": {}, "outputs": [ @@ -175,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 6, "id": "2a150a63", "metadata": {}, "outputs": [], @@ -207,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 7, "id": "3ceab64a", "metadata": {}, "outputs": [ @@ -245,17 +245,9 @@ "id": "87dcc583", "metadata": {}, "source": [ - "\n", + "# Results\n", "Here we create a sequence where we first split the initial documents into smaller documents, then we drop redundant documents, and finally we drop any documents not relevant to the query. The results aren't quite as good as the LLM-powered filter above, but we were able to do all this filtering much more quickly and cheaply by only using Embedding models." ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fdb63b80", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/langchain/retrievers/document_filters/__init__.py b/langchain/retrievers/document_filters/__init__.py index 70413307e14..47f91e9d7b9 100644 --- a/langchain/retrievers/document_filters/__init__.py +++ b/langchain/retrievers/document_filters/__init__.py @@ -1,21 +1,23 @@ -from langchain.retrievers.document_filters.compression_chain import ( - LLMChainDocumentCompressor, +from langchain.retrievers.document_filters.chain_extract import ( + LLMChainExtractionDocumentFilter, ) -from langchain.retrievers.document_filters.pipeline import DocumentFilterPipeline -from langchain.retrievers.document_filters.redundant_embeddings import ( +from langchain.retrievers.document_filters.chain_relevant import ( + LLMChainRelevancyDocumentFilter, +) +from langchain.retrievers.document_filters.embeddings_redundant import ( EmbeddingRedundantDocumentFilter, ) -from langchain.retrievers.document_filters.relevant_chain import LLMChainDocumentFilter -from langchain.retrievers.document_filters.relevant_embeddings import ( +from langchain.retrievers.document_filters.embeddings_relevant import ( EmbeddingRelevancyDocumentFilter, ) +from langchain.retrievers.document_filters.pipeline import DocumentFilterPipeline from langchain.retrievers.document_filters.text_splitter import SplitterDocumentFilter __all__ = [ "DocumentFilterPipeline", "EmbeddingRedundantDocumentFilter", "EmbeddingRelevancyDocumentFilter", - "LLMChainDocumentCompressor", - "LLMChainDocumentFilter", + "LLMChainExtractionDocumentFilter", + "LLMChainRelevancyDocumentFilter", "SplitterDocumentFilter", ] diff --git a/langchain/retrievers/document_filters/compression_chain.py b/langchain/retrievers/document_filters/chain_extract.py similarity index 93% rename from langchain/retrievers/document_filters/compression_chain.py rename to langchain/retrievers/document_filters/chain_extract.py index a66b5828669..73acbfc46f0 100644 --- a/langchain/retrievers/document_filters/compression_chain.py +++ b/langchain/retrievers/document_filters/chain_extract.py @@ -6,7 +6,7 @@ from langchain.retrievers.document_filters.base import ( BaseDocumentFilter, _RetrievedDocument, ) -from langchain.retrievers.document_filters.compression_chain_prompt import ( +from langchain.retrievers.document_filters.chain_extract_prompt import ( prompt_template, ) from langchain.schema import BaseLanguageModel, BaseOutputParser, Document @@ -39,7 +39,7 @@ def _get_default_chain_prompt() -> PromptTemplate: ) -class LLMChainDocumentCompressor(BaseDocumentFilter): +class LLMChainExtractionDocumentFilter(BaseDocumentFilter): llm_chain: LLMChain """LLM wrapper to use for compressing documents.""" @@ -72,7 +72,7 @@ class LLMChainDocumentCompressor(BaseDocumentFilter): llm: BaseLanguageModel, prompt: Optional[PromptTemplate] = None, get_input: Optional[Callable[[str, Document], str]] = None, - ) -> "LLMChainDocumentCompressor": + ) -> "LLMChainExtractionDocumentFilter": """Initialize from LLM.""" _prompt = prompt if prompt is not None else _get_default_chain_prompt() _get_input = get_input if get_input is not None else default_get_input diff --git a/langchain/retrievers/document_filters/compression_chain_prompt.py b/langchain/retrievers/document_filters/chain_extract_prompt.py similarity index 100% rename from langchain/retrievers/document_filters/compression_chain_prompt.py rename to langchain/retrievers/document_filters/chain_extract_prompt.py diff --git a/langchain/retrievers/document_filters/relevant_chain.py b/langchain/retrievers/document_filters/chain_relevant.py similarity index 92% rename from langchain/retrievers/document_filters/relevant_chain.py rename to langchain/retrievers/document_filters/chain_relevant.py index 794e7fe6cb3..e8706db595a 100644 --- a/langchain/retrievers/document_filters/relevant_chain.py +++ b/langchain/retrievers/document_filters/chain_relevant.py @@ -7,7 +7,7 @@ from langchain.retrievers.document_filters.base import ( BaseDocumentFilter, _RetrievedDocument, ) -from langchain.retrievers.document_filters.relevant_chain_prompt import prompt_template +from langchain.retrievers.document_filters.chain_relevant_prompt import prompt_template from langchain.schema import BaseLanguageModel, Document @@ -24,7 +24,7 @@ def default_get_input(query: str, doc: Document) -> Dict[str, Any]: return {"question": query, "context": doc.page_content} -class LLMChainDocumentFilter(BaseDocumentFilter): +class LLMChainRelevancyDocumentFilter(BaseDocumentFilter): """Filter that drops documents that aren't relevant to the query.""" llm_chain: LLMChain @@ -58,7 +58,7 @@ class LLMChainDocumentFilter(BaseDocumentFilter): llm: BaseLanguageModel, prompt: Optional[BasePromptTemplate] = None, **kwargs: Any - ) -> "LLMChainDocumentFilter": + ) -> "LLMChainRelevancyDocumentFilter": _prompt = prompt if prompt is not None else _get_default_chain_prompt() llm_chain = LLMChain(llm=llm, prompt=_prompt) return cls(llm_chain=llm_chain, **kwargs) diff --git a/langchain/retrievers/document_filters/relevant_chain_prompt.py b/langchain/retrievers/document_filters/chain_relevant_prompt.py similarity index 100% rename from langchain/retrievers/document_filters/relevant_chain_prompt.py rename to langchain/retrievers/document_filters/chain_relevant_prompt.py diff --git a/langchain/retrievers/document_filters/redundant_embeddings.py b/langchain/retrievers/document_filters/embeddings_redundant.py similarity index 100% rename from langchain/retrievers/document_filters/redundant_embeddings.py rename to langchain/retrievers/document_filters/embeddings_redundant.py diff --git a/langchain/retrievers/document_filters/relevant_embeddings.py b/langchain/retrievers/document_filters/embeddings_relevant.py similarity index 100% rename from langchain/retrievers/document_filters/relevant_embeddings.py rename to langchain/retrievers/document_filters/embeddings_relevant.py diff --git a/tests/integration_tests/retrievers/document_filters/test_compression_chain.py b/tests/integration_tests/retrievers/document_filters/test_compression_chain.py index 6e0bc470cdd..a83066d8ae6 100644 --- a/tests/integration_tests/retrievers/document_filters/test_compression_chain.py +++ b/tests/integration_tests/retrievers/document_filters/test_compression_chain.py @@ -1,6 +1,6 @@ """Integration test for LLMChainCompressor.""" from langchain.chat_models import ChatOpenAI -from langchain.retrievers.document_filters import LLMChainDocumentCompressor +from langchain.retrievers.document_filters import LLMChainExtractionDocumentFilter from langchain.retrievers.document_filters.base import _RetrievedDocument @@ -14,7 +14,7 @@ def test_llm_chain_compressor_filter() -> None: "Let's go to Olive Garden!", ] doc = _RetrievedDocument(page_content=" ".join(texts)) - compressor = LLMChainDocumentCompressor.from_llm(ChatOpenAI()) + compressor = LLMChainExtractionDocumentFilter.from_llm(ChatOpenAI()) actual = compressor.filter([doc], "Tell me about the Roman Empire")[0].page_content expected_returned = [0, 2, 4] expected_not_returned = [1, 3, 5] @@ -29,6 +29,6 @@ def test_llm_chain_compressor_filter_empty() -> None: "Let's go to Olive Garden!", ] doc = _RetrievedDocument(page_content=" ".join(texts)) - compressor = LLMChainDocumentCompressor.from_llm(ChatOpenAI()) + compressor = LLMChainExtractionDocumentFilter.from_llm(ChatOpenAI()) actual = compressor.filter([doc], "Tell me about the Roman Empire") assert len(actual) == 0 diff --git a/tests/integration_tests/retrievers/document_filters/test_relevant_chain.py b/tests/integration_tests/retrievers/document_filters/test_relevant_chain.py index 877ab597c79..3cdd14ed671 100644 --- a/tests/integration_tests/retrievers/document_filters/test_relevant_chain.py +++ b/tests/integration_tests/retrievers/document_filters/test_relevant_chain.py @@ -1,6 +1,6 @@ """Integration test for llm-based relevant doc filtering.""" from langchain.chat_models import ChatOpenAI -from langchain.retrievers.document_filters import LLMChainDocumentFilter +from langchain.retrievers.document_filters import LLMChainRelevancyDocumentFilter from langchain.retrievers.document_filters.base import _RetrievedDocument @@ -11,7 +11,7 @@ def test_llm_chain_document_filter() -> None: "My favorite color is green", ] docs = [_RetrievedDocument(page_content=t) for t in texts] - relevant_filter = LLMChainDocumentFilter.from_llm(llm=ChatOpenAI()) + relevant_filter = LLMChainRelevancyDocumentFilter.from_llm(llm=ChatOpenAI()) actual = relevant_filter.filter(docs, "Things I said related to food") assert len(actual) == 2 assert len(set(texts[:2]).intersection([d.page_content for d in actual])) == 2 diff --git a/tests/unit_tests/retrievers/document_filters/test_redundant_embeddings.py b/tests/unit_tests/retrievers/document_filters/test_redundant_embeddings.py index 594ee1e297a..24e70de5cf3 100644 --- a/tests/unit_tests/retrievers/document_filters/test_redundant_embeddings.py +++ b/tests/unit_tests/retrievers/document_filters/test_redundant_embeddings.py @@ -1,6 +1,6 @@ """Unit tests for redundant embedding filtering.""" from langchain.math_utils import cosine_similarity -from langchain.retrievers.document_filters.redundant_embeddings import ( +from langchain.retrievers.document_filters.embeddings_redundant import ( _filter_similar_embeddings, )