From 02ef9164b5091ffe2459786cbd8832024f350c82 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Thu, 8 Feb 2024 16:07:18 -0800 Subject: [PATCH] langchain[patch]: expose cohere rerank score, add parent doc param (#16887) --- .../vectorstores/elasticsearch.py | 2 +- .../chains/query_constructor/__init__.py | 3 + .../chains/query_constructor/base.py | 3 +- .../document_compressors/cohere_rerank.py | 73 ++++++++++++------- .../retrievers/parent_document_retriever.py | 23 ++++-- .../langchain_openai/chat_models/base.py | 18 ++--- 6 files changed, 76 insertions(+), 46 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/elasticsearch.py b/libs/community/langchain_community/vectorstores/elasticsearch.py index 2f8a9e2469b..7b6eef40fea 100644 --- a/libs/community/langchain_community/vectorstores/elasticsearch.py +++ b/libs/community/langchain_community/vectorstores/elasticsearch.py @@ -484,8 +484,8 @@ class ElasticsearchStore(VectorStore): from langchain_community.vectorstores.utils import DistanceStrategy vectorstore = ElasticsearchStore( + "langchain-demo", embedding=OpenAIEmbeddings(), - index_name="langchain-demo", es_url="http://localhost:9200", distance_strategy="DOT_PRODUCT" ) diff --git a/libs/langchain/langchain/chains/query_constructor/__init__.py b/libs/langchain/langchain/chains/query_constructor/__init__.py index e69de29bb2d..9d08ca0e080 100644 --- a/libs/langchain/langchain/chains/query_constructor/__init__.py +++ b/libs/langchain/langchain/chains/query_constructor/__init__.py @@ -0,0 +1,3 @@ +from langchain.chains.query_constructor.base import load_query_constructor_runnable + +__all__ = ["load_query_constructor_runnable"] diff --git a/libs/langchain/langchain/chains/query_constructor/base.py b/libs/langchain/langchain/chains/query_constructor/base.py index d99c046abfa..c08e74f20da 100644 --- a/libs/langchain/langchain/chains/query_constructor/base.py +++ b/libs/langchain/langchain/chains/query_constructor/base.py @@ -323,7 +323,8 @@ def load_query_constructor_runnable( Args: llm: BaseLanguageModel to use for the chain. - document_contents: The contents of the document to be queried. + document_contents: Description of the page contents of the document to be + queried. attribute_info: Sequence of attributes in the document. examples: Optional list of examples to use for the chain. allowed_comparators: Sequence of allowed comparators. Defaults to all diff --git a/libs/langchain/langchain/retrievers/document_compressors/cohere_rerank.py b/libs/langchain/langchain/retrievers/document_compressors/cohere_rerank.py index c767f86bde6..b36ea305c78 100644 --- a/libs/langchain/langchain/retrievers/document_compressors/cohere_rerank.py +++ b/libs/langchain/langchain/retrievers/document_compressors/cohere_rerank.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Dict, Optional, Sequence +from copy import deepcopy +from typing import Any, Dict, List, Optional, Sequence, Union from langchain_core.documents import Document from langchain_core.pydantic_v1 import Extra, root_validator @@ -9,23 +10,13 @@ from langchain.callbacks.manager import Callbacks from langchain.retrievers.document_compressors.base import BaseDocumentCompressor from langchain.utils import get_from_dict_or_env -if TYPE_CHECKING: - from cohere import Client -else: - # We do to avoid pydantic annotation issues when actually instantiating - # while keeping this import optional - try: - from cohere import Client - except ImportError: - pass - class CohereRerank(BaseDocumentCompressor): """Document compressor that uses `Cohere Rerank API`.""" - client: Client + client: Any """Cohere client to use for compressing documents.""" - top_n: int = 3 + top_n: Optional[int] = 3 """Number of documents to return.""" model: str = "rerank-english-v2.0" """Model to use for reranking.""" @@ -57,6 +48,42 @@ class CohereRerank(BaseDocumentCompressor): values["client"] = cohere.Client(cohere_api_key, client_name=client_name) return values + def rerank( + self, + documents: Sequence[Union[str, Document, dict]], + query: str, + *, + model: Optional[str] = None, + top_n: Optional[int] = -1, + max_chunks_per_doc: Optional[int] = None, + ) -> List[Dict[str, Any]]: + """Returns an ordered list of documents ordered by their relevance to the provided query. + + Args: + query: The query to use for reranking. + documents: A sequence of documents to rerank. + model: The model to use for re-ranking. Default to self.model. + top_n : The number of results to return. If None returns all results. + Defaults to self.top_n. + max_chunks_per_doc : The maximum number of chunks derived from a document. + """ # noqa: E501 + if len(documents) == 0: # to avoid empty api call + return [] + docs = [ + doc.page_content if isinstance(doc, Document) else doc for doc in documents + ] + model = model or self.model + top_n = top_n if (top_n is None or top_n > 0) else self.top_n + results = self.client.rerank( + query, docs, model, top_n=top_n, max_chunks_per_doc=max_chunks_per_doc + ) + result_dicts = [] + for res in results: + result_dicts.append( + {"index": res.index, "relevance_score": res.relevance_score} + ) + return result_dicts + def compress_documents( self, documents: Sequence[Document], @@ -74,16 +101,10 @@ class CohereRerank(BaseDocumentCompressor): Returns: A sequence of compressed documents. """ - if len(documents) == 0: # to avoid empty api call - return [] - doc_list = list(documents) - _docs = [d.page_content for d in doc_list] - results = self.client.rerank( - model=self.model, query=query, documents=_docs, top_n=self.top_n - ) - final_results = [] - for r in results: - doc = doc_list[r.index] - doc.metadata["relevance_score"] = r.relevance_score - final_results.append(doc) - return final_results + compressed = [] + for res in self.rerank(documents, query): + doc = documents[res["index"]] + doc_copy = Document(doc.page_content, metadata=deepcopy(doc.metadata)) + doc_copy.metadata["relevance_score"] = res["relevance_score"] + compressed.append(doc_copy) + return compressed diff --git a/libs/langchain/langchain/retrievers/parent_document_retriever.py b/libs/langchain/langchain/retrievers/parent_document_retriever.py index 86e4cef1868..5095c13540e 100644 --- a/libs/langchain/langchain/retrievers/parent_document_retriever.py +++ b/libs/langchain/langchain/retrievers/parent_document_retriever.py @@ -1,5 +1,5 @@ import uuid -from typing import List, Optional +from typing import List, Optional, Sequence from langchain_core.documents import Document @@ -31,17 +31,16 @@ class ParentDocumentRetriever(MultiVectorRetriever): .. code-block:: python - # Imports - from langchain_community.vectorstores import Chroma from langchain_community.embeddings import OpenAIEmbeddings + from langchain_community.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.storage import InMemoryStore # This text splitter is used to create the parent documents - parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000) + parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, add_start_index=True) # This text splitter is used to create the child documents # It should create documents smaller than the parent - child_splitter = RecursiveCharacterTextSplitter(chunk_size=400) + child_splitter = RecursiveCharacterTextSplitter(chunk_size=400, add_start_index=True) # The vectorstore to use to index the child chunks vectorstore = Chroma(embedding_function=OpenAIEmbeddings()) # The storage layer for the parent documents @@ -54,7 +53,7 @@ class ParentDocumentRetriever(MultiVectorRetriever): child_splitter=child_splitter, parent_splitter=parent_splitter, ) - """ + """ # noqa: E501 child_splitter: TextSplitter """The text splitter to use to create child documents.""" @@ -65,6 +64,11 @@ class ParentDocumentRetriever(MultiVectorRetriever): """The text splitter to use to create parent documents. If none, then the parent documents will be the raw documents passed in.""" + child_metadata_fields: Optional[Sequence[str]] = None + """Metadata fields to leave in child documents. If None, leave all parent document + metadata. + """ + def add_documents( self, documents: List[Document], @@ -76,7 +80,7 @@ class ParentDocumentRetriever(MultiVectorRetriever): Args: documents: List of documents to add ids: Optional list of ids for documents. If provided should be the same - length as the list of documents. Can provided if parent documents + length as the list of documents. Can be provided if parent documents are already in the document store and you don't want to re-add to the docstore. If not provided, random UUIDs will be used as ids. @@ -106,6 +110,11 @@ class ParentDocumentRetriever(MultiVectorRetriever): for i, doc in enumerate(documents): _id = doc_ids[i] sub_docs = self.child_splitter.split_documents([doc]) + if self.child_metadata_fields is not None: + for _doc in sub_docs: + _doc.metadata = { + k: _doc.metadata[k] for k in self.child_metadata_fields + } for _doc in sub_docs: _doc.metadata[self.id_key] = _id docs.extend(sub_docs) diff --git a/libs/partners/openai/langchain_openai/chat_models/base.py b/libs/partners/openai/langchain_openai/chat_models/base.py index ed33b919bdd..fc1430e2425 100644 --- a/libs/partners/openai/langchain_openai/chat_models/base.py +++ b/libs/partners/openai/langchain_openai/chat_models/base.py @@ -649,7 +649,7 @@ class ChatOpenAI(BaseChatModel): Must be the name of the single provided function or "auto" to automatically determine which function to call (if any). - kwargs: Any additional parameters to pass to the + **kwargs: Any additional parameters to pass to the :class:`~langchain.runnable.Runnable` constructor. """ @@ -701,22 +701,21 @@ class ChatOpenAI(BaseChatModel): "auto" to automatically determine which function to call (if any), or a dict of the form: {"type": "function", "function": {"name": <>}}. - kwargs: Any additional parameters to pass to the + **kwargs: Any additional parameters to pass to the :class:`~langchain.runnable.Runnable` constructor. """ formatted_tools = [convert_to_openai_tool(tool) for tool in tools] if tool_choice is not None: - if isinstance(tool_choice, str) and tool_choice not in ("auto", "none"): + if isinstance(tool_choice, str) and (tool_choice not in ("auto", "none")): tool_choice = {"type": "function", "function": {"name": tool_choice}} - if isinstance(tool_choice, dict) and len(formatted_tools) != 1: + if isinstance(tool_choice, dict) and (len(formatted_tools) != 1): raise ValueError( "When specifying `tool_choice`, you must provide exactly one " f"tool. Received {len(formatted_tools)} tools." ) - if ( - isinstance(tool_choice, dict) - and formatted_tools[0]["function"]["name"] + if isinstance(tool_choice, dict) and ( + formatted_tools[0]["function"]["name"] != tool_choice["function"]["name"] ): raise ValueError( @@ -724,7 +723,4 @@ class ChatOpenAI(BaseChatModel): f"provided tool was {formatted_tools[0]['function']['name']}." ) kwargs["tool_choice"] = tool_choice - return super().bind( - tools=formatted_tools, - **kwargs, - ) + return super().bind(tools=formatted_tools, **kwargs)