From 02ef9164b5091ffe2459786cbd8832024f350c82 Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Thu, 8 Feb 2024 16:07:18 -0800
Subject: [PATCH] langchain[patch]: expose cohere rerank score, add parent doc
 param (#16887)

---
 .../vectorstores/elasticsearch.py             |  2 +-
 .../chains/query_constructor/__init__.py      |  3 +
 .../chains/query_constructor/base.py          |  3 +-
 .../document_compressors/cohere_rerank.py     | 73 ++++++++++++-------
 .../retrievers/parent_document_retriever.py   | 23 ++++--
 .../langchain_openai/chat_models/base.py      | 18 ++---
 6 files changed, 76 insertions(+), 46 deletions(-)

diff --git a/libs/community/langchain_community/vectorstores/elasticsearch.py b/libs/community/langchain_community/vectorstores/elasticsearch.py
index 2f8a9e2469b..7b6eef40fea 100644
--- a/libs/community/langchain_community/vectorstores/elasticsearch.py
+++ b/libs/community/langchain_community/vectorstores/elasticsearch.py
@@ -484,8 +484,8 @@ class ElasticsearchStore(VectorStore):
             from langchain_community.vectorstores.utils import DistanceStrategy
 
             vectorstore = ElasticsearchStore(
+                "langchain-demo",
                 embedding=OpenAIEmbeddings(),
-                index_name="langchain-demo",
                 es_url="http://localhost:9200",
                 distance_strategy="DOT_PRODUCT"
             )
diff --git a/libs/langchain/langchain/chains/query_constructor/__init__.py b/libs/langchain/langchain/chains/query_constructor/__init__.py
index e69de29bb2d..9d08ca0e080 100644
--- a/libs/langchain/langchain/chains/query_constructor/__init__.py
+++ b/libs/langchain/langchain/chains/query_constructor/__init__.py
@@ -0,0 +1,3 @@
+from langchain.chains.query_constructor.base import load_query_constructor_runnable
+
+__all__ = ["load_query_constructor_runnable"]
diff --git a/libs/langchain/langchain/chains/query_constructor/base.py b/libs/langchain/langchain/chains/query_constructor/base.py
index d99c046abfa..c08e74f20da 100644
--- a/libs/langchain/langchain/chains/query_constructor/base.py
+++ b/libs/langchain/langchain/chains/query_constructor/base.py
@@ -323,7 +323,8 @@ def load_query_constructor_runnable(
 
     Args:
         llm: BaseLanguageModel to use for the chain.
-        document_contents: The contents of the document to be queried.
+        document_contents: Description of the page contents of the document to be
+            queried.
         attribute_info: Sequence of attributes in the document.
         examples: Optional list of examples to use for the chain.
         allowed_comparators: Sequence of allowed comparators. Defaults to all
diff --git a/libs/langchain/langchain/retrievers/document_compressors/cohere_rerank.py b/libs/langchain/langchain/retrievers/document_compressors/cohere_rerank.py
index c767f86bde6..b36ea305c78 100644
--- a/libs/langchain/langchain/retrievers/document_compressors/cohere_rerank.py
+++ b/libs/langchain/langchain/retrievers/document_compressors/cohere_rerank.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Dict, Optional, Sequence
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Sequence, Union
 
 from langchain_core.documents import Document
 from langchain_core.pydantic_v1 import Extra, root_validator
@@ -9,23 +10,13 @@ from langchain.callbacks.manager import Callbacks
 from langchain.retrievers.document_compressors.base import BaseDocumentCompressor
 from langchain.utils import get_from_dict_or_env
 
-if TYPE_CHECKING:
-    from cohere import Client
-else:
-    # We do to avoid pydantic annotation issues when actually instantiating
-    # while keeping this import optional
-    try:
-        from cohere import Client
-    except ImportError:
-        pass
-
 
 class CohereRerank(BaseDocumentCompressor):
     """Document compressor that uses `Cohere Rerank API`."""
 
-    client: Client
+    client: Any
     """Cohere client to use for compressing documents."""
-    top_n: int = 3
+    top_n: Optional[int] = 3
     """Number of documents to return."""
     model: str = "rerank-english-v2.0"
     """Model to use for reranking."""
@@ -57,6 +48,42 @@ class CohereRerank(BaseDocumentCompressor):
         values["client"] = cohere.Client(cohere_api_key, client_name=client_name)
         return values
 
+    def rerank(
+        self,
+        documents: Sequence[Union[str, Document, dict]],
+        query: str,
+        *,
+        model: Optional[str] = None,
+        top_n: Optional[int] = -1,
+        max_chunks_per_doc: Optional[int] = None,
+    ) -> List[Dict[str, Any]]:
+        """Returns an ordered list of documents ordered by their relevance to the provided query.
+
+        Args:
+            query: The query to use for reranking.
+            documents: A sequence of documents to rerank.
+            model: The model to use for re-ranking. Default to self.model.
+            top_n : The number of results to return. If None returns all results.
+                Defaults to self.top_n.
+            max_chunks_per_doc : The maximum number of chunks derived from a document.
+        """  # noqa: E501
+        if len(documents) == 0:  # to avoid empty api call
+            return []
+        docs = [
+            doc.page_content if isinstance(doc, Document) else doc for doc in documents
+        ]
+        model = model or self.model
+        top_n = top_n if (top_n is None or top_n > 0) else self.top_n
+        results = self.client.rerank(
+            query, docs, model, top_n=top_n, max_chunks_per_doc=max_chunks_per_doc
+        )
+        result_dicts = []
+        for res in results:
+            result_dicts.append(
+                {"index": res.index, "relevance_score": res.relevance_score}
+            )
+        return result_dicts
+
     def compress_documents(
         self,
         documents: Sequence[Document],
@@ -74,16 +101,10 @@ class CohereRerank(BaseDocumentCompressor):
         Returns:
             A sequence of compressed documents.
         """
-        if len(documents) == 0:  # to avoid empty api call
-            return []
-        doc_list = list(documents)
-        _docs = [d.page_content for d in doc_list]
-        results = self.client.rerank(
-            model=self.model, query=query, documents=_docs, top_n=self.top_n
-        )
-        final_results = []
-        for r in results:
-            doc = doc_list[r.index]
-            doc.metadata["relevance_score"] = r.relevance_score
-            final_results.append(doc)
-        return final_results
+        compressed = []
+        for res in self.rerank(documents, query):
+            doc = documents[res["index"]]
+            doc_copy = Document(doc.page_content, metadata=deepcopy(doc.metadata))
+            doc_copy.metadata["relevance_score"] = res["relevance_score"]
+            compressed.append(doc_copy)
+        return compressed
diff --git a/libs/langchain/langchain/retrievers/parent_document_retriever.py b/libs/langchain/langchain/retrievers/parent_document_retriever.py
index 86e4cef1868..5095c13540e 100644
--- a/libs/langchain/langchain/retrievers/parent_document_retriever.py
+++ b/libs/langchain/langchain/retrievers/parent_document_retriever.py
@@ -1,5 +1,5 @@
 import uuid
-from typing import List, Optional
+from typing import List, Optional, Sequence
 
 from langchain_core.documents import Document
 
@@ -31,17 +31,16 @@ class ParentDocumentRetriever(MultiVectorRetriever):
 
         .. code-block:: python
 
-            # Imports
-            from langchain_community.vectorstores import Chroma
             from langchain_community.embeddings import OpenAIEmbeddings
+            from langchain_community.vectorstores import Chroma
             from langchain.text_splitter import RecursiveCharacterTextSplitter
             from langchain.storage import InMemoryStore
 
             # This text splitter is used to create the parent documents
-            parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
+            parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, add_start_index=True)
             # This text splitter is used to create the child documents
             # It should create documents smaller than the parent
-            child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
+            child_splitter = RecursiveCharacterTextSplitter(chunk_size=400, add_start_index=True)
             # The vectorstore to use to index the child chunks
             vectorstore = Chroma(embedding_function=OpenAIEmbeddings())
             # The storage layer for the parent documents
@@ -54,7 +53,7 @@ class ParentDocumentRetriever(MultiVectorRetriever):
                 child_splitter=child_splitter,
                 parent_splitter=parent_splitter,
             )
-    """
+    """  # noqa: E501
 
     child_splitter: TextSplitter
     """The text splitter to use to create child documents."""
@@ -65,6 +64,11 @@ class ParentDocumentRetriever(MultiVectorRetriever):
     """The text splitter to use to create parent documents.
     If none, then the parent documents will be the raw documents passed in."""
 
+    child_metadata_fields: Optional[Sequence[str]] = None
+    """Metadata fields to leave in child documents. If None, leave all parent document 
+        metadata.
+    """
+
     def add_documents(
         self,
         documents: List[Document],
@@ -76,7 +80,7 @@ class ParentDocumentRetriever(MultiVectorRetriever):
         Args:
             documents: List of documents to add
             ids: Optional list of ids for documents. If provided should be the same
-                length as the list of documents. Can provided if parent documents
+                length as the list of documents. Can be provided if parent documents
                 are already in the document store and you don't want to re-add
                 to the docstore. If not provided, random UUIDs will be used as
                 ids.
@@ -106,6 +110,11 @@ class ParentDocumentRetriever(MultiVectorRetriever):
         for i, doc in enumerate(documents):
             _id = doc_ids[i]
             sub_docs = self.child_splitter.split_documents([doc])
+            if self.child_metadata_fields is not None:
+                for _doc in sub_docs:
+                    _doc.metadata = {
+                        k: _doc.metadata[k] for k in self.child_metadata_fields
+                    }
             for _doc in sub_docs:
                 _doc.metadata[self.id_key] = _id
             docs.extend(sub_docs)
diff --git a/libs/partners/openai/langchain_openai/chat_models/base.py b/libs/partners/openai/langchain_openai/chat_models/base.py
index ed33b919bdd..fc1430e2425 100644
--- a/libs/partners/openai/langchain_openai/chat_models/base.py
+++ b/libs/partners/openai/langchain_openai/chat_models/base.py
@@ -649,7 +649,7 @@ class ChatOpenAI(BaseChatModel):
                 Must be the name of the single provided function or
                 "auto" to automatically determine which function to call
                 (if any).
-            kwargs: Any additional parameters to pass to the
+            **kwargs: Any additional parameters to pass to the
                 :class:`~langchain.runnable.Runnable` constructor.
         """
 
@@ -701,22 +701,21 @@ class ChatOpenAI(BaseChatModel):
                 "auto" to automatically determine which function to call
                 (if any), or a dict of the form:
                 {"type": "function", "function": {"name": <<tool_name>>}}.
-            kwargs: Any additional parameters to pass to the
+            **kwargs: Any additional parameters to pass to the
                 :class:`~langchain.runnable.Runnable` constructor.
         """
 
         formatted_tools = [convert_to_openai_tool(tool) for tool in tools]
         if tool_choice is not None:
-            if isinstance(tool_choice, str) and tool_choice not in ("auto", "none"):
+            if isinstance(tool_choice, str) and (tool_choice not in ("auto", "none")):
                 tool_choice = {"type": "function", "function": {"name": tool_choice}}
-            if isinstance(tool_choice, dict) and len(formatted_tools) != 1:
+            if isinstance(tool_choice, dict) and (len(formatted_tools) != 1):
                 raise ValueError(
                     "When specifying `tool_choice`, you must provide exactly one "
                     f"tool. Received {len(formatted_tools)} tools."
                 )
-            if (
-                isinstance(tool_choice, dict)
-                and formatted_tools[0]["function"]["name"]
+            if isinstance(tool_choice, dict) and (
+                formatted_tools[0]["function"]["name"]
                 != tool_choice["function"]["name"]
             ):
                 raise ValueError(
@@ -724,7 +723,4 @@ class ChatOpenAI(BaseChatModel):
                     f"provided tool was {formatted_tools[0]['function']['name']}."
                 )
             kwargs["tool_choice"] = tool_choice
-        return super().bind(
-            tools=formatted_tools,
-            **kwargs,
-        )
+        return super().bind(tools=formatted_tools, **kwargs)