core[patch]: Update documentation for base retriever (#20345)

Updating in code documentation for base retriever to direct folks toward
the .invoke and .ainvoke methods + explain how to implement
This commit is contained in:
Eugene Yurtsev 2024-04-11 16:20:14 -04:00 committed by GitHub
parent d2f4153fe6
commit 2900720cd3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -51,12 +51,48 @@ RetrieverOutputLike = Runnable[Any, RetrieverOutput]
class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC): class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
"""Abstract base class for a Document retrieval system. """Abstract base class for a Document retrieval system.
A retrieval system is defined as something that can take string queries and return
the most 'relevant' Documents from some source.
Example: A retrieval system is defined as something that can take string queries and return
the most 'relevant' Documents from some source.
Usage:
A retriever follows the standard Runnable interface, and should be used
via the standard runnable methods of `invoke`, `ainvoke`, `batch`, `abatch`.
Implementation:
When implementing a custom retriever, the class should implement
the `_get_relevant_documents` method to define the logic for retrieving documents.
Optionally, an async native implementations can be provided by overriding the
`_aget_relevant_documents` method.
Example: A retriever that returns the first 5 documents from a list of documents
.. code-block:: python .. code-block:: python
from langchain_core import Document, BaseRetriever
from typing import List
class SimpleRetriever(BaseRetriever):
docs: List[Document]
k: int = 5
def _get_relevant_documents(self, query: str) -> List[Document]:
\"\"\"Return the first k documents from the list of documents\"\"\"
return self.docs[:self.k]
async def _aget_relevant_documents(self, query: str) -> List[Document]:
\"\"\"(Optional) async native implementation.\"\"\"
return self.docs[:self.k]
Example: A simple retriever based on a scitkit learn vectorizer
.. code-block:: python
from sklearn.metrics.pairwise import cosine_similarity
class TFIDFRetriever(BaseRetriever, BaseModel): class TFIDFRetriever(BaseRetriever, BaseModel):
vectorizer: Any vectorizer: Any
docs: List[Document] docs: List[Document]
@ -66,9 +102,7 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
class Config: class Config:
arbitrary_types_allowed = True arbitrary_types_allowed = True
def get_relevant_documents(self, query: str) -> List[Document]: def _get_relevant_documents(self, query: str) -> List[Document]:
from sklearn.metrics.pairwise import cosine_similarity
# Ip -- (n_docs,x), Op -- (n_docs,n_Feats) # Ip -- (n_docs,x), Op -- (n_docs,n_Feats)
query_vec = self.vectorizer.transform([query]) query_vec = self.vectorizer.transform([query])
# Op -- (n_docs,1) -- Cosine Sim with each doc # Op -- (n_docs,1) -- Cosine Sim with each doc
@ -137,6 +171,24 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
def invoke( def invoke(
self, input: str, config: Optional[RunnableConfig] = None, **kwargs: Any self, input: str, config: Optional[RunnableConfig] = None, **kwargs: Any
) -> List[Document]: ) -> List[Document]:
"""Invoke the retriever to get relevant documents.
Main entry point for synchronous retriever invocations.
Args:
input: The query string
config: Configuration for the retriever
**kwargs: Additional arguments to pass to the retriever
Returns:
List of relevant documents
Examples:
.. code-block:: python
retriever.invoke("query")
"""
config = ensure_config(config) config = ensure_config(config)
return self.get_relevant_documents( return self.get_relevant_documents(
input, input,
@ -153,6 +205,24 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
config: Optional[RunnableConfig] = None, config: Optional[RunnableConfig] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Asynchronously invoke the retriever to get relevant documents.
Main entry point for asynchronous retriever invocations.
Args:
input: The query string
config: Configuration for the retriever
**kwargs: Additional arguments to pass to the retriever
Returns:
List of relevant documents
Examples:
.. code-block:: python
await retriever.ainvoke("query")
"""
config = ensure_config(config) config = ensure_config(config)
return await self.aget_relevant_documents( return await self.aget_relevant_documents(
input, input,
@ -203,6 +273,10 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Retrieve documents relevant to a query. """Retrieve documents relevant to a query.
Users should favor using `.invoke` or `.batch` rather than
`get_relevant_documents directly`.
Args: Args:
query: string to find relevant documents for query: string to find relevant documents for
callbacks: Callback manager or list of callbacks callbacks: Callback manager or list of callbacks
@ -212,6 +286,8 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
metadata: Optional metadata associated with the retriever. Defaults to None metadata: Optional metadata associated with the retriever. Defaults to None
This metadata will be associated with each call to this retriever, This metadata will be associated with each call to this retriever,
and passed as arguments to the handlers defined in `callbacks`. and passed as arguments to the handlers defined in `callbacks`.
run_name: Optional name for the run.
Returns: Returns:
List of relevant documents List of relevant documents
""" """
@ -260,6 +336,10 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Asynchronously get documents relevant to a query. """Asynchronously get documents relevant to a query.
Users should favor using `.ainvoke` or `.abatch` rather than
`aget_relevant_documents directly`.
Args: Args:
query: string to find relevant documents for query: string to find relevant documents for
callbacks: Callback manager or list of callbacks callbacks: Callback manager or list of callbacks
@ -269,6 +349,8 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC):
metadata: Optional metadata associated with the retriever. Defaults to None metadata: Optional metadata associated with the retriever. Defaults to None
This metadata will be associated with each call to this retriever, This metadata will be associated with each call to this retriever,
and passed as arguments to the handlers defined in `callbacks`. and passed as arguments to the handlers defined in `callbacks`.
run_name: Optional name for the run.
Returns: Returns:
List of relevant documents List of relevant documents
""" """