diff --git a/libs/core/langchain_core/retrievers.py b/libs/core/langchain_core/retrievers.py index b7c847150b7..cf158da4ced 100644 --- a/libs/core/langchain_core/retrievers.py +++ b/libs/core/langchain_core/retrievers.py @@ -51,12 +51,48 @@ RetrieverOutputLike = Runnable[Any, RetrieverOutput] class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC): """Abstract base class for a Document retrieval system. - A retrieval system is defined as something that can take string queries and return - the most 'relevant' Documents from some source. - Example: + A retrieval system is defined as something that can take string queries and return + the most 'relevant' Documents from some source. + + Usage: + + A retriever follows the standard Runnable interface, and should be used + via the standard runnable methods of `invoke`, `ainvoke`, `batch`, `abatch`. + + Implementation: + + When implementing a custom retriever, the class should implement + the `_get_relevant_documents` method to define the logic for retrieving documents. + + Optionally, an async native implementations can be provided by overriding the + `_aget_relevant_documents` method. + + Example: A retriever that returns the first 5 documents from a list of documents + .. code-block:: python + from langchain_core import Document, BaseRetriever + from typing import List + + class SimpleRetriever(BaseRetriever): + docs: List[Document] + k: int = 5 + + def _get_relevant_documents(self, query: str) -> List[Document]: + \"\"\"Return the first k documents from the list of documents\"\"\" + return self.docs[:self.k] + + async def _aget_relevant_documents(self, query: str) -> List[Document]: + \"\"\"(Optional) async native implementation.\"\"\" + return self.docs[:self.k] + + Example: A simple retriever based on a scitkit learn vectorizer + + .. code-block:: python + + from sklearn.metrics.pairwise import cosine_similarity + class TFIDFRetriever(BaseRetriever, BaseModel): vectorizer: Any docs: List[Document] @@ -66,9 +102,7 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC): class Config: arbitrary_types_allowed = True - def get_relevant_documents(self, query: str) -> List[Document]: - from sklearn.metrics.pairwise import cosine_similarity - + def _get_relevant_documents(self, query: str) -> List[Document]: # Ip -- (n_docs,x), Op -- (n_docs,n_Feats) query_vec = self.vectorizer.transform([query]) # Op -- (n_docs,1) -- Cosine Sim with each doc @@ -137,6 +171,24 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC): def invoke( self, input: str, config: Optional[RunnableConfig] = None, **kwargs: Any ) -> List[Document]: + """Invoke the retriever to get relevant documents. + + Main entry point for synchronous retriever invocations. + + Args: + input: The query string + config: Configuration for the retriever + **kwargs: Additional arguments to pass to the retriever + + Returns: + List of relevant documents + + Examples: + + .. code-block:: python + + retriever.invoke("query") + """ config = ensure_config(config) return self.get_relevant_documents( input, @@ -153,6 +205,24 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC): config: Optional[RunnableConfig] = None, **kwargs: Any, ) -> List[Document]: + """Asynchronously invoke the retriever to get relevant documents. + + Main entry point for asynchronous retriever invocations. + + Args: + input: The query string + config: Configuration for the retriever + **kwargs: Additional arguments to pass to the retriever + + Returns: + List of relevant documents + + Examples: + + .. code-block:: python + + await retriever.ainvoke("query") + """ config = ensure_config(config) return await self.aget_relevant_documents( input, @@ -203,6 +273,10 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC): **kwargs: Any, ) -> List[Document]: """Retrieve documents relevant to a query. + + Users should favor using `.invoke` or `.batch` rather than + `get_relevant_documents directly`. + Args: query: string to find relevant documents for callbacks: Callback manager or list of callbacks @@ -212,6 +286,8 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC): metadata: Optional metadata associated with the retriever. Defaults to None This metadata will be associated with each call to this retriever, and passed as arguments to the handlers defined in `callbacks`. + run_name: Optional name for the run. + Returns: List of relevant documents """ @@ -260,6 +336,10 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC): **kwargs: Any, ) -> List[Document]: """Asynchronously get documents relevant to a query. + + Users should favor using `.ainvoke` or `.abatch` rather than + `aget_relevant_documents directly`. + Args: query: string to find relevant documents for callbacks: Callback manager or list of callbacks @@ -269,6 +349,8 @@ class BaseRetriever(RunnableSerializable[RetrieverInput, RetrieverOutput], ABC): metadata: Optional metadata associated with the retriever. Defaults to None This metadata will be associated with each call to this retriever, and passed as arguments to the handlers defined in `callbacks`. + run_name: Optional name for the run. + Returns: List of relevant documents """