Allow base_store to be used directly with MultiVectorRetriever (#14202)

Allow users to pass a generic `BaseStore[str, bytes]` to
MultiVectorRetriever, removing the need to use the `create_kv_docstore`
method. This encoding will now happen internally.

@rlancemartin @eyurtsev

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Jacob Lee 2023-12-04 14:43:32 -08:00 committed by GitHub
parent 67662564f3
commit a26c4a0930
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 63 additions and 48 deletions

View File

@ -88,7 +88,7 @@
"# The retriever (empty to start)\n", "# The retriever (empty to start)\n",
"retriever = MultiVectorRetriever(\n", "retriever = MultiVectorRetriever(\n",
" vectorstore=vectorstore,\n", " vectorstore=vectorstore,\n",
" docstore=store,\n", " base_store=store,\n",
" id_key=id_key,\n", " id_key=id_key,\n",
")\n", ")\n",
"import uuid\n", "import uuid\n",
@ -143,7 +143,7 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"Document(page_content='Tonight, Id like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.', metadata={'doc_id': '455205f7-bb7d-4c36-b442-d1d6f9f701ed', 'source': '../../state_of_the_union.txt'})" "Document(page_content='Tonight, Id like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.', metadata={'doc_id': '59899493-92a0-41cb-b6ba-a854730ad74a', 'source': '../../state_of_the_union.txt'})"
] ]
}, },
"execution_count": 8, "execution_count": 8,
@ -188,7 +188,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 10,
"id": "36739460-a737-4a8e-b70f-50bf8c8eaae7", "id": "36739460-a737-4a8e-b70f-50bf8c8eaae7",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -198,7 +198,7 @@
"9875" "9875"
] ]
}, },
"execution_count": 15, "execution_count": 10,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -223,7 +223,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 11,
"id": "1433dff4", "id": "1433dff4",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -238,7 +238,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 12,
"id": "35b30390", "id": "35b30390",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -253,7 +253,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 13,
"id": "41a2a738", "id": "41a2a738",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -263,7 +263,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 14,
"id": "7ac5e4b1", "id": "7ac5e4b1",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -276,7 +276,7 @@
"# The retriever (empty to start)\n", "# The retriever (empty to start)\n",
"retriever = MultiVectorRetriever(\n", "retriever = MultiVectorRetriever(\n",
" vectorstore=vectorstore,\n", " vectorstore=vectorstore,\n",
" docstore=store,\n", " base_store=store,\n",
" id_key=id_key,\n", " id_key=id_key,\n",
")\n", ")\n",
"doc_ids = [str(uuid.uuid4()) for _ in docs]" "doc_ids = [str(uuid.uuid4()) for _ in docs]"
@ -338,7 +338,7 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"Document(page_content=\"The document is a transcript of a speech given by the President of the United States. The President discusses several important issues and initiatives, including the nomination of a Supreme Court Justice, border security and immigration reform, protecting women's rights, advancing LGBTQ+ equality, bipartisan legislation, addressing the opioid epidemic and mental health, supporting veterans, investigating the health effects of burn pits on military personnel, ending cancer, and the strength and resilience of the American people.\", metadata={'doc_id': '79fa2e9f-28d9-4372-8af3-2caf4f1de312'})" "Document(page_content=\"The document is a speech given by the President of the United States. The President discusses various important issues and goals for the country, including nominating a Supreme Court Justice, securing the border and fixing the immigration system, protecting women's rights, supporting veterans, addressing the opioid epidemic, improving mental health care, and ending cancer. The President emphasizes the unity and strength of the American people and expresses optimism for the future of the nation.\", metadata={'doc_id': '8fdf4009-628c-400d-949c-1d3f4daf1e66'})"
] ]
}, },
"execution_count": 19, "execution_count": 19,
@ -393,7 +393,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 26, "execution_count": 22,
"id": "5219b085", "id": "5219b085",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -418,7 +418,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 32, "execution_count": 23,
"id": "523deb92", "id": "523deb92",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -429,7 +429,7 @@
" {\"doc\": lambda x: x.page_content}\n", " {\"doc\": lambda x: x.page_content}\n",
" # Only asking for 3 hypothetical questions, but this could be adjusted\n", " # Only asking for 3 hypothetical questions, but this could be adjusted\n",
" | ChatPromptTemplate.from_template(\n", " | ChatPromptTemplate.from_template(\n",
" \"Generate a list of 3 hypothetical questions that the below document could be used to answer:\\n\\n{doc}\"\n", " \"Generate a list of exactly 3 hypothetical questions that the below document could be used to answer:\\n\\n{doc}\"\n",
" )\n", " )\n",
" | ChatOpenAI(max_retries=0, model=\"gpt-4\").bind(\n", " | ChatOpenAI(max_retries=0, model=\"gpt-4\").bind(\n",
" functions=functions, function_call={\"name\": \"hypothetical_questions\"}\n", " functions=functions, function_call={\"name\": \"hypothetical_questions\"}\n",
@ -440,19 +440,19 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 33, "execution_count": 24,
"id": "11d30554", "id": "11d30554",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"[\"What was the author's initial impression of philosophy as a field of study, and how did it change when they got to college?\",\n", "[\"What were the author's initial areas of interest before college?\",\n",
" 'Why did the author decide to switch their focus to Artificial Intelligence (AI)?',\n", " \"What was the author's experience with programming in his early years?\",\n",
" \"What led to the author's disillusionment with the field of AI as it was practiced at the time?\"]" " 'Why did the author switch his focus from AI to Lisp?']"
] ]
}, },
"execution_count": 33, "execution_count": 24,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -463,7 +463,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 34, "execution_count": 25,
"id": "3eb2e48c", "id": "3eb2e48c",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -473,7 +473,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 67, "execution_count": 26,
"id": "b2cd6e75", "id": "b2cd6e75",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -488,7 +488,7 @@
"# The retriever (empty to start)\n", "# The retriever (empty to start)\n",
"retriever = MultiVectorRetriever(\n", "retriever = MultiVectorRetriever(\n",
" vectorstore=vectorstore,\n", " vectorstore=vectorstore,\n",
" docstore=store,\n", " base_store=store,\n",
" id_key=id_key,\n", " id_key=id_key,\n",
")\n", ")\n",
"doc_ids = [str(uuid.uuid4()) for _ in docs]" "doc_ids = [str(uuid.uuid4()) for _ in docs]"
@ -496,7 +496,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 68, "execution_count": 27,
"id": "18831b3b", "id": "18831b3b",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -510,7 +510,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 69, "execution_count": 28,
"id": "224b24c5", "id": "224b24c5",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -521,7 +521,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 70, "execution_count": 29,
"id": "7b442b90", "id": "7b442b90",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -531,20 +531,20 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 71, "execution_count": 30,
"id": "089b5ad0", "id": "089b5ad0",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"[Document(page_content=\"What is the President's stance on immigration reform?\", metadata={'doc_id': '505d73e3-8350-46ec-a58e-3af032f04ab3'}),\n", "[Document(page_content='What made Robert Morris advise the author to leave Y Combinator?', metadata={'doc_id': '740e484e-d67c-45f7-989d-9928aaf51c28'}),\n",
" Document(page_content=\"What is the President's stance on immigration reform?\", metadata={'doc_id': '1c9618f0-7660-4b4f-a37c-509cbbbf6dba'}),\n", " Document(page_content=\"How did the author's mother's illness affect his decision to leave Y Combinator?\", metadata={'doc_id': '740e484e-d67c-45f7-989d-9928aaf51c28'}),\n",
" Document(page_content=\"What is the President's stance on immigration reform?\", metadata={'doc_id': '82c08209-b904-46a8-9532-edd2380950b7'}),\n", " Document(page_content='What led the author to start publishing essays online?', metadata={'doc_id': '675ccee3-ce0b-4d5d-892c-b8942370babd'}),\n",
" Document(page_content='What measures is the President proposing to protect the rights of LGBTQ+ Americans?', metadata={'doc_id': '82c08209-b904-46a8-9532-edd2380950b7'})]" " Document(page_content='What measures are being taken to secure the border and fix the immigration system?', metadata={'doc_id': '2d51f010-969e-48a9-9e82-6b12bc7ab3d4'})]"
] ]
}, },
"execution_count": 71, "execution_count": 30,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -555,7 +555,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 72, "execution_count": 31,
"id": "7594b24e", "id": "7594b24e",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -565,17 +565,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 73, "execution_count": 32,
"id": "4c120c65", "id": "4c120c65",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"9194" "9844"
] ]
}, },
"execution_count": 73, "execution_count": 32,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -583,14 +583,6 @@
"source": [ "source": [
"len(retrieved_docs[0].page_content)" "len(retrieved_docs[0].page_content)"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"id": "616cfeeb",
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {
@ -609,7 +601,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.16" "version": "3.10.5"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -1,13 +1,13 @@
from enum import Enum from enum import Enum
from typing import List from typing import List, Optional
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_core.pydantic_v1 import Field
from langchain_core.retrievers import BaseRetriever from langchain_core.retrievers import BaseRetriever
from langchain_core.stores import BaseStore from langchain_core.stores import BaseStore
from langchain_core.vectorstores import VectorStore from langchain_core.vectorstores import VectorStore
from langchain.callbacks.manager import CallbackManagerForRetrieverRun from langchain.callbacks.manager import CallbackManagerForRetrieverRun
from langchain.storage._lc_store import create_kv_docstore
class SearchType(str, Enum): class SearchType(str, Enum):
@ -27,12 +27,35 @@ class MultiVectorRetriever(BaseRetriever):
and their embedding vectors""" and their embedding vectors"""
docstore: BaseStore[str, Document] docstore: BaseStore[str, Document]
"""The storage layer for the parent documents""" """The storage layer for the parent documents"""
id_key: str = "doc_id" id_key: str
search_kwargs: dict = Field(default_factory=dict) search_kwargs: dict
"""Keyword arguments to pass to the search function.""" """Keyword arguments to pass to the search function."""
search_type: SearchType = SearchType.similarity search_type: SearchType
"""Type of search to perform (similarity / mmr)""" """Type of search to perform (similarity / mmr)"""
def __init__(
self,
*,
vectorstore: VectorStore,
docstore: Optional[BaseStore[str, Document]] = None,
base_store: Optional[BaseStore[str, bytes]] = None,
id_key: str = "doc_id",
search_kwargs: Optional[dict] = None,
search_type: SearchType = SearchType.similarity,
):
if base_store is not None:
docstore = create_kv_docstore(base_store)
elif docstore is None:
raise Exception("You must pass a `base_store` parameter.")
super().__init__(
vectorstore=vectorstore,
docstore=docstore,
id_key=id_key,
search_kwargs=search_kwargs if search_kwargs is not None else {},
search_type=search_type,
)
def _get_relevant_documents( def _get_relevant_documents(
self, query: str, *, run_manager: CallbackManagerForRetrieverRun self, query: str, *, run_manager: CallbackManagerForRetrieverRun
) -> List[Document]: ) -> List[Document]: