From 9963b32e5965e908d2deef798ba4ab4e336474a5 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Thu, 24 Aug 2023 06:42:42 -0700 Subject: [PATCH] Harrison/multi vector (#9700) --- .../retrievers/multi_vector.ipynb | 366 ++++++++++++++++++ .../parent_document_retriever.ipynb | 3 +- .../langchain/retrievers/__init__.py | 2 + .../langchain/retrievers/multi_vector.py | 39 ++ .../retrievers/parent_document_retriever.py | 36 +- 5 files changed, 411 insertions(+), 35 deletions(-) create mode 100644 docs/extras/modules/data_connection/retrievers/multi_vector.ipynb create mode 100644 libs/langchain/langchain/retrievers/multi_vector.py diff --git a/docs/extras/modules/data_connection/retrievers/multi_vector.ipynb b/docs/extras/modules/data_connection/retrievers/multi_vector.ipynb new file mode 100644 index 00000000000..4a758756222 --- /dev/null +++ b/docs/extras/modules/data_connection/retrievers/multi_vector.ipynb @@ -0,0 +1,366 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d9172545", + "metadata": {}, + "source": [ + "# MultiVector Retriever\n", + "\n", + "It can often be beneficial to store multiple vectors per document. There are multiple use cases where this is beneficial. LangChain has a base `MultiVectorRetriever` which makes querying this type of setup easy. A lot of the complexity lies in how to create the multiple vectors per document. This notebook covers some of the common ways to create those vectors and use the `MultiVectorRetriever`.\n", + "\n", + "The methods to create multiple vectors per document include:\n", + "\n", + "- smaller chunks: split a document into smaller chunks, and embed those (this is ParentDocumentRetriever)\n", + "- summary: create a summary for each document, embed that along with (or instead of) the document\n", + "- hypothetical questions: create hypothetical questions that each document would be appropriate to answer, embed those along with (or instead of) the document" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "eed469be", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.retrievers.multi_vector import MultiVectorRetriever" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "18c1421a", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.vectorstores import Chroma\n", + "from langchain.embeddings import OpenAIEmbeddings\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "from langchain.storage import InMemoryStore\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6d869496", + "metadata": {}, + "outputs": [], + "source": [ + "loaders = [\n", + " TextLoader('../../paul_graham_essay.txt'),\n", + " TextLoader('../../state_of_the_union.txt'),\n", + "]\n", + "docs = []\n", + "for l in loaders:\n", + " docs.extend(l.load())\n", + "text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)\n", + "docs = text_splitter.split_documents(docs)" + ] + }, + { + "cell_type": "markdown", + "id": "fa17beda", + "metadata": {}, + "source": [ + "## Smaller chunks\n", + "\n", + "Often times it can be useful to retrieve larger chunks of information, but embed smaller chunks. This allows for embeddings to capture the semantic meaning as closely as possible, but for as much context as possible to be passed downstream. NOTE: this is what the ParentDocumentRetriever does. Here we show what is going on under the hood." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0e7b6b45", + "metadata": {}, + "outputs": [], + "source": [ + "# The vectorstore to use to index the child chunks\n", + "vectorstore = Chroma(\n", + " collection_name=\"full_documents\",\n", + " embedding_function=OpenAIEmbeddings()\n", + ")\n", + "# The storage layer for the parent documents\n", + "store = InMemoryStore()\n", + "id_key = \"doc_id\"\n", + "# The retriever (empty to start)\n", + "retriever = MultiVectorRetriever(\n", + " vectorstore=vectorstore, \n", + " docstore=store, \n", + " id_key=id_key,\n", + ")\n", + "import uuid\n", + "doc_ids = [str(uuid.uuid4()) for _ in docs]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "72a36491", + "metadata": {}, + "outputs": [], + "source": [ + "# The splitter to use to create smaller chunks\n", + "child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d23247d", + "metadata": {}, + "outputs": [], + "source": [ + "sub_docs = []\n", + "for i, doc in enumerate(docs):\n", + " _id = doc_ids[i]\n", + " _sub_docs = child_text_splitter.split_documents([doc])\n", + " for _doc in _sub_docs:\n", + " _doc.metadata[id_key] = _id\n", + " sub_docs.extend(_sub_docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "92ed5861", + "metadata": {}, + "outputs": [], + "source": [ + "retriever.vectorstore.add_documents(sub_docs)\n", + "retriever.docstore.mset(list(zip(doc_ids, docs)))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8afed60c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.', metadata={'doc_id': 'b4ca7817-e3fe-4103-ac81-574fb41439ef', 'source': '../../state_of_the_union.txt'})" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Vectorstore alone retrieves the small chunks\n", + "retriever.vectorstore.similarity_search(\"justice breyer\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3c9017f1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9874" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Retriever returns larger chunks\n", + "len(retriever.get_relevant_documents(\"justice breyer\")[0].page_content)" + ] + }, + { + "cell_type": "markdown", + "id": "d6a7ae0d", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "Oftentimes a summary may be able to distill more accurately what a chunk is about, leading to better retrieval. Here we show how to create summaries, and then embed those." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "1433dff4", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.prompts import ChatPromptTemplate\n", + "from langchain.schema.output_parser import StrOutputParser\n", + "import uuid\n", + "from langchain.schema.document import Document" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "35b30390", + "metadata": {}, + "outputs": [], + "source": [ + "chain = (\n", + " {\"doc\": lambda x: x.page_content}\n", + " | ChatPromptTemplate.from_template(\"Summarize the following document:\\n\\n{doc}\")\n", + " | ChatOpenAI(max_retries=0)\n", + " | StrOutputParser()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "41a2a738", + "metadata": {}, + "outputs": [], + "source": [ + "summaries = [chain.invoke(d) for d in docs]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7ac5e4b1", + "metadata": {}, + "outputs": [], + "source": [ + "# The vectorstore to use to index the child chunks\n", + "vectorstore = Chroma(\n", + " collection_name=\"summaries\",\n", + " embedding_function=OpenAIEmbeddings()\n", + ")\n", + "# The storage layer for the parent documents\n", + "store = InMemoryStore()\n", + "id_key = \"doc_id\"\n", + "# The retriever (empty to start)\n", + "retriever = MultiVectorRetriever(\n", + " vectorstore=vectorstore, \n", + " docstore=store, \n", + " id_key=id_key,\n", + ")\n", + "doc_ids = [str(uuid.uuid4()) for _ in docs]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "0d93309f", + "metadata": {}, + "outputs": [], + "source": [ + "summary_docs = [Document(page_content=s,metadata={id_key: doc_ids[i]}) for i, s in enumerate(summaries)]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "6d5edf0d", + "metadata": {}, + "outputs": [], + "source": [ + "retriever.vectorstore.add_documents(summary_docs)\n", + "retriever.docstore.mset(list(zip(doc_ids, docs)))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "299232d6", + "metadata": {}, + "outputs": [], + "source": [ + "sub_docs = vectorstore.similarity_search(\"justice breyer\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "10e404c0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='The document discusses various topics and proposals put forth by the President in a State of the Union address. These include the nomination of a judge for the Supreme Court, securing the border and fixing the immigration system, advancing liberty and justice for women and LGBTQ+ individuals, passing bipartisan legislation, addressing the opioid epidemic and mental health issues, supporting veterans, and ending cancer. The President expresses optimism about the future of the country and emphasizes the strength of the American people.', metadata={'doc_id': '8c7a707d-615d-42d5-919d-bc5178dd1ae4'})" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sub_docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "e4cce5c2", + "metadata": {}, + "outputs": [], + "source": [ + "retrieved_docs = retriever.get_relevant_documents(\"justice breyer\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "c8570dbb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9194" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(retrieved_docs[0].page_content)" + ] + }, + { + "cell_type": "markdown", + "id": "097a5396", + "metadata": {}, + "source": [ + "## Hypothetical Queries\n", + "\n", + "An LLM can also be used to generate a list of hypothetical questions that could be asked of a particular document. These questions can then be embedded" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/modules/data_connection/retrievers/parent_document_retriever.ipynb b/docs/extras/modules/data_connection/retrievers/parent_document_retriever.ipynb index 4b166bc1473..206ecbfbad8 100644 --- a/docs/extras/modules/data_connection/retrievers/parent_document_retriever.ipynb +++ b/docs/extras/modules/data_connection/retrievers/parent_document_retriever.ipynb @@ -83,7 +83,6 @@ "outputs": [], "source": [ "# This text splitter is used to create the child documents\n", - "\n", "child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)\n", "# The vectorstore to use to index the child chunks\n", "vectorstore = Chroma(\n", @@ -432,7 +431,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.5" + "version": "3.10.1" } }, "nbformat": 4, diff --git a/libs/langchain/langchain/retrievers/__init__.py b/libs/langchain/langchain/retrievers/__init__.py index 1e5497c93ce..c666d9103f4 100644 --- a/libs/langchain/langchain/retrievers/__init__.py +++ b/libs/langchain/langchain/retrievers/__init__.py @@ -40,6 +40,7 @@ from langchain.retrievers.merger_retriever import MergerRetriever from langchain.retrievers.metal import MetalRetriever from langchain.retrievers.milvus import MilvusRetriever from langchain.retrievers.multi_query import MultiQueryRetriever +from langchain.retrievers.multi_vector import MultiVectorRetriever from langchain.retrievers.parent_document_retriever import ParentDocumentRetriever from langchain.retrievers.pinecone_hybrid_search import PineconeHybridSearchRetriever from langchain.retrievers.pubmed import PubMedRetriever @@ -92,4 +93,5 @@ __all__ = [ "WebResearchRetriever", "EnsembleRetriever", "ParentDocumentRetriever", + "MultiVectorRetriever", ] diff --git a/libs/langchain/langchain/retrievers/multi_vector.py b/libs/langchain/langchain/retrievers/multi_vector.py new file mode 100644 index 00000000000..92d537189b9 --- /dev/null +++ b/libs/langchain/langchain/retrievers/multi_vector.py @@ -0,0 +1,39 @@ +from typing import List + +from pydantic import Field + +from langchain.callbacks.manager import CallbackManagerForRetrieverRun +from langchain.schema import BaseRetriever, BaseStore, Document +from langchain.vectorstores import VectorStore + + +class MultiVectorRetriever(BaseRetriever): + """Retrieve from a set of multiple embeddings for the same document.""" + + vectorstore: VectorStore + """The underlying vectorstore to use to store small chunks + and their embedding vectors""" + docstore: BaseStore[str, Document] + """The storage layer for the parent documents""" + id_key: str = "doc_id" + search_kwargs: dict = Field(default_factory=dict) + """Keyword arguments to pass to the search function.""" + + def _get_relevant_documents( + self, query: str, *, run_manager: CallbackManagerForRetrieverRun + ) -> List[Document]: + """Get documents relevant to a query. + Args: + query: String to find relevant documents for + run_manager: The callbacks handler to use + Returns: + List of relevant documents + """ + sub_docs = self.vectorstore.similarity_search(query, **self.search_kwargs) + # We do this to maintain the order of the ids that are returned + ids = [] + for d in sub_docs: + if d.metadata[self.id_key] not in ids: + ids.append(d.metadata[self.id_key]) + docs = self.docstore.mget(ids) + return [d for d in docs if d is not None] diff --git a/libs/langchain/langchain/retrievers/parent_document_retriever.py b/libs/langchain/langchain/retrievers/parent_document_retriever.py index 6757ba3ddb3..dd5aa720675 100644 --- a/libs/langchain/langchain/retrievers/parent_document_retriever.py +++ b/libs/langchain/langchain/retrievers/parent_document_retriever.py @@ -1,16 +1,12 @@ import uuid from typing import List, Optional -from langchain.callbacks.manager import CallbackManagerForRetrieverRun -from langchain.pydantic_v1 import Field +from langchain.retrievers import MultiVectorRetriever from langchain.schema.document import Document -from langchain.schema.retriever import BaseRetriever -from langchain.schema.storage import BaseStore from langchain.text_splitter import TextSplitter -from langchain.vectorstores.base import VectorStore -class ParentDocumentRetriever(BaseRetriever): +class ParentDocumentRetriever(MultiVectorRetriever): """Retrieve small chunks then retrieve their parent documents. When splitting documents for retrieval, there are often conflicting desires: @@ -59,40 +55,14 @@ class ParentDocumentRetriever(BaseRetriever): ) """ - vectorstore: VectorStore - """The underlying vectorstore to use to store small chunks - and their embedding vectors""" - docstore: BaseStore[str, Document] - """The storage layer for the parent documents""" child_splitter: TextSplitter """The text splitter to use to create child documents.""" - id_key: str = "doc_id" + """The key to use to track the parent id. This will be stored in the metadata of child documents.""" parent_splitter: Optional[TextSplitter] = None """The text splitter to use to create parent documents. If none, then the parent documents will be the raw documents passed in.""" - search_kwargs: dict = Field(default_factory=dict) - """Keyword arguments to pass to the search function.""" - - def _get_relevant_documents( - self, query: str, *, run_manager: CallbackManagerForRetrieverRun - ) -> List[Document]: - """Get documents relevant to a query. - Args: - query: String to find relevant documents for - run_manager: The callbacks handler to use - Returns: - List of relevant documents - """ - sub_docs = self.vectorstore.similarity_search(query, **self.search_kwargs) - # We do this to maintain the order of the ids that are returned - ids = [] - for d in sub_docs: - if d.metadata[self.id_key] not in ids: - ids.append(d.metadata[self.id_key]) - docs = self.docstore.mget(ids) - return [d for d in docs if d is not None] def add_documents( self,