Add dense proposals (#14719)

Indexing strategy based on decomposing candidate propositions while
indexing.
This commit is contained in:
William FH 2023-12-14 09:21:45 -08:00 committed by GitHub
parent bc3ec78a38
commit 79ae6c2a9e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 3371 additions and 0 deletions

View File

@ -0,0 +1,3 @@
docs/img_*.jpg
chroma_db_proposals
multi_vector_retriever_metadata

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2023 LangChain, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,81 @@
# rag-chroma-dense-retrieval
This template demonstrates the multi-vector indexing strategy proposed by Chen, et. al.'s [Dense X Retrieval: What Retrieval Granularity Should We Use?](https://arxiv.org/abs/2312.06648). The prompt, which you can [try out on the hub](https://smith.langchain.com/hub/wfh/proposal-indexing), directs an LLM to generate de-contextualized "propositions" which can be vectorized to increase the retrieval accuracy. You can see the full definition in `proposal_chain.py`.
![Retriever Diagram](./_images/retriever_diagram.png)
## Storage
For this demo, we index a simple academic paper using the RecursiveUrlLoader, and store all retriever information locally (using chroma and a bytestore stored on the local filesystem). You can modify the storage layer in `storage.py`.
## Environment Setup
Set the `OPENAI_API_KEY` environment variable to access `gpt-3.5` and the OpenAI Embeddings classes.
## Indexing
Create the index by running the following:
```python
poetry install
poetry run python rag_chroma_dense_retrieval/ingest.py
```
## Usage
To use this package, you should first have the LangChain CLI installed:
```shell
pip install -U langchain-cli
```
To create a new LangChain project and install this as the only package, you can do:
```shell
langchain app new my-app --package rag-chroma-dense-retrieval
```
If you want to add this to an existing project, you can just run:
```shell
langchain app add rag-chroma-dense-retrieval
```
And add the following code to your `server.py` file:
```python
from rag_chroma_dense_retrieval import chain
add_routes(app, chain, path="/rag-chroma-dense-retrieval")
```
(Optional) Let's now configure LangSmith.
LangSmith will help us trace, monitor and debug LangChain applications.
LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/).
If you don't have access, you can skip this section
```shell
export LANGCHAIN_TRACING_V2=true
export LANGCHAIN_API_KEY=<your-api-key>
export LANGCHAIN_PROJECT=<your-project> # if not specified, defaults to "default"
```
If you are inside this directory, then you can spin up a LangServe instance directly by:
```shell
langchain serve
```
This will start the FastAPI app with a server is running locally at
[http://localhost:8000](http://localhost:8000)
We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs)
We can access the playground at [http://127.0.0.1:8000/rag-chroma-dense-retrieval/playground](http://127.0.0.1:8000/rag-chroma-dense-retrieval/playground)
We can access the template from code with:
```python
from langserve.client import RemoteRunnable
runnable = RemoteRunnable("http://localhost:8000/rag-chroma-dense-retrieval")
```

Binary file not shown.

After

Width:  |  Height:  |  Size: 375 KiB

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,35 @@
[tool.poetry]
name = "rag-chroma-dense-retrieval"
version = "0.1.0"
description = "Dense retrieval using vectorized propositions.s"
authors = [
"William Fu-Hinthorn <will@langchain.dev>",
]
readme = "README.md"
[tool.poetry.dependencies]
python = ">=3.8.1,<4.0"
langchain = ">=0.0.350"
openai = "<2"
tiktoken = ">=0.5.1"
chromadb = ">=0.4.14"
bs4 = "^0.0.1"
[tool.poetry.group.dev.dependencies]
langchain-cli = ">=0.0.15"
[tool.langserve]
export_module = "rag_chroma_multi_modal_multi_vector"
export_attr = "chain"
[tool.templates-hub]
use-case = "rag"
author = "LangChain"
integrations = ["OpenAI", "Chroma"]
tags = ["vectordbs"]
[build-system]
requires = [
"poetry-core",
]
build-backend = "poetry.core.masonry.api"

View File

@ -0,0 +1,68 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "681a5d1e",
"metadata": {},
"source": [
"## Run Template\n",
"\n",
"In `server.py`, set -\n",
"```\n",
"from fastapi import FastAPI\n",
"from langserve import add_routes\n",
"from rag_chroma_dense_retrieval import chain\n",
"\n",
"app = FastAPI(\n",
" title=\"LangChain Server\",\n",
" version=\"1.0\",\n",
" description=\"Retriever and Generator for RAG Chroma Dense Retrieval\",\n",
")\n",
"\n",
"add_routes(app, chain, path=\"/rag-chroma-dense-retrieval\")\n",
"\n",
"if __name__ == \"__main__\":\n",
" import uvicorn\n",
"\n",
" uvicorn.run(app, host=\"localhost\", port=8000)\n",
"\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d774be2a",
"metadata": {},
"outputs": [],
"source": [
"from langserve.client import RemoteRunnable\n",
"\n",
"rag_app = RemoteRunnable(\"http://localhost:8001/rag-chroma-dense-retrieval\")\n",
"rag_app.invoke(\"How are transformers related to convolutional neural networks?\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,4 @@
from rag_chroma_dense_retrieval.chain import chain
from rag_chroma_dense_retrieval.proposal_chain import proposition_chain
__all__ = ["chain", "proposition_chain"]

View File

@ -0,0 +1,67 @@
from langchain_community.chat_models import ChatOpenAI
from langchain_core.load import load
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain_core.runnables import RunnablePassthrough
from rag_chroma_dense_retrieval.constants import DOCSTORE_ID_KEY
from rag_chroma_dense_retrieval.storage import get_multi_vector_retriever
def format_docs(docs: list) -> str:
loaded_docs = [load(doc) for doc in docs]
return "\n".join(
[
f"<Document id={i}>\n{doc.page_content}\n</Document>"
for i, doc in enumerate(loaded_docs)
]
)
def rag_chain(retriever):
"""
The RAG chain
:param retriever: A function that retrieves the necessary context for the model.
:return: A chain of functions representing the multi-modal RAG process.
"""
model = ChatOpenAI(temperature=0, model="gpt-4-1106-preview", max_tokens=1024)
prompt = ChatPromptTemplate.from_messages(
[
(
"system",
"You are an AI assistant. Answer based on the retrieved documents:"
"\n<Documents>\n{context}\n</Documents>",
),
("user", "{question}?"),
]
)
# Define the RAG pipeline
chain = (
{
"context": retriever | format_docs,
"question": RunnablePassthrough(),
}
| prompt
| model
| StrOutputParser()
)
return chain
# Create the multi-vector retriever
retriever = get_multi_vector_retriever(DOCSTORE_ID_KEY)
# Create RAG chain
chain = rag_chain(retriever)
# Add typing for input
class Question(BaseModel):
__root__: str
chain = chain.with_types(input_type=Question)

View File

@ -0,0 +1 @@
DOCSTORE_ID_KEY = "doc_id"

View File

@ -0,0 +1,87 @@
import logging
import uuid
from typing import Sequence
from langchain_core.documents import Document
from langchain_core.runnables import Runnable
from rag_chroma_dense_retrieval.constants import DOCSTORE_ID_KEY
from rag_chroma_dense_retrieval.proposal_chain import proposition_chain
from rag_chroma_dense_retrieval.storage import get_multi_vector_retriever
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def add_documents(
retriever,
propositions: Sequence[Sequence[str]],
docs: Sequence[Document],
id_key: str = DOCSTORE_ID_KEY,
):
doc_ids = [
str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.metadata["source"])) for doc in docs
]
prop_docs = [
Document(page_content=prop, metadata={id_key: doc_ids[i]})
for i, props in enumerate(propositions)
for prop in props
if prop
]
retriever.vectorstore.add_documents(prop_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))
def create_index(
docs: Sequence[Document],
indexer: Runnable,
docstore_id_key: str = DOCSTORE_ID_KEY,
):
"""
Create retriever that indexes docs and their propositions
:param docs: Documents to index
:param indexer: Runnable creates additional propositions per doc
:param docstore_id_key: Key to use to store the docstore id
:return: Retriever
"""
logger.info("Creating multi-vector retriever")
retriever = get_multi_vector_retriever(docstore_id_key)
propositions = indexer.batch([{"input": doc.page_content} for doc in docs])
add_documents(
retriever,
propositions,
docs,
id_key=docstore_id_key,
)
return retriever
if __name__ == "__main__":
# For our example, we'll load docs from the web
from langchain.text_splitter import RecursiveCharacterTextSplitter # noqa
from langchain_community.document_loaders.recursive_url_loader import (
RecursiveUrlLoader,
) # noqa
# The attention is all you need paper
# Could add more parsing here, as it's very raw.
loader = RecursiveUrlLoader("https://ar5iv.labs.arxiv.org/html/1706.03762")
data = loader.load()
logger.info(f"Loaded {len(data)} documents")
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)
logger.info(f"Split into {len(all_splits)} documents")
# Create retriever
retriever_multi_vector_img = create_index(
all_splits,
proposition_chain,
DOCSTORE_ID_KEY,
)

View File

@ -0,0 +1,107 @@
import logging
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_community.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Modified from the paper to be more robust to benign prompt injection
# https://arxiv.org/abs/2312.06648
# @misc{chen2023dense,
# title={Dense X Retrieval: What Retrieval Granularity Should We Use?},
# author={Tong Chen and Hongwei Wang and Sihao Chen and Wenhao Yu and Kaixin Ma
# and Xinran Zhao and Hongming Zhang and Dong Yu},
# year={2023},
# eprint={2312.06648},
# archivePrefix={arXiv},
# primaryClass={cs.CL}
# }
PROMPT = ChatPromptTemplate.from_messages(
[
(
"system",
"""Decompose the "Content" into clear and simple propositions, ensuring they are interpretable out of
context.
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input
whenever possible.
2. For any named entity that is accompanied by additional descriptive information, separate this
information into its own distinct proposition.
3. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences
and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the
entities they refer to.
4. Present the results as a list of strings, formatted in JSON.
Example:
Input: Title: ¯Eostre. Section: Theories and interpretations, Connection to Easter Hares. Content:
The earliest evidence for the Easter Hare (Osterhase) was recorded in south-west Germany in
1678 by the professor of medicine Georg Franck von Franckenau, but it remained unknown in
other parts of Germany until the 18th century. Scholar Richard Sermon writes that "hares were
frequently seen in gardens in spring, and thus may have served as a convenient explanation for the
origin of the colored eggs hidden there for children. Alternatively, there is a European tradition
that hares laid eggs, since a hares scratch or form and a lapwings nest look very similar, and
both occur on grassland and are first seen in the spring. In the nineteenth century the influence
of Easter cards, toys, and books was to make the Easter Hare/Rabbit popular throughout Europe.
German immigrants then exported the custom to Britain and America where it evolved into the
Easter Bunny."
Output: [ "The earliest evidence for the Easter Hare was recorded in south-west Germany in
1678 by Georg Franck von Franckenau.", "Georg Franck von Franckenau was a professor of
medicine.", "The evidence for the Easter Hare remained unknown in other parts of Germany until
the 18th century.", "Richard Sermon was a scholar.", "Richard Sermon writes a hypothesis about
the possible explanation for the connection between hares and the tradition during Easter", "Hares
were frequently seen in gardens in spring.", "Hares may have served as a convenient explanation
for the origin of the colored eggs hidden in gardens for children.", "There is a European tradition
that hares laid eggs.", "A hares scratch or form and a lapwings nest look very similar.", "Both
hares and lapwings nests occur on grassland and are first seen in the spring.", "In the nineteenth
century the influence of Easter cards, toys, and books was to make the Easter Hare/Rabbit popular
throughout Europe.", "German immigrants exported the custom of the Easter Hare/Rabbit to
Britain and America.", "The custom of the Easter Hare/Rabbit evolved into the Easter Bunny in
Britain and America."]""", # noqa
),
("user", "Decompose the following:\n{input}"),
]
)
def get_propositions(tool_calls: list) -> list:
if not tool_calls:
raise ValueError("No tool calls found")
return tool_calls[0]["args"]["propositions"]
def empty_proposals(x):
# Model couldn't generate proposals
return []
proposition_chain = (
PROMPT
| ChatOpenAI(model="gpt-3.5-turbo-16k").bind(
tools=[
{
"type": "function",
"function": {
"name": "decompose_content",
"description": "Return the decomposed propositions",
"parameters": {
"type": "object",
"properties": {
"propositions": {
"type": "array",
"items": {"type": "string"},
}
},
"required": ["propositions"],
},
},
}
],
tool_choice={"type": "function", "function": {"name": "decompose_content"}},
)
| JsonOutputToolsParser()
| get_propositions
).with_fallbacks([RunnableLambda(empty_proposals)])

View File

@ -0,0 +1,38 @@
import logging
from pathlib import Path
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import LocalFileStore
from langchain_community.vectorstores import Chroma
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def get_multi_vector_retriever(docstore_id_key: str):
"""Create the composed retriever object."""
vectorstore = get_vectorstore()
store = get_docstore()
return MultiVectorRetriever(
vectorstore=vectorstore,
byte_store=store,
id_key=docstore_id_key,
)
def get_vectorstore(collection_name: str = "proposals"):
"""Get the vectorstore used for this example."""
return Chroma(
collection_name=collection_name,
persist_directory=str(Path(__file__).parent.parent / "chroma_db_proposals"),
embedding_function=OpenAIEmbeddings(),
)
def get_docstore():
"""Get the metadata store used for this example."""
return LocalFileStore(
str(Path(__file__).parent.parent / "multi_vector_retriever_metadata")
)