mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-22 07:27:45 +00:00
Add dense proposals (#14719)
Indexing strategy based on decomposing candidate propositions while indexing.
This commit is contained in:
parent
bc3ec78a38
commit
79ae6c2a9e
3
templates/rag-chroma-dense-retrieval/.gitignore
vendored
Normal file
3
templates/rag-chroma-dense-retrieval/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
docs/img_*.jpg
|
||||
chroma_db_proposals
|
||||
multi_vector_retriever_metadata
|
21
templates/rag-chroma-dense-retrieval/LICENSE
Normal file
21
templates/rag-chroma-dense-retrieval/LICENSE
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2023 LangChain, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
81
templates/rag-chroma-dense-retrieval/README.md
Normal file
81
templates/rag-chroma-dense-retrieval/README.md
Normal file
@ -0,0 +1,81 @@
|
||||
# rag-chroma-dense-retrieval
|
||||
|
||||
This template demonstrates the multi-vector indexing strategy proposed by Chen, et. al.'s [Dense X Retrieval: What Retrieval Granularity Should We Use?](https://arxiv.org/abs/2312.06648). The prompt, which you can [try out on the hub](https://smith.langchain.com/hub/wfh/proposal-indexing), directs an LLM to generate de-contextualized "propositions" which can be vectorized to increase the retrieval accuracy. You can see the full definition in `proposal_chain.py`.
|
||||
|
||||

|
||||
|
||||
## Storage
|
||||
|
||||
For this demo, we index a simple academic paper using the RecursiveUrlLoader, and store all retriever information locally (using chroma and a bytestore stored on the local filesystem). You can modify the storage layer in `storage.py`.
|
||||
|
||||
## Environment Setup
|
||||
|
||||
Set the `OPENAI_API_KEY` environment variable to access `gpt-3.5` and the OpenAI Embeddings classes.
|
||||
|
||||
## Indexing
|
||||
|
||||
Create the index by running the following:
|
||||
|
||||
```python
|
||||
poetry install
|
||||
poetry run python rag_chroma_dense_retrieval/ingest.py
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
To use this package, you should first have the LangChain CLI installed:
|
||||
|
||||
```shell
|
||||
pip install -U langchain-cli
|
||||
```
|
||||
|
||||
To create a new LangChain project and install this as the only package, you can do:
|
||||
|
||||
```shell
|
||||
langchain app new my-app --package rag-chroma-dense-retrieval
|
||||
```
|
||||
|
||||
If you want to add this to an existing project, you can just run:
|
||||
|
||||
```shell
|
||||
langchain app add rag-chroma-dense-retrieval
|
||||
```
|
||||
|
||||
And add the following code to your `server.py` file:
|
||||
|
||||
```python
|
||||
from rag_chroma_dense_retrieval import chain
|
||||
|
||||
add_routes(app, chain, path="/rag-chroma-dense-retrieval")
|
||||
```
|
||||
|
||||
(Optional) Let's now configure LangSmith.
|
||||
LangSmith will help us trace, monitor and debug LangChain applications.
|
||||
LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/).
|
||||
If you don't have access, you can skip this section
|
||||
|
||||
```shell
|
||||
export LANGCHAIN_TRACING_V2=true
|
||||
export LANGCHAIN_API_KEY=<your-api-key>
|
||||
export LANGCHAIN_PROJECT=<your-project> # if not specified, defaults to "default"
|
||||
```
|
||||
|
||||
If you are inside this directory, then you can spin up a LangServe instance directly by:
|
||||
|
||||
```shell
|
||||
langchain serve
|
||||
```
|
||||
|
||||
This will start the FastAPI app with a server is running locally at
|
||||
[http://localhost:8000](http://localhost:8000)
|
||||
|
||||
We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs)
|
||||
We can access the playground at [http://127.0.0.1:8000/rag-chroma-dense-retrieval/playground](http://127.0.0.1:8000/rag-chroma-dense-retrieval/playground)
|
||||
|
||||
We can access the template from code with:
|
||||
|
||||
```python
|
||||
from langserve.client import RemoteRunnable
|
||||
|
||||
runnable = RemoteRunnable("http://localhost:8000/rag-chroma-dense-retrieval")
|
||||
```
|
Binary file not shown.
After Width: | Height: | Size: 375 KiB |
2859
templates/rag-chroma-dense-retrieval/poetry.lock
generated
Normal file
2859
templates/rag-chroma-dense-retrieval/poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
35
templates/rag-chroma-dense-retrieval/pyproject.toml
Normal file
35
templates/rag-chroma-dense-retrieval/pyproject.toml
Normal file
@ -0,0 +1,35 @@
|
||||
[tool.poetry]
|
||||
name = "rag-chroma-dense-retrieval"
|
||||
version = "0.1.0"
|
||||
description = "Dense retrieval using vectorized propositions.s"
|
||||
authors = [
|
||||
"William Fu-Hinthorn <will@langchain.dev>",
|
||||
]
|
||||
readme = "README.md"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8.1,<4.0"
|
||||
langchain = ">=0.0.350"
|
||||
openai = "<2"
|
||||
tiktoken = ">=0.5.1"
|
||||
chromadb = ">=0.4.14"
|
||||
bs4 = "^0.0.1"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
langchain-cli = ">=0.0.15"
|
||||
|
||||
[tool.langserve]
|
||||
export_module = "rag_chroma_multi_modal_multi_vector"
|
||||
export_attr = "chain"
|
||||
|
||||
[tool.templates-hub]
|
||||
use-case = "rag"
|
||||
author = "LangChain"
|
||||
integrations = ["OpenAI", "Chroma"]
|
||||
tags = ["vectordbs"]
|
||||
|
||||
[build-system]
|
||||
requires = [
|
||||
"poetry-core",
|
||||
]
|
||||
build-backend = "poetry.core.masonry.api"
|
@ -0,0 +1,68 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "681a5d1e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Run Template\n",
|
||||
"\n",
|
||||
"In `server.py`, set -\n",
|
||||
"```\n",
|
||||
"from fastapi import FastAPI\n",
|
||||
"from langserve import add_routes\n",
|
||||
"from rag_chroma_dense_retrieval import chain\n",
|
||||
"\n",
|
||||
"app = FastAPI(\n",
|
||||
" title=\"LangChain Server\",\n",
|
||||
" version=\"1.0\",\n",
|
||||
" description=\"Retriever and Generator for RAG Chroma Dense Retrieval\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"add_routes(app, chain, path=\"/rag-chroma-dense-retrieval\")\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" import uvicorn\n",
|
||||
"\n",
|
||||
" uvicorn.run(app, host=\"localhost\", port=8000)\n",
|
||||
"\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d774be2a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langserve.client import RemoteRunnable\n",
|
||||
"\n",
|
||||
"rag_app = RemoteRunnable(\"http://localhost:8001/rag-chroma-dense-retrieval\")\n",
|
||||
"rag_app.invoke(\"How are transformers related to convolutional neural networks?\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -0,0 +1,4 @@
|
||||
from rag_chroma_dense_retrieval.chain import chain
|
||||
from rag_chroma_dense_retrieval.proposal_chain import proposition_chain
|
||||
|
||||
__all__ = ["chain", "proposition_chain"]
|
@ -0,0 +1,67 @@
|
||||
from langchain_community.chat_models import ChatOpenAI
|
||||
from langchain_core.load import load
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.pydantic_v1 import BaseModel
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
|
||||
from rag_chroma_dense_retrieval.constants import DOCSTORE_ID_KEY
|
||||
from rag_chroma_dense_retrieval.storage import get_multi_vector_retriever
|
||||
|
||||
|
||||
def format_docs(docs: list) -> str:
|
||||
loaded_docs = [load(doc) for doc in docs]
|
||||
return "\n".join(
|
||||
[
|
||||
f"<Document id={i}>\n{doc.page_content}\n</Document>"
|
||||
for i, doc in enumerate(loaded_docs)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def rag_chain(retriever):
|
||||
"""
|
||||
The RAG chain
|
||||
|
||||
:param retriever: A function that retrieves the necessary context for the model.
|
||||
:return: A chain of functions representing the multi-modal RAG process.
|
||||
"""
|
||||
model = ChatOpenAI(temperature=0, model="gpt-4-1106-preview", max_tokens=1024)
|
||||
prompt = ChatPromptTemplate.from_messages(
|
||||
[
|
||||
(
|
||||
"system",
|
||||
"You are an AI assistant. Answer based on the retrieved documents:"
|
||||
"\n<Documents>\n{context}\n</Documents>",
|
||||
),
|
||||
("user", "{question}?"),
|
||||
]
|
||||
)
|
||||
|
||||
# Define the RAG pipeline
|
||||
chain = (
|
||||
{
|
||||
"context": retriever | format_docs,
|
||||
"question": RunnablePassthrough(),
|
||||
}
|
||||
| prompt
|
||||
| model
|
||||
| StrOutputParser()
|
||||
)
|
||||
|
||||
return chain
|
||||
|
||||
|
||||
# Create the multi-vector retriever
|
||||
retriever = get_multi_vector_retriever(DOCSTORE_ID_KEY)
|
||||
|
||||
# Create RAG chain
|
||||
chain = rag_chain(retriever)
|
||||
|
||||
|
||||
# Add typing for input
|
||||
class Question(BaseModel):
|
||||
__root__: str
|
||||
|
||||
|
||||
chain = chain.with_types(input_type=Question)
|
@ -0,0 +1 @@
|
||||
DOCSTORE_ID_KEY = "doc_id"
|
@ -0,0 +1,87 @@
|
||||
import logging
|
||||
import uuid
|
||||
from typing import Sequence
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.runnables import Runnable
|
||||
|
||||
from rag_chroma_dense_retrieval.constants import DOCSTORE_ID_KEY
|
||||
from rag_chroma_dense_retrieval.proposal_chain import proposition_chain
|
||||
from rag_chroma_dense_retrieval.storage import get_multi_vector_retriever
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def add_documents(
|
||||
retriever,
|
||||
propositions: Sequence[Sequence[str]],
|
||||
docs: Sequence[Document],
|
||||
id_key: str = DOCSTORE_ID_KEY,
|
||||
):
|
||||
doc_ids = [
|
||||
str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.metadata["source"])) for doc in docs
|
||||
]
|
||||
prop_docs = [
|
||||
Document(page_content=prop, metadata={id_key: doc_ids[i]})
|
||||
for i, props in enumerate(propositions)
|
||||
for prop in props
|
||||
if prop
|
||||
]
|
||||
retriever.vectorstore.add_documents(prop_docs)
|
||||
retriever.docstore.mset(list(zip(doc_ids, docs)))
|
||||
|
||||
|
||||
def create_index(
|
||||
docs: Sequence[Document],
|
||||
indexer: Runnable,
|
||||
docstore_id_key: str = DOCSTORE_ID_KEY,
|
||||
):
|
||||
"""
|
||||
Create retriever that indexes docs and their propositions
|
||||
|
||||
:param docs: Documents to index
|
||||
:param indexer: Runnable creates additional propositions per doc
|
||||
:param docstore_id_key: Key to use to store the docstore id
|
||||
:return: Retriever
|
||||
"""
|
||||
logger.info("Creating multi-vector retriever")
|
||||
retriever = get_multi_vector_retriever(docstore_id_key)
|
||||
propositions = indexer.batch([{"input": doc.page_content} for doc in docs])
|
||||
|
||||
add_documents(
|
||||
retriever,
|
||||
propositions,
|
||||
docs,
|
||||
id_key=docstore_id_key,
|
||||
)
|
||||
|
||||
return retriever
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# For our example, we'll load docs from the web
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter # noqa
|
||||
from langchain_community.document_loaders.recursive_url_loader import (
|
||||
RecursiveUrlLoader,
|
||||
) # noqa
|
||||
|
||||
# The attention is all you need paper
|
||||
# Could add more parsing here, as it's very raw.
|
||||
loader = RecursiveUrlLoader("https://ar5iv.labs.arxiv.org/html/1706.03762")
|
||||
data = loader.load()
|
||||
logger.info(f"Loaded {len(data)} documents")
|
||||
|
||||
# Split
|
||||
|
||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=0)
|
||||
all_splits = text_splitter.split_documents(data)
|
||||
logger.info(f"Split into {len(all_splits)} documents")
|
||||
|
||||
# Create retriever
|
||||
retriever_multi_vector_img = create_index(
|
||||
all_splits,
|
||||
proposition_chain,
|
||||
DOCSTORE_ID_KEY,
|
||||
)
|
@ -0,0 +1,107 @@
|
||||
import logging
|
||||
|
||||
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
|
||||
from langchain_community.chat_models import ChatOpenAI
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnableLambda
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Modified from the paper to be more robust to benign prompt injection
|
||||
# https://arxiv.org/abs/2312.06648
|
||||
# @misc{chen2023dense,
|
||||
# title={Dense X Retrieval: What Retrieval Granularity Should We Use?},
|
||||
# author={Tong Chen and Hongwei Wang and Sihao Chen and Wenhao Yu and Kaixin Ma
|
||||
# and Xinran Zhao and Hongming Zhang and Dong Yu},
|
||||
# year={2023},
|
||||
# eprint={2312.06648},
|
||||
# archivePrefix={arXiv},
|
||||
# primaryClass={cs.CL}
|
||||
# }
|
||||
PROMPT = ChatPromptTemplate.from_messages(
|
||||
[
|
||||
(
|
||||
"system",
|
||||
"""Decompose the "Content" into clear and simple propositions, ensuring they are interpretable out of
|
||||
context.
|
||||
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input
|
||||
whenever possible.
|
||||
2. For any named entity that is accompanied by additional descriptive information, separate this
|
||||
information into its own distinct proposition.
|
||||
3. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences
|
||||
and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the
|
||||
entities they refer to.
|
||||
4. Present the results as a list of strings, formatted in JSON.
|
||||
|
||||
Example:
|
||||
|
||||
Input: Title: ¯Eostre. Section: Theories and interpretations, Connection to Easter Hares. Content:
|
||||
The earliest evidence for the Easter Hare (Osterhase) was recorded in south-west Germany in
|
||||
1678 by the professor of medicine Georg Franck von Franckenau, but it remained unknown in
|
||||
other parts of Germany until the 18th century. Scholar Richard Sermon writes that "hares were
|
||||
frequently seen in gardens in spring, and thus may have served as a convenient explanation for the
|
||||
origin of the colored eggs hidden there for children. Alternatively, there is a European tradition
|
||||
that hares laid eggs, since a hare’s scratch or form and a lapwing’s nest look very similar, and
|
||||
both occur on grassland and are first seen in the spring. In the nineteenth century the influence
|
||||
of Easter cards, toys, and books was to make the Easter Hare/Rabbit popular throughout Europe.
|
||||
German immigrants then exported the custom to Britain and America where it evolved into the
|
||||
Easter Bunny."
|
||||
Output: [ "The earliest evidence for the Easter Hare was recorded in south-west Germany in
|
||||
1678 by Georg Franck von Franckenau.", "Georg Franck von Franckenau was a professor of
|
||||
medicine.", "The evidence for the Easter Hare remained unknown in other parts of Germany until
|
||||
the 18th century.", "Richard Sermon was a scholar.", "Richard Sermon writes a hypothesis about
|
||||
the possible explanation for the connection between hares and the tradition during Easter", "Hares
|
||||
were frequently seen in gardens in spring.", "Hares may have served as a convenient explanation
|
||||
for the origin of the colored eggs hidden in gardens for children.", "There is a European tradition
|
||||
that hares laid eggs.", "A hare’s scratch or form and a lapwing’s nest look very similar.", "Both
|
||||
hares and lapwing’s nests occur on grassland and are first seen in the spring.", "In the nineteenth
|
||||
century the influence of Easter cards, toys, and books was to make the Easter Hare/Rabbit popular
|
||||
throughout Europe.", "German immigrants exported the custom of the Easter Hare/Rabbit to
|
||||
Britain and America.", "The custom of the Easter Hare/Rabbit evolved into the Easter Bunny in
|
||||
Britain and America."]""", # noqa
|
||||
),
|
||||
("user", "Decompose the following:\n{input}"),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def get_propositions(tool_calls: list) -> list:
|
||||
if not tool_calls:
|
||||
raise ValueError("No tool calls found")
|
||||
return tool_calls[0]["args"]["propositions"]
|
||||
|
||||
|
||||
def empty_proposals(x):
|
||||
# Model couldn't generate proposals
|
||||
return []
|
||||
|
||||
|
||||
proposition_chain = (
|
||||
PROMPT
|
||||
| ChatOpenAI(model="gpt-3.5-turbo-16k").bind(
|
||||
tools=[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "decompose_content",
|
||||
"description": "Return the decomposed propositions",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"propositions": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
}
|
||||
},
|
||||
"required": ["propositions"],
|
||||
},
|
||||
},
|
||||
}
|
||||
],
|
||||
tool_choice={"type": "function", "function": {"name": "decompose_content"}},
|
||||
)
|
||||
| JsonOutputToolsParser()
|
||||
| get_propositions
|
||||
).with_fallbacks([RunnableLambda(empty_proposals)])
|
@ -0,0 +1,38 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.retrievers.multi_vector import MultiVectorRetriever
|
||||
from langchain.storage import LocalFileStore
|
||||
from langchain_community.vectorstores import Chroma
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_multi_vector_retriever(docstore_id_key: str):
|
||||
"""Create the composed retriever object."""
|
||||
vectorstore = get_vectorstore()
|
||||
store = get_docstore()
|
||||
return MultiVectorRetriever(
|
||||
vectorstore=vectorstore,
|
||||
byte_store=store,
|
||||
id_key=docstore_id_key,
|
||||
)
|
||||
|
||||
|
||||
def get_vectorstore(collection_name: str = "proposals"):
|
||||
"""Get the vectorstore used for this example."""
|
||||
return Chroma(
|
||||
collection_name=collection_name,
|
||||
persist_directory=str(Path(__file__).parent.parent / "chroma_db_proposals"),
|
||||
embedding_function=OpenAIEmbeddings(),
|
||||
)
|
||||
|
||||
|
||||
def get_docstore():
|
||||
"""Get the metadata store used for this example."""
|
||||
return LocalFileStore(
|
||||
str(Path(__file__).parent.parent / "multi_vector_retriever_metadata")
|
||||
)
|
Loading…
Reference in New Issue
Block a user