Update propositional-retrieval template (#14766)

More descriptive name. Add parser in ingest. Update image link
This commit is contained in:
William FH 2023-12-15 07:57:45 -08:00 committed by GitHub
parent 4855964332
commit 65091ebe50
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 32 additions and 26 deletions

View File

@ -1,8 +1,8 @@
# rag-chroma-dense-retrieval # propositional-retrieval
This template demonstrates the multi-vector indexing strategy proposed by Chen, et. al.'s [Dense X Retrieval: What Retrieval Granularity Should We Use?](https://arxiv.org/abs/2312.06648). The prompt, which you can [try out on the hub](https://smith.langchain.com/hub/wfh/proposal-indexing), directs an LLM to generate de-contextualized "propositions" which can be vectorized to increase the retrieval accuracy. You can see the full definition in `proposal_chain.py`. This template demonstrates the multi-vector indexing strategy proposed by Chen, et. al.'s [Dense X Retrieval: What Retrieval Granularity Should We Use?](https://arxiv.org/abs/2312.06648). The prompt, which you can [try out on the hub](https://smith.langchain.com/hub/wfh/proposal-indexing), directs an LLM to generate de-contextualized "propositions" which can be vectorized to increase the retrieval accuracy. You can see the full definition in `proposal_chain.py`.
![Retriever Diagram](./_images/retriever_diagram.png) ![Retriever Diagram](https://github.com/langchain-ai/langchain/raw/master/templates/propositional-retrieval/_images/retriever_diagram.png)
## Storage ## Storage
@ -18,7 +18,7 @@ Create the index by running the following:
```python ```python
poetry install poetry install
poetry run python rag_chroma_dense_retrieval/ingest.py poetry run python propositional_retrieval/ingest.py
``` ```
## Usage ## Usage
@ -32,21 +32,21 @@ pip install -U langchain-cli
To create a new LangChain project and install this as the only package, you can do: To create a new LangChain project and install this as the only package, you can do:
```shell ```shell
langchain app new my-app --package rag-chroma-dense-retrieval langchain app new my-app --package propositional-retrieval
``` ```
If you want to add this to an existing project, you can just run: If you want to add this to an existing project, you can just run:
```shell ```shell
langchain app add rag-chroma-dense-retrieval langchain app add propositional-retrieval
``` ```
And add the following code to your `server.py` file: And add the following code to your `server.py` file:
```python ```python
from rag_chroma_dense_retrieval import chain from propositional_retrieval import chain
add_routes(app, chain, path="/rag-chroma-dense-retrieval") add_routes(app, chain, path="/propositional-retrieval")
``` ```
(Optional) Let's now configure LangSmith. (Optional) Let's now configure LangSmith.
@ -70,12 +70,12 @@ This will start the FastAPI app with a server is running locally at
[http://localhost:8000](http://localhost:8000) [http://localhost:8000](http://localhost:8000)
We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs) We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs)
We can access the playground at [http://127.0.0.1:8000/rag-chroma-dense-retrieval/playground](http://127.0.0.1:8000/rag-chroma-dense-retrieval/playground) We can access the playground at [http://127.0.0.1:8000/propositional-retrieval/playground](http://127.0.0.1:8000/propositional-retrieval/playground)
We can access the template from code with: We can access the template from code with:
```python ```python
from langserve.client import RemoteRunnable from langserve.client import RemoteRunnable
runnable = RemoteRunnable("http://localhost:8000/rag-chroma-dense-retrieval") runnable = RemoteRunnable("http://localhost:8000/propositional-retrieval")
``` ```

View File

Before

Width:  |  Height:  |  Size: 375 KiB

After

Width:  |  Height:  |  Size: 375 KiB

View File

@ -12,7 +12,7 @@
"```\n", "```\n",
"from fastapi import FastAPI\n", "from fastapi import FastAPI\n",
"from langserve import add_routes\n", "from langserve import add_routes\n",
"from rag_chroma_dense_retrieval import chain\n", "from propositional_retrieval import chain\n",
"\n", "\n",
"app = FastAPI(\n", "app = FastAPI(\n",
" title=\"LangChain Server\",\n", " title=\"LangChain Server\",\n",
@ -20,7 +20,7 @@
" description=\"Retriever and Generator for RAG Chroma Dense Retrieval\",\n", " description=\"Retriever and Generator for RAG Chroma Dense Retrieval\",\n",
")\n", ")\n",
"\n", "\n",
"add_routes(app, chain, path=\"/rag-chroma-dense-retrieval\")\n", "add_routes(app, chain, path=\"/propositional-retrieval\")\n",
"\n", "\n",
"if __name__ == \"__main__\":\n", "if __name__ == \"__main__\":\n",
" import uvicorn\n", " import uvicorn\n",
@ -39,7 +39,7 @@
"source": [ "source": [
"from langserve.client import RemoteRunnable\n", "from langserve.client import RemoteRunnable\n",
"\n", "\n",
"rag_app = RemoteRunnable(\"http://localhost:8001/rag-chroma-dense-retrieval\")\n", "rag_app = RemoteRunnable(\"http://localhost:8001/propositional-retrieval\")\n",
"rag_app.invoke(\"How are transformers related to convolutional neural networks?\")" "rag_app.invoke(\"How are transformers related to convolutional neural networks?\")"
] ]
} }

View File

@ -0,0 +1,4 @@
from propositional_retrieval.chain import chain
from propositional_retrieval.proposal_chain import proposition_chain
__all__ = ["chain", "proposition_chain"]

View File

@ -5,8 +5,8 @@ from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel from langchain_core.pydantic_v1 import BaseModel
from langchain_core.runnables import RunnablePassthrough from langchain_core.runnables import RunnablePassthrough
from rag_chroma_dense_retrieval.constants import DOCSTORE_ID_KEY from propositional_retrieval.constants import DOCSTORE_ID_KEY
from rag_chroma_dense_retrieval.storage import get_multi_vector_retriever from propositional_retrieval.storage import get_multi_vector_retriever
def format_docs(docs: list) -> str: def format_docs(docs: list) -> str:

View File

@ -2,12 +2,13 @@ import logging
import uuid import uuid
from typing import Sequence from typing import Sequence
from bs4 import BeautifulSoup as Soup
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_core.runnables import Runnable from langchain_core.runnables import Runnable
from rag_chroma_dense_retrieval.constants import DOCSTORE_ID_KEY from propositional_retrieval.constants import DOCSTORE_ID_KEY
from rag_chroma_dense_retrieval.proposal_chain import proposition_chain from propositional_retrieval.proposal_chain import proposition_chain
from rag_chroma_dense_retrieval.storage import get_multi_vector_retriever from propositional_retrieval.storage import get_multi_vector_retriever
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
@ -48,7 +49,9 @@ def create_index(
""" """
logger.info("Creating multi-vector retriever") logger.info("Creating multi-vector retriever")
retriever = get_multi_vector_retriever(docstore_id_key) retriever = get_multi_vector_retriever(docstore_id_key)
propositions = indexer.batch([{"input": doc.page_content} for doc in docs]) propositions = indexer.batch(
[{"input": doc.page_content} for doc in docs], {"max_concurrency": 10}
)
add_documents( add_documents(
retriever, retriever,
@ -69,12 +72,15 @@ if __name__ == "__main__":
# The attention is all you need paper # The attention is all you need paper
# Could add more parsing here, as it's very raw. # Could add more parsing here, as it's very raw.
loader = RecursiveUrlLoader("https://ar5iv.labs.arxiv.org/html/1706.03762") loader = RecursiveUrlLoader(
"https://ar5iv.labs.arxiv.org/html/1706.03762",
max_depth=2,
extractor=lambda x: Soup(x, "html.parser").text,
)
data = loader.load() data = loader.load()
logger.info(f"Loaded {len(data)} documents") logger.info(f"Loaded {len(data)} documents")
# Split # Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=0) text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=0)
all_splits = text_splitter.split_documents(data) all_splits = text_splitter.split_documents(data)
logger.info(f"Split into {len(all_splits)} documents") logger.info(f"Split into {len(all_splits)} documents")

View File

@ -1,7 +1,7 @@
[tool.poetry] [tool.poetry]
name = "rag-chroma-dense-retrieval" name = "propositional-retrieval"
version = "0.1.0" version = "0.1.0"
description = "Dense retrieval using vectorized propositions.s" description = "Dense retrieval using vectorized propositions."
authors = [ authors = [
"William Fu-Hinthorn <will@langchain.dev>", "William Fu-Hinthorn <will@langchain.dev>",
] ]

View File

@ -1,4 +0,0 @@
from rag_chroma_dense_retrieval.chain import chain
from rag_chroma_dense_retrieval.proposal_chain import proposition_chain
__all__ = ["chain", "proposition_chain"]