community[minor]: Qdrant sparse vector retriever (#14814)

## Description

This PR intends to add support for Qdrant's new [sparse vector
retrieval](https://qdrant.tech/articles/sparse-vectors/) by introducing
a new retriever class, `QdrantSparseVectorRetriever`.

Necessary usage docs and integration tests have been added for the
retriever.

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Anush 2023-12-20 12:52:19 +05:30 committed by GitHub
parent c53fab63a3
commit 60c70effe9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 636 additions and 0 deletions

View File

@ -0,0 +1,255 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "ce0f17b9",
"metadata": {},
"source": [
"# Qdrant Sparse Vector Retriever\n",
"\n",
">[Qdrant](https://qdrant.tech/) is an open-source, high-performance vector search engine/database.\n",
"\n",
"\n",
">`QdrantSparseVectorRetriever` uses [sparse vectors](https://qdrant.tech/articles/sparse-vectors/) introduced in Qdrant [v1.7.0](https://qdrant.tech/articles/qdrant-1.7.x/) for document retrieval.\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "c307b082",
"metadata": {},
"source": [
"Install the 'qdrant_client' package:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bba863a2-977c-4add-b5f4-bfc33a80eae5",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"%pip install qdrant_client"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "c10dd962",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from qdrant_client import QdrantClient, models\n",
"\n",
"client = QdrantClient(location=\":memory:\")\n",
"collection_name = \"sparse_collection\"\n",
"vector_name = \"sparse_vector\"\n",
"\n",
"client.create_collection(\n",
" collection_name,\n",
" vectors_config={},\n",
" sparse_vectors_config={\n",
" vector_name: models.SparseVectorParams(\n",
" index=models.SparseIndexParams(\n",
" on_disk=False,\n",
" )\n",
" )\n",
" },\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "f47a2bfe",
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.retrievers import QdrantSparseVectorRetriever\n",
"from langchain_core.documents import Document"
]
},
{
"cell_type": "markdown",
"id": "41baa0d1",
"metadata": {},
"source": [
"Create a demo encoder function:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f2eff08e",
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"\n",
"\n",
"def demo_encoder(_: str) -> tuple[list[int], list[float]]:\n",
" return (\n",
" sorted(random.sample(range(100), 100)),\n",
" [random.uniform(0.1, 1.0) for _ in range(100)],\n",
" )\n",
"\n",
"\n",
"# Create a retriever with a demo encoder\n",
"retriever = QdrantSparseVectorRetriever(\n",
" client=client,\n",
" collection_name=collection_name,\n",
" sparse_vector_name=vector_name,\n",
" sparse_encoder=demo_encoder,\n",
")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "b68debff",
"metadata": {},
"source": [
"Add some documents:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "cd8a7b17",
"metadata": {},
"outputs": [],
"source": [
"docs = [\n",
" Document(\n",
" metadata={\n",
" \"title\": \"Beyond Horizons: AI Chronicles\",\n",
" \"author\": \"Dr. Cassandra Mitchell\",\n",
" },\n",
" page_content=\"An in-depth exploration of the fascinating journey of artificial intelligence, narrated by Dr. Mitchell. This captivating account spans the historical roots, current advancements, and speculative futures of AI, offering a gripping narrative that intertwines technology, ethics, and societal implications.\",\n",
" ),\n",
" Document(\n",
" metadata={\n",
" \"title\": \"Synergy Nexus: Merging Minds with Machines\",\n",
" \"author\": \"Prof. Benjamin S. Anderson\",\n",
" },\n",
" page_content=\"Professor Anderson delves into the synergistic possibilities of human-machine collaboration in 'Synergy Nexus.' The book articulates a vision where humans and AI seamlessly coalesce, creating new dimensions of productivity, creativity, and shared intelligence.\",\n",
" ),\n",
" Document(\n",
" metadata={\n",
" \"title\": \"AI Dilemmas: Navigating the Unknown\",\n",
" \"author\": \"Dr. Elena Rodriguez\",\n",
" },\n",
" page_content=\"Dr. Rodriguez pens an intriguing narrative in 'AI Dilemmas,' probing the uncharted territories of ethical quandaries arising from AI advancements. The book serves as a compass, guiding readers through the complex terrain of moral decisions confronting developers, policymakers, and society as AI evolves.\",\n",
" ),\n",
" Document(\n",
" metadata={\n",
" \"title\": \"Sentient Threads: Weaving AI Consciousness\",\n",
" \"author\": \"Prof. Alexander J. Bennett\",\n",
" },\n",
" page_content=\"In 'Sentient Threads,' Professor Bennett unravels the enigma of AI consciousness, presenting a tapestry of arguments that scrutinize the very essence of machine sentience. The book ignites contemplation on the ethical and philosophical dimensions surrounding the quest for true AI awareness.\",\n",
" ),\n",
" Document(\n",
" metadata={\n",
" \"title\": \"Silent Alchemy: Unseen AI Alleviations\",\n",
" \"author\": \"Dr. Emily Foster\",\n",
" },\n",
" page_content=\"Building upon her previous work, Dr. Foster unveils 'Silent Alchemy,' a profound examination of the covert presence of AI in our daily lives. This illuminating piece reveals the subtle yet impactful ways in which AI invisibly shapes our routines, emphasizing the need for heightened awareness in our technology-driven world.\",\n",
" ),\n",
"]"
]
},
{
"cell_type": "markdown",
"id": "a5e673fa",
"metadata": {},
"source": [
"Perform a retrieval:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "3c5970db",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['1a3e0d292e6444d39451d0588ce746dc',\n",
" '19b180dd31e749359d49967e5d5dcab7',\n",
" '8de69e56086f47748e32c9e379e6865b',\n",
" 'f528fac385954e46b89cf8607bf0ee5a',\n",
" 'c1a6249d005d4abd9192b1d0b829cebe']"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"retriever.add_documents(docs)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "4fffd0af",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content=\"In 'Sentient Threads,' Professor Bennett unravels the enigma of AI consciousness, presenting a tapestry of arguments that scrutinize the very essence of machine sentience. The book ignites contemplation on the ethical and philosophical dimensions surrounding the quest for true AI awareness.\", metadata={'title': 'Sentient Threads: Weaving AI Consciousness', 'author': 'Prof. Alexander J. Bennett'}),\n",
" Document(page_content=\"Dr. Rodriguez pens an intriguing narrative in 'AI Dilemmas,' probing the uncharted territories of ethical quandaries arising from AI advancements. The book serves as a compass, guiding readers through the complex terrain of moral decisions confronting developers, policymakers, and society as AI evolves.\", metadata={'title': 'AI Dilemmas: Navigating the Unknown', 'author': 'Dr. Elena Rodriguez'}),\n",
" Document(page_content=\"Professor Anderson delves into the synergistic possibilities of human-machine collaboration in 'Synergy Nexus.' The book articulates a vision where humans and AI seamlessly coalesce, creating new dimensions of productivity, creativity, and shared intelligence.\", metadata={'title': 'Synergy Nexus: Merging Minds with Machines', 'author': 'Prof. Benjamin S. Anderson'}),\n",
" Document(page_content='An in-depth exploration of the fascinating journey of artificial intelligence, narrated by Dr. Mitchell. This captivating account spans the historical roots, current advancements, and speculative futures of AI, offering a gripping narrative that intertwines technology, ethics, and societal implications.', metadata={'title': 'Beyond Horizons: AI Chronicles', 'author': 'Dr. Cassandra Mitchell'})]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"retriever.get_relevant_documents(\n",
" \"Life and ethical dilemmas of AI\",\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -57,6 +57,9 @@ from langchain_community.retrievers.pinecone_hybrid_search import (
PineconeHybridSearchRetriever,
)
from langchain_community.retrievers.pubmed import PubMedRetriever
from langchain_community.retrievers.qdrant_sparse_vector_retriever import (
QdrantSparseVectorRetriever,
)
from langchain_community.retrievers.remote_retriever import RemoteLangChainRetriever
from langchain_community.retrievers.svm import SVMRetriever
from langchain_community.retrievers.tavily_search_api import TavilySearchAPIRetriever
@ -93,6 +96,7 @@ __all__ = [
"OutlineRetriever",
"PineconeHybridSearchRetriever",
"PubMedRetriever",
"QdrantSparseVectorRetriever",
"RemoteLangChainRetriever",
"SVMRetriever",
"TavilySearchAPIRetriever",

View File

@ -0,0 +1,206 @@
import uuid
from itertools import islice
from typing import (
Any,
Callable,
Dict,
Generator,
Iterable,
List,
Optional,
Sequence,
Tuple,
cast,
)
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import Extra, root_validator
from langchain_core.retrievers import BaseRetriever
from langchain_community.vectorstores.qdrant import Qdrant, QdrantException
class QdrantSparseVectorRetriever(BaseRetriever):
"""Qdrant sparse vector retriever."""
client: Any
"""'qdrant_client' instance to use."""
collection_name: str
"""Qdrant collection name."""
sparse_vector_name: str
"""Name of the sparse vector to use."""
sparse_encoder: Callable[[str], Tuple[List[int], List[float]]]
"""Sparse encoder function to use."""
k: int = 4
"""Number of documents to return per query. Defaults to 4."""
filter: Optional[Any] = None
"""Qdrant qdrant_client.models.Filter to use for queries. Defaults to None."""
content_payload_key: str = "content"
"""Payload field containing the document content. Defaults to 'content'"""
metadata_payload_key: str = "metadata"
"""Payload field containing the document metadata. Defaults to 'metadata'."""
search_options: Dict[str, Any] = {}
"""Additional search options to pass to qdrant_client.QdrantClient.search()."""
class Config:
"""Configuration for this pydantic object."""
extra = Extra.forbid
arbitrary_types_allowed = True
@root_validator()
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that 'qdrant_client' python package exists in environment."""
try:
from grpc import RpcError
from qdrant_client import QdrantClient, models
from qdrant_client.http.exceptions import UnexpectedResponse
except ImportError:
raise ImportError(
"Could not import qdrant-client python package. "
"Please install it with `pip install qdrant-client`."
)
client = values["client"]
if not isinstance(client, QdrantClient):
raise ValueError(
f"client should be an instance of qdrant_client.QdrantClient, "
f"got {type(client)}"
)
filter = values["filter"]
if filter is not None and not isinstance(filter, models.Filter):
raise ValueError(
f"filter should be an instance of qdrant_client.models.Filter, "
f"got {type(filter)}"
)
client = cast(QdrantClient, client)
collection_name = values["collection_name"]
sparse_vector_name = values["sparse_vector_name"]
try:
collection_info = client.get_collection(collection_name)
sparse_vectors_config = collection_info.config.params.sparse_vectors
if sparse_vector_name not in sparse_vectors_config:
raise QdrantException(
f"Existing Qdrant collection {collection_name} does not "
f"contain sparse vector named {sparse_vector_name}."
f"Did you mean one of {', '.join(sparse_vectors_config.keys())}?"
)
except (UnexpectedResponse, RpcError, ValueError):
raise QdrantException(
f"Qdrant collection {collection_name} does not exist."
)
return values
def _get_relevant_documents(
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
) -> List[Document]:
from qdrant_client import QdrantClient, models
client = cast(QdrantClient, self.client)
query_indices, query_values = self.sparse_encoder(query)
results = client.search(
self.collection_name,
query_filter=self.filter,
query_vector=models.NamedSparseVector(
name=self.sparse_vector_name,
vector=models.SparseVector(
indices=query_indices,
values=query_values,
),
),
limit=self.k,
with_vectors=False,
**self.search_options,
)
return [
Qdrant._document_from_scored_point(
point, self.content_payload_key, self.metadata_payload_key
)
for point in results
]
def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]:
"""Run more documents through the embeddings and add to the vectorstore.
Args:
documents (List[Document]: Documents to add to the vectorstore.
Returns:
List[str]: List of IDs of the added texts.
"""
texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]
return self.add_texts(texts, metadatas, **kwargs)
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[Sequence[str]] = None,
batch_size: int = 64,
**kwargs: Any,
) -> List[str]:
from qdrant_client import QdrantClient
added_ids = []
client = cast(QdrantClient, self.client)
for batch_ids, points in self._generate_rest_batches(
texts, metadatas, ids, batch_size
):
client.upsert(self.collection_name, points=points, **kwargs)
added_ids.extend(batch_ids)
return added_ids
def _generate_rest_batches(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[Sequence[str]] = None,
batch_size: int = 64,
) -> Generator[Tuple[List[str], List[Any]], None, None]:
from qdrant_client import models as rest
texts_iterator = iter(texts)
metadatas_iterator = iter(metadatas or [])
ids_iterator = iter(ids or [uuid.uuid4().hex for _ in iter(texts)])
while batch_texts := list(islice(texts_iterator, batch_size)):
# Take the corresponding metadata and id for each text in a batch
batch_metadatas = list(islice(metadatas_iterator, batch_size)) or None
batch_ids = list(islice(ids_iterator, batch_size))
# Generate the sparse embeddings for all the texts in a batch
batch_embeddings: List[Tuple[List[int], List[float]]] = [
self.sparse_encoder(text) for text in batch_texts
]
points = [
rest.PointStruct(
id=point_id,
vector={
self.sparse_vector_name: rest.SparseVector(
indices=sparse_vector[0],
values=sparse_vector[1],
)
},
payload=payload,
)
for point_id, sparse_vector, payload in zip(
batch_ids,
batch_embeddings,
Qdrant._build_payloads(
batch_texts,
batch_metadatas,
self.content_payload_key,
self.metadata_payload_key,
),
)
]
yield batch_ids, points

View File

@ -0,0 +1,170 @@
import random
import uuid
from typing import List, Tuple
import pytest
from langchain_core.documents import Document
from langchain_community.retrievers import QdrantSparseVectorRetriever
from langchain_community.vectorstores.qdrant import QdrantException
def consistent_fake_sparse_encoder(
query: str, size: int = 100, density: float = 0.7
) -> Tuple[List[int], List[float]]:
"""
Generates a consistent fake sparse vector.
Parameters:
- query (str): The query string to make the function deterministic.
- size (int): The size of the vector to generate.
- density (float): The density of the vector to generate.
Returns:
- indices (list): List of indices where the non-zero elements are located.
- values (list): List of corresponding float values at the non-zero indices.
"""
# Ensure density is within the valid range [0, 1]
density = max(0.0, min(1.0, density))
# Use a deterministic seed based on the query
seed = hash(query)
random.seed(seed)
# Calculate the number of non-zero elements based on density
num_non_zero_elements = int(size * density)
# Generate random indices without replacement
indices = sorted(random.sample(range(size), num_non_zero_elements))
# Generate random float values for the non-zero elements
values = [random.uniform(0.0, 1.0) for _ in range(num_non_zero_elements)]
return indices, values
@pytest.fixture
def retriever() -> QdrantSparseVectorRetriever:
from qdrant_client import QdrantClient, models
client = QdrantClient(location=":memory:")
collection_name = uuid.uuid4().hex
vector_name = uuid.uuid4().hex
client.recreate_collection(
collection_name,
vectors_config={},
sparse_vectors_config={
vector_name: models.SparseVectorParams(
index=models.SparseIndexParams(
on_disk=False,
)
)
},
)
return QdrantSparseVectorRetriever(
client=client,
collection_name=collection_name,
sparse_vector_name=vector_name,
sparse_encoder=consistent_fake_sparse_encoder,
)
def test_invalid_collection_name(retriever: QdrantSparseVectorRetriever) -> None:
with pytest.raises(QdrantException) as e:
QdrantSparseVectorRetriever(
client=retriever.client,
collection_name="invalid collection",
sparse_vector_name=retriever.sparse_vector_name,
sparse_encoder=consistent_fake_sparse_encoder,
)
assert "does not exist" in str(e.value)
def test_invalid_sparse_vector_name(retriever: QdrantSparseVectorRetriever) -> None:
with pytest.raises(QdrantException) as e:
QdrantSparseVectorRetriever(
client=retriever.client,
collection_name=retriever.collection_name,
sparse_vector_name="invalid sparse vector",
sparse_encoder=consistent_fake_sparse_encoder,
)
assert "does not contain sparse vector" in str(e.value)
def test_add_documents(retriever: QdrantSparseVectorRetriever) -> None:
documents = [
Document(page_content="hello world", metadata={"a": 1}),
Document(page_content="foo bar", metadata={"b": 2}),
Document(page_content="baz qux", metadata={"c": 3}),
]
ids = retriever.add_documents(documents)
assert retriever.client.count(retriever.collection_name, exact=True).count == 3
documents = [
Document(page_content="hello world"),
Document(page_content="foo bar"),
Document(page_content="baz qux"),
]
ids = retriever.add_documents(documents)
assert len(ids) == 3
assert retriever.client.count(retriever.collection_name, exact=True).count == 6
def test_add_texts(retriever: QdrantSparseVectorRetriever) -> None:
retriever.add_texts(
["hello world", "foo bar", "baz qux"], [{"a": 1}, {"b": 2}, {"c": 3}]
)
assert retriever.client.count(retriever.collection_name, exact=True).count == 3
retriever.add_texts(["hello world", "foo bar", "baz qux"])
assert retriever.client.count(retriever.collection_name, exact=True).count == 6
def test_get_relevant_documents(retriever: QdrantSparseVectorRetriever) -> None:
retriever.add_texts(["Hai there!", "Hello world!", "Foo bar baz!"])
expected = [Document(page_content="Hai there!")]
retriever.k = 1
results = retriever.get_relevant_documents("Hai there!")
assert len(results) == retriever.k
assert results == expected
assert retriever.get_relevant_documents("Hai there!") == expected
def test_get_relevant_documents_with_filter(
retriever: QdrantSparseVectorRetriever,
) -> None:
from qdrant_client import models
retriever.add_texts(
["Hai there!", "Hello world!", "Foo bar baz!"],
[
{"value": 1},
{"value": 2},
{"value": 3},
],
)
retriever.filter = models.Filter(
must=[
models.FieldCondition(
key="metadata.value", match=models.MatchValue(value=2)
)
]
)
results = retriever.get_relevant_documents("Some query")
assert results[0] == Document(page_content="Hello world!", metadata={"value": 2})

View File

@ -24,6 +24,7 @@ EXPECTED_ALL = [
"OutlineRetriever",
"PineconeHybridSearchRetriever",
"PubMedRetriever",
"QdrantSparseVectorRetriever",
"RemoteLangChainRetriever",
"SVMRetriever",
"TavilySearchAPIRetriever",