mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-21 14:18:52 +00:00
community[minor]: Qdrant sparse vector retriever (#14814)
## Description This PR intends to add support for Qdrant's new [sparse vector retrieval](https://qdrant.tech/articles/sparse-vectors/) by introducing a new retriever class, `QdrantSparseVectorRetriever`. Necessary usage docs and integration tests have been added for the retriever. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
c53fab63a3
commit
60c70effe9
255
docs/docs/integrations/retrievers/qdrant-sparse.ipynb
Normal file
255
docs/docs/integrations/retrievers/qdrant-sparse.ipynb
Normal file
@ -0,0 +1,255 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ce0f17b9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Qdrant Sparse Vector Retriever\n",
|
||||
"\n",
|
||||
">[Qdrant](https://qdrant.tech/) is an open-source, high-performance vector search engine/database.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
">`QdrantSparseVectorRetriever` uses [sparse vectors](https://qdrant.tech/articles/sparse-vectors/) introduced in Qdrant [v1.7.0](https://qdrant.tech/articles/qdrant-1.7.x/) for document retrieval.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "c307b082",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Install the 'qdrant_client' package:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bba863a2-977c-4add-b5f4-bfc33a80eae5",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install qdrant_client"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "c10dd962",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from qdrant_client import QdrantClient, models\n",
|
||||
"\n",
|
||||
"client = QdrantClient(location=\":memory:\")\n",
|
||||
"collection_name = \"sparse_collection\"\n",
|
||||
"vector_name = \"sparse_vector\"\n",
|
||||
"\n",
|
||||
"client.create_collection(\n",
|
||||
" collection_name,\n",
|
||||
" vectors_config={},\n",
|
||||
" sparse_vectors_config={\n",
|
||||
" vector_name: models.SparseVectorParams(\n",
|
||||
" index=models.SparseIndexParams(\n",
|
||||
" on_disk=False,\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" },\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "f47a2bfe",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.retrievers import QdrantSparseVectorRetriever\n",
|
||||
"from langchain_core.documents import Document"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "41baa0d1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Create a demo encoder function:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "f2eff08e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import random\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def demo_encoder(_: str) -> tuple[list[int], list[float]]:\n",
|
||||
" return (\n",
|
||||
" sorted(random.sample(range(100), 100)),\n",
|
||||
" [random.uniform(0.1, 1.0) for _ in range(100)],\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Create a retriever with a demo encoder\n",
|
||||
"retriever = QdrantSparseVectorRetriever(\n",
|
||||
" client=client,\n",
|
||||
" collection_name=collection_name,\n",
|
||||
" sparse_vector_name=vector_name,\n",
|
||||
" sparse_encoder=demo_encoder,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "b68debff",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Add some documents:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "cd8a7b17",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = [\n",
|
||||
" Document(\n",
|
||||
" metadata={\n",
|
||||
" \"title\": \"Beyond Horizons: AI Chronicles\",\n",
|
||||
" \"author\": \"Dr. Cassandra Mitchell\",\n",
|
||||
" },\n",
|
||||
" page_content=\"An in-depth exploration of the fascinating journey of artificial intelligence, narrated by Dr. Mitchell. This captivating account spans the historical roots, current advancements, and speculative futures of AI, offering a gripping narrative that intertwines technology, ethics, and societal implications.\",\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" metadata={\n",
|
||||
" \"title\": \"Synergy Nexus: Merging Minds with Machines\",\n",
|
||||
" \"author\": \"Prof. Benjamin S. Anderson\",\n",
|
||||
" },\n",
|
||||
" page_content=\"Professor Anderson delves into the synergistic possibilities of human-machine collaboration in 'Synergy Nexus.' The book articulates a vision where humans and AI seamlessly coalesce, creating new dimensions of productivity, creativity, and shared intelligence.\",\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" metadata={\n",
|
||||
" \"title\": \"AI Dilemmas: Navigating the Unknown\",\n",
|
||||
" \"author\": \"Dr. Elena Rodriguez\",\n",
|
||||
" },\n",
|
||||
" page_content=\"Dr. Rodriguez pens an intriguing narrative in 'AI Dilemmas,' probing the uncharted territories of ethical quandaries arising from AI advancements. The book serves as a compass, guiding readers through the complex terrain of moral decisions confronting developers, policymakers, and society as AI evolves.\",\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" metadata={\n",
|
||||
" \"title\": \"Sentient Threads: Weaving AI Consciousness\",\n",
|
||||
" \"author\": \"Prof. Alexander J. Bennett\",\n",
|
||||
" },\n",
|
||||
" page_content=\"In 'Sentient Threads,' Professor Bennett unravels the enigma of AI consciousness, presenting a tapestry of arguments that scrutinize the very essence of machine sentience. The book ignites contemplation on the ethical and philosophical dimensions surrounding the quest for true AI awareness.\",\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" metadata={\n",
|
||||
" \"title\": \"Silent Alchemy: Unseen AI Alleviations\",\n",
|
||||
" \"author\": \"Dr. Emily Foster\",\n",
|
||||
" },\n",
|
||||
" page_content=\"Building upon her previous work, Dr. Foster unveils 'Silent Alchemy,' a profound examination of the covert presence of AI in our daily lives. This illuminating piece reveals the subtle yet impactful ways in which AI invisibly shapes our routines, emphasizing the need for heightened awareness in our technology-driven world.\",\n",
|
||||
" ),\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a5e673fa",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Perform a retrieval:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "3c5970db",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['1a3e0d292e6444d39451d0588ce746dc',\n",
|
||||
" '19b180dd31e749359d49967e5d5dcab7',\n",
|
||||
" '8de69e56086f47748e32c9e379e6865b',\n",
|
||||
" 'f528fac385954e46b89cf8607bf0ee5a',\n",
|
||||
" 'c1a6249d005d4abd9192b1d0b829cebe']"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"retriever.add_documents(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "4fffd0af",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content=\"In 'Sentient Threads,' Professor Bennett unravels the enigma of AI consciousness, presenting a tapestry of arguments that scrutinize the very essence of machine sentience. The book ignites contemplation on the ethical and philosophical dimensions surrounding the quest for true AI awareness.\", metadata={'title': 'Sentient Threads: Weaving AI Consciousness', 'author': 'Prof. Alexander J. Bennett'}),\n",
|
||||
" Document(page_content=\"Dr. Rodriguez pens an intriguing narrative in 'AI Dilemmas,' probing the uncharted territories of ethical quandaries arising from AI advancements. The book serves as a compass, guiding readers through the complex terrain of moral decisions confronting developers, policymakers, and society as AI evolves.\", metadata={'title': 'AI Dilemmas: Navigating the Unknown', 'author': 'Dr. Elena Rodriguez'}),\n",
|
||||
" Document(page_content=\"Professor Anderson delves into the synergistic possibilities of human-machine collaboration in 'Synergy Nexus.' The book articulates a vision where humans and AI seamlessly coalesce, creating new dimensions of productivity, creativity, and shared intelligence.\", metadata={'title': 'Synergy Nexus: Merging Minds with Machines', 'author': 'Prof. Benjamin S. Anderson'}),\n",
|
||||
" Document(page_content='An in-depth exploration of the fascinating journey of artificial intelligence, narrated by Dr. Mitchell. This captivating account spans the historical roots, current advancements, and speculative futures of AI, offering a gripping narrative that intertwines technology, ethics, and societal implications.', metadata={'title': 'Beyond Horizons: AI Chronicles', 'author': 'Dr. Cassandra Mitchell'})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"retriever.get_relevant_documents(\n",
|
||||
" \"Life and ethical dilemmas of AI\",\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -57,6 +57,9 @@ from langchain_community.retrievers.pinecone_hybrid_search import (
|
||||
PineconeHybridSearchRetriever,
|
||||
)
|
||||
from langchain_community.retrievers.pubmed import PubMedRetriever
|
||||
from langchain_community.retrievers.qdrant_sparse_vector_retriever import (
|
||||
QdrantSparseVectorRetriever,
|
||||
)
|
||||
from langchain_community.retrievers.remote_retriever import RemoteLangChainRetriever
|
||||
from langchain_community.retrievers.svm import SVMRetriever
|
||||
from langchain_community.retrievers.tavily_search_api import TavilySearchAPIRetriever
|
||||
@ -93,6 +96,7 @@ __all__ = [
|
||||
"OutlineRetriever",
|
||||
"PineconeHybridSearchRetriever",
|
||||
"PubMedRetriever",
|
||||
"QdrantSparseVectorRetriever",
|
||||
"RemoteLangChainRetriever",
|
||||
"SVMRetriever",
|
||||
"TavilySearchAPIRetriever",
|
||||
|
@ -0,0 +1,206 @@
|
||||
import uuid
|
||||
from itertools import islice
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Generator,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
cast,
|
||||
)
|
||||
|
||||
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.pydantic_v1 import Extra, root_validator
|
||||
from langchain_core.retrievers import BaseRetriever
|
||||
|
||||
from langchain_community.vectorstores.qdrant import Qdrant, QdrantException
|
||||
|
||||
|
||||
class QdrantSparseVectorRetriever(BaseRetriever):
|
||||
"""Qdrant sparse vector retriever."""
|
||||
|
||||
client: Any
|
||||
"""'qdrant_client' instance to use."""
|
||||
collection_name: str
|
||||
"""Qdrant collection name."""
|
||||
sparse_vector_name: str
|
||||
"""Name of the sparse vector to use."""
|
||||
sparse_encoder: Callable[[str], Tuple[List[int], List[float]]]
|
||||
"""Sparse encoder function to use."""
|
||||
k: int = 4
|
||||
"""Number of documents to return per query. Defaults to 4."""
|
||||
filter: Optional[Any] = None
|
||||
"""Qdrant qdrant_client.models.Filter to use for queries. Defaults to None."""
|
||||
content_payload_key: str = "content"
|
||||
"""Payload field containing the document content. Defaults to 'content'"""
|
||||
metadata_payload_key: str = "metadata"
|
||||
"""Payload field containing the document metadata. Defaults to 'metadata'."""
|
||||
search_options: Dict[str, Any] = {}
|
||||
"""Additional search options to pass to qdrant_client.QdrantClient.search()."""
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
|
||||
extra = Extra.forbid
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
@root_validator()
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
"""Validate that 'qdrant_client' python package exists in environment."""
|
||||
try:
|
||||
from grpc import RpcError
|
||||
from qdrant_client import QdrantClient, models
|
||||
from qdrant_client.http.exceptions import UnexpectedResponse
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import qdrant-client python package. "
|
||||
"Please install it with `pip install qdrant-client`."
|
||||
)
|
||||
|
||||
client = values["client"]
|
||||
if not isinstance(client, QdrantClient):
|
||||
raise ValueError(
|
||||
f"client should be an instance of qdrant_client.QdrantClient, "
|
||||
f"got {type(client)}"
|
||||
)
|
||||
|
||||
filter = values["filter"]
|
||||
if filter is not None and not isinstance(filter, models.Filter):
|
||||
raise ValueError(
|
||||
f"filter should be an instance of qdrant_client.models.Filter, "
|
||||
f"got {type(filter)}"
|
||||
)
|
||||
|
||||
client = cast(QdrantClient, client)
|
||||
|
||||
collection_name = values["collection_name"]
|
||||
sparse_vector_name = values["sparse_vector_name"]
|
||||
|
||||
try:
|
||||
collection_info = client.get_collection(collection_name)
|
||||
sparse_vectors_config = collection_info.config.params.sparse_vectors
|
||||
|
||||
if sparse_vector_name not in sparse_vectors_config:
|
||||
raise QdrantException(
|
||||
f"Existing Qdrant collection {collection_name} does not "
|
||||
f"contain sparse vector named {sparse_vector_name}."
|
||||
f"Did you mean one of {', '.join(sparse_vectors_config.keys())}?"
|
||||
)
|
||||
except (UnexpectedResponse, RpcError, ValueError):
|
||||
raise QdrantException(
|
||||
f"Qdrant collection {collection_name} does not exist."
|
||||
)
|
||||
return values
|
||||
|
||||
def _get_relevant_documents(
|
||||
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
||||
) -> List[Document]:
|
||||
from qdrant_client import QdrantClient, models
|
||||
|
||||
client = cast(QdrantClient, self.client)
|
||||
query_indices, query_values = self.sparse_encoder(query)
|
||||
results = client.search(
|
||||
self.collection_name,
|
||||
query_filter=self.filter,
|
||||
query_vector=models.NamedSparseVector(
|
||||
name=self.sparse_vector_name,
|
||||
vector=models.SparseVector(
|
||||
indices=query_indices,
|
||||
values=query_values,
|
||||
),
|
||||
),
|
||||
limit=self.k,
|
||||
with_vectors=False,
|
||||
**self.search_options,
|
||||
)
|
||||
return [
|
||||
Qdrant._document_from_scored_point(
|
||||
point, self.content_payload_key, self.metadata_payload_key
|
||||
)
|
||||
for point in results
|
||||
]
|
||||
|
||||
def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]:
|
||||
"""Run more documents through the embeddings and add to the vectorstore.
|
||||
|
||||
Args:
|
||||
documents (List[Document]: Documents to add to the vectorstore.
|
||||
|
||||
Returns:
|
||||
List[str]: List of IDs of the added texts.
|
||||
"""
|
||||
texts = [doc.page_content for doc in documents]
|
||||
metadatas = [doc.metadata for doc in documents]
|
||||
return self.add_texts(texts, metadatas, **kwargs)
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[Sequence[str]] = None,
|
||||
batch_size: int = 64,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
from qdrant_client import QdrantClient
|
||||
|
||||
added_ids = []
|
||||
client = cast(QdrantClient, self.client)
|
||||
for batch_ids, points in self._generate_rest_batches(
|
||||
texts, metadatas, ids, batch_size
|
||||
):
|
||||
client.upsert(self.collection_name, points=points, **kwargs)
|
||||
added_ids.extend(batch_ids)
|
||||
|
||||
return added_ids
|
||||
|
||||
def _generate_rest_batches(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[Sequence[str]] = None,
|
||||
batch_size: int = 64,
|
||||
) -> Generator[Tuple[List[str], List[Any]], None, None]:
|
||||
from qdrant_client import models as rest
|
||||
|
||||
texts_iterator = iter(texts)
|
||||
metadatas_iterator = iter(metadatas or [])
|
||||
ids_iterator = iter(ids or [uuid.uuid4().hex for _ in iter(texts)])
|
||||
while batch_texts := list(islice(texts_iterator, batch_size)):
|
||||
# Take the corresponding metadata and id for each text in a batch
|
||||
batch_metadatas = list(islice(metadatas_iterator, batch_size)) or None
|
||||
batch_ids = list(islice(ids_iterator, batch_size))
|
||||
|
||||
# Generate the sparse embeddings for all the texts in a batch
|
||||
batch_embeddings: List[Tuple[List[int], List[float]]] = [
|
||||
self.sparse_encoder(text) for text in batch_texts
|
||||
]
|
||||
|
||||
points = [
|
||||
rest.PointStruct(
|
||||
id=point_id,
|
||||
vector={
|
||||
self.sparse_vector_name: rest.SparseVector(
|
||||
indices=sparse_vector[0],
|
||||
values=sparse_vector[1],
|
||||
)
|
||||
},
|
||||
payload=payload,
|
||||
)
|
||||
for point_id, sparse_vector, payload in zip(
|
||||
batch_ids,
|
||||
batch_embeddings,
|
||||
Qdrant._build_payloads(
|
||||
batch_texts,
|
||||
batch_metadatas,
|
||||
self.content_payload_key,
|
||||
self.metadata_payload_key,
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
yield batch_ids, points
|
@ -0,0 +1,170 @@
|
||||
import random
|
||||
import uuid
|
||||
from typing import List, Tuple
|
||||
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.retrievers import QdrantSparseVectorRetriever
|
||||
from langchain_community.vectorstores.qdrant import QdrantException
|
||||
|
||||
|
||||
def consistent_fake_sparse_encoder(
|
||||
query: str, size: int = 100, density: float = 0.7
|
||||
) -> Tuple[List[int], List[float]]:
|
||||
"""
|
||||
Generates a consistent fake sparse vector.
|
||||
|
||||
Parameters:
|
||||
- query (str): The query string to make the function deterministic.
|
||||
- size (int): The size of the vector to generate.
|
||||
- density (float): The density of the vector to generate.
|
||||
|
||||
Returns:
|
||||
- indices (list): List of indices where the non-zero elements are located.
|
||||
- values (list): List of corresponding float values at the non-zero indices.
|
||||
"""
|
||||
# Ensure density is within the valid range [0, 1]
|
||||
density = max(0.0, min(1.0, density))
|
||||
|
||||
# Use a deterministic seed based on the query
|
||||
seed = hash(query)
|
||||
random.seed(seed)
|
||||
|
||||
# Calculate the number of non-zero elements based on density
|
||||
num_non_zero_elements = int(size * density)
|
||||
|
||||
# Generate random indices without replacement
|
||||
indices = sorted(random.sample(range(size), num_non_zero_elements))
|
||||
|
||||
# Generate random float values for the non-zero elements
|
||||
values = [random.uniform(0.0, 1.0) for _ in range(num_non_zero_elements)]
|
||||
|
||||
return indices, values
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def retriever() -> QdrantSparseVectorRetriever:
|
||||
from qdrant_client import QdrantClient, models
|
||||
|
||||
client = QdrantClient(location=":memory:")
|
||||
|
||||
collection_name = uuid.uuid4().hex
|
||||
vector_name = uuid.uuid4().hex
|
||||
|
||||
client.recreate_collection(
|
||||
collection_name,
|
||||
vectors_config={},
|
||||
sparse_vectors_config={
|
||||
vector_name: models.SparseVectorParams(
|
||||
index=models.SparseIndexParams(
|
||||
on_disk=False,
|
||||
)
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
return QdrantSparseVectorRetriever(
|
||||
client=client,
|
||||
collection_name=collection_name,
|
||||
sparse_vector_name=vector_name,
|
||||
sparse_encoder=consistent_fake_sparse_encoder,
|
||||
)
|
||||
|
||||
|
||||
def test_invalid_collection_name(retriever: QdrantSparseVectorRetriever) -> None:
|
||||
with pytest.raises(QdrantException) as e:
|
||||
QdrantSparseVectorRetriever(
|
||||
client=retriever.client,
|
||||
collection_name="invalid collection",
|
||||
sparse_vector_name=retriever.sparse_vector_name,
|
||||
sparse_encoder=consistent_fake_sparse_encoder,
|
||||
)
|
||||
assert "does not exist" in str(e.value)
|
||||
|
||||
|
||||
def test_invalid_sparse_vector_name(retriever: QdrantSparseVectorRetriever) -> None:
|
||||
with pytest.raises(QdrantException) as e:
|
||||
QdrantSparseVectorRetriever(
|
||||
client=retriever.client,
|
||||
collection_name=retriever.collection_name,
|
||||
sparse_vector_name="invalid sparse vector",
|
||||
sparse_encoder=consistent_fake_sparse_encoder,
|
||||
)
|
||||
|
||||
assert "does not contain sparse vector" in str(e.value)
|
||||
|
||||
|
||||
def test_add_documents(retriever: QdrantSparseVectorRetriever) -> None:
|
||||
documents = [
|
||||
Document(page_content="hello world", metadata={"a": 1}),
|
||||
Document(page_content="foo bar", metadata={"b": 2}),
|
||||
Document(page_content="baz qux", metadata={"c": 3}),
|
||||
]
|
||||
|
||||
ids = retriever.add_documents(documents)
|
||||
|
||||
assert retriever.client.count(retriever.collection_name, exact=True).count == 3
|
||||
|
||||
documents = [
|
||||
Document(page_content="hello world"),
|
||||
Document(page_content="foo bar"),
|
||||
Document(page_content="baz qux"),
|
||||
]
|
||||
|
||||
ids = retriever.add_documents(documents)
|
||||
|
||||
assert len(ids) == 3
|
||||
|
||||
assert retriever.client.count(retriever.collection_name, exact=True).count == 6
|
||||
|
||||
|
||||
def test_add_texts(retriever: QdrantSparseVectorRetriever) -> None:
|
||||
retriever.add_texts(
|
||||
["hello world", "foo bar", "baz qux"], [{"a": 1}, {"b": 2}, {"c": 3}]
|
||||
)
|
||||
|
||||
assert retriever.client.count(retriever.collection_name, exact=True).count == 3
|
||||
|
||||
retriever.add_texts(["hello world", "foo bar", "baz qux"])
|
||||
|
||||
assert retriever.client.count(retriever.collection_name, exact=True).count == 6
|
||||
|
||||
|
||||
def test_get_relevant_documents(retriever: QdrantSparseVectorRetriever) -> None:
|
||||
retriever.add_texts(["Hai there!", "Hello world!", "Foo bar baz!"])
|
||||
|
||||
expected = [Document(page_content="Hai there!")]
|
||||
|
||||
retriever.k = 1
|
||||
results = retriever.get_relevant_documents("Hai there!")
|
||||
|
||||
assert len(results) == retriever.k
|
||||
assert results == expected
|
||||
assert retriever.get_relevant_documents("Hai there!") == expected
|
||||
|
||||
|
||||
def test_get_relevant_documents_with_filter(
|
||||
retriever: QdrantSparseVectorRetriever,
|
||||
) -> None:
|
||||
from qdrant_client import models
|
||||
|
||||
retriever.add_texts(
|
||||
["Hai there!", "Hello world!", "Foo bar baz!"],
|
||||
[
|
||||
{"value": 1},
|
||||
{"value": 2},
|
||||
{"value": 3},
|
||||
],
|
||||
)
|
||||
|
||||
retriever.filter = models.Filter(
|
||||
must=[
|
||||
models.FieldCondition(
|
||||
key="metadata.value", match=models.MatchValue(value=2)
|
||||
)
|
||||
]
|
||||
)
|
||||
results = retriever.get_relevant_documents("Some query")
|
||||
|
||||
assert results[0] == Document(page_content="Hello world!", metadata={"value": 2})
|
@ -24,6 +24,7 @@ EXPECTED_ALL = [
|
||||
"OutlineRetriever",
|
||||
"PineconeHybridSearchRetriever",
|
||||
"PubMedRetriever",
|
||||
"QdrantSparseVectorRetriever",
|
||||
"RemoteLangChainRetriever",
|
||||
"SVMRetriever",
|
||||
"TavilySearchAPIRetriever",
|
||||
|
Loading…
Reference in New Issue
Block a user