community[patch]: Upstash Vector Store Namespace Support (#22251)

This PR introduces namespace support for Upstash Vector Store, which
would allow users to partition their data in the vector index.

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
Fahreddin Özcan 2024-06-04 02:30:56 +02:00 committed by GitHub
parent 25cf1a74d5
commit 0061ded002
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 154 additions and 19 deletions

View File

@ -61,6 +61,22 @@ store = UpstashVectorStore(
See [Upstash Vector documentation](https://upstash.com/docs/vector/features/embeddingmodels)
for more detail on embedding models.
## Namespaces
You can use namespaces to partition your data in the index. Namespaces are useful when you want to query over huge amount of data, and you want to partition the data to make the queries faster. When you use namespaces, there won't be post-filtering on the results which will make the query results more precise.
```python
from langchain_community.vectorstores.upstash import UpstashVectorStore
import os
os.environ["UPSTASH_VECTOR_REST_URL"] = "<UPSTASH_VECTOR_REST_URL>"
os.environ["UPSTASH_VECTOR_REST_TOKEN"] = "<UPSTASH_VECTOR_REST_TOKEN>"
store = UpstashVectorStore(
embedding=embeddings
namespace="my_namespace"
)
```
### Inserting Vectors
```python

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import logging
import uuid
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union, cast
import numpy as np
from langchain_core.documents import Document
@ -64,6 +64,8 @@ class UpstashVectorStore(VectorStore):
index_url: Optional[str] = None,
index_token: Optional[str] = None,
embedding: Optional[Union[Embeddings, bool]] = None,
*,
namespace: str = "",
):
"""
Constructor for UpstashVectorStore.
@ -83,6 +85,7 @@ class UpstashVectorStore(VectorStore):
is applied. If true, Upstash embeddings are used. When Upstash
embeddings are used, text is sent directly to Upstash and
embedding is applied there instead of embedding in Langchain.
namespace: Namespace to use from the index.
Example:
.. code-block:: python
@ -94,7 +97,8 @@ class UpstashVectorStore(VectorStore):
vectorstore = UpstashVectorStore(
embedding=embeddings,
index_url="...",
index_token="..."
index_token="...",
namespace="..."
)
# With an existing index
@ -103,7 +107,8 @@ class UpstashVectorStore(VectorStore):
index = Index(url="...", token="...")
vectorstore = UpstashVectorStore(
embedding=embeddings,
index=index
index=index,
namespace="..."
)
"""
@ -145,6 +150,7 @@ class UpstashVectorStore(VectorStore):
self._embeddings = embedding
self._text_key = text_key
self._namespace = namespace
@property
def embeddings(self) -> Optional[Union[Embeddings, bool]]: # type: ignore
@ -187,6 +193,8 @@ class UpstashVectorStore(VectorStore):
ids: Optional[List[str]] = None,
batch_size: int = 32,
embedding_chunk_size: int = 1000,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[str]:
"""
@ -202,6 +210,7 @@ class UpstashVectorStore(VectorStore):
batch_size: Batch size to use when upserting the embeddings.
Upstash supports at max 1000 vectors per request.
embedding_batch_size: Chunk size to use when embedding the texts.
namespace: Namespace to use from the index.
Returns:
List of ids from adding the texts into the vectorstore.
@ -216,6 +225,7 @@ class UpstashVectorStore(VectorStore):
batch_size=batch_size,
ids=ids,
embedding_chunk_size=embedding_chunk_size,
namespace=namespace,
**kwargs,
)
@ -225,6 +235,8 @@ class UpstashVectorStore(VectorStore):
ids: Optional[List[str]] = None,
batch_size: int = 32,
embedding_chunk_size: int = 1000,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[str]:
"""
@ -240,6 +252,7 @@ class UpstashVectorStore(VectorStore):
batch_size: Batch size to use when upserting the embeddings.
Upstash supports at max 1000 vectors per request.
embedding_batch_size: Chunk size to use when embedding the texts.
namespace: Namespace to use from the index.
Returns:
List of ids from adding the texts into the vectorstore.
@ -254,6 +267,7 @@ class UpstashVectorStore(VectorStore):
ids=ids,
batch_size=batch_size,
embedding_chunk_size=embedding_chunk_size,
namespace=namespace,
**kwargs,
)
@ -264,6 +278,8 @@ class UpstashVectorStore(VectorStore):
ids: Optional[List[str]] = None,
batch_size: int = 32,
embedding_chunk_size: int = 1000,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[str]:
"""
@ -281,11 +297,15 @@ class UpstashVectorStore(VectorStore):
batch_size: Batch size to use when upserting the embeddings.
Upstash supports at max 1000 vectors per request.
embedding_batch_size: Chunk size to use when embedding the texts.
namespace: Namespace to use from the index.
Returns:
List of ids from adding the texts into the vectorstore.
"""
if namespace is None:
namespace = self._namespace
texts = list(texts)
ids = ids or [str(uuid.uuid4()) for _ in texts]
@ -308,7 +328,9 @@ class UpstashVectorStore(VectorStore):
for batch in batch_iterate(
batch_size, zip(chunk_ids, embeddings, chunk_metadatas)
):
self._index.upsert(vectors=batch, **kwargs)
self._index.upsert(
vectors=batch, namespace=cast(str, namespace), **kwargs
)
return ids
@ -319,6 +341,8 @@ class UpstashVectorStore(VectorStore):
ids: Optional[List[str]] = None,
batch_size: int = 32,
embedding_chunk_size: int = 1000,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[str]:
"""
@ -336,11 +360,15 @@ class UpstashVectorStore(VectorStore):
batch_size: Batch size to use when upserting the embeddings.
Upstash supports at max 1000 vectors per request.
embedding_batch_size: Chunk size to use when embedding the texts.
namespace: Namespace to use from the index.
Returns:
List of ids from adding the texts into the vectorstore.
"""
if namespace is None:
namespace = self._namespace
texts = list(texts)
ids = ids or [str(uuid.uuid4()) for _ in texts]
@ -363,7 +391,9 @@ class UpstashVectorStore(VectorStore):
for batch in batch_iterate(
batch_size, zip(chunk_ids, embeddings, chunk_metadatas)
):
await self._async_index.upsert(vectors=batch, **kwargs)
await self._async_index.upsert(
vectors=batch, namespace=cast(str, namespace), **kwargs
)
return ids
@ -372,6 +402,8 @@ class UpstashVectorStore(VectorStore):
query: str,
k: int = 4,
filter: Optional[str] = None,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Retrieve texts most similar to query and
@ -381,12 +413,13 @@ class UpstashVectorStore(VectorStore):
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: Optional metadata filter in str format
namespace: Namespace to use from the index.
Returns:
List of Documents most similar to the query and score for each
"""
return self.similarity_search_by_vector_with_score(
self._embed_query(query), k=k, filter=filter, **kwargs
self._embed_query(query), k=k, filter=filter, namespace=namespace, **kwargs
)
async def asimilarity_search_with_score(
@ -394,6 +427,8 @@ class UpstashVectorStore(VectorStore):
query: str,
k: int = 4,
filter: Optional[str] = None,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Retrieve texts most similar to query and
@ -403,12 +438,13 @@ class UpstashVectorStore(VectorStore):
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: Optional metadata filter in str format
namespace: Namespace to use from the index.
Returns:
List of Documents most similar to the query and score for each
"""
return await self.asimilarity_search_by_vector_with_score(
self._embed_query(query), k=k, filter=filter, **kwargs
self._embed_query(query), k=k, filter=filter, namespace=namespace, **kwargs
)
def _process_results(self, results: List) -> List[Tuple[Document, float]]:
@ -430,15 +466,25 @@ class UpstashVectorStore(VectorStore):
embedding: Union[List[float], str],
k: int = 4,
filter: Optional[str] = None,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return texts whose embedding is closest to the given embedding"""
filter = filter or ""
if namespace is None:
namespace = self._namespace
if isinstance(embedding, str):
results = self._index.query(
data=embedding, top_k=k, include_metadata=True, filter=filter, **kwargs
data=embedding,
top_k=k,
include_metadata=True,
filter=filter,
namespace=namespace,
**kwargs,
)
else:
results = self._index.query(
@ -446,6 +492,7 @@ class UpstashVectorStore(VectorStore):
top_k=k,
include_metadata=True,
filter=filter,
namespace=namespace,
**kwargs,
)
@ -456,15 +503,25 @@ class UpstashVectorStore(VectorStore):
embedding: Union[List[float], str],
k: int = 4,
filter: Optional[str] = None,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return texts whose embedding is closest to the given embedding"""
filter = filter or ""
if namespace is None:
namespace = self._namespace
if isinstance(embedding, str):
results = await self._async_index.query(
data=embedding, top_k=k, include_metadata=True, filter=filter, **kwargs
data=embedding,
top_k=k,
include_metadata=True,
filter=filter,
namespace=namespace,
**kwargs,
)
else:
results = await self._async_index.query(
@ -472,6 +529,7 @@ class UpstashVectorStore(VectorStore):
top_k=k,
include_metadata=True,
filter=filter,
namespace=namespace,
**kwargs,
)
@ -482,6 +540,8 @@ class UpstashVectorStore(VectorStore):
query: str,
k: int = 4,
filter: Optional[str] = None,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return documents most similar to query.
@ -490,12 +550,13 @@ class UpstashVectorStore(VectorStore):
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: Optional metadata filter in str format
namespace: Namespace to use from the index.
Returns:
List of Documents most similar to the query and score for each
"""
docs_and_scores = self.similarity_search_with_score(
query, k=k, filter=filter, **kwargs
query, k=k, filter=filter, namespace=namespace, **kwargs
)
return [doc for doc, _ in docs_and_scores]
@ -504,6 +565,8 @@ class UpstashVectorStore(VectorStore):
query: str,
k: int = 4,
filter: Optional[str] = None,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return documents most similar to query.
@ -512,12 +575,13 @@ class UpstashVectorStore(VectorStore):
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: Optional metadata filter in str format
namespace: Namespace to use from the index.
Returns:
List of Documents most similar to the query
"""
docs_and_scores = await self.asimilarity_search_with_score(
query, k=k, filter=filter, **kwargs
query, k=k, filter=filter, namespace=namespace, **kwargs
)
return [doc for doc, _ in docs_and_scores]
@ -526,6 +590,8 @@ class UpstashVectorStore(VectorStore):
embedding: Union[List[float], str],
k: int = 4,
filter: Optional[str] = None,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return documents closest to the given embedding.
@ -534,12 +600,13 @@ class UpstashVectorStore(VectorStore):
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: Optional metadata filter in str format
namespace: Namespace to use from the index.
Returns:
List of Documents most similar to the query
"""
docs_and_scores = self.similarity_search_by_vector_with_score(
embedding, k=k, filter=filter, **kwargs
embedding, k=k, filter=filter, namespace=namespace, **kwargs
)
return [doc for doc, _ in docs_and_scores]
@ -548,6 +615,8 @@ class UpstashVectorStore(VectorStore):
embedding: Union[List[float], str],
k: int = 4,
filter: Optional[str] = None,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return documents closest to the given embedding.
@ -556,12 +625,13 @@ class UpstashVectorStore(VectorStore):
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: Optional metadata filter in str format
namespace: Namespace to use from the index.
Returns:
List of Documents most similar to the query
"""
docs_and_scores = await self.asimilarity_search_by_vector_with_score(
embedding, k=k, filter=filter, **kwargs
embedding, k=k, filter=filter, namespace=namespace, **kwargs
)
return [doc for doc, _ in docs_and_scores]
@ -570,25 +640,31 @@ class UpstashVectorStore(VectorStore):
query: str,
k: int = 4,
filter: Optional[str] = None,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""
Since Upstash always returns relevance scores, default implementation is used.
"""
return self.similarity_search_with_score(query, k=k, filter=filter, **kwargs)
return self.similarity_search_with_score(
query, k=k, filter=filter, namespace=namespace, **kwargs
)
async def _asimilarity_search_with_relevance_scores(
self,
query: str,
k: int = 4,
filter: Optional[str] = None,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""
Since Upstash always returns relevance scores, default implementation is used.
"""
return await self.asimilarity_search_with_score(
query, k=k, filter=filter, **kwargs
query, k=k, filter=filter, namespace=namespace, **kwargs
)
def max_marginal_relevance_search_by_vector(
@ -598,6 +674,8 @@ class UpstashVectorStore(VectorStore):
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[str] = None,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
@ -614,10 +692,14 @@ class UpstashVectorStore(VectorStore):
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter: Optional metadata filter in str format
namespace: Namespace to use from the index.
Returns:
List of Documents selected by maximal marginal relevance.
"""
if namespace is None:
namespace = self._namespace
assert isinstance(self.embeddings, Embeddings)
if isinstance(embedding, str):
results = self._index.query(
@ -626,6 +708,7 @@ class UpstashVectorStore(VectorStore):
include_vectors=True,
include_metadata=True,
filter=filter or "",
namespace=namespace,
**kwargs,
)
else:
@ -635,6 +718,7 @@ class UpstashVectorStore(VectorStore):
include_vectors=True,
include_metadata=True,
filter=filter or "",
namespace=namespace,
**kwargs,
)
@ -657,6 +741,8 @@ class UpstashVectorStore(VectorStore):
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[str] = None,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
@ -673,10 +759,15 @@ class UpstashVectorStore(VectorStore):
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter: Optional metadata filter in str format
namespace: Namespace to use from the index.
Returns:
List of Documents selected by maximal marginal relevance.
"""
if namespace is None:
namespace = self._namespace
assert isinstance(self.embeddings, Embeddings)
if isinstance(embedding, str):
results = await self._async_index.query(
@ -685,6 +776,7 @@ class UpstashVectorStore(VectorStore):
include_vectors=True,
include_metadata=True,
filter=filter or "",
namespace=namespace,
**kwargs,
)
else:
@ -694,6 +786,7 @@ class UpstashVectorStore(VectorStore):
include_vectors=True,
include_metadata=True,
filter=filter or "",
namespace=namespace,
**kwargs,
)
@ -716,6 +809,8 @@ class UpstashVectorStore(VectorStore):
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[str] = None,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
@ -732,6 +827,7 @@ class UpstashVectorStore(VectorStore):
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter: Optional metadata filter in str format
namespace: Namespace to use from the index.
Returns:
List of Documents selected by maximal marginal relevance.
@ -743,6 +839,7 @@ class UpstashVectorStore(VectorStore):
fetch_k=fetch_k,
lambda_mult=lambda_mult,
filter=filter,
namespace=namespace,
**kwargs,
)
@ -753,6 +850,8 @@ class UpstashVectorStore(VectorStore):
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[str] = None,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
@ -769,6 +868,7 @@ class UpstashVectorStore(VectorStore):
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter: Optional metadata filter in str format
namespace: Namespace to use from the index.
Returns:
List of Documents selected by maximal marginal relevance.
@ -780,6 +880,7 @@ class UpstashVectorStore(VectorStore):
fetch_k=fetch_k,
lambda_mult=lambda_mult,
filter=filter,
namespace=namespace,
**kwargs,
)
@ -797,6 +898,8 @@ class UpstashVectorStore(VectorStore):
async_index: Optional[AsyncIndex] = None,
index_url: Optional[str] = None,
index_token: Optional[str] = None,
*,
namespace: str = "",
**kwargs: Any,
) -> UpstashVectorStore:
"""Create a new UpstashVectorStore from a list of texts.
@ -819,6 +922,7 @@ class UpstashVectorStore(VectorStore):
async_index=async_index,
index_url=index_url,
index_token=index_token,
namespace=namespace,
**kwargs,
)
@ -828,6 +932,7 @@ class UpstashVectorStore(VectorStore):
ids=ids,
batch_size=batch_size,
embedding_chunk_size=embedding_chunk_size,
namespace=namespace,
)
return vector_store
@ -845,6 +950,8 @@ class UpstashVectorStore(VectorStore):
async_index: Optional[AsyncIndex] = None,
index_url: Optional[str] = None,
index_token: Optional[str] = None,
*,
namespace: str = "",
**kwargs: Any,
) -> UpstashVectorStore:
"""Create a new UpstashVectorStore from a list of texts.
@ -865,6 +972,7 @@ class UpstashVectorStore(VectorStore):
text_key=text_key,
index=index,
async_index=async_index,
namespace=namespace,
index_url=index_url,
index_token=index_token,
**kwargs,
@ -875,6 +983,7 @@ class UpstashVectorStore(VectorStore):
metadatas=metadatas,
ids=ids,
batch_size=batch_size,
namespace=namespace,
embedding_chunk_size=embedding_chunk_size,
)
return vector_store
@ -884,6 +993,8 @@ class UpstashVectorStore(VectorStore):
ids: Optional[List[str]] = None,
delete_all: Optional[bool] = None,
batch_size: Optional[int] = 1000,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> None:
"""Delete by vector IDs
@ -892,14 +1003,17 @@ class UpstashVectorStore(VectorStore):
ids: List of ids to delete.
delete_all: Delete all vectors in the index.
batch_size: Batch size to use when deleting the embeddings.
namespace: Namespace to use from the index.
Upstash supports at max 1000 deletions per request.
"""
if namespace is None:
namespace = self._namespace
if delete_all:
self._index.reset()
self._index.reset(namespace=namespace)
elif ids is not None:
for batch in batch_iterate(batch_size, ids):
self._index.delete(ids=batch)
self._index.delete(ids=batch, namespace=namespace)
else:
raise ValueError("Either ids or delete_all should be provided")
@ -910,6 +1024,8 @@ class UpstashVectorStore(VectorStore):
ids: Optional[List[str]] = None,
delete_all: Optional[bool] = None,
batch_size: Optional[int] = 1000,
*,
namespace: Optional[str] = None,
**kwargs: Any,
) -> None:
"""Delete by vector IDs
@ -918,14 +1034,17 @@ class UpstashVectorStore(VectorStore):
ids: List of ids to delete.
delete_all: Delete all vectors in the index.
batch_size: Batch size to use when deleting the embeddings.
namespace: Namespace to use from the index.
Upstash supports at max 1000 deletions per request.
"""
if namespace is None:
namespace = self._namespace
if delete_all:
await self._async_index.reset()
await self._async_index.reset(namespace=namespace)
elif ids is not None:
for batch in batch_iterate(batch_size, ids):
await self._async_index.delete(ids=batch)
await self._async_index.delete(ids=batch, namespace=namespace)
else:
raise ValueError("Either ids or delete_all should be provided")