Revert pinecone v4 support (#7566)

Revert 9d13dcd
This commit is contained in:
Bagatur 2023-07-11 20:58:59 -04:00 committed by GitHub
parent e811c5e8c6
commit 2babe3069f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 129 additions and 38 deletions

View File

@ -40,6 +40,7 @@ class Pinecone(VectorStore):
index: Any,
embedding_function: Callable,
text_key: str,
namespace: Optional[str] = None,
distance_strategy: Optional[DistanceStrategy] = DistanceStrategy.COSINE,
):
"""Initialize with Pinecone client."""
@ -58,6 +59,7 @@ class Pinecone(VectorStore):
self._index = index
self._embedding_function = embedding_function
self._text_key = text_key
self._namespace = namespace
self.distance_strategy = distance_strategy
def add_texts(
@ -65,6 +67,7 @@ class Pinecone(VectorStore):
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
namespace: Optional[str] = None,
batch_size: int = 32,
**kwargs: Any,
) -> List[str]:
@ -74,11 +77,14 @@ class Pinecone(VectorStore):
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
ids: Optional list of ids to associate with the texts.
namespace: Optional pinecone namespace to add the texts to.
Returns:
List of ids from adding the texts into the vectorstore.
"""
if namespace is None:
namespace = self._namespace
# Embed and create the documents
docs = []
ids = ids or [str(uuid.uuid4()) for _ in texts]
@ -88,7 +94,7 @@ class Pinecone(VectorStore):
metadata[self._text_key] = text
docs.append((ids[i], embedding, metadata))
# upsert to Pinecone
self._index.upsert(vectors=docs, batch_size=batch_size)
self._index.upsert(vectors=docs, namespace=namespace, batch_size=batch_size)
return ids
def similarity_search_with_score(
@ -96,6 +102,7 @@ class Pinecone(VectorStore):
query: str,
k: int = 4,
filter: Optional[dict] = None,
namespace: Optional[str] = None,
) -> List[Tuple[Document, float]]:
"""Return pinecone documents most similar to query, along with scores.
@ -103,16 +110,20 @@ class Pinecone(VectorStore):
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: Dictionary of argument(s) to filter on metadata
namespace: Namespace to search in. Default will search in '' namespace.
Returns:
List of Documents most similar to the query and score for each
"""
if namespace is None:
namespace = self._namespace
query_obj = self._embedding_function(query)
docs = []
results = self._index.query(
[query_obj],
top_k=k,
include_metadata=True,
namespace=namespace,
filter=filter,
)
for res in results["matches"]:
@ -132,6 +143,7 @@ class Pinecone(VectorStore):
query: str,
k: int = 4,
filter: Optional[dict] = None,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return pinecone documents most similar to query.
@ -140,12 +152,13 @@ class Pinecone(VectorStore):
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: Dictionary of argument(s) to filter on metadata
namespace: Namespace to search in. Default will search in '' namespace.
Returns:
List of Documents most similar to the query and score for each
"""
docs_and_scores = self.similarity_search_with_score(
query, k=k, filter=filter, **kwargs
query, k=k, filter=filter, namespace=namespace, **kwargs
)
return [doc for doc, _ in docs_and_scores]
@ -178,6 +191,7 @@ class Pinecone(VectorStore):
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[dict] = None,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
@ -196,11 +210,14 @@ class Pinecone(VectorStore):
Returns:
List of Documents selected by maximal marginal relevance.
"""
if namespace is None:
namespace = self._namespace
results = self._index.query(
[embedding],
top_k=fetch_k,
include_values=True,
include_metadata=True,
namespace=namespace,
filter=filter,
)
mmr_selected = maximal_marginal_relevance(
@ -222,6 +239,7 @@ class Pinecone(VectorStore):
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[dict] = None,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
@ -242,7 +260,7 @@ class Pinecone(VectorStore):
"""
embedding = self._embedding_function(query)
return self.max_marginal_relevance_search_by_vector(
embedding, k, fetch_k, lambda_mult, filter
embedding, k, fetch_k, lambda_mult, filter, namespace
)
@classmethod
@ -255,6 +273,7 @@ class Pinecone(VectorStore):
batch_size: int = 32,
text_key: str = "text",
index_name: Optional[str] = None,
namespace: Optional[str] = None,
**kwargs: Any,
) -> Pinecone:
"""Construct Pinecone wrapper from raw documents.
@ -327,8 +346,8 @@ class Pinecone(VectorStore):
to_upsert = zip(ids_batch, embeds, metadata)
# upsert to Pinecone
index.upsert(vectors=list(to_upsert))
return cls(index, embedding.embed_query, text_key)
index.upsert(vectors=list(to_upsert), namespace=namespace)
return cls(index, embedding.embed_query, text_key, namespace)
@classmethod
def from_existing_index(
@ -336,6 +355,7 @@ class Pinecone(VectorStore):
index_name: str,
embedding: Embeddings,
text_key: str = "text",
namespace: Optional[str] = None,
) -> Pinecone:
"""Load pinecone vectorstore from index name."""
try:
@ -345,21 +365,38 @@ class Pinecone(VectorStore):
"Could not import pinecone python package. "
"Please install it with `pip install pinecone-client`."
)
return cls(pinecone.Index(index_name), embedding.embed_query, text_key)
return cls(
pinecone.Index(index_name), embedding.embed_query, text_key, namespace
)
def delete(
self,
ids: Optional[List[str]] = None,
delete_all: Optional[bool] = None,
namespace: Optional[str] = None,
filter: Optional[dict] = None,
**kwargs: Any,
) -> None:
"""Delete by vector IDs
"""Delete by vector IDs or filter.
Args:
ids: List of ids to delete.
filter: Dictionary of conditions to filter vectors to delete.
"""
if ids is None:
raise ValueError("Ids must be provided.")
chunk_size = 1000
for i in range(0, len(ids), chunk_size):
chunk = ids[i : i + chunk_size]
self._index.delete(ids=chunk, **kwargs)
if namespace is None:
namespace = self._namespace
if delete_all:
self._index.delete(delete_all=True, namespace=namespace, **kwargs)
elif ids is not None:
chunk_size = 1000
for i in range(0, len(ids), chunk_size):
chunk = ids[i : i + chunk_size]
self._index.delete(ids=chunk, namespace=namespace, **kwargs)
elif filter is not None:
self._index.delete(filter=filter, namespace=namespace, **kwargs)
else:
raise ValueError("Either ids, delete_all, or filter must be provided.")
return None

View File

@ -13,6 +13,7 @@ from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pinecone import Pinecone
index_name = "langchain-test-index" # name of the index
namespace_name = "langchain-test-namespace" # name of the namespace
dimension = 1536 # dimension of the embeddings
@ -40,28 +41,40 @@ class TestPinecone:
cls.index = pinecone.Index(index_name)
if index_name in pinecone.list_indexes():
pinecone.delete_index(index_name)
index_stats = cls.index.describe_index_stats()
if index_stats["dimension"] == dimension:
# delete all the vectors in the index if the dimension is the same
# from all namespaces
index_stats = cls.index.describe_index_stats()
for _namespace_name in index_stats["namespaces"].keys():
cls.index.delete(delete_all=True, namespace=_namespace_name)
pinecone.create_index(name=index_name, dimension=dimension)
else:
pinecone.delete_index(index_name)
pinecone.create_index(name=index_name, dimension=dimension)
else:
pinecone.create_index(name=index_name, dimension=dimension)
# insure the index is empty
index_stats = cls.index.describe_index_stats()
assert index_stats["dimension"] == dimension
assert index_stats["total_vector_count"] == 0
if index_stats["namespaces"].get(namespace_name) is not None:
assert index_stats["namespaces"][namespace_name]["vector_count"] == 0
@classmethod
def teardown_class(cls) -> None:
if index_name in pinecone.list_indexes():
pinecone.delete_index(index_name)
pinecone.create_index(index_name, dimension=dimension)
index_stats = cls.index.describe_index_stats()
for _namespace_name in index_stats["namespaces"].keys():
cls.index.delete(delete_all=True, namespace=_namespace_name)
reset_pinecone()
@pytest.fixture(autouse=True)
def setup(self) -> None:
if index_name in pinecone.list_indexes():
pinecone.delete_index(index_name)
pinecone.create_index(index_name, dimension=dimension)
# delete all the vectors in the index
index_stats = self.index.describe_index_stats()
for _namespace_name in index_stats["namespaces"].keys():
self.index.delete(delete_all=True, namespace=_namespace_name)
reset_pinecone()
@ -75,11 +88,12 @@ class TestPinecone:
texts.insert(0, needs)
docsearch = Pinecone.from_texts(
texts=texts, embedding=embedding_openai, index_name=index_name
texts=texts,
embedding=embedding_openai,
index_name=index_name,
namespace=namespace_name,
)
# wait for the index to be ready
time.sleep(20)
output = docsearch.similarity_search(unique_id, k=1)
output = docsearch.similarity_search(unique_id, k=1, namespace=namespace_name)
assert output == [Document(page_content=needs)]
@pytest.mark.vcr()
@ -98,10 +112,9 @@ class TestPinecone:
embedding_openai,
index_name=index_name,
metadatas=metadatas,
namespace=namespace_name,
)
# wait for the index to be ready
time.sleep(20)
output = docsearch.similarity_search(needs, k=1)
output = docsearch.similarity_search(needs, k=1, namespace=namespace_name)
# TODO: why metadata={"page": 0.0}) instead of {"page": 0}?
assert output == [Document(page_content=needs, metadata={"page": 0.0})]
@ -116,10 +129,11 @@ class TestPinecone:
embedding_openai,
index_name=index_name,
metadatas=metadatas,
namespace=namespace_name,
)
output = docsearch.similarity_search_with_score(
"foo", k=3, namespace=namespace_name
)
# wait for the index to be ready
time.sleep(20)
output = docsearch.similarity_search_with_score("foo", k=3)
docs = [o[0] for o in output]
scores = [o[1] for o in output]
sorted_documents = sorted(docs, key=lambda x: x.metadata["page"])
@ -132,17 +146,57 @@ class TestPinecone:
]
assert scores[0] > scores[1] > scores[2]
def test_from_existing_index_with_namespaces(
self, embedding_openai: OpenAIEmbeddings
) -> None:
"""Test that namespaces are properly handled."""
# Create two indexes with the same name but different namespaces
texts_1 = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts_1))]
Pinecone.from_texts(
texts_1,
embedding_openai,
index_name=index_name,
metadatas=metadatas,
namespace=f"{index_name}-1",
)
texts_2 = ["foo2", "bar2", "baz2"]
metadatas = [{"page": i} for i in range(len(texts_2))]
Pinecone.from_texts(
texts_2,
embedding_openai,
index_name=index_name,
metadatas=metadatas,
namespace=f"{index_name}-2",
)
# Search with namespace
docsearch = Pinecone.from_existing_index(
index_name=index_name,
embedding=embedding_openai,
namespace=f"{index_name}-1",
)
output = docsearch.similarity_search("foo", k=20, namespace=f"{index_name}-1")
# check that we don't get results from the other namespace
page_contents = sorted(set([o.page_content for o in output]))
assert all(content in ["foo", "bar", "baz"] for content in page_contents)
assert all(content not in ["foo2", "bar2", "baz2"] for content in page_contents)
def test_add_documents_with_ids(
self, texts: List[str], embedding_openai: OpenAIEmbeddings
) -> None:
ids = [uuid.uuid4().hex for _ in range(len(texts))]
Pinecone.from_texts(
texts=texts, ids=ids, embedding=embedding_openai, index_name=index_name
texts=texts,
ids=ids,
embedding=embedding_openai,
index_name=index_name,
namespace=index_name,
)
# wait for the index to be ready
time.sleep(20)
index_stats = self.index.describe_index_stats()
assert index_stats["total_vector_count"] == len(texts)
assert index_stats["namespaces"][index_name]["vector_count"] == len(texts)
ids_1 = [uuid.uuid4().hex for _ in range(len(texts))]
Pinecone.from_texts(
@ -150,10 +204,10 @@ class TestPinecone:
ids=ids_1,
embedding=embedding_openai,
index_name=index_name,
namespace=index_name,
)
# wait for the index to be ready
time.sleep(20)
index_stats = self.index.describe_index_stats()
assert index_stats["namespaces"][index_name]["vector_count"] == len(texts) * 2
assert index_stats["total_vector_count"] == len(texts) * 2
@pytest.mark.vcr()