diff --git a/langchain/vectorstores/pinecone.py b/langchain/vectorstores/pinecone.py index baeb69f20df..a57bf36379b 100644 --- a/langchain/vectorstores/pinecone.py +++ b/langchain/vectorstores/pinecone.py @@ -40,6 +40,7 @@ class Pinecone(VectorStore): index: Any, embedding_function: Callable, text_key: str, + namespace: Optional[str] = None, distance_strategy: Optional[DistanceStrategy] = DistanceStrategy.COSINE, ): """Initialize with Pinecone client.""" @@ -58,6 +59,7 @@ class Pinecone(VectorStore): self._index = index self._embedding_function = embedding_function self._text_key = text_key + self._namespace = namespace self.distance_strategy = distance_strategy def add_texts( @@ -65,6 +67,7 @@ class Pinecone(VectorStore): texts: Iterable[str], metadatas: Optional[List[dict]] = None, ids: Optional[List[str]] = None, + namespace: Optional[str] = None, batch_size: int = 32, **kwargs: Any, ) -> List[str]: @@ -74,11 +77,14 @@ class Pinecone(VectorStore): texts: Iterable of strings to add to the vectorstore. metadatas: Optional list of metadatas associated with the texts. ids: Optional list of ids to associate with the texts. + namespace: Optional pinecone namespace to add the texts to. Returns: List of ids from adding the texts into the vectorstore. """ + if namespace is None: + namespace = self._namespace # Embed and create the documents docs = [] ids = ids or [str(uuid.uuid4()) for _ in texts] @@ -88,7 +94,7 @@ class Pinecone(VectorStore): metadata[self._text_key] = text docs.append((ids[i], embedding, metadata)) # upsert to Pinecone - self._index.upsert(vectors=docs, batch_size=batch_size) + self._index.upsert(vectors=docs, namespace=namespace, batch_size=batch_size) return ids def similarity_search_with_score( @@ -96,6 +102,7 @@ class Pinecone(VectorStore): query: str, k: int = 4, filter: Optional[dict] = None, + namespace: Optional[str] = None, ) -> List[Tuple[Document, float]]: """Return pinecone documents most similar to query, along with scores. @@ -103,16 +110,20 @@ class Pinecone(VectorStore): query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: Dictionary of argument(s) to filter on metadata + namespace: Namespace to search in. Default will search in '' namespace. Returns: List of Documents most similar to the query and score for each """ + if namespace is None: + namespace = self._namespace query_obj = self._embedding_function(query) docs = [] results = self._index.query( [query_obj], top_k=k, include_metadata=True, + namespace=namespace, filter=filter, ) for res in results["matches"]: @@ -132,6 +143,7 @@ class Pinecone(VectorStore): query: str, k: int = 4, filter: Optional[dict] = None, + namespace: Optional[str] = None, **kwargs: Any, ) -> List[Document]: """Return pinecone documents most similar to query. @@ -140,12 +152,13 @@ class Pinecone(VectorStore): query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: Dictionary of argument(s) to filter on metadata + namespace: Namespace to search in. Default will search in '' namespace. Returns: List of Documents most similar to the query and score for each """ docs_and_scores = self.similarity_search_with_score( - query, k=k, filter=filter, **kwargs + query, k=k, filter=filter, namespace=namespace, **kwargs ) return [doc for doc, _ in docs_and_scores] @@ -178,6 +191,7 @@ class Pinecone(VectorStore): fetch_k: int = 20, lambda_mult: float = 0.5, filter: Optional[dict] = None, + namespace: Optional[str] = None, **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance. @@ -196,11 +210,14 @@ class Pinecone(VectorStore): Returns: List of Documents selected by maximal marginal relevance. """ + if namespace is None: + namespace = self._namespace results = self._index.query( [embedding], top_k=fetch_k, include_values=True, include_metadata=True, + namespace=namespace, filter=filter, ) mmr_selected = maximal_marginal_relevance( @@ -222,6 +239,7 @@ class Pinecone(VectorStore): fetch_k: int = 20, lambda_mult: float = 0.5, filter: Optional[dict] = None, + namespace: Optional[str] = None, **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance. @@ -242,7 +260,7 @@ class Pinecone(VectorStore): """ embedding = self._embedding_function(query) return self.max_marginal_relevance_search_by_vector( - embedding, k, fetch_k, lambda_mult, filter + embedding, k, fetch_k, lambda_mult, filter, namespace ) @classmethod @@ -255,6 +273,7 @@ class Pinecone(VectorStore): batch_size: int = 32, text_key: str = "text", index_name: Optional[str] = None, + namespace: Optional[str] = None, **kwargs: Any, ) -> Pinecone: """Construct Pinecone wrapper from raw documents. @@ -327,8 +346,8 @@ class Pinecone(VectorStore): to_upsert = zip(ids_batch, embeds, metadata) # upsert to Pinecone - index.upsert(vectors=list(to_upsert)) - return cls(index, embedding.embed_query, text_key) + index.upsert(vectors=list(to_upsert), namespace=namespace) + return cls(index, embedding.embed_query, text_key, namespace) @classmethod def from_existing_index( @@ -336,6 +355,7 @@ class Pinecone(VectorStore): index_name: str, embedding: Embeddings, text_key: str = "text", + namespace: Optional[str] = None, ) -> Pinecone: """Load pinecone vectorstore from index name.""" try: @@ -345,21 +365,38 @@ class Pinecone(VectorStore): "Could not import pinecone python package. " "Please install it with `pip install pinecone-client`." ) - return cls(pinecone.Index(index_name), embedding.embed_query, text_key) + + return cls( + pinecone.Index(index_name), embedding.embed_query, text_key, namespace + ) def delete( self, ids: Optional[List[str]] = None, + delete_all: Optional[bool] = None, + namespace: Optional[str] = None, + filter: Optional[dict] = None, **kwargs: Any, ) -> None: - """Delete by vector IDs + """Delete by vector IDs or filter. Args: ids: List of ids to delete. + filter: Dictionary of conditions to filter vectors to delete. """ - if ids is None: - raise ValueError("Ids must be provided.") - chunk_size = 1000 - for i in range(0, len(ids), chunk_size): - chunk = ids[i : i + chunk_size] - self._index.delete(ids=chunk, **kwargs) + if namespace is None: + namespace = self._namespace + + if delete_all: + self._index.delete(delete_all=True, namespace=namespace, **kwargs) + elif ids is not None: + chunk_size = 1000 + for i in range(0, len(ids), chunk_size): + chunk = ids[i : i + chunk_size] + self._index.delete(ids=chunk, namespace=namespace, **kwargs) + elif filter is not None: + self._index.delete(filter=filter, namespace=namespace, **kwargs) + else: + raise ValueError("Either ids, delete_all, or filter must be provided.") + + return None diff --git a/tests/integration_tests/vectorstores/test_pinecone.py b/tests/integration_tests/vectorstores/test_pinecone.py index d1c6fa0c56f..cc380150cbe 100644 --- a/tests/integration_tests/vectorstores/test_pinecone.py +++ b/tests/integration_tests/vectorstores/test_pinecone.py @@ -13,6 +13,7 @@ from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores.pinecone import Pinecone index_name = "langchain-test-index" # name of the index +namespace_name = "langchain-test-namespace" # name of the namespace dimension = 1536 # dimension of the embeddings @@ -40,28 +41,40 @@ class TestPinecone: cls.index = pinecone.Index(index_name) if index_name in pinecone.list_indexes(): - pinecone.delete_index(index_name) + index_stats = cls.index.describe_index_stats() + if index_stats["dimension"] == dimension: + # delete all the vectors in the index if the dimension is the same + # from all namespaces + index_stats = cls.index.describe_index_stats() + for _namespace_name in index_stats["namespaces"].keys(): + cls.index.delete(delete_all=True, namespace=_namespace_name) - pinecone.create_index(name=index_name, dimension=dimension) + else: + pinecone.delete_index(index_name) + pinecone.create_index(name=index_name, dimension=dimension) + else: + pinecone.create_index(name=index_name, dimension=dimension) # insure the index is empty index_stats = cls.index.describe_index_stats() assert index_stats["dimension"] == dimension - assert index_stats["total_vector_count"] == 0 + if index_stats["namespaces"].get(namespace_name) is not None: + assert index_stats["namespaces"][namespace_name]["vector_count"] == 0 @classmethod def teardown_class(cls) -> None: - if index_name in pinecone.list_indexes(): - pinecone.delete_index(index_name) - pinecone.create_index(index_name, dimension=dimension) + index_stats = cls.index.describe_index_stats() + for _namespace_name in index_stats["namespaces"].keys(): + cls.index.delete(delete_all=True, namespace=_namespace_name) reset_pinecone() @pytest.fixture(autouse=True) def setup(self) -> None: - if index_name in pinecone.list_indexes(): - pinecone.delete_index(index_name) - pinecone.create_index(index_name, dimension=dimension) + # delete all the vectors in the index + index_stats = self.index.describe_index_stats() + for _namespace_name in index_stats["namespaces"].keys(): + self.index.delete(delete_all=True, namespace=_namespace_name) reset_pinecone() @@ -75,11 +88,12 @@ class TestPinecone: texts.insert(0, needs) docsearch = Pinecone.from_texts( - texts=texts, embedding=embedding_openai, index_name=index_name + texts=texts, + embedding=embedding_openai, + index_name=index_name, + namespace=namespace_name, ) - # wait for the index to be ready - time.sleep(20) - output = docsearch.similarity_search(unique_id, k=1) + output = docsearch.similarity_search(unique_id, k=1, namespace=namespace_name) assert output == [Document(page_content=needs)] @pytest.mark.vcr() @@ -98,10 +112,9 @@ class TestPinecone: embedding_openai, index_name=index_name, metadatas=metadatas, + namespace=namespace_name, ) - # wait for the index to be ready - time.sleep(20) - output = docsearch.similarity_search(needs, k=1) + output = docsearch.similarity_search(needs, k=1, namespace=namespace_name) # TODO: why metadata={"page": 0.0}) instead of {"page": 0}? assert output == [Document(page_content=needs, metadata={"page": 0.0})] @@ -116,10 +129,11 @@ class TestPinecone: embedding_openai, index_name=index_name, metadatas=metadatas, + namespace=namespace_name, + ) + output = docsearch.similarity_search_with_score( + "foo", k=3, namespace=namespace_name ) - # wait for the index to be ready - time.sleep(20) - output = docsearch.similarity_search_with_score("foo", k=3) docs = [o[0] for o in output] scores = [o[1] for o in output] sorted_documents = sorted(docs, key=lambda x: x.metadata["page"]) @@ -132,17 +146,57 @@ class TestPinecone: ] assert scores[0] > scores[1] > scores[2] + def test_from_existing_index_with_namespaces( + self, embedding_openai: OpenAIEmbeddings + ) -> None: + """Test that namespaces are properly handled.""" + # Create two indexes with the same name but different namespaces + texts_1 = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts_1))] + Pinecone.from_texts( + texts_1, + embedding_openai, + index_name=index_name, + metadatas=metadatas, + namespace=f"{index_name}-1", + ) + + texts_2 = ["foo2", "bar2", "baz2"] + metadatas = [{"page": i} for i in range(len(texts_2))] + + Pinecone.from_texts( + texts_2, + embedding_openai, + index_name=index_name, + metadatas=metadatas, + namespace=f"{index_name}-2", + ) + + # Search with namespace + docsearch = Pinecone.from_existing_index( + index_name=index_name, + embedding=embedding_openai, + namespace=f"{index_name}-1", + ) + output = docsearch.similarity_search("foo", k=20, namespace=f"{index_name}-1") + # check that we don't get results from the other namespace + page_contents = sorted(set([o.page_content for o in output])) + assert all(content in ["foo", "bar", "baz"] for content in page_contents) + assert all(content not in ["foo2", "bar2", "baz2"] for content in page_contents) + def test_add_documents_with_ids( self, texts: List[str], embedding_openai: OpenAIEmbeddings ) -> None: ids = [uuid.uuid4().hex for _ in range(len(texts))] Pinecone.from_texts( - texts=texts, ids=ids, embedding=embedding_openai, index_name=index_name + texts=texts, + ids=ids, + embedding=embedding_openai, + index_name=index_name, + namespace=index_name, ) - # wait for the index to be ready - time.sleep(20) index_stats = self.index.describe_index_stats() - assert index_stats["total_vector_count"] == len(texts) + assert index_stats["namespaces"][index_name]["vector_count"] == len(texts) ids_1 = [uuid.uuid4().hex for _ in range(len(texts))] Pinecone.from_texts( @@ -150,10 +204,10 @@ class TestPinecone: ids=ids_1, embedding=embedding_openai, index_name=index_name, + namespace=index_name, ) - # wait for the index to be ready - time.sleep(20) index_stats = self.index.describe_index_stats() + assert index_stats["namespaces"][index_name]["vector_count"] == len(texts) * 2 assert index_stats["total_vector_count"] == len(texts) * 2 @pytest.mark.vcr()