From a5ee7de650aa89f0a2b98f52b53b0a8076ef9574 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Thu, 12 Jan 2023 06:08:47 -0800 Subject: [PATCH] pinecone changes (#590) Co-authored-by: Smit Shah Co-authored-by: iocuydi <46613640+iocuydi@users.noreply.github.com> --- langchain/vectorstores/pinecone.py | 77 +++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 7 deletions(-) diff --git a/langchain/vectorstores/pinecone.py b/langchain/vectorstores/pinecone.py index 8f3b94966d2..d395ceec866 100644 --- a/langchain/vectorstores/pinecone.py +++ b/langchain/vectorstores/pinecone.py @@ -2,7 +2,7 @@ from __future__ import annotations import uuid -from typing import Any, Callable, Iterable, List, Optional +from typing import Any, Callable, Iterable, List, Optional, Tuple from langchain.docstore.document import Document from langchain.embeddings.base import Embeddings @@ -46,16 +46,21 @@ class Pinecone(VectorStore): self._text_key = text_key def add_texts( - self, texts: Iterable[str], metadatas: Optional[List[dict]] = None + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + namespace: Optional[str] = None, ) -> List[str]: """Run more texts through the embeddings and add to the vectorstore. Args: texts: Iterable of strings to add to the vectorstore. metadatas: Optional list of metadatas associated with the texts. + namespace: Optional pinecone namespace to add the texts to. Returns: List of ids from adding the texts into the vectorstore. + """ # Embed and create the documents docs = [] @@ -68,14 +73,57 @@ class Pinecone(VectorStore): docs.append((id, embedding, metadata)) ids.append(id) # upsert to Pinecone - self._index.upsert(vectors=docs) + self._index.upsert(vectors=docs, namespace=namespace) return ids - def similarity_search(self, query: str, k: int = 5) -> List[Document]: - """Look up similar documents in pinecone.""" + def similarity_search_with_score( + self, + query: str, + k: int = 5, + namespace: Optional[str] = None, + ) -> List[Tuple[Document, float]]: + """Return pinecone documents most similar to query, along with scores. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + namespace: Namespace to search in. Default will search in '' namespace. + + Returns: + List of Documents most similar to the query and score for each + """ query_obj = self._embedding_function(query) docs = [] - results = self._index.query([query_obj], top_k=k, include_metadata=True) + results = self._index.query( + [query_obj], top_k=k, include_metadata=True, namespace=namespace + ) + for res in results["matches"]: + metadata = res["metadata"] + text = metadata.pop(self._text_key) + docs.append((Document(page_content=text, metadata=metadata), res["score"])) + return docs + + def similarity_search( + self, + query: str, + k: int = 5, + namespace: Optional[str] = None, + ) -> List[Document]: + """Return pinecone documents most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + namespace: Namespace to search in. Default will search in '' namespace. + + Returns: + List of Documents most similar to the query and score for each + """ + query_obj = self._embedding_function(query) + docs = [] + results = self._index.query( + [query_obj], top_k=k, include_metadata=True, namespace=namespace + ) for res in results["matches"]: metadata = res["metadata"] text = metadata.pop(self._text_key) @@ -132,7 +180,7 @@ class Pinecone(VectorStore): i_end = min(i + batch_size, len(texts)) # get batch of texts and ids lines_batch = texts[i : i + batch_size] - ids_batch = [str(n) for n in range(i, i_end)] + ids_batch = [str(uuid.uuid4()) for n in range(i, i_end)] # create embeddings embeds = embedding.embed_documents(lines_batch) # prep metadata and upsert batch @@ -150,3 +198,18 @@ class Pinecone(VectorStore): # upsert to Pinecone index.upsert(vectors=list(to_upsert), namespace=namespace) return cls(index, embedding.embed_query, text_key) + + @classmethod + def from_existing_index( + cls, index_name: str, embedding: Embeddings, text_key: str = "text" + ) -> Pinecone: + """Load pinecone vectorstore from index name.""" + try: + import pinecone + except ImportError: + raise ValueError( + "Could not import pinecone python package. " + "Please install it with `pip install pinecone-client`." + ) + + return cls(pinecone.Index(index_name), embedding.embed_query, text_key)