pinecone changes (#590)

Co-authored-by: Smit Shah <who828@gmail.com>
Co-authored-by: iocuydi <46613640+iocuydi@users.noreply.github.com>
This commit is contained in:
Harrison Chase 2023-01-12 06:08:47 -08:00 committed by GitHub
parent 7b6e7f6e12
commit a5ee7de650
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -2,7 +2,7 @@
from __future__ import annotations from __future__ import annotations
import uuid import uuid
from typing import Any, Callable, Iterable, List, Optional from typing import Any, Callable, Iterable, List, Optional, Tuple
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings from langchain.embeddings.base import Embeddings
@ -46,16 +46,21 @@ class Pinecone(VectorStore):
self._text_key = text_key self._text_key = text_key
def add_texts( def add_texts(
self, texts: Iterable[str], metadatas: Optional[List[dict]] = None self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
namespace: Optional[str] = None,
) -> List[str]: ) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore. """Run more texts through the embeddings and add to the vectorstore.
Args: Args:
texts: Iterable of strings to add to the vectorstore. texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts. metadatas: Optional list of metadatas associated with the texts.
namespace: Optional pinecone namespace to add the texts to.
Returns: Returns:
List of ids from adding the texts into the vectorstore. List of ids from adding the texts into the vectorstore.
""" """
# Embed and create the documents # Embed and create the documents
docs = [] docs = []
@ -68,14 +73,57 @@ class Pinecone(VectorStore):
docs.append((id, embedding, metadata)) docs.append((id, embedding, metadata))
ids.append(id) ids.append(id)
# upsert to Pinecone # upsert to Pinecone
self._index.upsert(vectors=docs) self._index.upsert(vectors=docs, namespace=namespace)
return ids return ids
def similarity_search(self, query: str, k: int = 5) -> List[Document]: def similarity_search_with_score(
"""Look up similar documents in pinecone.""" self,
query: str,
k: int = 5,
namespace: Optional[str] = None,
) -> List[Tuple[Document, float]]:
"""Return pinecone documents most similar to query, along with scores.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
namespace: Namespace to search in. Default will search in '' namespace.
Returns:
List of Documents most similar to the query and score for each
"""
query_obj = self._embedding_function(query) query_obj = self._embedding_function(query)
docs = [] docs = []
results = self._index.query([query_obj], top_k=k, include_metadata=True) results = self._index.query(
[query_obj], top_k=k, include_metadata=True, namespace=namespace
)
for res in results["matches"]:
metadata = res["metadata"]
text = metadata.pop(self._text_key)
docs.append((Document(page_content=text, metadata=metadata), res["score"]))
return docs
def similarity_search(
self,
query: str,
k: int = 5,
namespace: Optional[str] = None,
) -> List[Document]:
"""Return pinecone documents most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
namespace: Namespace to search in. Default will search in '' namespace.
Returns:
List of Documents most similar to the query and score for each
"""
query_obj = self._embedding_function(query)
docs = []
results = self._index.query(
[query_obj], top_k=k, include_metadata=True, namespace=namespace
)
for res in results["matches"]: for res in results["matches"]:
metadata = res["metadata"] metadata = res["metadata"]
text = metadata.pop(self._text_key) text = metadata.pop(self._text_key)
@ -132,7 +180,7 @@ class Pinecone(VectorStore):
i_end = min(i + batch_size, len(texts)) i_end = min(i + batch_size, len(texts))
# get batch of texts and ids # get batch of texts and ids
lines_batch = texts[i : i + batch_size] lines_batch = texts[i : i + batch_size]
ids_batch = [str(n) for n in range(i, i_end)] ids_batch = [str(uuid.uuid4()) for n in range(i, i_end)]
# create embeddings # create embeddings
embeds = embedding.embed_documents(lines_batch) embeds = embedding.embed_documents(lines_batch)
# prep metadata and upsert batch # prep metadata and upsert batch
@ -150,3 +198,18 @@ class Pinecone(VectorStore):
# upsert to Pinecone # upsert to Pinecone
index.upsert(vectors=list(to_upsert), namespace=namespace) index.upsert(vectors=list(to_upsert), namespace=namespace)
return cls(index, embedding.embed_query, text_key) return cls(index, embedding.embed_query, text_key)
@classmethod
def from_existing_index(
cls, index_name: str, embedding: Embeddings, text_key: str = "text"
) -> Pinecone:
"""Load pinecone vectorstore from index name."""
try:
import pinecone
except ImportError:
raise ValueError(
"Could not import pinecone python package. "
"Please install it with `pip install pinecone-client`."
)
return cls(pinecone.Index(index_name), embedding.embed_query, text_key)