optimize pgvector add_texts (#7185)

- Description: At the moment, inserting new embeddings to pgvector is
querying all embeddings every time as the defined `embeddings`
relationship is using the default params, which sets `lazy="select"`.
This change drastically improves the performance and adds a few
additional cleanups:
* remove `collection.embeddings.append` as it was querying all
embeddings on insert, replace with `collection_id` param
* centralize storing logic in add_embeddings function to reduce
duplication
  * remove boilerplate

- Issue: No issue was opened.
- Dependencies: None.
- Tag maintainer: this is a vectorstore update, so I think
@rlancemartin, @eyurtsev
- Twitter handle: @falmannaa
This commit is contained in:
Feras Almannaa 2023-07-05 23:19:42 +03:00 committed by GitHub
parent 6711854e30
commit 79b59a8e06
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -207,12 +207,6 @@ class PGVector(VectorStore):
pre_delete_collection: bool = False, pre_delete_collection: bool = False,
**kwargs: Any, **kwargs: Any,
) -> PGVector: ) -> PGVector:
if ids is None:
ids = [str(uuid.uuid1()) for _ in texts]
if not metadatas:
metadatas = [{} for _ in texts]
connection_string = cls.get_connection_string(kwargs) connection_string = cls.get_connection_string(kwargs)
store = cls( store = cls(
@ -231,12 +225,12 @@ class PGVector(VectorStore):
def add_embeddings( def add_embeddings(
self, self,
texts: List[str], texts: Iterable[str],
embeddings: List[List[float]], embeddings: List[List[float]],
metadatas: List[dict], metadatas: Optional[List[dict]] = None,
ids: List[str], ids: Optional[List[str]] = None,
**kwargs: Any, **kwargs: Any,
) -> None: ) -> List[str]:
"""Add embeddings to the vectorstore. """Add embeddings to the vectorstore.
Args: Args:
@ -245,6 +239,12 @@ class PGVector(VectorStore):
metadatas: List of metadatas associated with the texts. metadatas: List of metadatas associated with the texts.
kwargs: vectorstore specific parameters kwargs: vectorstore specific parameters
""" """
if ids is None:
ids = [str(uuid.uuid1()) for _ in texts]
if not metadatas:
metadatas = [{} for _ in texts]
with Session(self._conn) as session: with Session(self._conn) as session:
collection = self.get_collection(session) collection = self.get_collection(session)
if not collection: if not collection:
@ -255,11 +255,13 @@ class PGVector(VectorStore):
document=text, document=text,
cmetadata=metadata, cmetadata=metadata,
custom_id=id, custom_id=id,
collection_id=collection.uuid,
) )
collection.embeddings.append(embedding_store)
session.add(embedding_store) session.add(embedding_store)
session.commit() session.commit()
return ids
def add_texts( def add_texts(
self, self,
texts: Iterable[str], texts: Iterable[str],
@ -277,30 +279,10 @@ class PGVector(VectorStore):
Returns: Returns:
List of ids from adding the texts into the vectorstore. List of ids from adding the texts into the vectorstore.
""" """
if ids is None:
ids = [str(uuid.uuid1()) for _ in texts]
embeddings = self.embedding_function.embed_documents(list(texts)) embeddings = self.embedding_function.embed_documents(list(texts))
return self.add_embeddings(
if not metadatas: texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs
metadatas = [{} for _ in texts] )
with Session(self._conn) as session:
collection = self.get_collection(session)
if not collection:
raise ValueError("Collection not found")
for text, metadata, embedding, id in zip(texts, metadatas, embeddings, ids):
embedding_store = EmbeddingStore(
embedding=embedding,
document=text,
cmetadata=metadata,
custom_id=id,
)
collection.embeddings.append(embedding_store)
session.add(embedding_store)
session.commit()
return ids
def similarity_search( def similarity_search(
self, self,