AstraDB VectorStore: implement pre_delete_collection (#13780)

- **Description:** some vector stores have a flag for try deleting the
collection before creating it (such as ´vectorpg´). This is a useful
flag when prototyping indexing pipelines and also for integration tests.
Added the bool flag `pre_delete_collection ` to the constructor (default
False)
  - **Tag maintainer:** @hemidactylus 
  - **Twitter handle:** nicoloboschi

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
Nicolò Boschi 2023-12-03 21:06:20 +01:00 committed by GitHub
parent 2780d2d4dd
commit e204657b3c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 79 additions and 36 deletions

View File

@ -78,43 +78,46 @@ class AstraDB(VectorStore):
vectorstore.add_texts(["Giraffes", "All good here"]) vectorstore.add_texts(["Giraffes", "All good here"])
results = vectorstore.similarity_search("Everything's ok", k=1) results = vectorstore.similarity_search("Everything's ok", k=1)
Constructor args (only keyword-arguments accepted): Constructor Args (only keyword-arguments accepted):
embedding (Embeddings): embedding function to use. embedding (Embeddings): embedding function to use.
collection_name (str): name of the Astra DB collection to create/use. collection_name (str): name of the Astra DB collection to create/use.
token (Optional[str]): API token for Astra DB usage. token (Optional[str]): API token for Astra DB usage.
api_endpoint (Optional[str]): full URL to the API endpoint, api_endpoint (Optional[str]): full URL to the API endpoint,
such as "https://<DB-ID>-us-east1.apps.astra.datastax.com". such as "https://<DB-ID>-us-east1.apps.astra.datastax.com".
astra_db_client (Optional[Any]): *alternative to token+api_endpoint*, astra_db_client (Optional[Any]): *alternative to token+api_endpoint*,
you can pass an already-created 'astrapy.db.AstraDB' instance. you can pass an already-created 'astrapy.db.AstraDB' instance.
namespace (Optional[str]): namespace (aka keyspace) where the namespace (Optional[str]): namespace (aka keyspace) where the
collection is created. Defaults to the database's "default namespace". collection is created. Defaults to the database's "default namespace".
metric (Optional[str]): similarity function to use out of those metric (Optional[str]): similarity function to use out of those
available in Astra DB. If left out, it will use Astra DB API's available in Astra DB. If left out, it will use Astra DB API's
defaults (i.e. "cosine" - but, for performance reasons, defaults (i.e. "cosine" - but, for performance reasons,
"dot_product" is suggested if embeddings are normalized to one). "dot_product" is suggested if embeddings are normalized to one).
Advanced arguments (coming with sensible defaults): Advanced arguments (coming with sensible defaults):
batch_size (Optional[int]): Size of batches for bulk insertions. batch_size (Optional[int]): Size of batches for bulk insertions.
bulk_insert_batch_concurrency (Optional[int]): Number of threads bulk_insert_batch_concurrency (Optional[int]): Number of threads
to insert batches concurrently. to insert batches concurrently.
bulk_insert_overwrite_concurrency (Optional[int]): Number of bulk_insert_overwrite_concurrency (Optional[int]): Number of
threads in a batch to insert pre-existing entries. threads in a batch to insert pre-existing entries.
bulk_delete_concurrency (Optional[int]): Number of threads bulk_delete_concurrency (Optional[int]): Number of threads
(for deleting multiple rows concurrently). (for deleting multiple rows concurrently).
pre_delete_collection (Optional[bool]): whether to delete the collection
before creating it. If False and the collection already exists,
the collection will be used as is.
A note on concurrency: as a rule of thumb, on a typical client machine A note on concurrency: as a rule of thumb, on a typical client machine
it is suggested to keep the quantity it is suggested to keep the quantity
bulk_insert_batch_concurrency * bulk_insert_overwrite_concurrency bulk_insert_batch_concurrency * bulk_insert_overwrite_concurrency
much below 1000 to avoid exhausting the client multithreading/networking much below 1000 to avoid exhausting the client multithreading/networking
resources. The hardcoded defaults are somewhat conservative to meet resources. The hardcoded defaults are somewhat conservative to meet
most machines' specs, but a sensible choice to test may be: most machines' specs, but a sensible choice to test may be:
bulk_insert_batch_concurrency = 80 bulk_insert_batch_concurrency = 80
bulk_insert_overwrite_concurrency = 10 bulk_insert_overwrite_concurrency = 10
A bit of experimentation is required to nail the best results here, A bit of experimentation is required to nail the best results here,
depending on both the machine/network specs and the expected workload depending on both the machine/network specs and the expected workload
(specifically, how often a write is an update of an existing id). (specifically, how often a write is an update of an existing id).
Remember you can pass concurrency settings to individual calls to Remember you can pass concurrency settings to individual calls to
add_texts and add_documents as well. add_texts and add_documents as well.
""" """
@staticmethod @staticmethod
@ -138,6 +141,7 @@ class AstraDB(VectorStore):
bulk_insert_batch_concurrency: Optional[int] = None, bulk_insert_batch_concurrency: Optional[int] = None,
bulk_insert_overwrite_concurrency: Optional[int] = None, bulk_insert_overwrite_concurrency: Optional[int] = None,
bulk_delete_concurrency: Optional[int] = None, bulk_delete_concurrency: Optional[int] = None,
pre_delete_collection: bool = False,
) -> None: ) -> None:
""" """
Create an AstraDB vector store object. See class docstring for help. Create an AstraDB vector store object. See class docstring for help.
@ -154,6 +158,7 @@ class AstraDB(VectorStore):
"Could not import a recent astrapy python package. " "Could not import a recent astrapy python package. "
"Please install it with `pip install --upgrade astrapy`." "Please install it with `pip install --upgrade astrapy`."
) )
# Conflicting-arg checks: # Conflicting-arg checks:
if astra_db_client is not None: if astra_db_client is not None:
if token is not None or api_endpoint is not None: if token is not None or api_endpoint is not None:
@ -191,7 +196,10 @@ class AstraDB(VectorStore):
api_endpoint=self.api_endpoint, api_endpoint=self.api_endpoint,
namespace=self.namespace, namespace=self.namespace,
) )
self._provision_collection() if not pre_delete_collection:
self._provision_collection()
else:
self.clear()
self.collection = LibAstraDBCollection( self.collection = LibAstraDBCollection(
collection_name=self.collection_name, collection_name=self.collection_name,

View File

@ -148,6 +148,41 @@ class TestAstraDB:
) )
v_store_2.delete_collection() v_store_2.delete_collection()
def test_astradb_vectorstore_pre_delete_collection(self) -> None:
"""Create and delete."""
emb = SomeEmbeddings(dimension=2)
# creation by passing the connection secrets
v_store = AstraDB(
embedding=emb,
collection_name="lc_test_pre_del",
token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
namespace=os.environ.get("ASTRA_DB_KEYSPACE"),
)
try:
v_store.add_texts(
texts=["aa"],
metadatas=[
{"k": "a", "ord": 0},
],
ids=["a"],
)
res1 = v_store.similarity_search("aa", k=5)
assert len(res1) == 1
v_store = AstraDB(
embedding=emb,
pre_delete_collection=True,
collection_name="lc_test_pre_del",
token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
namespace=os.environ.get("ASTRA_DB_KEYSPACE"),
)
res1 = v_store.similarity_search("aa", k=5)
assert len(res1) == 0
finally:
v_store.delete_collection()
def test_astradb_vectorstore_from_x(self) -> None: def test_astradb_vectorstore_from_x(self) -> None:
"""from_texts and from_documents methods.""" """from_texts and from_documents methods."""
emb = SomeEmbeddings(dimension=2) emb = SomeEmbeddings(dimension=2)