AstraDB VectorStore: implement pre_delete_collection (#13780)

- **Description:** some vector stores have a flag for try deleting the collection before creating it (such as ´vectorpg´). This is a useful flag when prototyping indexing pipelines and also for integration tests. Added the bool flag `pre_delete_collection ` to the constructor (default False) - **Tag maintainer:** @hemidactylus - **Twitter handle:** nicoloboschi --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
2025-09-17 15:35:14 +00:00 · 2023-12-03 21:06:20 +01:00
parent 2780d2d4dd
commit e204657b3c
2 changed files with 79 additions and 36 deletions
--- a/libs/langchain/langchain/vectorstores/astradb.py
+++ b/libs/langchain/langchain/vectorstores/astradb.py
@@ -78,43 +78,46 @@ class AstraDB(VectorStore):
                vectorstore.add_texts(["Giraffes", "All good here"])
                results = vectorstore.similarity_search("Everything's ok", k=1)

-    Constructor args (only keyword-arguments accepted):
-        embedding (Embeddings): embedding function to use.
-        collection_name (str): name of the Astra DB collection to create/use.
-        token (Optional[str]): API token for Astra DB usage.
-        api_endpoint (Optional[str]): full URL to the API endpoint,
-            such as "https://<DB-ID>-us-east1.apps.astra.datastax.com".
-        astra_db_client (Optional[Any]): *alternative to token+api_endpoint*,
-            you can pass an already-created 'astrapy.db.AstraDB' instance.
-        namespace (Optional[str]): namespace (aka keyspace) where the
-            collection is created. Defaults to the database's "default namespace".
-        metric (Optional[str]): similarity function to use out of those
-            available in Astra DB. If left out, it will use Astra DB API's
-            defaults (i.e. "cosine" - but, for performance reasons,
-            "dot_product" is suggested if embeddings are normalized to one).
+      Constructor Args (only keyword-arguments accepted):
+          embedding (Embeddings): embedding function to use.
+          collection_name (str): name of the Astra DB collection to create/use.
+          token (Optional[str]): API token for Astra DB usage.
+          api_endpoint (Optional[str]): full URL to the API endpoint,
+              such as "https://<DB-ID>-us-east1.apps.astra.datastax.com".
+          astra_db_client (Optional[Any]): *alternative to token+api_endpoint*,
+              you can pass an already-created 'astrapy.db.AstraDB' instance.
+          namespace (Optional[str]): namespace (aka keyspace) where the
+              collection is created. Defaults to the database's "default namespace".
+          metric (Optional[str]): similarity function to use out of those
+              available in Astra DB. If left out, it will use Astra DB API's
+              defaults (i.e. "cosine" - but, for performance reasons,
+              "dot_product" is suggested if embeddings are normalized to one).

-    Advanced arguments (coming with sensible defaults):
-        batch_size (Optional[int]): Size of batches for bulk insertions.
-        bulk_insert_batch_concurrency (Optional[int]): Number of threads
-            to insert batches concurrently.
-        bulk_insert_overwrite_concurrency (Optional[int]): Number of
-            threads in a batch to insert pre-existing entries.
-        bulk_delete_concurrency (Optional[int]): Number of threads
-            (for deleting multiple rows concurrently).
+      Advanced arguments (coming with sensible defaults):
+          batch_size (Optional[int]): Size of batches for bulk insertions.
+          bulk_insert_batch_concurrency (Optional[int]): Number of threads
+              to insert batches concurrently.
+          bulk_insert_overwrite_concurrency (Optional[int]): Number of
+              threads in a batch to insert pre-existing entries.
+          bulk_delete_concurrency (Optional[int]): Number of threads
+              (for deleting multiple rows concurrently).
+          pre_delete_collection (Optional[bool]): whether to delete the collection
+              before creating it. If False and the collection already exists,
+              the collection will be used as is.

-    A note on concurrency: as a rule of thumb, on a typical client machine
-    it is suggested to keep the quantity
-        bulk_insert_batch_concurrency * bulk_insert_overwrite_concurrency
-    much below 1000 to avoid exhausting the client multithreading/networking
-    resources. The hardcoded defaults are somewhat conservative to meet
-    most machines' specs, but a sensible choice to test may be:
-        bulk_insert_batch_concurrency = 80
-        bulk_insert_overwrite_concurrency = 10
-    A bit of experimentation is required to nail the best results here,
-    depending on both the machine/network specs and the expected workload
-    (specifically, how often a write is an update of an existing id).
-    Remember you can pass concurrency settings to individual calls to
-    add_texts and add_documents as well.
+      A note on concurrency: as a rule of thumb, on a typical client machine
+      it is suggested to keep the quantity
+          bulk_insert_batch_concurrency * bulk_insert_overwrite_concurrency
+      much below 1000 to avoid exhausting the client multithreading/networking
+      resources. The hardcoded defaults are somewhat conservative to meet
+      most machines' specs, but a sensible choice to test may be:
+          bulk_insert_batch_concurrency = 80
+          bulk_insert_overwrite_concurrency = 10
+      A bit of experimentation is required to nail the best results here,
+      depending on both the machine/network specs and the expected workload
+      (specifically, how often a write is an update of an existing id).
+      Remember you can pass concurrency settings to individual calls to
+      add_texts and add_documents as well.
    """

    @staticmethod
@@ -138,6 +141,7 @@ class AstraDB(VectorStore):
        bulk_insert_batch_concurrency: Optional[int] = None,
        bulk_insert_overwrite_concurrency: Optional[int] = None,
        bulk_delete_concurrency: Optional[int] = None,
+        pre_delete_collection: bool = False,
    ) -> None:
        """
        Create an AstraDB vector store object. See class docstring for help.
@@ -154,6 +158,7 @@ class AstraDB(VectorStore):
                "Could not import a recent astrapy python package. "
                "Please install it with `pip install --upgrade astrapy`."
            )
+
        # Conflicting-arg checks:
        if astra_db_client is not None:
            if token is not None or api_endpoint is not None:
@@ -191,7 +196,10 @@ class AstraDB(VectorStore):
                api_endpoint=self.api_endpoint,
                namespace=self.namespace,
            )
-        self._provision_collection()
+        if not pre_delete_collection:
+            self._provision_collection()
+        else:
+            self.clear()

        self.collection = LibAstraDBCollection(
            collection_name=self.collection_name,
--- a/libs/langchain/tests/integration_tests/vectorstores/test_astradb.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/test_astradb.py
@@ -148,6 +148,41 @@ class TestAstraDB:
        )
        v_store_2.delete_collection()

+    def test_astradb_vectorstore_pre_delete_collection(self) -> None:
+        """Create and delete."""
+        emb = SomeEmbeddings(dimension=2)
+        # creation by passing the connection secrets
+
+        v_store = AstraDB(
+            embedding=emb,
+            collection_name="lc_test_pre_del",
+            token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
+            api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
+            namespace=os.environ.get("ASTRA_DB_KEYSPACE"),
+        )
+        try:
+            v_store.add_texts(
+                texts=["aa"],
+                metadatas=[
+                    {"k": "a", "ord": 0},
+                ],
+                ids=["a"],
+            )
+            res1 = v_store.similarity_search("aa", k=5)
+            assert len(res1) == 1
+            v_store = AstraDB(
+                embedding=emb,
+                pre_delete_collection=True,
+                collection_name="lc_test_pre_del",
+                token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
+                api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
+                namespace=os.environ.get("ASTRA_DB_KEYSPACE"),
+            )
+            res1 = v_store.similarity_search("aa", k=5)
+            assert len(res1) == 0
+        finally:
+            v_store.delete_collection()
+
    def test_astradb_vectorstore_from_x(self) -> None:
        """from_texts and from_documents methods."""
        emb = SomeEmbeddings(dimension=2)