diff --git a/libs/langchain/langchain/vectorstores/astradb.py b/libs/langchain/langchain/vectorstores/astradb.py index 0eacbd2d926..0b5274c90ba 100644 --- a/libs/langchain/langchain/vectorstores/astradb.py +++ b/libs/langchain/langchain/vectorstores/astradb.py @@ -78,43 +78,46 @@ class AstraDB(VectorStore): vectorstore.add_texts(["Giraffes", "All good here"]) results = vectorstore.similarity_search("Everything's ok", k=1) - Constructor args (only keyword-arguments accepted): - embedding (Embeddings): embedding function to use. - collection_name (str): name of the Astra DB collection to create/use. - token (Optional[str]): API token for Astra DB usage. - api_endpoint (Optional[str]): full URL to the API endpoint, - such as "https://-us-east1.apps.astra.datastax.com". - astra_db_client (Optional[Any]): *alternative to token+api_endpoint*, - you can pass an already-created 'astrapy.db.AstraDB' instance. - namespace (Optional[str]): namespace (aka keyspace) where the - collection is created. Defaults to the database's "default namespace". - metric (Optional[str]): similarity function to use out of those - available in Astra DB. If left out, it will use Astra DB API's - defaults (i.e. "cosine" - but, for performance reasons, - "dot_product" is suggested if embeddings are normalized to one). + Constructor Args (only keyword-arguments accepted): + embedding (Embeddings): embedding function to use. + collection_name (str): name of the Astra DB collection to create/use. + token (Optional[str]): API token for Astra DB usage. + api_endpoint (Optional[str]): full URL to the API endpoint, + such as "https://-us-east1.apps.astra.datastax.com". + astra_db_client (Optional[Any]): *alternative to token+api_endpoint*, + you can pass an already-created 'astrapy.db.AstraDB' instance. + namespace (Optional[str]): namespace (aka keyspace) where the + collection is created. Defaults to the database's "default namespace". + metric (Optional[str]): similarity function to use out of those + available in Astra DB. If left out, it will use Astra DB API's + defaults (i.e. "cosine" - but, for performance reasons, + "dot_product" is suggested if embeddings are normalized to one). - Advanced arguments (coming with sensible defaults): - batch_size (Optional[int]): Size of batches for bulk insertions. - bulk_insert_batch_concurrency (Optional[int]): Number of threads - to insert batches concurrently. - bulk_insert_overwrite_concurrency (Optional[int]): Number of - threads in a batch to insert pre-existing entries. - bulk_delete_concurrency (Optional[int]): Number of threads - (for deleting multiple rows concurrently). + Advanced arguments (coming with sensible defaults): + batch_size (Optional[int]): Size of batches for bulk insertions. + bulk_insert_batch_concurrency (Optional[int]): Number of threads + to insert batches concurrently. + bulk_insert_overwrite_concurrency (Optional[int]): Number of + threads in a batch to insert pre-existing entries. + bulk_delete_concurrency (Optional[int]): Number of threads + (for deleting multiple rows concurrently). + pre_delete_collection (Optional[bool]): whether to delete the collection + before creating it. If False and the collection already exists, + the collection will be used as is. - A note on concurrency: as a rule of thumb, on a typical client machine - it is suggested to keep the quantity - bulk_insert_batch_concurrency * bulk_insert_overwrite_concurrency - much below 1000 to avoid exhausting the client multithreading/networking - resources. The hardcoded defaults are somewhat conservative to meet - most machines' specs, but a sensible choice to test may be: - bulk_insert_batch_concurrency = 80 - bulk_insert_overwrite_concurrency = 10 - A bit of experimentation is required to nail the best results here, - depending on both the machine/network specs and the expected workload - (specifically, how often a write is an update of an existing id). - Remember you can pass concurrency settings to individual calls to - add_texts and add_documents as well. + A note on concurrency: as a rule of thumb, on a typical client machine + it is suggested to keep the quantity + bulk_insert_batch_concurrency * bulk_insert_overwrite_concurrency + much below 1000 to avoid exhausting the client multithreading/networking + resources. The hardcoded defaults are somewhat conservative to meet + most machines' specs, but a sensible choice to test may be: + bulk_insert_batch_concurrency = 80 + bulk_insert_overwrite_concurrency = 10 + A bit of experimentation is required to nail the best results here, + depending on both the machine/network specs and the expected workload + (specifically, how often a write is an update of an existing id). + Remember you can pass concurrency settings to individual calls to + add_texts and add_documents as well. """ @staticmethod @@ -138,6 +141,7 @@ class AstraDB(VectorStore): bulk_insert_batch_concurrency: Optional[int] = None, bulk_insert_overwrite_concurrency: Optional[int] = None, bulk_delete_concurrency: Optional[int] = None, + pre_delete_collection: bool = False, ) -> None: """ Create an AstraDB vector store object. See class docstring for help. @@ -154,6 +158,7 @@ class AstraDB(VectorStore): "Could not import a recent astrapy python package. " "Please install it with `pip install --upgrade astrapy`." ) + # Conflicting-arg checks: if astra_db_client is not None: if token is not None or api_endpoint is not None: @@ -191,7 +196,10 @@ class AstraDB(VectorStore): api_endpoint=self.api_endpoint, namespace=self.namespace, ) - self._provision_collection() + if not pre_delete_collection: + self._provision_collection() + else: + self.clear() self.collection = LibAstraDBCollection( collection_name=self.collection_name, diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_astradb.py b/libs/langchain/tests/integration_tests/vectorstores/test_astradb.py index 8b1703f5cc9..fdcc9ab1367 100644 --- a/libs/langchain/tests/integration_tests/vectorstores/test_astradb.py +++ b/libs/langchain/tests/integration_tests/vectorstores/test_astradb.py @@ -148,6 +148,41 @@ class TestAstraDB: ) v_store_2.delete_collection() + def test_astradb_vectorstore_pre_delete_collection(self) -> None: + """Create and delete.""" + emb = SomeEmbeddings(dimension=2) + # creation by passing the connection secrets + + v_store = AstraDB( + embedding=emb, + collection_name="lc_test_pre_del", + token=os.environ["ASTRA_DB_APPLICATION_TOKEN"], + api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"], + namespace=os.environ.get("ASTRA_DB_KEYSPACE"), + ) + try: + v_store.add_texts( + texts=["aa"], + metadatas=[ + {"k": "a", "ord": 0}, + ], + ids=["a"], + ) + res1 = v_store.similarity_search("aa", k=5) + assert len(res1) == 1 + v_store = AstraDB( + embedding=emb, + pre_delete_collection=True, + collection_name="lc_test_pre_del", + token=os.environ["ASTRA_DB_APPLICATION_TOKEN"], + api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"], + namespace=os.environ.get("ASTRA_DB_KEYSPACE"), + ) + res1 = v_store.similarity_search("aa", k=5) + assert len(res1) == 0 + finally: + v_store.delete_collection() + def test_astradb_vectorstore_from_x(self) -> None: """from_texts and from_documents methods.""" emb = SomeEmbeddings(dimension=2)