mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-25 08:57:48 +00:00
AstraDB VectorStore: implement pre_delete_collection (#13780)
- **Description:** some vector stores have a flag for try deleting the collection before creating it (such as ´vectorpg´). This is a useful flag when prototyping indexing pipelines and also for integration tests. Added the bool flag `pre_delete_collection ` to the constructor (default False) - **Tag maintainer:** @hemidactylus - **Twitter handle:** nicoloboschi --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
2780d2d4dd
commit
e204657b3c
@ -78,43 +78,46 @@ class AstraDB(VectorStore):
|
|||||||
vectorstore.add_texts(["Giraffes", "All good here"])
|
vectorstore.add_texts(["Giraffes", "All good here"])
|
||||||
results = vectorstore.similarity_search("Everything's ok", k=1)
|
results = vectorstore.similarity_search("Everything's ok", k=1)
|
||||||
|
|
||||||
Constructor args (only keyword-arguments accepted):
|
Constructor Args (only keyword-arguments accepted):
|
||||||
embedding (Embeddings): embedding function to use.
|
embedding (Embeddings): embedding function to use.
|
||||||
collection_name (str): name of the Astra DB collection to create/use.
|
collection_name (str): name of the Astra DB collection to create/use.
|
||||||
token (Optional[str]): API token for Astra DB usage.
|
token (Optional[str]): API token for Astra DB usage.
|
||||||
api_endpoint (Optional[str]): full URL to the API endpoint,
|
api_endpoint (Optional[str]): full URL to the API endpoint,
|
||||||
such as "https://<DB-ID>-us-east1.apps.astra.datastax.com".
|
such as "https://<DB-ID>-us-east1.apps.astra.datastax.com".
|
||||||
astra_db_client (Optional[Any]): *alternative to token+api_endpoint*,
|
astra_db_client (Optional[Any]): *alternative to token+api_endpoint*,
|
||||||
you can pass an already-created 'astrapy.db.AstraDB' instance.
|
you can pass an already-created 'astrapy.db.AstraDB' instance.
|
||||||
namespace (Optional[str]): namespace (aka keyspace) where the
|
namespace (Optional[str]): namespace (aka keyspace) where the
|
||||||
collection is created. Defaults to the database's "default namespace".
|
collection is created. Defaults to the database's "default namespace".
|
||||||
metric (Optional[str]): similarity function to use out of those
|
metric (Optional[str]): similarity function to use out of those
|
||||||
available in Astra DB. If left out, it will use Astra DB API's
|
available in Astra DB. If left out, it will use Astra DB API's
|
||||||
defaults (i.e. "cosine" - but, for performance reasons,
|
defaults (i.e. "cosine" - but, for performance reasons,
|
||||||
"dot_product" is suggested if embeddings are normalized to one).
|
"dot_product" is suggested if embeddings are normalized to one).
|
||||||
|
|
||||||
Advanced arguments (coming with sensible defaults):
|
Advanced arguments (coming with sensible defaults):
|
||||||
batch_size (Optional[int]): Size of batches for bulk insertions.
|
batch_size (Optional[int]): Size of batches for bulk insertions.
|
||||||
bulk_insert_batch_concurrency (Optional[int]): Number of threads
|
bulk_insert_batch_concurrency (Optional[int]): Number of threads
|
||||||
to insert batches concurrently.
|
to insert batches concurrently.
|
||||||
bulk_insert_overwrite_concurrency (Optional[int]): Number of
|
bulk_insert_overwrite_concurrency (Optional[int]): Number of
|
||||||
threads in a batch to insert pre-existing entries.
|
threads in a batch to insert pre-existing entries.
|
||||||
bulk_delete_concurrency (Optional[int]): Number of threads
|
bulk_delete_concurrency (Optional[int]): Number of threads
|
||||||
(for deleting multiple rows concurrently).
|
(for deleting multiple rows concurrently).
|
||||||
|
pre_delete_collection (Optional[bool]): whether to delete the collection
|
||||||
|
before creating it. If False and the collection already exists,
|
||||||
|
the collection will be used as is.
|
||||||
|
|
||||||
A note on concurrency: as a rule of thumb, on a typical client machine
|
A note on concurrency: as a rule of thumb, on a typical client machine
|
||||||
it is suggested to keep the quantity
|
it is suggested to keep the quantity
|
||||||
bulk_insert_batch_concurrency * bulk_insert_overwrite_concurrency
|
bulk_insert_batch_concurrency * bulk_insert_overwrite_concurrency
|
||||||
much below 1000 to avoid exhausting the client multithreading/networking
|
much below 1000 to avoid exhausting the client multithreading/networking
|
||||||
resources. The hardcoded defaults are somewhat conservative to meet
|
resources. The hardcoded defaults are somewhat conservative to meet
|
||||||
most machines' specs, but a sensible choice to test may be:
|
most machines' specs, but a sensible choice to test may be:
|
||||||
bulk_insert_batch_concurrency = 80
|
bulk_insert_batch_concurrency = 80
|
||||||
bulk_insert_overwrite_concurrency = 10
|
bulk_insert_overwrite_concurrency = 10
|
||||||
A bit of experimentation is required to nail the best results here,
|
A bit of experimentation is required to nail the best results here,
|
||||||
depending on both the machine/network specs and the expected workload
|
depending on both the machine/network specs and the expected workload
|
||||||
(specifically, how often a write is an update of an existing id).
|
(specifically, how often a write is an update of an existing id).
|
||||||
Remember you can pass concurrency settings to individual calls to
|
Remember you can pass concurrency settings to individual calls to
|
||||||
add_texts and add_documents as well.
|
add_texts and add_documents as well.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -138,6 +141,7 @@ class AstraDB(VectorStore):
|
|||||||
bulk_insert_batch_concurrency: Optional[int] = None,
|
bulk_insert_batch_concurrency: Optional[int] = None,
|
||||||
bulk_insert_overwrite_concurrency: Optional[int] = None,
|
bulk_insert_overwrite_concurrency: Optional[int] = None,
|
||||||
bulk_delete_concurrency: Optional[int] = None,
|
bulk_delete_concurrency: Optional[int] = None,
|
||||||
|
pre_delete_collection: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Create an AstraDB vector store object. See class docstring for help.
|
Create an AstraDB vector store object. See class docstring for help.
|
||||||
@ -154,6 +158,7 @@ class AstraDB(VectorStore):
|
|||||||
"Could not import a recent astrapy python package. "
|
"Could not import a recent astrapy python package. "
|
||||||
"Please install it with `pip install --upgrade astrapy`."
|
"Please install it with `pip install --upgrade astrapy`."
|
||||||
)
|
)
|
||||||
|
|
||||||
# Conflicting-arg checks:
|
# Conflicting-arg checks:
|
||||||
if astra_db_client is not None:
|
if astra_db_client is not None:
|
||||||
if token is not None or api_endpoint is not None:
|
if token is not None or api_endpoint is not None:
|
||||||
@ -191,7 +196,10 @@ class AstraDB(VectorStore):
|
|||||||
api_endpoint=self.api_endpoint,
|
api_endpoint=self.api_endpoint,
|
||||||
namespace=self.namespace,
|
namespace=self.namespace,
|
||||||
)
|
)
|
||||||
self._provision_collection()
|
if not pre_delete_collection:
|
||||||
|
self._provision_collection()
|
||||||
|
else:
|
||||||
|
self.clear()
|
||||||
|
|
||||||
self.collection = LibAstraDBCollection(
|
self.collection = LibAstraDBCollection(
|
||||||
collection_name=self.collection_name,
|
collection_name=self.collection_name,
|
||||||
|
@ -148,6 +148,41 @@ class TestAstraDB:
|
|||||||
)
|
)
|
||||||
v_store_2.delete_collection()
|
v_store_2.delete_collection()
|
||||||
|
|
||||||
|
def test_astradb_vectorstore_pre_delete_collection(self) -> None:
|
||||||
|
"""Create and delete."""
|
||||||
|
emb = SomeEmbeddings(dimension=2)
|
||||||
|
# creation by passing the connection secrets
|
||||||
|
|
||||||
|
v_store = AstraDB(
|
||||||
|
embedding=emb,
|
||||||
|
collection_name="lc_test_pre_del",
|
||||||
|
token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
|
||||||
|
api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
|
||||||
|
namespace=os.environ.get("ASTRA_DB_KEYSPACE"),
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
v_store.add_texts(
|
||||||
|
texts=["aa"],
|
||||||
|
metadatas=[
|
||||||
|
{"k": "a", "ord": 0},
|
||||||
|
],
|
||||||
|
ids=["a"],
|
||||||
|
)
|
||||||
|
res1 = v_store.similarity_search("aa", k=5)
|
||||||
|
assert len(res1) == 1
|
||||||
|
v_store = AstraDB(
|
||||||
|
embedding=emb,
|
||||||
|
pre_delete_collection=True,
|
||||||
|
collection_name="lc_test_pre_del",
|
||||||
|
token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
|
||||||
|
api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
|
||||||
|
namespace=os.environ.get("ASTRA_DB_KEYSPACE"),
|
||||||
|
)
|
||||||
|
res1 = v_store.similarity_search("aa", k=5)
|
||||||
|
assert len(res1) == 0
|
||||||
|
finally:
|
||||||
|
v_store.delete_collection()
|
||||||
|
|
||||||
def test_astradb_vectorstore_from_x(self) -> None:
|
def test_astradb_vectorstore_from_x(self) -> None:
|
||||||
"""from_texts and from_documents methods."""
|
"""from_texts and from_documents methods."""
|
||||||
emb = SomeEmbeddings(dimension=2)
|
emb = SomeEmbeddings(dimension=2)
|
||||||
|
Loading…
Reference in New Issue
Block a user