mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-04 12:18:24 +00:00
docs: Improved deeplake.py init documentation (#17549)
**Description:** Updated documentation for DeepLake init method. Especially the exec_option docs needed improvement, but did a general cleanup while I was looking at it. **Issue:** n/a **Dependencies:** None --------- Co-authored-by: Nathan Voxland <nathan@voxland.net>
This commit is contained in:
parent
29ee0496b6
commit
9ece134d45
@ -60,7 +60,7 @@ class DeepLake(VectorStore):
|
||||
embedding: Optional[Embeddings] = None,
|
||||
embedding_function: Optional[Embeddings] = None,
|
||||
read_only: bool = False,
|
||||
ingestion_batch_size: int = 1000,
|
||||
ingestion_batch_size: int = 1024,
|
||||
num_workers: int = 0,
|
||||
verbose: bool = True,
|
||||
exec_option: Optional[str] = None,
|
||||
@ -85,8 +85,19 @@ class DeepLake(VectorStore):
|
||||
... )
|
||||
|
||||
Args:
|
||||
dataset_path (str): Path to existing dataset or where to create
|
||||
a new one. Defaults to _LANGCHAIN_DEFAULT_DEEPLAKE_PATH.
|
||||
dataset_path (str): The full path for storing to the Deep Lake
|
||||
Vector Store. It can be:
|
||||
- a Deep Lake cloud path of the form ``hub://org_id/dataset_name``.
|
||||
Requires registration with Deep Lake.
|
||||
- an s3 path of the form ``s3://bucketname/path/to/dataset``.
|
||||
Credentials are required in either the environment or passed to
|
||||
the creds argument.
|
||||
- a local file system path of the form ``./path/to/dataset``
|
||||
or ``~/path/to/dataset`` or ``path/to/dataset``.
|
||||
- a memory path of the form ``mem://path/to/dataset`` which doesn't
|
||||
save the dataset but keeps it in memory instead.
|
||||
Should be used only for testing as it does not persist.
|
||||
Defaults to _LANGCHAIN_DEFAULT_DEEPLAKE_PATH.
|
||||
token (str, optional): Activeloop token, for fetching credentials
|
||||
to the dataset at path if it is a Deep Lake dataset.
|
||||
Tokens are normally autogenerated. Optional.
|
||||
@ -98,25 +109,29 @@ class DeepLake(VectorStore):
|
||||
read_only (bool): Open dataset in read-only mode. Default is False.
|
||||
ingestion_batch_size (int): During data ingestion, data is divided
|
||||
into batches. Batch size is the size of each batch.
|
||||
Default is 1000.
|
||||
Default is 1024.
|
||||
num_workers (int): Number of workers to use during data ingestion.
|
||||
Default is 0.
|
||||
verbose (bool): Print dataset summary after each operation.
|
||||
Default is True.
|
||||
exec_option (str, optional): DeepLakeVectorStore supports 3 ways to perform
|
||||
searching - "python", "compute_engine", "tensor_db" and auto.
|
||||
Default is None.
|
||||
exec_option (str, optional): Default method for search execution.
|
||||
It could be either ``"auto"``, ``"python"``, ``"compute_engine"``
|
||||
or ``"tensor_db"``. Defaults to ``"auto"``.
|
||||
If None, it's set to "auto".
|
||||
- ``auto``- Selects the best execution method based on the storage
|
||||
location of the Vector Store. It is the default option.
|
||||
- ``python`` - Pure-python implementation that runs on the client.
|
||||
WARNING: using this with big datasets can lead to memory
|
||||
issues. Data can be stored anywhere.
|
||||
- ``compute_engine`` - C++ implementation of the Deep Lake Compute
|
||||
Engine that runs on the client. Can be used for any data stored in
|
||||
or connected to Deep Lake. Not for in-memory or local datasets.
|
||||
- ``tensor_db`` - Hosted Managed Tensor Database that is
|
||||
responsible for storage and query execution. Only for data stored in
|
||||
the Deep Lake Managed Database. Use runtime = {"db_engine": True}
|
||||
- ``python`` - Pure-python implementation that runs on the client and
|
||||
can be used for data stored anywhere. WARNING: using this option
|
||||
with big datasets is discouraged because it can lead to
|
||||
memory issues.
|
||||
- ``compute_engine`` - Performant C++ implementation of the Deep Lake
|
||||
Compute Engine that runs on the client and can be used for any data
|
||||
stored in or connected to Deep Lake. It cannot be used with
|
||||
in-memory or local datasets.
|
||||
- ``tensor_db`` - Performant and fully-hosted Managed Tensor Database
|
||||
that is responsible for storage and query execution. Only available
|
||||
for data stored in the Deep Lake Managed Database. Store datasets
|
||||
in this database by specifying runtime = {"tensor_db": True}
|
||||
during dataset creation.
|
||||
runtime (Dict, optional): Parameters for creating the Vector Store in
|
||||
Deep Lake's Managed Tensor Database. Not applicable when loading an
|
||||
|
Loading…
Reference in New Issue
Block a user