mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-04 04:07:54 +00:00
docs: Improved deeplake.py init documentation (#17549)
**Description:** Updated documentation for DeepLake init method. Especially the exec_option docs needed improvement, but did a general cleanup while I was looking at it. **Issue:** n/a **Dependencies:** None --------- Co-authored-by: Nathan Voxland <nathan@voxland.net>
This commit is contained in:
parent
29ee0496b6
commit
9ece134d45
@ -60,7 +60,7 @@ class DeepLake(VectorStore):
|
|||||||
embedding: Optional[Embeddings] = None,
|
embedding: Optional[Embeddings] = None,
|
||||||
embedding_function: Optional[Embeddings] = None,
|
embedding_function: Optional[Embeddings] = None,
|
||||||
read_only: bool = False,
|
read_only: bool = False,
|
||||||
ingestion_batch_size: int = 1000,
|
ingestion_batch_size: int = 1024,
|
||||||
num_workers: int = 0,
|
num_workers: int = 0,
|
||||||
verbose: bool = True,
|
verbose: bool = True,
|
||||||
exec_option: Optional[str] = None,
|
exec_option: Optional[str] = None,
|
||||||
@ -85,8 +85,19 @@ class DeepLake(VectorStore):
|
|||||||
... )
|
... )
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dataset_path (str): Path to existing dataset or where to create
|
dataset_path (str): The full path for storing to the Deep Lake
|
||||||
a new one. Defaults to _LANGCHAIN_DEFAULT_DEEPLAKE_PATH.
|
Vector Store. It can be:
|
||||||
|
- a Deep Lake cloud path of the form ``hub://org_id/dataset_name``.
|
||||||
|
Requires registration with Deep Lake.
|
||||||
|
- an s3 path of the form ``s3://bucketname/path/to/dataset``.
|
||||||
|
Credentials are required in either the environment or passed to
|
||||||
|
the creds argument.
|
||||||
|
- a local file system path of the form ``./path/to/dataset``
|
||||||
|
or ``~/path/to/dataset`` or ``path/to/dataset``.
|
||||||
|
- a memory path of the form ``mem://path/to/dataset`` which doesn't
|
||||||
|
save the dataset but keeps it in memory instead.
|
||||||
|
Should be used only for testing as it does not persist.
|
||||||
|
Defaults to _LANGCHAIN_DEFAULT_DEEPLAKE_PATH.
|
||||||
token (str, optional): Activeloop token, for fetching credentials
|
token (str, optional): Activeloop token, for fetching credentials
|
||||||
to the dataset at path if it is a Deep Lake dataset.
|
to the dataset at path if it is a Deep Lake dataset.
|
||||||
Tokens are normally autogenerated. Optional.
|
Tokens are normally autogenerated. Optional.
|
||||||
@ -98,25 +109,29 @@ class DeepLake(VectorStore):
|
|||||||
read_only (bool): Open dataset in read-only mode. Default is False.
|
read_only (bool): Open dataset in read-only mode. Default is False.
|
||||||
ingestion_batch_size (int): During data ingestion, data is divided
|
ingestion_batch_size (int): During data ingestion, data is divided
|
||||||
into batches. Batch size is the size of each batch.
|
into batches. Batch size is the size of each batch.
|
||||||
Default is 1000.
|
Default is 1024.
|
||||||
num_workers (int): Number of workers to use during data ingestion.
|
num_workers (int): Number of workers to use during data ingestion.
|
||||||
Default is 0.
|
Default is 0.
|
||||||
verbose (bool): Print dataset summary after each operation.
|
verbose (bool): Print dataset summary after each operation.
|
||||||
Default is True.
|
Default is True.
|
||||||
exec_option (str, optional): DeepLakeVectorStore supports 3 ways to perform
|
exec_option (str, optional): Default method for search execution.
|
||||||
searching - "python", "compute_engine", "tensor_db" and auto.
|
It could be either ``"auto"``, ``"python"``, ``"compute_engine"``
|
||||||
Default is None.
|
or ``"tensor_db"``. Defaults to ``"auto"``.
|
||||||
|
If None, it's set to "auto".
|
||||||
- ``auto``- Selects the best execution method based on the storage
|
- ``auto``- Selects the best execution method based on the storage
|
||||||
location of the Vector Store. It is the default option.
|
location of the Vector Store. It is the default option.
|
||||||
- ``python`` - Pure-python implementation that runs on the client.
|
- ``python`` - Pure-python implementation that runs on the client and
|
||||||
WARNING: using this with big datasets can lead to memory
|
can be used for data stored anywhere. WARNING: using this option
|
||||||
issues. Data can be stored anywhere.
|
with big datasets is discouraged because it can lead to
|
||||||
- ``compute_engine`` - C++ implementation of the Deep Lake Compute
|
memory issues.
|
||||||
Engine that runs on the client. Can be used for any data stored in
|
- ``compute_engine`` - Performant C++ implementation of the Deep Lake
|
||||||
or connected to Deep Lake. Not for in-memory or local datasets.
|
Compute Engine that runs on the client and can be used for any data
|
||||||
- ``tensor_db`` - Hosted Managed Tensor Database that is
|
stored in or connected to Deep Lake. It cannot be used with
|
||||||
responsible for storage and query execution. Only for data stored in
|
in-memory or local datasets.
|
||||||
the Deep Lake Managed Database. Use runtime = {"db_engine": True}
|
- ``tensor_db`` - Performant and fully-hosted Managed Tensor Database
|
||||||
|
that is responsible for storage and query execution. Only available
|
||||||
|
for data stored in the Deep Lake Managed Database. Store datasets
|
||||||
|
in this database by specifying runtime = {"tensor_db": True}
|
||||||
during dataset creation.
|
during dataset creation.
|
||||||
runtime (Dict, optional): Parameters for creating the Vector Store in
|
runtime (Dict, optional): Parameters for creating the Vector Store in
|
||||||
Deep Lake's Managed Tensor Database. Not applicable when loading an
|
Deep Lake's Managed Tensor Database. Not applicable when loading an
|
||||||
|
Loading…
Reference in New Issue
Block a user