community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)

Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
2025-09-01 02:50:47 +00:00 · 2023-12-11 13:53:30 -08:00
parent c0f4b95aa9
commit ed58eeb9c5
2446 changed files with 171805 additions and 137118 deletions
--- a/libs/community/langchain_community/vectorstores/deeplake.py
+++ b/libs/community/langchain_community/vectorstores/deeplake.py
@@ -0,0 +1,901 @@
+from __future__ import annotations
+
+import logging
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+
+try:
+    import deeplake
+    from deeplake import VectorStore as DeepLakeVectorStore
+    from deeplake.core.fast_forwarding import version_compare
+
+    _DEEPLAKE_INSTALLED = True
+except ImportError:
+    _DEEPLAKE_INSTALLED = False
+
+from langchain_core.documents import Document
+from langchain_core.embeddings import Embeddings
+from langchain_core.vectorstores import VectorStore
+
+from langchain_community.vectorstores.utils import maximal_marginal_relevance
+
+logger = logging.getLogger(__name__)
+
+
+class DeepLake(VectorStore):
+    """`Activeloop Deep Lake` vector store.
+
+    We integrated deeplake's similarity search and filtering for fast prototyping.
+    Now, it supports Tensor Query Language (TQL) for production use cases
+    over billion rows.
+
+    Why Deep Lake?
+
+    - Not only stores embeddings, but also the original data with version control.
+    - Serverless, doesn't require another service and can be used with major
+        cloud providers (S3, GCS, etc.)
+    - More than just a multi-modal vector store. You can use the dataset
+        to fine-tune your own LLM models.
+
+    To use, you should have the ``deeplake`` python package installed.
+
+    Example:
+        .. code-block:: python
+
+                from langchain_community.vectorstores import DeepLake
+                from langchain_community.embeddings.openai import OpenAIEmbeddings
+
+                embeddings = OpenAIEmbeddings()
+                vectorstore = DeepLake("langchain_store", embeddings.embed_query)
+    """
+
+    _LANGCHAIN_DEFAULT_DEEPLAKE_PATH = "./deeplake/"
+
+    def __init__(
+        self,
+        dataset_path: str = _LANGCHAIN_DEFAULT_DEEPLAKE_PATH,
+        token: Optional[str] = None,
+        embedding: Optional[Embeddings] = None,
+        embedding_function: Optional[Embeddings] = None,
+        read_only: bool = False,
+        ingestion_batch_size: int = 1000,
+        num_workers: int = 0,
+        verbose: bool = True,
+        exec_option: Optional[str] = None,
+        runtime: Optional[Dict] = None,
+        index_params: Optional[Dict[str, Union[int, str]]] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Creates an empty DeepLakeVectorStore or loads an existing one.
+
+        The DeepLakeVectorStore is located at the specified ``path``.
+
+        Examples:
+            >>> # Create a vector store with default tensors
+            >>> deeplake_vectorstore = DeepLake(
+            ...        path = <path_for_storing_Data>,
+            ... )
+            >>>
+            >>> # Create a vector store in the Deep Lake Managed Tensor Database
+            >>> data = DeepLake(
+            ...        path = "hub://org_id/dataset_name",
+            ...        runtime = {"tensor_db": True},
+            ... )
+
+        Args:
+            dataset_path (str): Path to existing dataset or where to create
+                a new one. Defaults to _LANGCHAIN_DEFAULT_DEEPLAKE_PATH.
+            token (str, optional):  Activeloop token, for fetching credentials
+                to the dataset at path if it is a Deep Lake dataset.
+                Tokens are normally autogenerated. Optional.
+            embedding (Embeddings, optional): Function to convert
+                either documents or query. Optional.
+            embedding_function (Embeddings, optional): Function to convert
+                either documents or query. Optional. Deprecated: keeping this
+                parameter for backwards compatibility.
+            read_only (bool): Open dataset in read-only mode. Default is False.
+            ingestion_batch_size (int): During data ingestion, data is divided
+                into batches. Batch size is the size of each batch.
+                Default is 1000.
+            num_workers (int): Number of workers to use during data ingestion.
+                Default is 0.
+            verbose (bool): Print dataset summary after each operation.
+                Default is True.
+            exec_option (str, optional): DeepLakeVectorStore supports 3 ways to perform
+                searching - "python", "compute_engine", "tensor_db" and auto.
+                Default is None.
+                - ``auto``- Selects the best execution method based on the storage
+                    location of the Vector Store. It is the default option.
+                - ``python`` - Pure-python implementation that runs on the client.
+                    WARNING: using this with big datasets can lead to memory
+                    issues. Data can be stored anywhere.
+                - ``compute_engine`` - C++ implementation of the Deep Lake Compute
+                    Engine that runs on the client. Can be used for any data stored in
+                    or connected to Deep Lake. Not for in-memory or local datasets.
+                - ``tensor_db`` - Hosted Managed Tensor Database that is
+                    responsible for storage and query execution. Only for data stored in
+                    the Deep Lake Managed Database. Use runtime = {"db_engine": True}
+                    during dataset creation.
+            runtime (Dict, optional): Parameters for creating the Vector Store in
+                Deep Lake's Managed Tensor Database. Not applicable when loading an
+                existing Vector Store. To create a Vector Store in the Managed Tensor
+                Database, set `runtime = {"tensor_db": True}`.
+            index_params (Optional[Dict[str, Union[int, str]]], optional): Dictionary
+                containing information about vector index that will be created. Defaults
+                to None, which will utilize ``DEFAULT_VECTORSTORE_INDEX_PARAMS`` from
+                ``deeplake.constants``. The specified key-values override the default
+                ones.
+                - threshold: The threshold for the dataset size above which an index
+                    will be created for the embedding tensor. When the threshold value
+                    is set to -1, index creation is turned off. Defaults to -1, which
+                    turns off the index.
+                - distance_metric: This key specifies the method of calculating the
+                    distance between vectors when creating the vector database (VDB)
+                    index. It can either be a string that corresponds to a member of
+                    the DistanceType enumeration, or the string value itself.
+                    - If no value is provided, it defaults to "L2".
+                    - "L2" corresponds to DistanceType.L2_NORM.
+                    - "COS" corresponds to DistanceType.COSINE_SIMILARITY.
+                - additional_params: Additional parameters for fine-tuning the index.
+            **kwargs: Other optional keyword arguments.
+
+        Raises:
+            ValueError: If some condition is not met.
+        """
+
+        self.ingestion_batch_size = ingestion_batch_size
+        self.num_workers = num_workers
+        self.verbose = verbose
+
+        if _DEEPLAKE_INSTALLED is False:
+            raise ImportError(
+                "Could not import deeplake python package. "
+                "Please install it with `pip install deeplake[enterprise]`."
+            )
+
+        if (
+            runtime == {"tensor_db": True}
+            and version_compare(deeplake.__version__, "3.6.7") == -1
+        ):
+            raise ImportError(
+                "To use tensor_db option you need to update deeplake to `3.6.7` or "
+                "higher. "
+                f"Currently installed deeplake version is {deeplake.__version__}. "
+            )
+
+        self.dataset_path = dataset_path
+
+        if embedding_function:
+            logger.warning(
+                "Using embedding function is deprecated and will be removed "
+                "in the future. Please use embedding instead."
+            )
+
+        self.vectorstore = DeepLakeVectorStore(
+            path=self.dataset_path,
+            embedding_function=embedding_function or embedding,
+            read_only=read_only,
+            token=token,
+            exec_option=exec_option,
+            verbose=verbose,
+            runtime=runtime,
+            index_params=index_params,
+            **kwargs,
+        )
+
+        self._embedding_function = embedding_function or embedding
+        self._id_tensor_name = "ids" if "ids" in self.vectorstore.tensors() else "id"
+
+    @property
+    def embeddings(self) -> Optional[Embeddings]:
+        return self._embedding_function
+
+    def add_texts(
+        self,
+        texts: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        ids: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        """Run more texts through the embeddings and add to the vectorstore.
+
+        Examples:
+            >>> ids = deeplake_vectorstore.add_texts(
+            ...     texts = <list_of_texts>,
+            ...     metadatas = <list_of_metadata_jsons>,
+            ...     ids = <list_of_ids>,
+            ... )
+
+        Args:
+            texts (Iterable[str]): Texts to add to the vectorstore.
+            metadatas (Optional[List[dict]], optional): Optional list of metadatas.
+            ids (Optional[List[str]], optional): Optional list of IDs.
+            embedding_function (Optional[Embeddings], optional): Embedding function
+                to use to convert the text into embeddings.
+            **kwargs (Any): Any additional keyword arguments passed is not supported
+                by this method.
+
+        Returns:
+            List[str]: List of IDs of the added texts.
+        """
+        if kwargs:
+            unsupported_items = "`, `".join(set(kwargs.keys()))
+            raise TypeError(
+                f"`{unsupported_items}` is/are not a valid argument to add_text method"
+            )
+
+        kwargs = {}
+        if ids:
+            if self._id_tensor_name == "ids":  # for backwards compatibility
+                kwargs["ids"] = ids
+            else:
+                kwargs["id"] = ids
+
+        if metadatas is None:
+            metadatas = [{}] * len(list(texts))
+
+        if not isinstance(texts, list):
+            texts = list(texts)
+
+        if texts is None:
+            raise ValueError("`texts` parameter shouldn't be None.")
+        elif len(texts) == 0:
+            raise ValueError("`texts` parameter shouldn't be empty.")
+
+        return self.vectorstore.add(
+            text=texts,
+            metadata=metadatas,
+            embedding_data=texts,
+            embedding_tensor="embedding",
+            embedding_function=self._embedding_function.embed_documents,  # type: ignore
+            return_ids=True,
+            **kwargs,
+        )
+
+    def _search_tql(
+        self,
+        tql: Optional[str],
+        exec_option: Optional[str] = None,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Function for performing tql_search.
+
+        Args:
+            tql (str): TQL Query string for direct evaluation.
+                Available only for `compute_engine` and `tensor_db`.
+            exec_option (str, optional): Supports 3 ways to search.
+                Could be "python", "compute_engine" or "tensor_db". Default is "python".
+                - ``python`` - Pure-python implementation for the client.
+                    WARNING: not recommended for big datasets due to potential memory
+                    issues.
+                - ``compute_engine`` - C++ implementation of Deep Lake Compute
+                    Engine for the client. Not for in-memory or local datasets.
+                - ``tensor_db`` - Hosted Managed Tensor Database for storage
+                    and query execution. Only for data in Deep Lake Managed Database.
+                        Use runtime = {"db_engine": True} during dataset creation.
+            return_score (bool): Return score with document. Default is False.
+
+        Returns:
+            Tuple[List[Document], List[Tuple[Document, float]]] - A tuple of two lists.
+                The first list contains Documents, and the second list contains
+                tuples of Document and float score.
+
+        Raises:
+            ValueError: If return_score is True but some condition is not met.
+        """
+        result = self.vectorstore.search(
+            query=tql,
+            exec_option=exec_option,
+        )
+        metadatas = result["metadata"]
+        texts = result["text"]
+
+        docs = [
+            Document(
+                page_content=text,
+                metadata=metadata,
+            )
+            for text, metadata in zip(texts, metadatas)
+        ]
+
+        if kwargs:
+            unsupported_argument = next(iter(kwargs))
+            if kwargs[unsupported_argument] is not False:
+                raise ValueError(
+                    f"specifying {unsupported_argument} is "
+                    "not supported with tql search."
+                )
+
+        return docs
+
+    def _search(
+        self,
+        query: Optional[str] = None,
+        embedding: Optional[Union[List[float], np.ndarray]] = None,
+        embedding_function: Optional[Callable] = None,
+        k: int = 4,
+        distance_metric: Optional[str] = None,
+        use_maximal_marginal_relevance: bool = False,
+        fetch_k: Optional[int] = 20,
+        filter: Optional[Union[Dict, Callable]] = None,
+        return_score: bool = False,
+        exec_option: Optional[str] = None,
+        deep_memory: bool = False,
+        **kwargs: Any,
+    ) -> Any[List[Document], List[Tuple[Document, float]]]:
+        """
+        Return docs similar to query.
+
+        Args:
+            query (str, optional): Text to look up similar docs.
+            embedding (Union[List[float], np.ndarray], optional): Query's embedding.
+            embedding_function (Callable, optional): Function to convert `query`
+                into embedding.
+            k (int): Number of Documents to return.
+            distance_metric (Optional[str], optional): `L2` for Euclidean, `L1` for
+                Nuclear, `max` for L-infinity distance, `cos` for cosine similarity,
+                'dot' for dot product.
+            filter (Union[Dict, Callable], optional): Additional filter prior
+                to the embedding search.
+                - ``Dict`` - Key-value search on tensors of htype json, on an
+                    AND basis (a sample must satisfy all key-value filters to be True)
+                    Dict = {"tensor_name_1": {"key": value},
+                            "tensor_name_2": {"key": value}}
+                - ``Function`` - Any function compatible with `deeplake.filter`.
+            use_maximal_marginal_relevance (bool): Use maximal marginal relevance.
+            fetch_k (int): Number of Documents for MMR algorithm.
+            return_score (bool): Return the score.
+            exec_option (str, optional): Supports 3 ways to perform searching.
+                Could be "python", "compute_engine" or "tensor_db".
+                - ``python`` - Pure-python implementation for the client.
+                    WARNING: not recommended for big datasets.
+                - ``compute_engine`` - C++ implementation of Deep Lake Compute
+                    Engine for the client. Not for in-memory or local datasets.
+                - ``tensor_db`` - Hosted Managed Tensor Database for storage
+                    and query execution. Only for data in Deep Lake Managed Database.
+                    Use runtime = {"db_engine": True} during dataset creation.
+            deep_memory (bool): Whether to use the Deep Memory model for improving
+                search results. Defaults to False if deep_memory is not specified in
+                the Vector Store initialization. If True, the distance metric is set
+                to "deepmemory_distance", which represents the metric with which the
+                model was trained. The search is performed using the Deep Memory model.
+                If False, the distance metric is set to "COS" or whatever distance
+                metric user specifies.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List of Documents by the specified distance metric,
+            if return_score True, return a tuple of (Document, score)
+
+        Raises:
+            ValueError: if both `embedding` and `embedding_function` are not specified.
+        """
+
+        if kwargs.get("tql"):
+            return self._search_tql(
+                tql=kwargs["tql"],
+                exec_option=exec_option,
+                return_score=return_score,
+                embedding=embedding,
+                embedding_function=embedding_function,
+                distance_metric=distance_metric,
+                use_maximal_marginal_relevance=use_maximal_marginal_relevance,
+                filter=filter,
+            )
+
+        if embedding_function:
+            if isinstance(embedding_function, Embeddings):
+                _embedding_function = embedding_function.embed_query
+            else:
+                _embedding_function = embedding_function
+        elif self._embedding_function:
+            _embedding_function = self._embedding_function.embed_query
+        else:
+            _embedding_function = None
+
+        if embedding is None:
+            if _embedding_function is None:
+                raise ValueError(
+                    "Either `embedding` or `embedding_function` needs to be"
+                    " specified."
+                )
+
+            embedding = _embedding_function(query) if query else None
+
+        if isinstance(embedding, list):
+            embedding = np.array(embedding, dtype=np.float32)
+            if len(embedding.shape) > 1:
+                embedding = embedding[0]
+
+        result = self.vectorstore.search(
+            embedding=embedding,
+            k=fetch_k if use_maximal_marginal_relevance else k,
+            distance_metric=distance_metric,
+            filter=filter,
+            exec_option=exec_option,
+            return_tensors=["embedding", "metadata", "text", self._id_tensor_name],
+            deep_memory=deep_memory,
+        )
+
+        scores = result["score"]
+        embeddings = result["embedding"]
+        metadatas = result["metadata"]
+        texts = result["text"]
+
+        if use_maximal_marginal_relevance:
+            lambda_mult = kwargs.get("lambda_mult", 0.5)
+            indices = maximal_marginal_relevance(  # type: ignore
+                embedding,  # type: ignore
+                embeddings,
+                k=min(k, len(texts)),
+                lambda_mult=lambda_mult,
+            )
+
+            scores = [scores[i] for i in indices]
+            texts = [texts[i] for i in indices]
+            metadatas = [metadatas[i] for i in indices]
+
+        docs = [
+            Document(
+                page_content=text,
+                metadata=metadata,
+            )
+            for text, metadata in zip(texts, metadatas)
+        ]
+
+        if return_score:
+            return [(doc, score) for doc, score in zip(docs, scores)]
+
+        return docs
+
+    def similarity_search(
+        self,
+        query: str,
+        k: int = 4,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """
+        Return docs most similar to query.
+
+        Examples:
+            >>> # Search using an embedding
+            >>> data = vector_store.similarity_search(
+            ...     query=<your_query>,
+            ...     k=<num_items>,
+            ...     exec_option=<preferred_exec_option>,
+            ... )
+            >>> # Run tql search:
+            >>> data = vector_store.similarity_search(
+            ...     query=None,
+            ...     tql="SELECT * WHERE id == <id>",
+            ...     exec_option="compute_engine",
+            ... )
+
+        Args:
+            k (int): Number of Documents to return. Defaults to 4.
+            query (str): Text to look up similar documents.
+            **kwargs: Additional keyword arguments include:
+                embedding (Callable): Embedding function to use. Defaults to None.
+                distance_metric (str): 'L2' for Euclidean, 'L1' for Nuclear, 'max'
+                    for L-infinity, 'cos' for cosine, 'dot' for dot product.
+                    Defaults to 'L2'.
+                filter (Union[Dict, Callable], optional): Additional filter
+                    before embedding search.
+                    - Dict: Key-value search on tensors of htype json,
+                        (sample must satisfy all key-value filters)
+                        Dict = {"tensor_1": {"key": value}, "tensor_2": {"key": value}}
+                    - Function: Compatible with `deeplake.filter`.
+                    Defaults to None.
+                exec_option (str): Supports 3 ways to perform searching.
+                    'python', 'compute_engine', or 'tensor_db'. Defaults to 'python'.
+                    - 'python': Pure-python implementation for the client.
+                        WARNING: not recommended for big datasets.
+                    - 'compute_engine': C++ implementation of the Compute Engine for
+                        the client. Not for in-memory or local datasets.
+                    - 'tensor_db': Managed Tensor Database for storage and query.
+                        Only for data in Deep Lake Managed Database.
+                        Use `runtime = {"db_engine": True}` during dataset creation.
+                deep_memory (bool): Whether to use the Deep Memory model for improving
+                    search results. Defaults to False if deep_memory is not specified
+                    in the Vector Store initialization. If True, the distance metric
+                    is set to "deepmemory_distance", which represents the metric with
+                    which the model was trained. The search is performed using the Deep
+                    Memory model. If False, the distance metric is set to "COS" or
+                    whatever distance metric user specifies.
+
+        Returns:
+            List[Document]: List of Documents most similar to the query vector.
+        """
+
+        return self._search(
+            query=query,
+            k=k,
+            use_maximal_marginal_relevance=False,
+            return_score=False,
+            **kwargs,
+        )
+
+    def similarity_search_by_vector(
+        self,
+        embedding: Union[List[float], np.ndarray],
+        k: int = 4,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """
+        Return docs most similar to embedding vector.
+
+        Examples:
+            >>> # Search using an embedding
+            >>> data = vector_store.similarity_search_by_vector(
+            ...    embedding=<your_embedding>,
+            ...    k=<num_items_to_return>,
+            ...    exec_option=<preferred_exec_option>,
+            ... )
+
+        Args:
+            embedding (Union[List[float], np.ndarray]):
+                Embedding to find similar docs.
+            k (int): Number of Documents to return. Defaults to 4.
+            **kwargs: Additional keyword arguments including:
+                filter (Union[Dict, Callable], optional):
+                    Additional filter before embedding search.
+                    - ``Dict`` - Key-value search on tensors of htype json. True
+                        if all key-value filters are satisfied.
+                        Dict = {"tensor_name_1": {"key": value},
+                                "tensor_name_2": {"key": value}}
+                    - ``Function`` - Any function compatible with
+                        `deeplake.filter`.
+                    Defaults to None.
+                exec_option (str): Options for search execution include
+                    "python", "compute_engine", or "tensor_db". Defaults to
+                    "python".
+                    - "python" - Pure-python implementation running on the client.
+                        Can be used for data stored anywhere. WARNING: using this
+                        option with big datasets is discouraged due to potential
+                        memory issues.
+                    - "compute_engine" - Performant C++ implementation of the Deep
+                        Lake Compute Engine. Runs on the client and can be used for
+                        any data stored in or connected to Deep Lake. It cannot be
+                        used with in-memory or local datasets.
+                    - "tensor_db" - Performant, fully-hosted Managed Tensor Database.
+                        Responsible for storage and query execution. Only available
+                        for data stored in the Deep Lake Managed Database.
+                        To store datasets in this database, specify
+                        `runtime = {"db_engine": True}` during dataset creation.
+                distance_metric (str): `L2` for Euclidean, `L1` for Nuclear,
+                    `max` for L-infinity distance, `cos` for cosine similarity,
+                    'dot' for dot product. Defaults to `L2`.
+                deep_memory (bool): Whether to use the Deep Memory model for improving
+                    search results. Defaults to False if deep_memory is not specified
+                    in the Vector Store initialization. If True, the distance metric
+                    is set to "deepmemory_distance", which represents the metric with
+                    which the model was trained. The search is performed using the Deep
+                    Memory model. If False, the distance metric is set to "COS" or
+                    whatever distance metric user specifies.
+
+        Returns:
+            List[Document]: List of Documents most similar to the query vector.
+        """
+
+        return self._search(
+            embedding=embedding,
+            k=k,
+            use_maximal_marginal_relevance=False,
+            return_score=False,
+            **kwargs,
+        )
+
+    def similarity_search_with_score(
+        self,
+        query: str,
+        k: int = 4,
+        **kwargs: Any,
+    ) -> List[Tuple[Document, float]]:
+        """
+        Run similarity search with Deep Lake with distance returned.
+
+        Examples:
+        >>> data = vector_store.similarity_search_with_score(
+        ...     query=<your_query>,
+        ...     embedding=<your_embedding_function>
+        ...     k=<number_of_items_to_return>,
+        ...     exec_option=<preferred_exec_option>,
+        ... )
+
+        Args:
+            query (str): Query text to search for.
+            k (int): Number of results to return. Defaults to 4.
+            **kwargs: Additional keyword arguments. Some of these arguments are:
+                distance_metric: `L2` for Euclidean, `L1` for Nuclear, `max` L-infinity
+                    distance, `cos` for cosine similarity, 'dot' for dot product.
+                    Defaults to `L2`.
+                filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
+                    embedding_function (Callable): Embedding function to use. Defaults
+                    to None.
+                exec_option (str): DeepLakeVectorStore supports 3 ways to perform
+                    searching. It could be either "python", "compute_engine" or
+                    "tensor_db". Defaults to "python".
+                    - "python" - Pure-python implementation running on the client.
+                        Can be used for data stored anywhere. WARNING: using this
+                        option with big datasets is discouraged due to potential
+                        memory issues.
+                    - "compute_engine" - Performant C++ implementation of the Deep
+                        Lake Compute Engine. Runs on the client and can be used for
+                        any data stored in or connected to Deep Lake. It cannot be used
+                        with in-memory or local datasets.
+                    - "tensor_db" - Performant, fully-hosted Managed Tensor Database.
+                        Responsible for storage and query execution. Only available for
+                        data stored in the Deep Lake Managed Database. To store datasets
+                        in this database, specify `runtime = {"db_engine": True}`
+                        during dataset creation.
+                deep_memory (bool): Whether to use the Deep Memory model for improving
+                    search results. Defaults to False if deep_memory is not specified
+                    in the Vector Store initialization. If True, the distance metric
+                    is set to "deepmemory_distance", which represents the metric with
+                    which the model was trained. The search is performed using the Deep
+                    Memory model. If False, the distance metric is set to "COS" or
+                    whatever distance metric user specifies.
+
+        Returns:
+            List[Tuple[Document, float]]: List of documents most similar to the query
+                text with distance in float."""
+
+        return self._search(
+            query=query,
+            k=k,
+            return_score=True,
+            **kwargs,
+        )
+
+    def max_marginal_relevance_search_by_vector(
+        self,
+        embedding: List[float],
+        k: int = 4,
+        fetch_k: int = 20,
+        lambda_mult: float = 0.5,
+        exec_option: Optional[str] = None,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """
+        Return docs selected using the maximal marginal relevance. Maximal marginal
+        relevance optimizes for similarity to query AND diversity among selected docs.
+
+        Examples:
+        >>> data = vector_store.max_marginal_relevance_search_by_vector(
+        ...        embedding=<your_embedding>,
+        ...        fetch_k=<elements_to_fetch_before_mmr_search>,
+        ...        k=<number_of_items_to_return>,
+        ...        exec_option=<preferred_exec_option>,
+        ... )
+
+        Args:
+            embedding: Embedding to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            fetch_k: Number of Documents to fetch for MMR algorithm.
+            lambda_mult: Number between 0 and 1 determining the degree of diversity.
+                0 corresponds to max diversity and 1 to min diversity. Defaults to 0.5.
+            exec_option (str): DeepLakeVectorStore supports 3 ways for searching.
+                Could be "python", "compute_engine" or "tensor_db". Defaults to
+                "python".
+                - "python" - Pure-python implementation running on the client.
+                    Can be used for data stored anywhere. WARNING: using this
+                    option with big datasets is discouraged due to potential
+                    memory issues.
+                - "compute_engine" - Performant C++ implementation of the Deep
+                    Lake Compute Engine. Runs on the client and can be used for
+                    any data stored in or connected to Deep Lake. It cannot be used
+                    with in-memory or local datasets.
+                - "tensor_db" - Performant, fully-hosted Managed Tensor Database.
+                    Responsible for storage and query execution. Only available for
+                    data stored in the Deep Lake Managed Database. To store datasets
+                    in this database, specify `runtime = {"db_engine": True}`
+                    during dataset creation.
+            deep_memory (bool): Whether to use the Deep Memory model for improving
+                search results. Defaults to False if deep_memory is not specified
+                in the Vector Store initialization. If True, the distance metric
+                is set to "deepmemory_distance", which represents the metric with
+                which the model was trained. The search is performed using the Deep
+                Memory model. If False, the distance metric is set to "COS" or
+                whatever distance metric user specifies.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[Documents] - A list of documents.
+        """
+
+        return self._search(
+            embedding=embedding,
+            k=k,
+            fetch_k=fetch_k,
+            use_maximal_marginal_relevance=True,
+            lambda_mult=lambda_mult,
+            exec_option=exec_option,
+            **kwargs,
+        )
+
+    def max_marginal_relevance_search(
+        self,
+        query: str,
+        k: int = 4,
+        fetch_k: int = 20,
+        lambda_mult: float = 0.5,
+        exec_option: Optional[str] = None,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Return docs selected using maximal marginal relevance.
+
+        Maximal marginal relevance optimizes for similarity to query AND diversity
+        among selected documents.
+
+        Examples:
+        >>> # Search using an embedding
+        >>> data = vector_store.max_marginal_relevance_search(
+        ...        query = <query_to_search>,
+        ...        embedding_function = <embedding_function_for_query>,
+        ...        k = <number_of_items_to_return>,
+        ...        exec_option = <preferred_exec_option>,
+        ... )
+
+        Args:
+            query: Text to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            fetch_k: Number of Documents for MMR algorithm.
+            lambda_mult: Value between 0 and 1. 0 corresponds
+                        to maximum diversity and 1 to minimum.
+                        Defaults to 0.5.
+            exec_option (str): Supports 3 ways to perform searching.
+                - "python" - Pure-python implementation running on the client.
+                        Can be used for data stored anywhere. WARNING: using this
+                        option with big datasets is discouraged due to potential
+                        memory issues.
+                    - "compute_engine" - Performant C++ implementation of the Deep
+                        Lake Compute Engine. Runs on the client and can be used for
+                        any data stored in or connected to Deep Lake. It cannot be
+                        used with in-memory or local datasets.
+                    - "tensor_db" - Performant, fully-hosted Managed Tensor Database.
+                        Responsible for storage and query execution. Only available
+                        for data stored in the Deep Lake Managed Database. To store
+                        datasets in this database, specify
+                        `runtime = {"db_engine": True}` during dataset creation.
+            deep_memory (bool): Whether to use the Deep Memory model for improving
+                search results. Defaults to False if deep_memory is not specified
+                in the Vector Store initialization. If True, the distance metric
+                is set to "deepmemory_distance", which represents the metric with
+                which the model was trained. The search is performed using the Deep
+                Memory model. If False, the distance metric is set to "COS" or
+                whatever distance metric user specifies.
+            **kwargs: Additional keyword arguments
+
+        Returns:
+            List of Documents selected by maximal marginal relevance.
+
+        Raises:
+            ValueError: when MRR search is on but embedding function is
+                not specified.
+        """
+        embedding_function = kwargs.get("embedding") or self._embedding_function
+        if embedding_function is None:
+            raise ValueError(
+                "For MMR search, you must specify an embedding function on"
+                " `creation` or during add call."
+            )
+        return self._search(
+            query=query,
+            k=k,
+            fetch_k=fetch_k,
+            use_maximal_marginal_relevance=True,
+            lambda_mult=lambda_mult,
+            exec_option=exec_option,
+            embedding_function=embedding_function,  # type: ignore
+            **kwargs,
+        )
+
+    @classmethod
+    def from_texts(
+        cls,
+        texts: List[str],
+        embedding: Optional[Embeddings] = None,
+        metadatas: Optional[List[dict]] = None,
+        ids: Optional[List[str]] = None,
+        dataset_path: str = _LANGCHAIN_DEFAULT_DEEPLAKE_PATH,
+        **kwargs: Any,
+    ) -> DeepLake:
+        """Create a Deep Lake dataset from a raw documents.
+
+        If a dataset_path is specified, the dataset will be persisted in that location,
+        otherwise by default at `./deeplake`
+
+        Examples:
+        >>> # Search using an embedding
+        >>> vector_store = DeepLake.from_texts(
+        ...        texts = <the_texts_that_you_want_to_embed>,
+        ...        embedding_function = <embedding_function_for_query>,
+        ...        k = <number_of_items_to_return>,
+        ...        exec_option = <preferred_exec_option>,
+        ... )
+
+        Args:
+            dataset_path (str): - The full path to the dataset. Can be:
+                - Deep Lake cloud path of the form ``hub://username/dataset_name``.
+                    To write to Deep Lake cloud datasets,
+                    ensure that you are logged in to Deep Lake
+                    (use 'activeloop login' from command line)
+                - AWS S3 path of the form ``s3://bucketname/path/to/dataset``.
+                    Credentials are required in either the environment
+                - Google Cloud Storage path of the form
+                    ``gcs://bucketname/path/to/dataset`` Credentials are required
+                    in either the environment
+                - Local file system path of the form ``./path/to/dataset`` or
+                    ``~/path/to/dataset`` or ``path/to/dataset``.
+                - In-memory path of the form ``mem://path/to/dataset`` which doesn't
+                    save the dataset, but keeps it in memory instead.
+                    Should be used only for testing as it does not persist.
+            texts (List[Document]): List of documents to add.
+            embedding (Optional[Embeddings]): Embedding function. Defaults to None.
+                Note, in other places, it is called embedding_function.
+            metadatas (Optional[List[dict]]): List of metadatas. Defaults to None.
+            ids (Optional[List[str]]): List of document IDs. Defaults to None.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            DeepLake: Deep Lake dataset.
+        """
+        deeplake_dataset = cls(dataset_path=dataset_path, embedding=embedding, **kwargs)
+        deeplake_dataset.add_texts(
+            texts=texts,
+            metadatas=metadatas,
+            ids=ids,
+        )
+        return deeplake_dataset
+
+    def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> bool:
+        """Delete the entities in the dataset.
+
+        Args:
+            ids (Optional[List[str]], optional): The document_ids to delete.
+                Defaults to None.
+            **kwargs: Other keyword arguments that subclasses might use.
+                - filter (Optional[Dict[str, str]], optional): The filter to delete by.
+                - delete_all (Optional[bool], optional): Whether to drop the dataset.
+
+        Returns:
+            bool: Whether the delete operation was successful.
+        """
+        filter = kwargs.get("filter")
+        delete_all = kwargs.get("delete_all")
+
+        self.vectorstore.delete(ids=ids, filter=filter, delete_all=delete_all)
+
+        return True
+
+    @classmethod
+    def force_delete_by_path(cls, path: str) -> None:
+        """Force delete dataset by path.
+
+        Args:
+            path (str): path of the dataset to delete.
+
+        Raises:
+            ValueError: if deeplake is not installed.
+        """
+
+        try:
+            import deeplake
+        except ImportError:
+            raise ValueError(
+                "Could not import deeplake python package. "
+                "Please install it with `pip install deeplake`."
+            )
+        deeplake.delete(path, large_ok=True, force=True)
+
+    def delete_dataset(self) -> None:
+        """Delete the collection."""
+        self.delete(delete_all=True)
+
+    def ds(self) -> Any:
+        logger.warning(
+            "this method is deprecated and will be removed, "
+            "better to use `db.vectorstore.dataset` instead."
+        )
+        return self.vectorstore.dataset