diff --git a/libs/community/langchain_community/vectorstores/deeplake.py b/libs/community/langchain_community/vectorstores/deeplake.py index 7051988b92b..52fe55fa7ce 100644 --- a/libs/community/langchain_community/vectorstores/deeplake.py +++ b/libs/community/langchain_community/vectorstores/deeplake.py @@ -51,6 +51,7 @@ class DeepLake(VectorStore): """ _LANGCHAIN_DEFAULT_DEEPLAKE_PATH = "./deeplake/" + _valid_search_kwargs = ["lambda_mult"] def __init__( self, @@ -219,11 +220,7 @@ class DeepLake(VectorStore): Returns: List[str]: List of IDs of the added texts. """ - if kwargs: - unsupported_items = "`, `".join(set(kwargs.keys())) - raise TypeError( - f"`{unsupported_items}` is/are not a valid argument to add_text method" - ) + self._validate_kwargs(kwargs, "add_texts") kwargs = {} if ids: @@ -371,6 +368,9 @@ class DeepLake(VectorStore): Raises: ValueError: if both `embedding` and `embedding_function` are not specified. """ + if kwargs.get("tql_query"): + logger.warning("`tql_query` is deprecated. Please use `tql` instead.") + kwargs["tql"] = kwargs.pop("tql_query") if kwargs.get("tql"): return self._search_tql( @@ -384,6 +384,8 @@ class DeepLake(VectorStore): filter=filter, ) + self._validate_kwargs(kwargs, "search") + if embedding_function: if isinstance(embedding_function, Embeddings): _embedding_function = embedding_function.embed_query @@ -417,7 +419,6 @@ class DeepLake(VectorStore): return_tensors=["embedding", "metadata", "text", self._id_tensor_name], deep_memory=deep_memory, ) - scores = result["score"] embeddings = result["embedding"] metadatas = result["metadata"] @@ -445,6 +446,9 @@ class DeepLake(VectorStore): ] if return_score: + if not isinstance(scores, list): + scores = [scores] + return [(doc, score) for doc, score in zip(docs, scores)] return docs @@ -899,3 +903,30 @@ class DeepLake(VectorStore): "better to use `db.vectorstore.dataset` instead." ) return self.vectorstore.dataset + + @classmethod + def _validate_kwargs(cls, kwargs, method_name): + if kwargs: + valid_items = cls._get_valid_args(method_name) + unsupported_items = cls._get_unsupported_items(kwargs, valid_items) + + if unsupported_items: + raise TypeError( + f"`{unsupported_items}` are not a valid " + f"argument to {method_name} method" + ) + + @classmethod + def _get_valid_args(cls, method_name): + if method_name == "search": + return cls._valid_search_kwargs + else: + return [] + + @staticmethod + def _get_unsupported_items(kwargs, valid_items): + kwargs = {k: v for k, v in kwargs.items() if k not in valid_items} + unsupported_items = None + if kwargs: + unsupported_items = "`, `".join(set(kwargs.keys())) + return unsupported_items diff --git a/libs/community/tests/integration_tests/vectorstores/test_deeplake.py b/libs/community/tests/integration_tests/vectorstores/test_deeplake.py index e15fdb9694e..99d4d6def2e 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_deeplake.py +++ b/libs/community/tests/integration_tests/vectorstores/test_deeplake.py @@ -18,7 +18,9 @@ def deeplake_datastore() -> DeepLake: embedding_function=FakeEmbeddings(), overwrite=True, ) - return docsearch + yield docsearch + + docsearch.delete_dataset() @pytest.fixture(params=["L1", "L2", "max", "cos"]) @@ -50,27 +52,14 @@ def test_deeplake_with_metadatas() -> None: assert output == [Document(page_content="foo", metadata={"page": "0"})] -def test_deeplakewith_persistence() -> None: +def test_deeplake_with_persistence(deeplake_datastore) -> None: """Test end to end construction and search, with persistence.""" - import deeplake - - dataset_path = "./tests/persist_dir" - if deeplake.exists(dataset_path): - deeplake.delete(dataset_path) - - texts = ["foo", "bar", "baz"] - docsearch = DeepLake.from_texts( - dataset_path=dataset_path, - texts=texts, - embedding=FakeEmbeddings(), - ) - - output = docsearch.similarity_search("foo", k=1) - assert output == [Document(page_content="foo")] + output = deeplake_datastore.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": "0"})] # Get a new VectorStore from the persisted directory docsearch = DeepLake( - dataset_path=dataset_path, + dataset_path=deeplake_datastore.vectorstore.dataset_handler.path, embedding_function=FakeEmbeddings(), ) output = docsearch.similarity_search("foo", k=1) @@ -83,22 +72,12 @@ def test_deeplakewith_persistence() -> None: # Or on program exit -def test_deeplake_overwrite_flag() -> None: +def test_deeplake_overwrite_flag(deeplake_datastore) -> None: """Test overwrite behavior""" - import deeplake + dataset_path = deeplake_datastore.vectorstore.dataset_handler.path - dataset_path = "./tests/persist_dir" - if deeplake.exists(dataset_path): - deeplake.delete(dataset_path) - - texts = ["foo", "bar", "baz"] - docsearch = DeepLake.from_texts( - dataset_path=dataset_path, - texts=texts, - embedding=FakeEmbeddings(), - ) - output = docsearch.similarity_search("foo", k=1) - assert output == [Document(page_content="foo")] + output = deeplake_datastore.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": "0"})] # Get a new VectorStore from the persisted directory, with no overwrite (implicit) docsearch = DeepLake( @@ -107,7 +86,7 @@ def test_deeplake_overwrite_flag() -> None: ) output = docsearch.similarity_search("foo", k=1) # assert page still present - assert output == [Document(page_content="foo")] + assert output == [Document(page_content="foo", metadata={"page": "0"})] # Get a new VectorStore from the persisted directory, with no overwrite (explicit) docsearch = DeepLake( @@ -117,7 +96,7 @@ def test_deeplake_overwrite_flag() -> None: ) output = docsearch.similarity_search("foo", k=1) # assert page still present - assert output == [Document(page_content="foo")] + assert output == [Document(page_content="foo", metadata={"page": "0"})] # Get a new VectorStore from the persisted directory, with overwrite docsearch = DeepLake( @@ -129,8 +108,9 @@ def test_deeplake_overwrite_flag() -> None: output = docsearch.similarity_search("foo", k=1) -def test_similarity_search(deeplake_datastore: DeepLake, distance_metric: str) -> None: +def test_similarity_search(deeplake_datastore) -> None: """Test similarity search.""" + distance_metric = "cos" output = deeplake_datastore.similarity_search( "foo", k=1, distance_metric=distance_metric ) @@ -145,7 +125,6 @@ def test_similarity_search(deeplake_datastore: DeepLake, distance_metric: str) - query="foo", tql_query=tql_query, k=1, distance_metric=distance_metric ) assert len(output) == 1 - deeplake_datastore.delete_dataset() def test_similarity_search_by_vector( @@ -164,6 +143,7 @@ def test_similarity_search_with_score( deeplake_datastore: DeepLake, distance_metric: str ) -> None: """Test similarity search with score.""" + deeplake_datastore.vectorstore.summary() output, score = deeplake_datastore.similarity_search_with_score( "foo", k=1, distance_metric=distance_metric )[0] @@ -281,3 +261,11 @@ def test_ids_backwards_compatibility() -> None: ) output = db.similarity_search("foo", k=1) assert len(output) == 1 + + +def test_similarity_search_should_error_out_when_not_supported_kwargs_are_provided( + deeplake_datastore: DeepLake, +) -> None: + """Test that ids are backwards compatible.""" + with pytest.raises(TypeError): + deeplake_datastore.similarity_search("foo", k=1, not_supported_kwarg=True)