From e192f6b6eba2aafdab52d2e981e1abfe317b6a84 Mon Sep 17 00:00:00 2001 From: mwmajewsk <5279578+mmajewsk@users.noreply.github.com> Date: Fri, 1 Mar 2024 20:21:21 +0100 Subject: [PATCH] community[patch]: fix, better error message in deeplake vectoriser (#18397) If the document loader recieves Pathlib path instead of str, it reads the file correctly, but the problem begins when the document is added to Deeplake. This problem arises from casting the path to str in the metadata. ```python deeplake = True fname = Path('./lorem_ipsum.txt') loader = TextLoader(fname, encoding="utf-8") docs = loader.load_and_split() text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100) chunks= text_splitter.split_documents(docs) if deeplake: db = DeepLake(dataset_path=ds_path, embedding=embeddings, token=activeloop_token) db.add_documents(chunks) else: db = Chroma.from_documents(docs, embeddings) ``` So using this snippet of code the error message for deeplake looks like this: ``` [part of error message omitted] Traceback (most recent call last): File "/home/mwm/repositories/sources/fixing_langchain/main.py", line 53, in db.add_documents(chunks) File "/home/mwm/repositories/sources/langchain/libs/core/langchain_core/vectorstores.py", line 139, in add_documents return self.add_texts(texts, metadatas, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/mwm/repositories/sources/langchain/libs/community/langchain_community/vectorstores/deeplake.py", line 258, in add_texts return self.vectorstore.add( ^^^^^^^^^^^^^^^^^^^^^ File "/home/mwm/anaconda3/envs/langchain/lib/python3.11/site-packages/deeplake/core/vectorstore/deeplake_vectorstore.py", line 226, in add return self.dataset_handler.add( ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/mwm/anaconda3/envs/langchain/lib/python3.11/site-packages/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py", line 139, in add dataset_utils.extend_or_ingest_dataset( File "/home/mwm/anaconda3/envs/langchain/lib/python3.11/site-packages/deeplake/core/vectorstore/vector_search/dataset/dataset.py", line 544, in extend_or_ingest_dataset extend( File "/home/mwm/anaconda3/envs/langchain/lib/python3.11/site-packages/deeplake/core/vectorstore/vector_search/dataset/dataset.py", line 505, in extend dataset.extend(batched_processed_tensors, progressbar=False) File "/home/mwm/anaconda3/envs/langchain/lib/python3.11/site-packages/deeplake/core/dataset/dataset.py", line 3247, in extend raise SampleExtendError(str(e)) from e.__cause__ deeplake.util.exceptions.SampleExtendError: Failed to append a sample to the tensor 'metadata'. See more details in the traceback. If you wish to skip the samples that cause errors, please specify `ignore_errors=True`. ``` Which is does not explain the error well enough. The same error for chroma looks like this ``` During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/home/mwm/repositories/sources/fixing_langchain/main.py", line 56, in db = Chroma.from_documents(docs, embeddings) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/mwm/repositories/sources/langchain/libs/community/langchain_community/vectorstores/chroma.py", line 778, in from_documents return cls.from_texts( ^^^^^^^^^^^^^^^ File "/home/mwm/repositories/sources/langchain/libs/community/langchain_community/vectorstores/chroma.py", line 736, in from_texts chroma_collection.add_texts( File "/home/mwm/repositories/sources/langchain/libs/community/langchain_community/vectorstores/chroma.py", line 309, in add_texts raise ValueError(e.args[0] + "\n\n" + msg) ValueError: Expected metadata value to be a str, int, float or bool, got lorem_ipsum.txt which is a Try filtering complex metadata from the document using langchain_community.vectorstores.utils.filter_complex_metadata. ``` Which is way more user friendly, so I just added information about possible mismatch of the type in the error message, the same way it is covered in chroma https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/vectorstores/chroma.py#L224 --- .../vectorstores/deeplake.py | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/deeplake.py b/libs/community/langchain_community/vectorstores/deeplake.py index 7240d166f67..5ece84dc557 100644 --- a/libs/community/langchain_community/vectorstores/deeplake.py +++ b/libs/community/langchain_community/vectorstores/deeplake.py @@ -9,6 +9,7 @@ try: import deeplake from deeplake import VectorStore as DeepLakeVectorStore from deeplake.core.fast_forwarding import version_compare + from deeplake.util.exceptions import SampleExtendError _DEEPLAKE_INSTALLED = True except ImportError: @@ -255,15 +256,25 @@ class DeepLake(VectorStore): elif len(texts) == 0: raise ValueError("`texts` parameter shouldn't be empty.") - return self.vectorstore.add( - text=texts, - metadata=metadatas, - embedding_data=texts, - embedding_tensor="embedding", - embedding_function=self._embedding_function.embed_documents, # type: ignore - return_ids=True, - **kwargs, - ) + try: + return self.vectorstore.add( + text=texts, + metadata=metadatas, + embedding_data=texts, + embedding_tensor="embedding", + embedding_function=self._embedding_function.embed_documents, # type: ignore + return_ids=True, + **kwargs, + ) + except SampleExtendError as e: + if "Failed to append a sample to the tensor 'metadata'" in str(e): + msg = ( + "**Hint: You might be using invalid type of argument in " + "document loader (e.g. 'pathlib.PosixPath' instead of 'str')" + ) + raise ValueError(e.args[0] + "\n\n" + msg) + else: + raise e def _search_tql( self,