mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-29 09:58:44 +00:00
community[patch]: fix, better error message in deeplake vectoriser (#18397)
If the document loader recieves Pathlib path instead of str, it reads the file correctly, but the problem begins when the document is added to Deeplake. This problem arises from casting the path to str in the metadata. ```python deeplake = True fname = Path('./lorem_ipsum.txt') loader = TextLoader(fname, encoding="utf-8") docs = loader.load_and_split() text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100) chunks= text_splitter.split_documents(docs) if deeplake: db = DeepLake(dataset_path=ds_path, embedding=embeddings, token=activeloop_token) db.add_documents(chunks) else: db = Chroma.from_documents(docs, embeddings) ``` So using this snippet of code the error message for deeplake looks like this: ``` [part of error message omitted] Traceback (most recent call last): File "/home/mwm/repositories/sources/fixing_langchain/main.py", line 53, in <module> db.add_documents(chunks) File "/home/mwm/repositories/sources/langchain/libs/core/langchain_core/vectorstores.py", line 139, in add_documents return self.add_texts(texts, metadatas, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/mwm/repositories/sources/langchain/libs/community/langchain_community/vectorstores/deeplake.py", line 258, in add_texts return self.vectorstore.add( ^^^^^^^^^^^^^^^^^^^^^ File "/home/mwm/anaconda3/envs/langchain/lib/python3.11/site-packages/deeplake/core/vectorstore/deeplake_vectorstore.py", line 226, in add return self.dataset_handler.add( ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/mwm/anaconda3/envs/langchain/lib/python3.11/site-packages/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py", line 139, in add dataset_utils.extend_or_ingest_dataset( File "/home/mwm/anaconda3/envs/langchain/lib/python3.11/site-packages/deeplake/core/vectorstore/vector_search/dataset/dataset.py", line 544, in extend_or_ingest_dataset extend( File "/home/mwm/anaconda3/envs/langchain/lib/python3.11/site-packages/deeplake/core/vectorstore/vector_search/dataset/dataset.py", line 505, in extend dataset.extend(batched_processed_tensors, progressbar=False) File "/home/mwm/anaconda3/envs/langchain/lib/python3.11/site-packages/deeplake/core/dataset/dataset.py", line 3247, in extend raise SampleExtendError(str(e)) from e.__cause__ deeplake.util.exceptions.SampleExtendError: Failed to append a sample to the tensor 'metadata'. See more details in the traceback. If you wish to skip the samples that cause errors, please specify `ignore_errors=True`. ``` Which is does not explain the error well enough. The same error for chroma looks like this ``` During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/home/mwm/repositories/sources/fixing_langchain/main.py", line 56, in <module> db = Chroma.from_documents(docs, embeddings) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/mwm/repositories/sources/langchain/libs/community/langchain_community/vectorstores/chroma.py", line 778, in from_documents return cls.from_texts( ^^^^^^^^^^^^^^^ File "/home/mwm/repositories/sources/langchain/libs/community/langchain_community/vectorstores/chroma.py", line 736, in from_texts chroma_collection.add_texts( File "/home/mwm/repositories/sources/langchain/libs/community/langchain_community/vectorstores/chroma.py", line 309, in add_texts raise ValueError(e.args[0] + "\n\n" + msg) ValueError: Expected metadata value to be a str, int, float or bool, got lorem_ipsum.txt which is a <class 'pathlib.PosixPath'> Try filtering complex metadata from the document using langchain_community.vectorstores.utils.filter_complex_metadata. ``` Which is way more user friendly, so I just added information about possible mismatch of the type in the error message, the same way it is covered in chroma https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/vectorstores/chroma.py#L224
This commit is contained in:
parent
7d962278f6
commit
e192f6b6eb
@ -9,6 +9,7 @@ try:
|
|||||||
import deeplake
|
import deeplake
|
||||||
from deeplake import VectorStore as DeepLakeVectorStore
|
from deeplake import VectorStore as DeepLakeVectorStore
|
||||||
from deeplake.core.fast_forwarding import version_compare
|
from deeplake.core.fast_forwarding import version_compare
|
||||||
|
from deeplake.util.exceptions import SampleExtendError
|
||||||
|
|
||||||
_DEEPLAKE_INSTALLED = True
|
_DEEPLAKE_INSTALLED = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -255,6 +256,7 @@ class DeepLake(VectorStore):
|
|||||||
elif len(texts) == 0:
|
elif len(texts) == 0:
|
||||||
raise ValueError("`texts` parameter shouldn't be empty.")
|
raise ValueError("`texts` parameter shouldn't be empty.")
|
||||||
|
|
||||||
|
try:
|
||||||
return self.vectorstore.add(
|
return self.vectorstore.add(
|
||||||
text=texts,
|
text=texts,
|
||||||
metadata=metadatas,
|
metadata=metadatas,
|
||||||
@ -264,6 +266,15 @@ class DeepLake(VectorStore):
|
|||||||
return_ids=True,
|
return_ids=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
except SampleExtendError as e:
|
||||||
|
if "Failed to append a sample to the tensor 'metadata'" in str(e):
|
||||||
|
msg = (
|
||||||
|
"**Hint: You might be using invalid type of argument in "
|
||||||
|
"document loader (e.g. 'pathlib.PosixPath' instead of 'str')"
|
||||||
|
)
|
||||||
|
raise ValueError(e.args[0] + "\n\n" + msg)
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
|
||||||
def _search_tql(
|
def _search_tql(
|
||||||
self,
|
self,
|
||||||
|
Loading…
Reference in New Issue
Block a user