mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-06 11:37:12 +00:00
Bagatur/filter metadata (#9015)
Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
parent
a429145420
commit
3b754b5461
@ -205,12 +205,22 @@ class Chroma(VectorStore):
|
|||||||
[embeddings[idx] for idx in non_empty_ids] if embeddings else None
|
[embeddings[idx] for idx in non_empty_ids] if embeddings else None
|
||||||
)
|
)
|
||||||
ids_with_metadata = [ids[idx] for idx in non_empty_ids]
|
ids_with_metadata = [ids[idx] for idx in non_empty_ids]
|
||||||
self._collection.upsert(
|
try:
|
||||||
metadatas=metadatas,
|
self._collection.upsert(
|
||||||
embeddings=embeddings_with_metadatas,
|
metadatas=metadatas,
|
||||||
documents=texts_with_metadatas,
|
embeddings=embeddings_with_metadatas,
|
||||||
ids=ids_with_metadata,
|
documents=texts_with_metadatas,
|
||||||
)
|
ids=ids_with_metadata,
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
if "Expected metadata value to be" in str(e):
|
||||||
|
msg = (
|
||||||
|
"Try filtering complex metadata from the document using "
|
||||||
|
"langchain.vectorstore.utils.filter_complex_metadata."
|
||||||
|
)
|
||||||
|
raise ValueError(e.args[0] + "\n\n" + msg)
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
if empty_ids:
|
if empty_ids:
|
||||||
texts_without_metadatas = [texts[j] for j in empty_ids]
|
texts_without_metadatas = [texts[j] for j in empty_ids]
|
||||||
embeddings_without_metadatas = (
|
embeddings_without_metadatas = (
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
"""Utility functions for working with vectors and vectorstores."""
|
"""Utility functions for working with vectors and vectorstores."""
|
||||||
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import List
|
from typing import List, Tuple, Type
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
from langchain.utils.math import cosine_similarity
|
from langchain.utils.math import cosine_similarity
|
||||||
|
|
||||||
|
|
||||||
@ -51,3 +52,23 @@ def maximal_marginal_relevance(
|
|||||||
idxs.append(idx_to_add)
|
idxs.append(idx_to_add)
|
||||||
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
|
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
|
||||||
return idxs
|
return idxs
|
||||||
|
|
||||||
|
|
||||||
|
def filter_complex_metadata(
|
||||||
|
documents: List[Document],
|
||||||
|
*,
|
||||||
|
allowed_types: Tuple[Type, ...] = (str, bool, int, float)
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Filter out metadata types that are not supported for a vector store."""
|
||||||
|
updated_documents = []
|
||||||
|
for document in documents:
|
||||||
|
filtered_metadata = {}
|
||||||
|
for key, value in document.metadata.items():
|
||||||
|
if not isinstance(value, allowed_types):
|
||||||
|
continue
|
||||||
|
filtered_metadata[key] = value
|
||||||
|
|
||||||
|
document.metadata = filtered_metadata
|
||||||
|
updated_documents.append(document)
|
||||||
|
|
||||||
|
return updated_documents
|
||||||
|
@ -1,7 +1,11 @@
|
|||||||
"""Test vector store utility functions."""
|
"""Test vector store utility functions."""
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from langchain.vectorstores.utils import maximal_marginal_relevance
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.vectorstores.utils import (
|
||||||
|
filter_complex_metadata,
|
||||||
|
maximal_marginal_relevance,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_maximal_marginal_relevance_lambda_zero() -> None:
|
def test_maximal_marginal_relevance_lambda_zero() -> None:
|
||||||
@ -52,3 +56,70 @@ def test_maximal_marginal_relevance_query_dim() -> None:
|
|||||||
first = maximal_marginal_relevance(query_embedding, embedding_list)
|
first = maximal_marginal_relevance(query_embedding, embedding_list)
|
||||||
second = maximal_marginal_relevance(query_embedding_2d, embedding_list)
|
second = maximal_marginal_relevance(query_embedding_2d, embedding_list)
|
||||||
assert first == second
|
assert first == second
|
||||||
|
|
||||||
|
|
||||||
|
def test_filter_list_metadata() -> None:
|
||||||
|
documents = [
|
||||||
|
Document(
|
||||||
|
page_content="",
|
||||||
|
metadata={
|
||||||
|
"key1": "this is a string!",
|
||||||
|
"key2": ["a", "list", "of", "strings"],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="",
|
||||||
|
metadata={
|
||||||
|
"key1": "this is another string!",
|
||||||
|
"key2": {"foo"},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="",
|
||||||
|
metadata={
|
||||||
|
"key1": "this is another string!",
|
||||||
|
"key2": {"foo": "bar"},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="",
|
||||||
|
metadata={
|
||||||
|
"key1": "this is another string!",
|
||||||
|
"key2": True,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="",
|
||||||
|
metadata={
|
||||||
|
"key1": "this is another string!",
|
||||||
|
"key2": 1,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="",
|
||||||
|
metadata={
|
||||||
|
"key1": "this is another string!",
|
||||||
|
"key2": 1.0,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="",
|
||||||
|
metadata={
|
||||||
|
"key1": "this is another string!",
|
||||||
|
"key2": "foo",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
updated_documents = filter_complex_metadata(documents)
|
||||||
|
filtered_metadata = [doc.metadata for doc in updated_documents]
|
||||||
|
|
||||||
|
assert filtered_metadata == [
|
||||||
|
{"key1": "this is a string!"},
|
||||||
|
{"key1": "this is another string!"},
|
||||||
|
{"key1": "this is another string!"},
|
||||||
|
{"key1": "this is another string!", "key2": True},
|
||||||
|
{"key1": "this is another string!", "key2": 1},
|
||||||
|
{"key1": "this is another string!", "key2": 1.0},
|
||||||
|
{"key1": "this is another string!", "key2": "foo"},
|
||||||
|
]
|
||||||
|
Loading…
Reference in New Issue
Block a user