mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-27 08:58:48 +00:00
core[patch]: Add additional hashing options to indexing API, warn on SHA-1 (#31649)
Add additional hashing options to the indexing API, warn on SHA-1 Requires: - Bumping langchain-core version - bumping min langchain-core in langchain --------- Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
parent
cc4f5269b1
commit
9164e6f906
@ -5,6 +5,7 @@ from __future__ import annotations
|
|||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import uuid
|
import uuid
|
||||||
|
import warnings
|
||||||
from collections.abc import AsyncIterable, AsyncIterator, Iterable, Iterator, Sequence
|
from collections.abc import AsyncIterable, AsyncIterator, Iterable, Iterator, Sequence
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from typing import (
|
from typing import (
|
||||||
@ -18,8 +19,6 @@ from typing import (
|
|||||||
cast,
|
cast,
|
||||||
)
|
)
|
||||||
|
|
||||||
from pydantic import model_validator
|
|
||||||
|
|
||||||
from langchain_core.document_loaders.base import BaseLoader
|
from langchain_core.document_loaders.base import BaseLoader
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_core.exceptions import LangChainException
|
from langchain_core.exceptions import LangChainException
|
||||||
@ -35,94 +34,51 @@ NAMESPACE_UUID = uuid.UUID(int=1984)
|
|||||||
T = TypeVar("T")
|
T = TypeVar("T")
|
||||||
|
|
||||||
|
|
||||||
def _hash_string_to_uuid(input_string: str) -> uuid.UUID:
|
def _hash_string_to_uuid(input_string: str) -> str:
|
||||||
"""Hashes a string and returns the corresponding UUID."""
|
"""Hashes a string and returns the corresponding UUID."""
|
||||||
hash_value = hashlib.sha1(
|
hash_value = hashlib.sha1(
|
||||||
input_string.encode("utf-8"), usedforsecurity=False
|
input_string.encode("utf-8"), usedforsecurity=False
|
||||||
).hexdigest()
|
).hexdigest()
|
||||||
|
return str(uuid.uuid5(NAMESPACE_UUID, hash_value))
|
||||||
|
|
||||||
|
|
||||||
|
_WARNED_ABOUT_SHA1: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
def _warn_about_sha1() -> None:
|
||||||
|
"""Emit a one-time warning about SHA-1 collision weaknesses."""
|
||||||
|
# Global variable OK in this case
|
||||||
|
global _WARNED_ABOUT_SHA1 # noqa: PLW0603
|
||||||
|
if not _WARNED_ABOUT_SHA1:
|
||||||
|
warnings.warn(
|
||||||
|
"Using SHA-1 for document hashing. SHA-1 is *not* "
|
||||||
|
"collision-resistant; a motivated attacker can construct distinct inputs "
|
||||||
|
"that map to the same fingerprint. If this matters in your "
|
||||||
|
"threat model, switch to a stronger algorithm such "
|
||||||
|
"as 'blake2b', 'sha256', or 'sha512' by specifying "
|
||||||
|
" `key_encoder` parameter in the the `index` or `aindex` function. ",
|
||||||
|
category=UserWarning,
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
_WARNED_ABOUT_SHA1 = True
|
||||||
|
|
||||||
|
|
||||||
|
def _hash_string(
|
||||||
|
input_string: str, *, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]
|
||||||
|
) -> uuid.UUID:
|
||||||
|
"""Hash *input_string* to a deterministic UUID using the configured algorithm."""
|
||||||
|
if algorithm == "sha1":
|
||||||
|
_warn_about_sha1()
|
||||||
|
hash_value = _calculate_hash(input_string, algorithm)
|
||||||
return uuid.uuid5(NAMESPACE_UUID, hash_value)
|
return uuid.uuid5(NAMESPACE_UUID, hash_value)
|
||||||
|
|
||||||
|
|
||||||
def _hash_nested_dict_to_uuid(data: dict[Any, Any]) -> uuid.UUID:
|
def _hash_nested_dict(
|
||||||
"""Hashes a nested dictionary and returns the corresponding UUID."""
|
data: dict[Any, Any], *, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]
|
||||||
|
) -> uuid.UUID:
|
||||||
|
"""Hash a nested dictionary to a UUID using the configured algorithm."""
|
||||||
serialized_data = json.dumps(data, sort_keys=True)
|
serialized_data = json.dumps(data, sort_keys=True)
|
||||||
hash_value = hashlib.sha1(
|
return _hash_string(serialized_data, algorithm=algorithm)
|
||||||
serialized_data.encode("utf-8"), usedforsecurity=False
|
|
||||||
).hexdigest()
|
|
||||||
return uuid.uuid5(NAMESPACE_UUID, hash_value)
|
|
||||||
|
|
||||||
|
|
||||||
class _HashedDocument(Document):
|
|
||||||
"""A hashed document with a unique ID."""
|
|
||||||
|
|
||||||
uid: str
|
|
||||||
hash_: str
|
|
||||||
"""The hash of the document including content and metadata."""
|
|
||||||
content_hash: str
|
|
||||||
"""The hash of the document content."""
|
|
||||||
metadata_hash: str
|
|
||||||
"""The hash of the document metadata."""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def is_lc_serializable(cls) -> bool:
|
|
||||||
return False
|
|
||||||
|
|
||||||
@model_validator(mode="before")
|
|
||||||
@classmethod
|
|
||||||
def calculate_hashes(cls, values: dict[str, Any]) -> Any:
|
|
||||||
"""Root validator to calculate content and metadata hash."""
|
|
||||||
content = values.get("page_content", "")
|
|
||||||
metadata = values.get("metadata", {})
|
|
||||||
|
|
||||||
forbidden_keys = ("hash_", "content_hash", "metadata_hash")
|
|
||||||
|
|
||||||
for key in forbidden_keys:
|
|
||||||
if key in metadata:
|
|
||||||
msg = (
|
|
||||||
f"Metadata cannot contain key {key} as it "
|
|
||||||
f"is reserved for internal use."
|
|
||||||
)
|
|
||||||
raise ValueError(msg)
|
|
||||||
|
|
||||||
content_hash = str(_hash_string_to_uuid(content))
|
|
||||||
|
|
||||||
try:
|
|
||||||
metadata_hash = str(_hash_nested_dict_to_uuid(metadata))
|
|
||||||
except Exception as e:
|
|
||||||
msg = (
|
|
||||||
f"Failed to hash metadata: {e}. "
|
|
||||||
f"Please use a dict that can be serialized using json."
|
|
||||||
)
|
|
||||||
raise ValueError(msg) from e
|
|
||||||
|
|
||||||
values["content_hash"] = content_hash
|
|
||||||
values["metadata_hash"] = metadata_hash
|
|
||||||
values["hash_"] = str(_hash_string_to_uuid(content_hash + metadata_hash))
|
|
||||||
|
|
||||||
_uid = values.get("uid")
|
|
||||||
|
|
||||||
if _uid is None:
|
|
||||||
values["uid"] = values["hash_"]
|
|
||||||
return values
|
|
||||||
|
|
||||||
def to_document(self) -> Document:
|
|
||||||
"""Return a Document object."""
|
|
||||||
return Document(
|
|
||||||
id=self.uid,
|
|
||||||
page_content=self.page_content,
|
|
||||||
metadata=self.metadata,
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_document(
|
|
||||||
cls, document: Document, *, uid: Optional[str] = None
|
|
||||||
) -> _HashedDocument:
|
|
||||||
"""Create a HashedDocument from a Document."""
|
|
||||||
return cls( # type: ignore[call-arg]
|
|
||||||
uid=uid, # type: ignore[arg-type]
|
|
||||||
page_content=document.page_content,
|
|
||||||
metadata=document.metadata,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _batch(size: int, iterable: Iterable[T]) -> Iterator[list[T]]:
|
def _batch(size: int, iterable: Iterable[T]) -> Iterator[list[T]]:
|
||||||
@ -168,14 +124,16 @@ def _get_source_id_assigner(
|
|||||||
|
|
||||||
|
|
||||||
def _deduplicate_in_order(
|
def _deduplicate_in_order(
|
||||||
hashed_documents: Iterable[_HashedDocument],
|
hashed_documents: Iterable[Document],
|
||||||
) -> Iterator[_HashedDocument]:
|
) -> Iterator[Document]:
|
||||||
"""Deduplicate a list of hashed documents while preserving order."""
|
"""Deduplicate a list of hashed documents while preserving order."""
|
||||||
seen: set[str] = set()
|
seen: set[str] = set()
|
||||||
|
|
||||||
for hashed_doc in hashed_documents:
|
for hashed_doc in hashed_documents:
|
||||||
if hashed_doc.hash_ not in seen:
|
if hashed_doc.id not in seen:
|
||||||
seen.add(hashed_doc.hash_)
|
# At this stage, the id is guaranteed to be a string.
|
||||||
|
# Avoiding unnecessary run time checks.
|
||||||
|
seen.add(cast("str", hashed_doc.id))
|
||||||
yield hashed_doc
|
yield hashed_doc
|
||||||
|
|
||||||
|
|
||||||
@ -183,6 +141,94 @@ class IndexingException(LangChainException):
|
|||||||
"""Raised when an indexing operation fails."""
|
"""Raised when an indexing operation fails."""
|
||||||
|
|
||||||
|
|
||||||
|
def _calculate_hash(
|
||||||
|
text: str, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]
|
||||||
|
) -> str:
|
||||||
|
"""Return a hexadecimal digest of *text* using *algorithm*."""
|
||||||
|
if algorithm == "sha1":
|
||||||
|
# Calculate the SHA-1 hash and return it as a UUID.
|
||||||
|
digest = hashlib.sha1(text.encode("utf-8"), usedforsecurity=False).hexdigest()
|
||||||
|
return str(uuid.uuid5(NAMESPACE_UUID, digest))
|
||||||
|
if algorithm == "blake2b":
|
||||||
|
return hashlib.blake2b(text.encode("utf-8")).hexdigest()
|
||||||
|
if algorithm == "sha256":
|
||||||
|
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||||
|
if algorithm == "sha512":
|
||||||
|
return hashlib.sha512(text.encode("utf-8")).hexdigest()
|
||||||
|
msg = f"Unsupported hashing algorithm: {algorithm}"
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_document_with_hash(
|
||||||
|
document: Document,
|
||||||
|
*,
|
||||||
|
key_encoder: Union[
|
||||||
|
Callable[[Document], str], Literal["sha1", "sha256", "sha512", "blake2b"]
|
||||||
|
],
|
||||||
|
) -> Document:
|
||||||
|
"""Calculate a hash of the document, and assign it to the uid.
|
||||||
|
|
||||||
|
When using one of the predefined hashing algorithms, the hash is calculated
|
||||||
|
by hashing the content and the metadata of the document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: Document to hash.
|
||||||
|
key_encoder: Hashing algorithm to use for hashing the document.
|
||||||
|
If not provided, a default encoder using SHA-1 will be used.
|
||||||
|
SHA-1 is not collision-resistant, and a motivated attacker
|
||||||
|
could craft two different texts that hash to the
|
||||||
|
same cache key.
|
||||||
|
|
||||||
|
New applications should use one of the alternative encoders
|
||||||
|
or provide a custom and strong key encoder function to avoid this risk.
|
||||||
|
|
||||||
|
When changing the key encoder, you must change the
|
||||||
|
index as well to avoid duplicated documents in the cache.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document with a unique identifier based on the hash of the content and metadata.
|
||||||
|
"""
|
||||||
|
metadata: dict[str, Any] = dict(document.metadata or {})
|
||||||
|
|
||||||
|
if callable(key_encoder):
|
||||||
|
# If key_encoder is a callable, we use it to generate the hash.
|
||||||
|
hash_ = key_encoder(document)
|
||||||
|
else:
|
||||||
|
# The hashes are calculated separate for the content and the metadata.
|
||||||
|
content_hash = _calculate_hash(document.page_content, algorithm=key_encoder)
|
||||||
|
try:
|
||||||
|
serialized_meta = json.dumps(metadata, sort_keys=True)
|
||||||
|
except Exception as e:
|
||||||
|
msg = (
|
||||||
|
f"Failed to hash metadata: {e}. "
|
||||||
|
f"Please use a dict that can be serialized using json."
|
||||||
|
)
|
||||||
|
raise ValueError(msg) from e
|
||||||
|
metadata_hash = _calculate_hash(serialized_meta, algorithm=key_encoder)
|
||||||
|
hash_ = _calculate_hash(content_hash + metadata_hash, algorithm=key_encoder)
|
||||||
|
|
||||||
|
return Document(
|
||||||
|
# Assign a unique identifier based on the hash.
|
||||||
|
id=hash_,
|
||||||
|
page_content=document.page_content,
|
||||||
|
metadata=document.metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# This internal abstraction was imported by the langchain package internally, so
|
||||||
|
# we keep it here for backwards compatibility.
|
||||||
|
class _HashedDocument:
|
||||||
|
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
||||||
|
"""Raise an error if this class is instantiated."""
|
||||||
|
msg = (
|
||||||
|
"_HashedDocument is an internal abstraction that was deprecated in "
|
||||||
|
" langchain-core 0.3.63. This abstraction is marked as private and "
|
||||||
|
" should not have been used directly. If you are seeing this error, please "
|
||||||
|
" update your code appropriately."
|
||||||
|
)
|
||||||
|
raise NotImplementedError(msg)
|
||||||
|
|
||||||
|
|
||||||
def _delete(
|
def _delete(
|
||||||
vector_store: Union[VectorStore, DocumentIndex],
|
vector_store: Union[VectorStore, DocumentIndex],
|
||||||
ids: list[str],
|
ids: list[str],
|
||||||
@ -231,6 +277,9 @@ def index(
|
|||||||
source_id_key: Union[str, Callable[[Document], str], None] = None,
|
source_id_key: Union[str, Callable[[Document], str], None] = None,
|
||||||
cleanup_batch_size: int = 1_000,
|
cleanup_batch_size: int = 1_000,
|
||||||
force_update: bool = False,
|
force_update: bool = False,
|
||||||
|
key_encoder: Union[
|
||||||
|
Literal["sha1", "sha256", "sha512", "blake2b"], Callable[[Document], str]
|
||||||
|
] = "sha1",
|
||||||
upsert_kwargs: Optional[dict[str, Any]] = None,
|
upsert_kwargs: Optional[dict[str, Any]] = None,
|
||||||
) -> IndexingResult:
|
) -> IndexingResult:
|
||||||
"""Index data from the loader into the vector store.
|
"""Index data from the loader into the vector store.
|
||||||
@ -291,6 +340,23 @@ def index(
|
|||||||
force_update: Force update documents even if they are present in the
|
force_update: Force update documents even if they are present in the
|
||||||
record manager. Useful if you are re-indexing with updated embeddings.
|
record manager. Useful if you are re-indexing with updated embeddings.
|
||||||
Default is False.
|
Default is False.
|
||||||
|
key_encoder: Hashing algorithm to use for hashing the document content and
|
||||||
|
metadata. Default is "sha1".
|
||||||
|
Other options include "blake2b", "sha256", and "sha512".
|
||||||
|
|
||||||
|
.. versionadded:: 0.3.66
|
||||||
|
|
||||||
|
key_encoder: Hashing algorithm to use for hashing the document.
|
||||||
|
If not provided, a default encoder using SHA-1 will be used.
|
||||||
|
SHA-1 is not collision-resistant, and a motivated attacker
|
||||||
|
could craft two different texts that hash to the
|
||||||
|
same cache key.
|
||||||
|
|
||||||
|
New applications should use one of the alternative encoders
|
||||||
|
or provide a custom and strong key encoder function to avoid this risk.
|
||||||
|
|
||||||
|
When changing the key encoder, you must change the
|
||||||
|
index as well to avoid duplicated documents in the cache.
|
||||||
upsert_kwargs: Additional keyword arguments to pass to the add_documents
|
upsert_kwargs: Additional keyword arguments to pass to the add_documents
|
||||||
method of the VectorStore or the upsert method of the
|
method of the VectorStore or the upsert method of the
|
||||||
DocumentIndex. For example, you can use this to
|
DocumentIndex. For example, you can use this to
|
||||||
@ -313,6 +379,11 @@ def index(
|
|||||||
|
|
||||||
* Added `scoped_full` cleanup mode.
|
* Added `scoped_full` cleanup mode.
|
||||||
"""
|
"""
|
||||||
|
# Behavior is deprecated, but we keep it for backwards compatibility.
|
||||||
|
# # Warn only once per process.
|
||||||
|
if key_encoder == "sha1":
|
||||||
|
_warn_about_sha1()
|
||||||
|
|
||||||
if cleanup not in {"incremental", "full", "scoped_full", None}:
|
if cleanup not in {"incremental", "full", "scoped_full", None}:
|
||||||
msg = (
|
msg = (
|
||||||
f"cleanup should be one of 'incremental', 'full', 'scoped_full' or None. "
|
f"cleanup should be one of 'incremental', 'full', 'scoped_full' or None. "
|
||||||
@ -375,12 +446,15 @@ def index(
|
|||||||
for doc_batch in _batch(batch_size, doc_iterator):
|
for doc_batch in _batch(batch_size, doc_iterator):
|
||||||
hashed_docs = list(
|
hashed_docs = list(
|
||||||
_deduplicate_in_order(
|
_deduplicate_in_order(
|
||||||
[_HashedDocument.from_document(doc) for doc in doc_batch]
|
[
|
||||||
|
_get_document_with_hash(doc, key_encoder=key_encoder)
|
||||||
|
for doc in doc_batch
|
||||||
|
]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
source_ids: Sequence[Optional[str]] = [
|
source_ids: Sequence[Optional[str]] = [
|
||||||
source_id_assigner(doc) for doc in hashed_docs
|
source_id_assigner(hashed_doc) for hashed_doc in hashed_docs
|
||||||
]
|
]
|
||||||
|
|
||||||
if cleanup in {"incremental", "scoped_full"}:
|
if cleanup in {"incremental", "scoped_full"}:
|
||||||
@ -391,8 +465,8 @@ def index(
|
|||||||
f"Source ids are required when cleanup mode is "
|
f"Source ids are required when cleanup mode is "
|
||||||
f"incremental or scoped_full. "
|
f"incremental or scoped_full. "
|
||||||
f"Document that starts with "
|
f"Document that starts with "
|
||||||
f"content: {hashed_doc.page_content[:100]} was not assigned "
|
f"content: {hashed_doc.page_content[:100]} "
|
||||||
f"as source id."
|
f"was not assigned as source id."
|
||||||
)
|
)
|
||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
if cleanup == "scoped_full":
|
if cleanup == "scoped_full":
|
||||||
@ -400,7 +474,9 @@ def index(
|
|||||||
# source ids cannot be None after for loop above.
|
# source ids cannot be None after for loop above.
|
||||||
source_ids = cast("Sequence[str]", source_ids)
|
source_ids = cast("Sequence[str]", source_ids)
|
||||||
|
|
||||||
exists_batch = record_manager.exists([doc.uid for doc in hashed_docs])
|
exists_batch = record_manager.exists(
|
||||||
|
cast("Sequence[str]", [doc.id for doc in hashed_docs])
|
||||||
|
)
|
||||||
|
|
||||||
# Filter out documents that already exist in the record store.
|
# Filter out documents that already exist in the record store.
|
||||||
uids = []
|
uids = []
|
||||||
@ -408,14 +484,15 @@ def index(
|
|||||||
uids_to_refresh = []
|
uids_to_refresh = []
|
||||||
seen_docs: set[str] = set()
|
seen_docs: set[str] = set()
|
||||||
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
|
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
|
||||||
|
hashed_id = cast("str", hashed_doc.id)
|
||||||
if doc_exists:
|
if doc_exists:
|
||||||
if force_update:
|
if force_update:
|
||||||
seen_docs.add(hashed_doc.uid)
|
seen_docs.add(hashed_id)
|
||||||
else:
|
else:
|
||||||
uids_to_refresh.append(hashed_doc.uid)
|
uids_to_refresh.append(hashed_id)
|
||||||
continue
|
continue
|
||||||
uids.append(hashed_doc.uid)
|
uids.append(hashed_id)
|
||||||
docs_to_index.append(hashed_doc.to_document())
|
docs_to_index.append(hashed_doc)
|
||||||
|
|
||||||
# Update refresh timestamp
|
# Update refresh timestamp
|
||||||
if uids_to_refresh:
|
if uids_to_refresh:
|
||||||
@ -445,7 +522,7 @@ def index(
|
|||||||
# Update ALL records, even if they already exist since we want to refresh
|
# Update ALL records, even if they already exist since we want to refresh
|
||||||
# their timestamp.
|
# their timestamp.
|
||||||
record_manager.update(
|
record_manager.update(
|
||||||
[doc.uid for doc in hashed_docs],
|
cast("Sequence[str]", [doc.id for doc in hashed_docs]),
|
||||||
group_ids=source_ids,
|
group_ids=source_ids,
|
||||||
time_at_least=index_start_dt,
|
time_at_least=index_start_dt,
|
||||||
)
|
)
|
||||||
@ -453,7 +530,6 @@ def index(
|
|||||||
# If source IDs are provided, we can do the deletion incrementally!
|
# If source IDs are provided, we can do the deletion incrementally!
|
||||||
if cleanup == "incremental":
|
if cleanup == "incremental":
|
||||||
# Get the uids of the documents that were not returned by the loader.
|
# Get the uids of the documents that were not returned by the loader.
|
||||||
|
|
||||||
# mypy isn't good enough to determine that source ids cannot be None
|
# mypy isn't good enough to determine that source ids cannot be None
|
||||||
# here due to a check that's happening above, so we check again.
|
# here due to a check that's happening above, so we check again.
|
||||||
for source_id in source_ids:
|
for source_id in source_ids:
|
||||||
@ -537,6 +613,9 @@ async def aindex(
|
|||||||
source_id_key: Union[str, Callable[[Document], str], None] = None,
|
source_id_key: Union[str, Callable[[Document], str], None] = None,
|
||||||
cleanup_batch_size: int = 1_000,
|
cleanup_batch_size: int = 1_000,
|
||||||
force_update: bool = False,
|
force_update: bool = False,
|
||||||
|
key_encoder: Union[
|
||||||
|
Literal["sha1", "sha256", "sha512", "blake2b"], Callable[[Document], str]
|
||||||
|
] = "sha1",
|
||||||
upsert_kwargs: Optional[dict[str, Any]] = None,
|
upsert_kwargs: Optional[dict[str, Any]] = None,
|
||||||
) -> IndexingResult:
|
) -> IndexingResult:
|
||||||
"""Async index data from the loader into the vector store.
|
"""Async index data from the loader into the vector store.
|
||||||
@ -596,6 +675,17 @@ async def aindex(
|
|||||||
force_update: Force update documents even if they are present in the
|
force_update: Force update documents even if they are present in the
|
||||||
record manager. Useful if you are re-indexing with updated embeddings.
|
record manager. Useful if you are re-indexing with updated embeddings.
|
||||||
Default is False.
|
Default is False.
|
||||||
|
key_encoder: Hashing algorithm to use for hashing the document.
|
||||||
|
If not provided, a default encoder using SHA-1 will be used.
|
||||||
|
SHA-1 is not collision-resistant, and a motivated attacker
|
||||||
|
could craft two different texts that hash to the
|
||||||
|
same cache key.
|
||||||
|
|
||||||
|
New applications should use one of the alternative encoders
|
||||||
|
or provide a custom and strong key encoder function to avoid this risk.
|
||||||
|
|
||||||
|
When changing the key encoder, you must change the
|
||||||
|
index as well to avoid duplicated documents in the cache.
|
||||||
upsert_kwargs: Additional keyword arguments to pass to the aadd_documents
|
upsert_kwargs: Additional keyword arguments to pass to the aadd_documents
|
||||||
method of the VectorStore or the aupsert method of the
|
method of the VectorStore or the aupsert method of the
|
||||||
DocumentIndex. For example, you can use this to
|
DocumentIndex. For example, you can use this to
|
||||||
@ -618,6 +708,11 @@ async def aindex(
|
|||||||
|
|
||||||
* Added `scoped_full` cleanup mode.
|
* Added `scoped_full` cleanup mode.
|
||||||
"""
|
"""
|
||||||
|
# Behavior is deprecated, but we keep it for backwards compatibility.
|
||||||
|
# # Warn only once per process.
|
||||||
|
if key_encoder == "sha1":
|
||||||
|
_warn_about_sha1()
|
||||||
|
|
||||||
if cleanup not in {"incremental", "full", "scoped_full", None}:
|
if cleanup not in {"incremental", "full", "scoped_full", None}:
|
||||||
msg = (
|
msg = (
|
||||||
f"cleanup should be one of 'incremental', 'full', 'scoped_full' or None. "
|
f"cleanup should be one of 'incremental', 'full', 'scoped_full' or None. "
|
||||||
@ -691,7 +786,10 @@ async def aindex(
|
|||||||
async for doc_batch in _abatch(batch_size, async_doc_iterator):
|
async for doc_batch in _abatch(batch_size, async_doc_iterator):
|
||||||
hashed_docs = list(
|
hashed_docs = list(
|
||||||
_deduplicate_in_order(
|
_deduplicate_in_order(
|
||||||
[_HashedDocument.from_document(doc) for doc in doc_batch]
|
[
|
||||||
|
_get_document_with_hash(doc, key_encoder=key_encoder)
|
||||||
|
for doc in doc_batch
|
||||||
|
]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -707,8 +805,8 @@ async def aindex(
|
|||||||
f"Source ids are required when cleanup mode is "
|
f"Source ids are required when cleanup mode is "
|
||||||
f"incremental or scoped_full. "
|
f"incremental or scoped_full. "
|
||||||
f"Document that starts with "
|
f"Document that starts with "
|
||||||
f"content: {hashed_doc.page_content[:100]} was not assigned "
|
f"content: {hashed_doc.page_content[:100]} "
|
||||||
f"as source id."
|
f"was not assigned as source id."
|
||||||
)
|
)
|
||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
if cleanup == "scoped_full":
|
if cleanup == "scoped_full":
|
||||||
@ -716,7 +814,9 @@ async def aindex(
|
|||||||
# source ids cannot be None after for loop above.
|
# source ids cannot be None after for loop above.
|
||||||
source_ids = cast("Sequence[str]", source_ids)
|
source_ids = cast("Sequence[str]", source_ids)
|
||||||
|
|
||||||
exists_batch = await record_manager.aexists([doc.uid for doc in hashed_docs])
|
exists_batch = await record_manager.aexists(
|
||||||
|
cast("Sequence[str]", [doc.id for doc in hashed_docs])
|
||||||
|
)
|
||||||
|
|
||||||
# Filter out documents that already exist in the record store.
|
# Filter out documents that already exist in the record store.
|
||||||
uids: list[str] = []
|
uids: list[str] = []
|
||||||
@ -724,14 +824,15 @@ async def aindex(
|
|||||||
uids_to_refresh = []
|
uids_to_refresh = []
|
||||||
seen_docs: set[str] = set()
|
seen_docs: set[str] = set()
|
||||||
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
|
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
|
||||||
|
hashed_id = cast("str", hashed_doc.id)
|
||||||
if doc_exists:
|
if doc_exists:
|
||||||
if force_update:
|
if force_update:
|
||||||
seen_docs.add(hashed_doc.uid)
|
seen_docs.add(hashed_id)
|
||||||
else:
|
else:
|
||||||
uids_to_refresh.append(hashed_doc.uid)
|
uids_to_refresh.append(hashed_id)
|
||||||
continue
|
continue
|
||||||
uids.append(hashed_doc.uid)
|
uids.append(hashed_id)
|
||||||
docs_to_index.append(hashed_doc.to_document())
|
docs_to_index.append(hashed_doc)
|
||||||
|
|
||||||
if uids_to_refresh:
|
if uids_to_refresh:
|
||||||
# Must be updated to refresh timestamp.
|
# Must be updated to refresh timestamp.
|
||||||
@ -760,7 +861,7 @@ async def aindex(
|
|||||||
# Update ALL records, even if they already exist since we want to refresh
|
# Update ALL records, even if they already exist since we want to refresh
|
||||||
# their timestamp.
|
# their timestamp.
|
||||||
await record_manager.aupdate(
|
await record_manager.aupdate(
|
||||||
[doc.uid for doc in hashed_docs],
|
cast("Sequence[str]", [doc.id for doc in hashed_docs]),
|
||||||
group_ids=source_ids,
|
group_ids=source_ids,
|
||||||
time_at_least=index_start_dt,
|
time_at_least=index_start_dt,
|
||||||
)
|
)
|
||||||
|
@ -1,50 +1,65 @@
|
|||||||
import pytest
|
from typing import Literal
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_core.indexing.api import _HashedDocument
|
from langchain_core.indexing.api import _get_document_with_hash
|
||||||
|
|
||||||
|
|
||||||
def test_hashed_document_hashing() -> None:
|
def test_hashed_document_hashing() -> None:
|
||||||
hashed_document = _HashedDocument( # type: ignore[call-arg]
|
document = Document(
|
||||||
uid="123", page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
uid="123", page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
||||||
)
|
)
|
||||||
assert isinstance(hashed_document.hash_, str)
|
hashed_document = _get_document_with_hash(document, key_encoder="sha1")
|
||||||
|
assert isinstance(hashed_document.id, str)
|
||||||
|
|
||||||
def test_hashing_with_missing_content() -> None:
|
|
||||||
"""Check that ValueError is raised if page_content is missing."""
|
|
||||||
with pytest.raises(TypeError):
|
|
||||||
_HashedDocument(
|
|
||||||
metadata={"key": "value"},
|
|
||||||
) # type: ignore[call-arg]
|
|
||||||
|
|
||||||
|
|
||||||
def test_uid_auto_assigned_to_hash() -> None:
|
|
||||||
"""Test uid is auto-assigned to the hashed_document hash."""
|
|
||||||
hashed_document = _HashedDocument( # type: ignore[call-arg]
|
|
||||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
|
||||||
)
|
|
||||||
assert hashed_document.uid == hashed_document.hash_
|
|
||||||
|
|
||||||
|
|
||||||
def test_to_document() -> None:
|
def test_to_document() -> None:
|
||||||
"""Test to_document method."""
|
"""Test to_document method."""
|
||||||
hashed_document = _HashedDocument( # type: ignore[call-arg]
|
original_doc = Document(
|
||||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
||||||
)
|
)
|
||||||
doc = hashed_document.to_document()
|
hashed_doc = _get_document_with_hash(original_doc, key_encoder="sha1")
|
||||||
assert isinstance(doc, Document)
|
assert isinstance(hashed_doc, Document)
|
||||||
assert doc.page_content == "Lorem ipsum dolor sit amet"
|
assert hashed_doc is not original_doc
|
||||||
assert doc.metadata == {"key": "value"}
|
assert hashed_doc.page_content == "Lorem ipsum dolor sit amet"
|
||||||
|
assert hashed_doc.metadata["key"] == "value"
|
||||||
|
|
||||||
|
|
||||||
def test_from_document() -> None:
|
def test_hashing() -> None:
|
||||||
"""Test from document class method."""
|
"""Test from document class method."""
|
||||||
document = Document(
|
document = Document(
|
||||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
||||||
)
|
)
|
||||||
|
hashed_document = _get_document_with_hash(document, key_encoder="sha1")
|
||||||
hashed_document = _HashedDocument.from_document(document)
|
|
||||||
# hash should be deterministic
|
# hash should be deterministic
|
||||||
assert hashed_document.hash_ == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
|
assert hashed_document.id == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
|
||||||
assert hashed_document.uid == hashed_document.hash_
|
|
||||||
|
# Verify that hashing with sha1 is determinstic
|
||||||
|
another_hashed_document = _get_document_with_hash(document, key_encoder="sha1")
|
||||||
|
assert another_hashed_document.id == hashed_document.id
|
||||||
|
|
||||||
|
# Verify that the result is different from SHA256, SHA512, blake2b
|
||||||
|
values: list[Literal["sha256", "sha512", "blake2b"]] = [
|
||||||
|
"sha256",
|
||||||
|
"sha512",
|
||||||
|
"blake2b",
|
||||||
|
]
|
||||||
|
|
||||||
|
for key_encoder in values:
|
||||||
|
different_hashed_document = _get_document_with_hash(
|
||||||
|
document, key_encoder=key_encoder
|
||||||
|
)
|
||||||
|
assert different_hashed_document.id != hashed_document.id
|
||||||
|
|
||||||
|
|
||||||
|
def test_hashing_custom_key_encoder() -> None:
|
||||||
|
"""Test hashing with a custom key encoder."""
|
||||||
|
|
||||||
|
def custom_key_encoder(doc: Document) -> str:
|
||||||
|
return f"quack-{doc.metadata['key']}"
|
||||||
|
|
||||||
|
document = Document(
|
||||||
|
page_content="Lorem ipsum dolor sit amet", metadata={"key": "like a duck"}
|
||||||
|
)
|
||||||
|
hashed_document = _get_document_with_hash(document, key_encoder=custom_key_encoder)
|
||||||
|
assert hashed_document.id == "quack-like a duck"
|
||||||
|
assert isinstance(hashed_document.id, str)
|
||||||
|
@ -13,7 +13,11 @@ from langchain_core.document_loaders.base import BaseLoader
|
|||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_core.embeddings import DeterministicFakeEmbedding
|
from langchain_core.embeddings import DeterministicFakeEmbedding
|
||||||
from langchain_core.indexing import InMemoryRecordManager, aindex, index
|
from langchain_core.indexing import InMemoryRecordManager, aindex, index
|
||||||
from langchain_core.indexing.api import IndexingException, _abatch, _HashedDocument
|
from langchain_core.indexing.api import (
|
||||||
|
IndexingException,
|
||||||
|
_abatch,
|
||||||
|
_get_document_with_hash,
|
||||||
|
)
|
||||||
from langchain_core.indexing.in_memory import InMemoryDocumentIndex
|
from langchain_core.indexing.in_memory import InMemoryDocumentIndex
|
||||||
from langchain_core.vectorstores import InMemoryVectorStore, VectorStore
|
from langchain_core.vectorstores import InMemoryVectorStore, VectorStore
|
||||||
|
|
||||||
@ -2222,7 +2226,7 @@ def test_indexing_custom_batch_size(
|
|||||||
metadata={"source": "1"},
|
metadata={"source": "1"},
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
ids = [_HashedDocument.from_document(doc).uid for doc in docs]
|
ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]
|
||||||
|
|
||||||
batch_size = 1
|
batch_size = 1
|
||||||
|
|
||||||
@ -2232,7 +2236,13 @@ def test_indexing_custom_batch_size(
|
|||||||
mock_add_documents = MagicMock()
|
mock_add_documents = MagicMock()
|
||||||
vector_store.add_documents = mock_add_documents # type: ignore[method-assign]
|
vector_store.add_documents = mock_add_documents # type: ignore[method-assign]
|
||||||
|
|
||||||
index(docs, record_manager, vector_store, batch_size=batch_size)
|
index(
|
||||||
|
docs,
|
||||||
|
record_manager,
|
||||||
|
vector_store,
|
||||||
|
batch_size=batch_size,
|
||||||
|
key_encoder="sha256",
|
||||||
|
)
|
||||||
args, kwargs = mock_add_documents.call_args
|
args, kwargs = mock_add_documents.call_args
|
||||||
doc_with_id = Document(
|
doc_with_id = Document(
|
||||||
id=ids[0], page_content="This is a test document.", metadata={"source": "1"}
|
id=ids[0], page_content="This is a test document.", metadata={"source": "1"}
|
||||||
@ -2253,7 +2263,7 @@ async def test_aindexing_custom_batch_size(
|
|||||||
metadata={"source": "1"},
|
metadata={"source": "1"},
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
ids = [_HashedDocument.from_document(doc).uid for doc in docs]
|
ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]
|
||||||
|
|
||||||
batch_size = 1
|
batch_size = 1
|
||||||
mock_add_documents = AsyncMock()
|
mock_add_documents = AsyncMock()
|
||||||
@ -2261,7 +2271,9 @@ async def test_aindexing_custom_batch_size(
|
|||||||
id=ids[0], page_content="This is a test document.", metadata={"source": "1"}
|
id=ids[0], page_content="This is a test document.", metadata={"source": "1"}
|
||||||
)
|
)
|
||||||
vector_store.aadd_documents = mock_add_documents # type: ignore[method-assign]
|
vector_store.aadd_documents = mock_add_documents # type: ignore[method-assign]
|
||||||
await aindex(docs, arecord_manager, vector_store, batch_size=batch_size)
|
await aindex(
|
||||||
|
docs, arecord_manager, vector_store, batch_size=batch_size, key_encoder="sha256"
|
||||||
|
)
|
||||||
args, kwargs = mock_add_documents.call_args
|
args, kwargs = mock_add_documents.call_args
|
||||||
assert args == ([doc_with_id],)
|
assert args == ([doc_with_id],)
|
||||||
assert kwargs == {"ids": ids, "batch_size": batch_size}
|
assert kwargs == {"ids": ids, "batch_size": batch_size}
|
||||||
|
@ -1,50 +0,0 @@
|
|||||||
import pytest
|
|
||||||
from langchain_core.documents import Document
|
|
||||||
|
|
||||||
from langchain.indexes._api import _HashedDocument
|
|
||||||
|
|
||||||
|
|
||||||
def test_hashed_document_hashing() -> None:
|
|
||||||
hashed_document = _HashedDocument( # type: ignore[call-arg]
|
|
||||||
uid="123", page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
|
||||||
)
|
|
||||||
assert isinstance(hashed_document.hash_, str)
|
|
||||||
|
|
||||||
|
|
||||||
def test_hashing_with_missing_content() -> None:
|
|
||||||
"""Check that ValueError is raised if page_content is missing."""
|
|
||||||
with pytest.raises(TypeError):
|
|
||||||
_HashedDocument( # type: ignore[call-arg]
|
|
||||||
metadata={"key": "value"},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_uid_auto_assigned_to_hash() -> None:
|
|
||||||
"""Test uid is auto-assigned to the hashed_document hash."""
|
|
||||||
hashed_document = _HashedDocument( # type: ignore[call-arg]
|
|
||||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
|
||||||
)
|
|
||||||
assert hashed_document.uid == hashed_document.hash_
|
|
||||||
|
|
||||||
|
|
||||||
def test_to_document() -> None:
|
|
||||||
"""Test to_document method."""
|
|
||||||
hashed_document = _HashedDocument( # type: ignore[call-arg]
|
|
||||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
|
||||||
)
|
|
||||||
doc = hashed_document.to_document()
|
|
||||||
assert isinstance(doc, Document)
|
|
||||||
assert doc.page_content == "Lorem ipsum dolor sit amet"
|
|
||||||
assert doc.metadata == {"key": "value"}
|
|
||||||
|
|
||||||
|
|
||||||
def test_from_document() -> None:
|
|
||||||
"""Test from document class method."""
|
|
||||||
document = Document(
|
|
||||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
|
||||||
)
|
|
||||||
|
|
||||||
hashed_document = _HashedDocument.from_document(document)
|
|
||||||
# hash should be deterministic
|
|
||||||
assert hashed_document.hash_ == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
|
|
||||||
assert hashed_document.uid == hashed_document.hash_
|
|
@ -11,10 +11,10 @@ import pytest_asyncio
|
|||||||
from langchain_core.document_loaders import BaseLoader
|
from langchain_core.document_loaders import BaseLoader
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_core.embeddings import Embeddings
|
from langchain_core.embeddings import Embeddings
|
||||||
|
from langchain_core.indexing.api import _abatch, _get_document_with_hash
|
||||||
from langchain_core.vectorstores import VST, VectorStore
|
from langchain_core.vectorstores import VST, VectorStore
|
||||||
|
|
||||||
from langchain.indexes import aindex, index
|
from langchain.indexes import aindex, index
|
||||||
from langchain.indexes._api import _abatch, _HashedDocument
|
|
||||||
from langchain.indexes._sql_record_manager import SQLRecordManager
|
from langchain.indexes._sql_record_manager import SQLRecordManager
|
||||||
|
|
||||||
|
|
||||||
@ -1374,11 +1374,17 @@ def test_indexing_custom_batch_size(
|
|||||||
metadata={"source": "1"},
|
metadata={"source": "1"},
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
ids = [_HashedDocument.from_document(doc).uid for doc in docs]
|
ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]
|
||||||
|
|
||||||
batch_size = 1
|
batch_size = 1
|
||||||
with patch.object(vector_store, "add_documents") as mock_add_documents:
|
with patch.object(vector_store, "add_documents") as mock_add_documents:
|
||||||
index(docs, record_manager, vector_store, batch_size=batch_size)
|
index(
|
||||||
|
docs,
|
||||||
|
record_manager,
|
||||||
|
vector_store,
|
||||||
|
batch_size=batch_size,
|
||||||
|
key_encoder="sha256",
|
||||||
|
)
|
||||||
args, kwargs = mock_add_documents.call_args
|
args, kwargs = mock_add_documents.call_args
|
||||||
docs_with_id = [
|
docs_with_id = [
|
||||||
Document(
|
Document(
|
||||||
@ -1402,11 +1408,17 @@ async def test_aindexing_custom_batch_size(
|
|||||||
metadata={"source": "1"},
|
metadata={"source": "1"},
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
ids = [_HashedDocument.from_document(doc).uid for doc in docs]
|
ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]
|
||||||
|
|
||||||
batch_size = 1
|
batch_size = 1
|
||||||
with patch.object(vector_store, "aadd_documents") as mock_add_documents:
|
with patch.object(vector_store, "aadd_documents") as mock_add_documents:
|
||||||
await aindex(docs, arecord_manager, vector_store, batch_size=batch_size)
|
await aindex(
|
||||||
|
docs,
|
||||||
|
arecord_manager,
|
||||||
|
vector_store,
|
||||||
|
batch_size=batch_size,
|
||||||
|
key_encoder="sha256",
|
||||||
|
)
|
||||||
args, kwargs = mock_add_documents.call_args
|
args, kwargs = mock_add_documents.call_args
|
||||||
docs_with_id = [
|
docs_with_id = [
|
||||||
Document(
|
Document(
|
||||||
|
Loading…
Reference in New Issue
Block a user