mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 08:33:49 +00:00
core[patch]: Add additional hashing options to indexing API, warn on SHA-1 (#31649)
Add additional hashing options to the indexing API, warn on SHA-1 Requires: - Bumping langchain-core version - bumping min langchain-core in langchain --------- Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
parent
cc4f5269b1
commit
9164e6f906
@ -5,6 +5,7 @@ from __future__ import annotations
|
||||
import hashlib
|
||||
import json
|
||||
import uuid
|
||||
import warnings
|
||||
from collections.abc import AsyncIterable, AsyncIterator, Iterable, Iterator, Sequence
|
||||
from itertools import islice
|
||||
from typing import (
|
||||
@ -18,8 +19,6 @@ from typing import (
|
||||
cast,
|
||||
)
|
||||
|
||||
from pydantic import model_validator
|
||||
|
||||
from langchain_core.document_loaders.base import BaseLoader
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.exceptions import LangChainException
|
||||
@ -35,94 +34,51 @@ NAMESPACE_UUID = uuid.UUID(int=1984)
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def _hash_string_to_uuid(input_string: str) -> uuid.UUID:
|
||||
def _hash_string_to_uuid(input_string: str) -> str:
|
||||
"""Hashes a string and returns the corresponding UUID."""
|
||||
hash_value = hashlib.sha1(
|
||||
input_string.encode("utf-8"), usedforsecurity=False
|
||||
).hexdigest()
|
||||
return str(uuid.uuid5(NAMESPACE_UUID, hash_value))
|
||||
|
||||
|
||||
_WARNED_ABOUT_SHA1: bool = False
|
||||
|
||||
|
||||
def _warn_about_sha1() -> None:
|
||||
"""Emit a one-time warning about SHA-1 collision weaknesses."""
|
||||
# Global variable OK in this case
|
||||
global _WARNED_ABOUT_SHA1 # noqa: PLW0603
|
||||
if not _WARNED_ABOUT_SHA1:
|
||||
warnings.warn(
|
||||
"Using SHA-1 for document hashing. SHA-1 is *not* "
|
||||
"collision-resistant; a motivated attacker can construct distinct inputs "
|
||||
"that map to the same fingerprint. If this matters in your "
|
||||
"threat model, switch to a stronger algorithm such "
|
||||
"as 'blake2b', 'sha256', or 'sha512' by specifying "
|
||||
" `key_encoder` parameter in the the `index` or `aindex` function. ",
|
||||
category=UserWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
_WARNED_ABOUT_SHA1 = True
|
||||
|
||||
|
||||
def _hash_string(
|
||||
input_string: str, *, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]
|
||||
) -> uuid.UUID:
|
||||
"""Hash *input_string* to a deterministic UUID using the configured algorithm."""
|
||||
if algorithm == "sha1":
|
||||
_warn_about_sha1()
|
||||
hash_value = _calculate_hash(input_string, algorithm)
|
||||
return uuid.uuid5(NAMESPACE_UUID, hash_value)
|
||||
|
||||
|
||||
def _hash_nested_dict_to_uuid(data: dict[Any, Any]) -> uuid.UUID:
|
||||
"""Hashes a nested dictionary and returns the corresponding UUID."""
|
||||
def _hash_nested_dict(
|
||||
data: dict[Any, Any], *, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]
|
||||
) -> uuid.UUID:
|
||||
"""Hash a nested dictionary to a UUID using the configured algorithm."""
|
||||
serialized_data = json.dumps(data, sort_keys=True)
|
||||
hash_value = hashlib.sha1(
|
||||
serialized_data.encode("utf-8"), usedforsecurity=False
|
||||
).hexdigest()
|
||||
return uuid.uuid5(NAMESPACE_UUID, hash_value)
|
||||
|
||||
|
||||
class _HashedDocument(Document):
|
||||
"""A hashed document with a unique ID."""
|
||||
|
||||
uid: str
|
||||
hash_: str
|
||||
"""The hash of the document including content and metadata."""
|
||||
content_hash: str
|
||||
"""The hash of the document content."""
|
||||
metadata_hash: str
|
||||
"""The hash of the document metadata."""
|
||||
|
||||
@classmethod
|
||||
def is_lc_serializable(cls) -> bool:
|
||||
return False
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def calculate_hashes(cls, values: dict[str, Any]) -> Any:
|
||||
"""Root validator to calculate content and metadata hash."""
|
||||
content = values.get("page_content", "")
|
||||
metadata = values.get("metadata", {})
|
||||
|
||||
forbidden_keys = ("hash_", "content_hash", "metadata_hash")
|
||||
|
||||
for key in forbidden_keys:
|
||||
if key in metadata:
|
||||
msg = (
|
||||
f"Metadata cannot contain key {key} as it "
|
||||
f"is reserved for internal use."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
|
||||
content_hash = str(_hash_string_to_uuid(content))
|
||||
|
||||
try:
|
||||
metadata_hash = str(_hash_nested_dict_to_uuid(metadata))
|
||||
except Exception as e:
|
||||
msg = (
|
||||
f"Failed to hash metadata: {e}. "
|
||||
f"Please use a dict that can be serialized using json."
|
||||
)
|
||||
raise ValueError(msg) from e
|
||||
|
||||
values["content_hash"] = content_hash
|
||||
values["metadata_hash"] = metadata_hash
|
||||
values["hash_"] = str(_hash_string_to_uuid(content_hash + metadata_hash))
|
||||
|
||||
_uid = values.get("uid")
|
||||
|
||||
if _uid is None:
|
||||
values["uid"] = values["hash_"]
|
||||
return values
|
||||
|
||||
def to_document(self) -> Document:
|
||||
"""Return a Document object."""
|
||||
return Document(
|
||||
id=self.uid,
|
||||
page_content=self.page_content,
|
||||
metadata=self.metadata,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_document(
|
||||
cls, document: Document, *, uid: Optional[str] = None
|
||||
) -> _HashedDocument:
|
||||
"""Create a HashedDocument from a Document."""
|
||||
return cls( # type: ignore[call-arg]
|
||||
uid=uid, # type: ignore[arg-type]
|
||||
page_content=document.page_content,
|
||||
metadata=document.metadata,
|
||||
)
|
||||
return _hash_string(serialized_data, algorithm=algorithm)
|
||||
|
||||
|
||||
def _batch(size: int, iterable: Iterable[T]) -> Iterator[list[T]]:
|
||||
@ -168,14 +124,16 @@ def _get_source_id_assigner(
|
||||
|
||||
|
||||
def _deduplicate_in_order(
|
||||
hashed_documents: Iterable[_HashedDocument],
|
||||
) -> Iterator[_HashedDocument]:
|
||||
hashed_documents: Iterable[Document],
|
||||
) -> Iterator[Document]:
|
||||
"""Deduplicate a list of hashed documents while preserving order."""
|
||||
seen: set[str] = set()
|
||||
|
||||
for hashed_doc in hashed_documents:
|
||||
if hashed_doc.hash_ not in seen:
|
||||
seen.add(hashed_doc.hash_)
|
||||
if hashed_doc.id not in seen:
|
||||
# At this stage, the id is guaranteed to be a string.
|
||||
# Avoiding unnecessary run time checks.
|
||||
seen.add(cast("str", hashed_doc.id))
|
||||
yield hashed_doc
|
||||
|
||||
|
||||
@ -183,6 +141,94 @@ class IndexingException(LangChainException):
|
||||
"""Raised when an indexing operation fails."""
|
||||
|
||||
|
||||
def _calculate_hash(
|
||||
text: str, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]
|
||||
) -> str:
|
||||
"""Return a hexadecimal digest of *text* using *algorithm*."""
|
||||
if algorithm == "sha1":
|
||||
# Calculate the SHA-1 hash and return it as a UUID.
|
||||
digest = hashlib.sha1(text.encode("utf-8"), usedforsecurity=False).hexdigest()
|
||||
return str(uuid.uuid5(NAMESPACE_UUID, digest))
|
||||
if algorithm == "blake2b":
|
||||
return hashlib.blake2b(text.encode("utf-8")).hexdigest()
|
||||
if algorithm == "sha256":
|
||||
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||
if algorithm == "sha512":
|
||||
return hashlib.sha512(text.encode("utf-8")).hexdigest()
|
||||
msg = f"Unsupported hashing algorithm: {algorithm}"
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
def _get_document_with_hash(
|
||||
document: Document,
|
||||
*,
|
||||
key_encoder: Union[
|
||||
Callable[[Document], str], Literal["sha1", "sha256", "sha512", "blake2b"]
|
||||
],
|
||||
) -> Document:
|
||||
"""Calculate a hash of the document, and assign it to the uid.
|
||||
|
||||
When using one of the predefined hashing algorithms, the hash is calculated
|
||||
by hashing the content and the metadata of the document.
|
||||
|
||||
Args:
|
||||
document: Document to hash.
|
||||
key_encoder: Hashing algorithm to use for hashing the document.
|
||||
If not provided, a default encoder using SHA-1 will be used.
|
||||
SHA-1 is not collision-resistant, and a motivated attacker
|
||||
could craft two different texts that hash to the
|
||||
same cache key.
|
||||
|
||||
New applications should use one of the alternative encoders
|
||||
or provide a custom and strong key encoder function to avoid this risk.
|
||||
|
||||
When changing the key encoder, you must change the
|
||||
index as well to avoid duplicated documents in the cache.
|
||||
|
||||
Returns:
|
||||
Document with a unique identifier based on the hash of the content and metadata.
|
||||
"""
|
||||
metadata: dict[str, Any] = dict(document.metadata or {})
|
||||
|
||||
if callable(key_encoder):
|
||||
# If key_encoder is a callable, we use it to generate the hash.
|
||||
hash_ = key_encoder(document)
|
||||
else:
|
||||
# The hashes are calculated separate for the content and the metadata.
|
||||
content_hash = _calculate_hash(document.page_content, algorithm=key_encoder)
|
||||
try:
|
||||
serialized_meta = json.dumps(metadata, sort_keys=True)
|
||||
except Exception as e:
|
||||
msg = (
|
||||
f"Failed to hash metadata: {e}. "
|
||||
f"Please use a dict that can be serialized using json."
|
||||
)
|
||||
raise ValueError(msg) from e
|
||||
metadata_hash = _calculate_hash(serialized_meta, algorithm=key_encoder)
|
||||
hash_ = _calculate_hash(content_hash + metadata_hash, algorithm=key_encoder)
|
||||
|
||||
return Document(
|
||||
# Assign a unique identifier based on the hash.
|
||||
id=hash_,
|
||||
page_content=document.page_content,
|
||||
metadata=document.metadata,
|
||||
)
|
||||
|
||||
|
||||
# This internal abstraction was imported by the langchain package internally, so
|
||||
# we keep it here for backwards compatibility.
|
||||
class _HashedDocument:
|
||||
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
||||
"""Raise an error if this class is instantiated."""
|
||||
msg = (
|
||||
"_HashedDocument is an internal abstraction that was deprecated in "
|
||||
" langchain-core 0.3.63. This abstraction is marked as private and "
|
||||
" should not have been used directly. If you are seeing this error, please "
|
||||
" update your code appropriately."
|
||||
)
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
|
||||
def _delete(
|
||||
vector_store: Union[VectorStore, DocumentIndex],
|
||||
ids: list[str],
|
||||
@ -231,6 +277,9 @@ def index(
|
||||
source_id_key: Union[str, Callable[[Document], str], None] = None,
|
||||
cleanup_batch_size: int = 1_000,
|
||||
force_update: bool = False,
|
||||
key_encoder: Union[
|
||||
Literal["sha1", "sha256", "sha512", "blake2b"], Callable[[Document], str]
|
||||
] = "sha1",
|
||||
upsert_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> IndexingResult:
|
||||
"""Index data from the loader into the vector store.
|
||||
@ -291,6 +340,23 @@ def index(
|
||||
force_update: Force update documents even if they are present in the
|
||||
record manager. Useful if you are re-indexing with updated embeddings.
|
||||
Default is False.
|
||||
key_encoder: Hashing algorithm to use for hashing the document content and
|
||||
metadata. Default is "sha1".
|
||||
Other options include "blake2b", "sha256", and "sha512".
|
||||
|
||||
.. versionadded:: 0.3.66
|
||||
|
||||
key_encoder: Hashing algorithm to use for hashing the document.
|
||||
If not provided, a default encoder using SHA-1 will be used.
|
||||
SHA-1 is not collision-resistant, and a motivated attacker
|
||||
could craft two different texts that hash to the
|
||||
same cache key.
|
||||
|
||||
New applications should use one of the alternative encoders
|
||||
or provide a custom and strong key encoder function to avoid this risk.
|
||||
|
||||
When changing the key encoder, you must change the
|
||||
index as well to avoid duplicated documents in the cache.
|
||||
upsert_kwargs: Additional keyword arguments to pass to the add_documents
|
||||
method of the VectorStore or the upsert method of the
|
||||
DocumentIndex. For example, you can use this to
|
||||
@ -313,6 +379,11 @@ def index(
|
||||
|
||||
* Added `scoped_full` cleanup mode.
|
||||
"""
|
||||
# Behavior is deprecated, but we keep it for backwards compatibility.
|
||||
# # Warn only once per process.
|
||||
if key_encoder == "sha1":
|
||||
_warn_about_sha1()
|
||||
|
||||
if cleanup not in {"incremental", "full", "scoped_full", None}:
|
||||
msg = (
|
||||
f"cleanup should be one of 'incremental', 'full', 'scoped_full' or None. "
|
||||
@ -375,12 +446,15 @@ def index(
|
||||
for doc_batch in _batch(batch_size, doc_iterator):
|
||||
hashed_docs = list(
|
||||
_deduplicate_in_order(
|
||||
[_HashedDocument.from_document(doc) for doc in doc_batch]
|
||||
[
|
||||
_get_document_with_hash(doc, key_encoder=key_encoder)
|
||||
for doc in doc_batch
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
source_ids: Sequence[Optional[str]] = [
|
||||
source_id_assigner(doc) for doc in hashed_docs
|
||||
source_id_assigner(hashed_doc) for hashed_doc in hashed_docs
|
||||
]
|
||||
|
||||
if cleanup in {"incremental", "scoped_full"}:
|
||||
@ -391,8 +465,8 @@ def index(
|
||||
f"Source ids are required when cleanup mode is "
|
||||
f"incremental or scoped_full. "
|
||||
f"Document that starts with "
|
||||
f"content: {hashed_doc.page_content[:100]} was not assigned "
|
||||
f"as source id."
|
||||
f"content: {hashed_doc.page_content[:100]} "
|
||||
f"was not assigned as source id."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
if cleanup == "scoped_full":
|
||||
@ -400,7 +474,9 @@ def index(
|
||||
# source ids cannot be None after for loop above.
|
||||
source_ids = cast("Sequence[str]", source_ids)
|
||||
|
||||
exists_batch = record_manager.exists([doc.uid for doc in hashed_docs])
|
||||
exists_batch = record_manager.exists(
|
||||
cast("Sequence[str]", [doc.id for doc in hashed_docs])
|
||||
)
|
||||
|
||||
# Filter out documents that already exist in the record store.
|
||||
uids = []
|
||||
@ -408,14 +484,15 @@ def index(
|
||||
uids_to_refresh = []
|
||||
seen_docs: set[str] = set()
|
||||
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
|
||||
hashed_id = cast("str", hashed_doc.id)
|
||||
if doc_exists:
|
||||
if force_update:
|
||||
seen_docs.add(hashed_doc.uid)
|
||||
seen_docs.add(hashed_id)
|
||||
else:
|
||||
uids_to_refresh.append(hashed_doc.uid)
|
||||
uids_to_refresh.append(hashed_id)
|
||||
continue
|
||||
uids.append(hashed_doc.uid)
|
||||
docs_to_index.append(hashed_doc.to_document())
|
||||
uids.append(hashed_id)
|
||||
docs_to_index.append(hashed_doc)
|
||||
|
||||
# Update refresh timestamp
|
||||
if uids_to_refresh:
|
||||
@ -445,7 +522,7 @@ def index(
|
||||
# Update ALL records, even if they already exist since we want to refresh
|
||||
# their timestamp.
|
||||
record_manager.update(
|
||||
[doc.uid for doc in hashed_docs],
|
||||
cast("Sequence[str]", [doc.id for doc in hashed_docs]),
|
||||
group_ids=source_ids,
|
||||
time_at_least=index_start_dt,
|
||||
)
|
||||
@ -453,7 +530,6 @@ def index(
|
||||
# If source IDs are provided, we can do the deletion incrementally!
|
||||
if cleanup == "incremental":
|
||||
# Get the uids of the documents that were not returned by the loader.
|
||||
|
||||
# mypy isn't good enough to determine that source ids cannot be None
|
||||
# here due to a check that's happening above, so we check again.
|
||||
for source_id in source_ids:
|
||||
@ -537,6 +613,9 @@ async def aindex(
|
||||
source_id_key: Union[str, Callable[[Document], str], None] = None,
|
||||
cleanup_batch_size: int = 1_000,
|
||||
force_update: bool = False,
|
||||
key_encoder: Union[
|
||||
Literal["sha1", "sha256", "sha512", "blake2b"], Callable[[Document], str]
|
||||
] = "sha1",
|
||||
upsert_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> IndexingResult:
|
||||
"""Async index data from the loader into the vector store.
|
||||
@ -596,6 +675,17 @@ async def aindex(
|
||||
force_update: Force update documents even if they are present in the
|
||||
record manager. Useful if you are re-indexing with updated embeddings.
|
||||
Default is False.
|
||||
key_encoder: Hashing algorithm to use for hashing the document.
|
||||
If not provided, a default encoder using SHA-1 will be used.
|
||||
SHA-1 is not collision-resistant, and a motivated attacker
|
||||
could craft two different texts that hash to the
|
||||
same cache key.
|
||||
|
||||
New applications should use one of the alternative encoders
|
||||
or provide a custom and strong key encoder function to avoid this risk.
|
||||
|
||||
When changing the key encoder, you must change the
|
||||
index as well to avoid duplicated documents in the cache.
|
||||
upsert_kwargs: Additional keyword arguments to pass to the aadd_documents
|
||||
method of the VectorStore or the aupsert method of the
|
||||
DocumentIndex. For example, you can use this to
|
||||
@ -618,6 +708,11 @@ async def aindex(
|
||||
|
||||
* Added `scoped_full` cleanup mode.
|
||||
"""
|
||||
# Behavior is deprecated, but we keep it for backwards compatibility.
|
||||
# # Warn only once per process.
|
||||
if key_encoder == "sha1":
|
||||
_warn_about_sha1()
|
||||
|
||||
if cleanup not in {"incremental", "full", "scoped_full", None}:
|
||||
msg = (
|
||||
f"cleanup should be one of 'incremental', 'full', 'scoped_full' or None. "
|
||||
@ -691,7 +786,10 @@ async def aindex(
|
||||
async for doc_batch in _abatch(batch_size, async_doc_iterator):
|
||||
hashed_docs = list(
|
||||
_deduplicate_in_order(
|
||||
[_HashedDocument.from_document(doc) for doc in doc_batch]
|
||||
[
|
||||
_get_document_with_hash(doc, key_encoder=key_encoder)
|
||||
for doc in doc_batch
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
@ -707,8 +805,8 @@ async def aindex(
|
||||
f"Source ids are required when cleanup mode is "
|
||||
f"incremental or scoped_full. "
|
||||
f"Document that starts with "
|
||||
f"content: {hashed_doc.page_content[:100]} was not assigned "
|
||||
f"as source id."
|
||||
f"content: {hashed_doc.page_content[:100]} "
|
||||
f"was not assigned as source id."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
if cleanup == "scoped_full":
|
||||
@ -716,7 +814,9 @@ async def aindex(
|
||||
# source ids cannot be None after for loop above.
|
||||
source_ids = cast("Sequence[str]", source_ids)
|
||||
|
||||
exists_batch = await record_manager.aexists([doc.uid for doc in hashed_docs])
|
||||
exists_batch = await record_manager.aexists(
|
||||
cast("Sequence[str]", [doc.id for doc in hashed_docs])
|
||||
)
|
||||
|
||||
# Filter out documents that already exist in the record store.
|
||||
uids: list[str] = []
|
||||
@ -724,14 +824,15 @@ async def aindex(
|
||||
uids_to_refresh = []
|
||||
seen_docs: set[str] = set()
|
||||
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
|
||||
hashed_id = cast("str", hashed_doc.id)
|
||||
if doc_exists:
|
||||
if force_update:
|
||||
seen_docs.add(hashed_doc.uid)
|
||||
seen_docs.add(hashed_id)
|
||||
else:
|
||||
uids_to_refresh.append(hashed_doc.uid)
|
||||
uids_to_refresh.append(hashed_id)
|
||||
continue
|
||||
uids.append(hashed_doc.uid)
|
||||
docs_to_index.append(hashed_doc.to_document())
|
||||
uids.append(hashed_id)
|
||||
docs_to_index.append(hashed_doc)
|
||||
|
||||
if uids_to_refresh:
|
||||
# Must be updated to refresh timestamp.
|
||||
@ -760,7 +861,7 @@ async def aindex(
|
||||
# Update ALL records, even if they already exist since we want to refresh
|
||||
# their timestamp.
|
||||
await record_manager.aupdate(
|
||||
[doc.uid for doc in hashed_docs],
|
||||
cast("Sequence[str]", [doc.id for doc in hashed_docs]),
|
||||
group_ids=source_ids,
|
||||
time_at_least=index_start_dt,
|
||||
)
|
||||
|
@ -1,50 +1,65 @@
|
||||
import pytest
|
||||
from typing import Literal
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.indexing.api import _HashedDocument
|
||||
from langchain_core.indexing.api import _get_document_with_hash
|
||||
|
||||
|
||||
def test_hashed_document_hashing() -> None:
|
||||
hashed_document = _HashedDocument( # type: ignore[call-arg]
|
||||
document = Document(
|
||||
uid="123", page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
||||
)
|
||||
assert isinstance(hashed_document.hash_, str)
|
||||
|
||||
|
||||
def test_hashing_with_missing_content() -> None:
|
||||
"""Check that ValueError is raised if page_content is missing."""
|
||||
with pytest.raises(TypeError):
|
||||
_HashedDocument(
|
||||
metadata={"key": "value"},
|
||||
) # type: ignore[call-arg]
|
||||
|
||||
|
||||
def test_uid_auto_assigned_to_hash() -> None:
|
||||
"""Test uid is auto-assigned to the hashed_document hash."""
|
||||
hashed_document = _HashedDocument( # type: ignore[call-arg]
|
||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
||||
)
|
||||
assert hashed_document.uid == hashed_document.hash_
|
||||
hashed_document = _get_document_with_hash(document, key_encoder="sha1")
|
||||
assert isinstance(hashed_document.id, str)
|
||||
|
||||
|
||||
def test_to_document() -> None:
|
||||
"""Test to_document method."""
|
||||
hashed_document = _HashedDocument( # type: ignore[call-arg]
|
||||
original_doc = Document(
|
||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
||||
)
|
||||
doc = hashed_document.to_document()
|
||||
assert isinstance(doc, Document)
|
||||
assert doc.page_content == "Lorem ipsum dolor sit amet"
|
||||
assert doc.metadata == {"key": "value"}
|
||||
hashed_doc = _get_document_with_hash(original_doc, key_encoder="sha1")
|
||||
assert isinstance(hashed_doc, Document)
|
||||
assert hashed_doc is not original_doc
|
||||
assert hashed_doc.page_content == "Lorem ipsum dolor sit amet"
|
||||
assert hashed_doc.metadata["key"] == "value"
|
||||
|
||||
|
||||
def test_from_document() -> None:
|
||||
def test_hashing() -> None:
|
||||
"""Test from document class method."""
|
||||
document = Document(
|
||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
||||
)
|
||||
|
||||
hashed_document = _HashedDocument.from_document(document)
|
||||
hashed_document = _get_document_with_hash(document, key_encoder="sha1")
|
||||
# hash should be deterministic
|
||||
assert hashed_document.hash_ == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
|
||||
assert hashed_document.uid == hashed_document.hash_
|
||||
assert hashed_document.id == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
|
||||
|
||||
# Verify that hashing with sha1 is determinstic
|
||||
another_hashed_document = _get_document_with_hash(document, key_encoder="sha1")
|
||||
assert another_hashed_document.id == hashed_document.id
|
||||
|
||||
# Verify that the result is different from SHA256, SHA512, blake2b
|
||||
values: list[Literal["sha256", "sha512", "blake2b"]] = [
|
||||
"sha256",
|
||||
"sha512",
|
||||
"blake2b",
|
||||
]
|
||||
|
||||
for key_encoder in values:
|
||||
different_hashed_document = _get_document_with_hash(
|
||||
document, key_encoder=key_encoder
|
||||
)
|
||||
assert different_hashed_document.id != hashed_document.id
|
||||
|
||||
|
||||
def test_hashing_custom_key_encoder() -> None:
|
||||
"""Test hashing with a custom key encoder."""
|
||||
|
||||
def custom_key_encoder(doc: Document) -> str:
|
||||
return f"quack-{doc.metadata['key']}"
|
||||
|
||||
document = Document(
|
||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "like a duck"}
|
||||
)
|
||||
hashed_document = _get_document_with_hash(document, key_encoder=custom_key_encoder)
|
||||
assert hashed_document.id == "quack-like a duck"
|
||||
assert isinstance(hashed_document.id, str)
|
||||
|
@ -13,7 +13,11 @@ from langchain_core.document_loaders.base import BaseLoader
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import DeterministicFakeEmbedding
|
||||
from langchain_core.indexing import InMemoryRecordManager, aindex, index
|
||||
from langchain_core.indexing.api import IndexingException, _abatch, _HashedDocument
|
||||
from langchain_core.indexing.api import (
|
||||
IndexingException,
|
||||
_abatch,
|
||||
_get_document_with_hash,
|
||||
)
|
||||
from langchain_core.indexing.in_memory import InMemoryDocumentIndex
|
||||
from langchain_core.vectorstores import InMemoryVectorStore, VectorStore
|
||||
|
||||
@ -2222,7 +2226,7 @@ def test_indexing_custom_batch_size(
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
]
|
||||
ids = [_HashedDocument.from_document(doc).uid for doc in docs]
|
||||
ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]
|
||||
|
||||
batch_size = 1
|
||||
|
||||
@ -2232,7 +2236,13 @@ def test_indexing_custom_batch_size(
|
||||
mock_add_documents = MagicMock()
|
||||
vector_store.add_documents = mock_add_documents # type: ignore[method-assign]
|
||||
|
||||
index(docs, record_manager, vector_store, batch_size=batch_size)
|
||||
index(
|
||||
docs,
|
||||
record_manager,
|
||||
vector_store,
|
||||
batch_size=batch_size,
|
||||
key_encoder="sha256",
|
||||
)
|
||||
args, kwargs = mock_add_documents.call_args
|
||||
doc_with_id = Document(
|
||||
id=ids[0], page_content="This is a test document.", metadata={"source": "1"}
|
||||
@ -2253,7 +2263,7 @@ async def test_aindexing_custom_batch_size(
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
]
|
||||
ids = [_HashedDocument.from_document(doc).uid for doc in docs]
|
||||
ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]
|
||||
|
||||
batch_size = 1
|
||||
mock_add_documents = AsyncMock()
|
||||
@ -2261,7 +2271,9 @@ async def test_aindexing_custom_batch_size(
|
||||
id=ids[0], page_content="This is a test document.", metadata={"source": "1"}
|
||||
)
|
||||
vector_store.aadd_documents = mock_add_documents # type: ignore[method-assign]
|
||||
await aindex(docs, arecord_manager, vector_store, batch_size=batch_size)
|
||||
await aindex(
|
||||
docs, arecord_manager, vector_store, batch_size=batch_size, key_encoder="sha256"
|
||||
)
|
||||
args, kwargs = mock_add_documents.call_args
|
||||
assert args == ([doc_with_id],)
|
||||
assert kwargs == {"ids": ids, "batch_size": batch_size}
|
||||
|
@ -1,50 +0,0 @@
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain.indexes._api import _HashedDocument
|
||||
|
||||
|
||||
def test_hashed_document_hashing() -> None:
|
||||
hashed_document = _HashedDocument( # type: ignore[call-arg]
|
||||
uid="123", page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
||||
)
|
||||
assert isinstance(hashed_document.hash_, str)
|
||||
|
||||
|
||||
def test_hashing_with_missing_content() -> None:
|
||||
"""Check that ValueError is raised if page_content is missing."""
|
||||
with pytest.raises(TypeError):
|
||||
_HashedDocument( # type: ignore[call-arg]
|
||||
metadata={"key": "value"},
|
||||
)
|
||||
|
||||
|
||||
def test_uid_auto_assigned_to_hash() -> None:
|
||||
"""Test uid is auto-assigned to the hashed_document hash."""
|
||||
hashed_document = _HashedDocument( # type: ignore[call-arg]
|
||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
||||
)
|
||||
assert hashed_document.uid == hashed_document.hash_
|
||||
|
||||
|
||||
def test_to_document() -> None:
|
||||
"""Test to_document method."""
|
||||
hashed_document = _HashedDocument( # type: ignore[call-arg]
|
||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
||||
)
|
||||
doc = hashed_document.to_document()
|
||||
assert isinstance(doc, Document)
|
||||
assert doc.page_content == "Lorem ipsum dolor sit amet"
|
||||
assert doc.metadata == {"key": "value"}
|
||||
|
||||
|
||||
def test_from_document() -> None:
|
||||
"""Test from document class method."""
|
||||
document = Document(
|
||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
||||
)
|
||||
|
||||
hashed_document = _HashedDocument.from_document(document)
|
||||
# hash should be deterministic
|
||||
assert hashed_document.hash_ == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
|
||||
assert hashed_document.uid == hashed_document.hash_
|
@ -11,10 +11,10 @@ import pytest_asyncio
|
||||
from langchain_core.document_loaders import BaseLoader
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.indexing.api import _abatch, _get_document_with_hash
|
||||
from langchain_core.vectorstores import VST, VectorStore
|
||||
|
||||
from langchain.indexes import aindex, index
|
||||
from langchain.indexes._api import _abatch, _HashedDocument
|
||||
from langchain.indexes._sql_record_manager import SQLRecordManager
|
||||
|
||||
|
||||
@ -1374,11 +1374,17 @@ def test_indexing_custom_batch_size(
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
]
|
||||
ids = [_HashedDocument.from_document(doc).uid for doc in docs]
|
||||
ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]
|
||||
|
||||
batch_size = 1
|
||||
with patch.object(vector_store, "add_documents") as mock_add_documents:
|
||||
index(docs, record_manager, vector_store, batch_size=batch_size)
|
||||
index(
|
||||
docs,
|
||||
record_manager,
|
||||
vector_store,
|
||||
batch_size=batch_size,
|
||||
key_encoder="sha256",
|
||||
)
|
||||
args, kwargs = mock_add_documents.call_args
|
||||
docs_with_id = [
|
||||
Document(
|
||||
@ -1402,11 +1408,17 @@ async def test_aindexing_custom_batch_size(
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
]
|
||||
ids = [_HashedDocument.from_document(doc).uid for doc in docs]
|
||||
ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]
|
||||
|
||||
batch_size = 1
|
||||
with patch.object(vector_store, "aadd_documents") as mock_add_documents:
|
||||
await aindex(docs, arecord_manager, vector_store, batch_size=batch_size)
|
||||
await aindex(
|
||||
docs,
|
||||
arecord_manager,
|
||||
vector_store,
|
||||
batch_size=batch_size,
|
||||
key_encoder="sha256",
|
||||
)
|
||||
args, kwargs = mock_add_documents.call_args
|
||||
docs_with_id = [
|
||||
Document(
|
||||
|
Loading…
Reference in New Issue
Block a user