diff --git a/libs/core/langchain_core/indexing/api.py b/libs/core/langchain_core/indexing/api.py index 7e7b2c3debf..9e1fc237852 100644 --- a/libs/core/langchain_core/indexing/api.py +++ b/libs/core/langchain_core/indexing/api.py @@ -5,6 +5,7 @@ from __future__ import annotations import hashlib import json import uuid +import warnings from collections.abc import AsyncIterable, AsyncIterator, Iterable, Iterator, Sequence from itertools import islice from typing import ( @@ -18,8 +19,6 @@ from typing import ( cast, ) -from pydantic import model_validator - from langchain_core.document_loaders.base import BaseLoader from langchain_core.documents import Document from langchain_core.exceptions import LangChainException @@ -35,94 +34,51 @@ NAMESPACE_UUID = uuid.UUID(int=1984) T = TypeVar("T") -def _hash_string_to_uuid(input_string: str) -> uuid.UUID: +def _hash_string_to_uuid(input_string: str) -> str: """Hashes a string and returns the corresponding UUID.""" hash_value = hashlib.sha1( input_string.encode("utf-8"), usedforsecurity=False ).hexdigest() + return str(uuid.uuid5(NAMESPACE_UUID, hash_value)) + + +_WARNED_ABOUT_SHA1: bool = False + + +def _warn_about_sha1() -> None: + """Emit a one-time warning about SHA-1 collision weaknesses.""" + # Global variable OK in this case + global _WARNED_ABOUT_SHA1 # noqa: PLW0603 + if not _WARNED_ABOUT_SHA1: + warnings.warn( + "Using SHA-1 for document hashing. SHA-1 is *not* " + "collision-resistant; a motivated attacker can construct distinct inputs " + "that map to the same fingerprint. If this matters in your " + "threat model, switch to a stronger algorithm such " + "as 'blake2b', 'sha256', or 'sha512' by specifying " + " `key_encoder` parameter in the the `index` or `aindex` function. ", + category=UserWarning, + stacklevel=2, + ) + _WARNED_ABOUT_SHA1 = True + + +def _hash_string( + input_string: str, *, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"] +) -> uuid.UUID: + """Hash *input_string* to a deterministic UUID using the configured algorithm.""" + if algorithm == "sha1": + _warn_about_sha1() + hash_value = _calculate_hash(input_string, algorithm) return uuid.uuid5(NAMESPACE_UUID, hash_value) -def _hash_nested_dict_to_uuid(data: dict[Any, Any]) -> uuid.UUID: - """Hashes a nested dictionary and returns the corresponding UUID.""" +def _hash_nested_dict( + data: dict[Any, Any], *, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"] +) -> uuid.UUID: + """Hash a nested dictionary to a UUID using the configured algorithm.""" serialized_data = json.dumps(data, sort_keys=True) - hash_value = hashlib.sha1( - serialized_data.encode("utf-8"), usedforsecurity=False - ).hexdigest() - return uuid.uuid5(NAMESPACE_UUID, hash_value) - - -class _HashedDocument(Document): - """A hashed document with a unique ID.""" - - uid: str - hash_: str - """The hash of the document including content and metadata.""" - content_hash: str - """The hash of the document content.""" - metadata_hash: str - """The hash of the document metadata.""" - - @classmethod - def is_lc_serializable(cls) -> bool: - return False - - @model_validator(mode="before") - @classmethod - def calculate_hashes(cls, values: dict[str, Any]) -> Any: - """Root validator to calculate content and metadata hash.""" - content = values.get("page_content", "") - metadata = values.get("metadata", {}) - - forbidden_keys = ("hash_", "content_hash", "metadata_hash") - - for key in forbidden_keys: - if key in metadata: - msg = ( - f"Metadata cannot contain key {key} as it " - f"is reserved for internal use." - ) - raise ValueError(msg) - - content_hash = str(_hash_string_to_uuid(content)) - - try: - metadata_hash = str(_hash_nested_dict_to_uuid(metadata)) - except Exception as e: - msg = ( - f"Failed to hash metadata: {e}. " - f"Please use a dict that can be serialized using json." - ) - raise ValueError(msg) from e - - values["content_hash"] = content_hash - values["metadata_hash"] = metadata_hash - values["hash_"] = str(_hash_string_to_uuid(content_hash + metadata_hash)) - - _uid = values.get("uid") - - if _uid is None: - values["uid"] = values["hash_"] - return values - - def to_document(self) -> Document: - """Return a Document object.""" - return Document( - id=self.uid, - page_content=self.page_content, - metadata=self.metadata, - ) - - @classmethod - def from_document( - cls, document: Document, *, uid: Optional[str] = None - ) -> _HashedDocument: - """Create a HashedDocument from a Document.""" - return cls( # type: ignore[call-arg] - uid=uid, # type: ignore[arg-type] - page_content=document.page_content, - metadata=document.metadata, - ) + return _hash_string(serialized_data, algorithm=algorithm) def _batch(size: int, iterable: Iterable[T]) -> Iterator[list[T]]: @@ -168,14 +124,16 @@ def _get_source_id_assigner( def _deduplicate_in_order( - hashed_documents: Iterable[_HashedDocument], -) -> Iterator[_HashedDocument]: + hashed_documents: Iterable[Document], +) -> Iterator[Document]: """Deduplicate a list of hashed documents while preserving order.""" seen: set[str] = set() for hashed_doc in hashed_documents: - if hashed_doc.hash_ not in seen: - seen.add(hashed_doc.hash_) + if hashed_doc.id not in seen: + # At this stage, the id is guaranteed to be a string. + # Avoiding unnecessary run time checks. + seen.add(cast("str", hashed_doc.id)) yield hashed_doc @@ -183,6 +141,94 @@ class IndexingException(LangChainException): """Raised when an indexing operation fails.""" +def _calculate_hash( + text: str, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"] +) -> str: + """Return a hexadecimal digest of *text* using *algorithm*.""" + if algorithm == "sha1": + # Calculate the SHA-1 hash and return it as a UUID. + digest = hashlib.sha1(text.encode("utf-8"), usedforsecurity=False).hexdigest() + return str(uuid.uuid5(NAMESPACE_UUID, digest)) + if algorithm == "blake2b": + return hashlib.blake2b(text.encode("utf-8")).hexdigest() + if algorithm == "sha256": + return hashlib.sha256(text.encode("utf-8")).hexdigest() + if algorithm == "sha512": + return hashlib.sha512(text.encode("utf-8")).hexdigest() + msg = f"Unsupported hashing algorithm: {algorithm}" + raise ValueError(msg) + + +def _get_document_with_hash( + document: Document, + *, + key_encoder: Union[ + Callable[[Document], str], Literal["sha1", "sha256", "sha512", "blake2b"] + ], +) -> Document: + """Calculate a hash of the document, and assign it to the uid. + + When using one of the predefined hashing algorithms, the hash is calculated + by hashing the content and the metadata of the document. + + Args: + document: Document to hash. + key_encoder: Hashing algorithm to use for hashing the document. + If not provided, a default encoder using SHA-1 will be used. + SHA-1 is not collision-resistant, and a motivated attacker + could craft two different texts that hash to the + same cache key. + + New applications should use one of the alternative encoders + or provide a custom and strong key encoder function to avoid this risk. + + When changing the key encoder, you must change the + index as well to avoid duplicated documents in the cache. + + Returns: + Document with a unique identifier based on the hash of the content and metadata. + """ + metadata: dict[str, Any] = dict(document.metadata or {}) + + if callable(key_encoder): + # If key_encoder is a callable, we use it to generate the hash. + hash_ = key_encoder(document) + else: + # The hashes are calculated separate for the content and the metadata. + content_hash = _calculate_hash(document.page_content, algorithm=key_encoder) + try: + serialized_meta = json.dumps(metadata, sort_keys=True) + except Exception as e: + msg = ( + f"Failed to hash metadata: {e}. " + f"Please use a dict that can be serialized using json." + ) + raise ValueError(msg) from e + metadata_hash = _calculate_hash(serialized_meta, algorithm=key_encoder) + hash_ = _calculate_hash(content_hash + metadata_hash, algorithm=key_encoder) + + return Document( + # Assign a unique identifier based on the hash. + id=hash_, + page_content=document.page_content, + metadata=document.metadata, + ) + + +# This internal abstraction was imported by the langchain package internally, so +# we keep it here for backwards compatibility. +class _HashedDocument: + def __init__(self, *args: Any, **kwargs: Any) -> None: + """Raise an error if this class is instantiated.""" + msg = ( + "_HashedDocument is an internal abstraction that was deprecated in " + " langchain-core 0.3.63. This abstraction is marked as private and " + " should not have been used directly. If you are seeing this error, please " + " update your code appropriately." + ) + raise NotImplementedError(msg) + + def _delete( vector_store: Union[VectorStore, DocumentIndex], ids: list[str], @@ -231,6 +277,9 @@ def index( source_id_key: Union[str, Callable[[Document], str], None] = None, cleanup_batch_size: int = 1_000, force_update: bool = False, + key_encoder: Union[ + Literal["sha1", "sha256", "sha512", "blake2b"], Callable[[Document], str] + ] = "sha1", upsert_kwargs: Optional[dict[str, Any]] = None, ) -> IndexingResult: """Index data from the loader into the vector store. @@ -291,6 +340,23 @@ def index( force_update: Force update documents even if they are present in the record manager. Useful if you are re-indexing with updated embeddings. Default is False. + key_encoder: Hashing algorithm to use for hashing the document content and + metadata. Default is "sha1". + Other options include "blake2b", "sha256", and "sha512". + + .. versionadded:: 0.3.66 + + key_encoder: Hashing algorithm to use for hashing the document. + If not provided, a default encoder using SHA-1 will be used. + SHA-1 is not collision-resistant, and a motivated attacker + could craft two different texts that hash to the + same cache key. + + New applications should use one of the alternative encoders + or provide a custom and strong key encoder function to avoid this risk. + + When changing the key encoder, you must change the + index as well to avoid duplicated documents in the cache. upsert_kwargs: Additional keyword arguments to pass to the add_documents method of the VectorStore or the upsert method of the DocumentIndex. For example, you can use this to @@ -313,6 +379,11 @@ def index( * Added `scoped_full` cleanup mode. """ + # Behavior is deprecated, but we keep it for backwards compatibility. + # # Warn only once per process. + if key_encoder == "sha1": + _warn_about_sha1() + if cleanup not in {"incremental", "full", "scoped_full", None}: msg = ( f"cleanup should be one of 'incremental', 'full', 'scoped_full' or None. " @@ -375,12 +446,15 @@ def index( for doc_batch in _batch(batch_size, doc_iterator): hashed_docs = list( _deduplicate_in_order( - [_HashedDocument.from_document(doc) for doc in doc_batch] + [ + _get_document_with_hash(doc, key_encoder=key_encoder) + for doc in doc_batch + ] ) ) source_ids: Sequence[Optional[str]] = [ - source_id_assigner(doc) for doc in hashed_docs + source_id_assigner(hashed_doc) for hashed_doc in hashed_docs ] if cleanup in {"incremental", "scoped_full"}: @@ -391,8 +465,8 @@ def index( f"Source ids are required when cleanup mode is " f"incremental or scoped_full. " f"Document that starts with " - f"content: {hashed_doc.page_content[:100]} was not assigned " - f"as source id." + f"content: {hashed_doc.page_content[:100]} " + f"was not assigned as source id." ) raise ValueError(msg) if cleanup == "scoped_full": @@ -400,7 +474,9 @@ def index( # source ids cannot be None after for loop above. source_ids = cast("Sequence[str]", source_ids) - exists_batch = record_manager.exists([doc.uid for doc in hashed_docs]) + exists_batch = record_manager.exists( + cast("Sequence[str]", [doc.id for doc in hashed_docs]) + ) # Filter out documents that already exist in the record store. uids = [] @@ -408,14 +484,15 @@ def index( uids_to_refresh = [] seen_docs: set[str] = set() for hashed_doc, doc_exists in zip(hashed_docs, exists_batch): + hashed_id = cast("str", hashed_doc.id) if doc_exists: if force_update: - seen_docs.add(hashed_doc.uid) + seen_docs.add(hashed_id) else: - uids_to_refresh.append(hashed_doc.uid) + uids_to_refresh.append(hashed_id) continue - uids.append(hashed_doc.uid) - docs_to_index.append(hashed_doc.to_document()) + uids.append(hashed_id) + docs_to_index.append(hashed_doc) # Update refresh timestamp if uids_to_refresh: @@ -445,7 +522,7 @@ def index( # Update ALL records, even if they already exist since we want to refresh # their timestamp. record_manager.update( - [doc.uid for doc in hashed_docs], + cast("Sequence[str]", [doc.id for doc in hashed_docs]), group_ids=source_ids, time_at_least=index_start_dt, ) @@ -453,7 +530,6 @@ def index( # If source IDs are provided, we can do the deletion incrementally! if cleanup == "incremental": # Get the uids of the documents that were not returned by the loader. - # mypy isn't good enough to determine that source ids cannot be None # here due to a check that's happening above, so we check again. for source_id in source_ids: @@ -537,6 +613,9 @@ async def aindex( source_id_key: Union[str, Callable[[Document], str], None] = None, cleanup_batch_size: int = 1_000, force_update: bool = False, + key_encoder: Union[ + Literal["sha1", "sha256", "sha512", "blake2b"], Callable[[Document], str] + ] = "sha1", upsert_kwargs: Optional[dict[str, Any]] = None, ) -> IndexingResult: """Async index data from the loader into the vector store. @@ -596,6 +675,17 @@ async def aindex( force_update: Force update documents even if they are present in the record manager. Useful if you are re-indexing with updated embeddings. Default is False. + key_encoder: Hashing algorithm to use for hashing the document. + If not provided, a default encoder using SHA-1 will be used. + SHA-1 is not collision-resistant, and a motivated attacker + could craft two different texts that hash to the + same cache key. + + New applications should use one of the alternative encoders + or provide a custom and strong key encoder function to avoid this risk. + + When changing the key encoder, you must change the + index as well to avoid duplicated documents in the cache. upsert_kwargs: Additional keyword arguments to pass to the aadd_documents method of the VectorStore or the aupsert method of the DocumentIndex. For example, you can use this to @@ -618,6 +708,11 @@ async def aindex( * Added `scoped_full` cleanup mode. """ + # Behavior is deprecated, but we keep it for backwards compatibility. + # # Warn only once per process. + if key_encoder == "sha1": + _warn_about_sha1() + if cleanup not in {"incremental", "full", "scoped_full", None}: msg = ( f"cleanup should be one of 'incremental', 'full', 'scoped_full' or None. " @@ -691,7 +786,10 @@ async def aindex( async for doc_batch in _abatch(batch_size, async_doc_iterator): hashed_docs = list( _deduplicate_in_order( - [_HashedDocument.from_document(doc) for doc in doc_batch] + [ + _get_document_with_hash(doc, key_encoder=key_encoder) + for doc in doc_batch + ] ) ) @@ -707,8 +805,8 @@ async def aindex( f"Source ids are required when cleanup mode is " f"incremental or scoped_full. " f"Document that starts with " - f"content: {hashed_doc.page_content[:100]} was not assigned " - f"as source id." + f"content: {hashed_doc.page_content[:100]} " + f"was not assigned as source id." ) raise ValueError(msg) if cleanup == "scoped_full": @@ -716,7 +814,9 @@ async def aindex( # source ids cannot be None after for loop above. source_ids = cast("Sequence[str]", source_ids) - exists_batch = await record_manager.aexists([doc.uid for doc in hashed_docs]) + exists_batch = await record_manager.aexists( + cast("Sequence[str]", [doc.id for doc in hashed_docs]) + ) # Filter out documents that already exist in the record store. uids: list[str] = [] @@ -724,14 +824,15 @@ async def aindex( uids_to_refresh = [] seen_docs: set[str] = set() for hashed_doc, doc_exists in zip(hashed_docs, exists_batch): + hashed_id = cast("str", hashed_doc.id) if doc_exists: if force_update: - seen_docs.add(hashed_doc.uid) + seen_docs.add(hashed_id) else: - uids_to_refresh.append(hashed_doc.uid) + uids_to_refresh.append(hashed_id) continue - uids.append(hashed_doc.uid) - docs_to_index.append(hashed_doc.to_document()) + uids.append(hashed_id) + docs_to_index.append(hashed_doc) if uids_to_refresh: # Must be updated to refresh timestamp. @@ -760,7 +861,7 @@ async def aindex( # Update ALL records, even if they already exist since we want to refresh # their timestamp. await record_manager.aupdate( - [doc.uid for doc in hashed_docs], + cast("Sequence[str]", [doc.id for doc in hashed_docs]), group_ids=source_ids, time_at_least=index_start_dt, ) diff --git a/libs/core/tests/unit_tests/indexing/test_hashed_document.py b/libs/core/tests/unit_tests/indexing/test_hashed_document.py index a756e2d93a4..fd88391aa3a 100644 --- a/libs/core/tests/unit_tests/indexing/test_hashed_document.py +++ b/libs/core/tests/unit_tests/indexing/test_hashed_document.py @@ -1,50 +1,65 @@ -import pytest +from typing import Literal from langchain_core.documents import Document -from langchain_core.indexing.api import _HashedDocument +from langchain_core.indexing.api import _get_document_with_hash def test_hashed_document_hashing() -> None: - hashed_document = _HashedDocument( # type: ignore[call-arg] + document = Document( uid="123", page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"} ) - assert isinstance(hashed_document.hash_, str) - - -def test_hashing_with_missing_content() -> None: - """Check that ValueError is raised if page_content is missing.""" - with pytest.raises(TypeError): - _HashedDocument( - metadata={"key": "value"}, - ) # type: ignore[call-arg] - - -def test_uid_auto_assigned_to_hash() -> None: - """Test uid is auto-assigned to the hashed_document hash.""" - hashed_document = _HashedDocument( # type: ignore[call-arg] - page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"} - ) - assert hashed_document.uid == hashed_document.hash_ + hashed_document = _get_document_with_hash(document, key_encoder="sha1") + assert isinstance(hashed_document.id, str) def test_to_document() -> None: """Test to_document method.""" - hashed_document = _HashedDocument( # type: ignore[call-arg] + original_doc = Document( page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"} ) - doc = hashed_document.to_document() - assert isinstance(doc, Document) - assert doc.page_content == "Lorem ipsum dolor sit amet" - assert doc.metadata == {"key": "value"} + hashed_doc = _get_document_with_hash(original_doc, key_encoder="sha1") + assert isinstance(hashed_doc, Document) + assert hashed_doc is not original_doc + assert hashed_doc.page_content == "Lorem ipsum dolor sit amet" + assert hashed_doc.metadata["key"] == "value" -def test_from_document() -> None: +def test_hashing() -> None: """Test from document class method.""" document = Document( page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"} ) - - hashed_document = _HashedDocument.from_document(document) + hashed_document = _get_document_with_hash(document, key_encoder="sha1") # hash should be deterministic - assert hashed_document.hash_ == "fd1dc827-051b-537d-a1fe-1fa043e8b276" - assert hashed_document.uid == hashed_document.hash_ + assert hashed_document.id == "fd1dc827-051b-537d-a1fe-1fa043e8b276" + + # Verify that hashing with sha1 is determinstic + another_hashed_document = _get_document_with_hash(document, key_encoder="sha1") + assert another_hashed_document.id == hashed_document.id + + # Verify that the result is different from SHA256, SHA512, blake2b + values: list[Literal["sha256", "sha512", "blake2b"]] = [ + "sha256", + "sha512", + "blake2b", + ] + + for key_encoder in values: + different_hashed_document = _get_document_with_hash( + document, key_encoder=key_encoder + ) + assert different_hashed_document.id != hashed_document.id + + +def test_hashing_custom_key_encoder() -> None: + """Test hashing with a custom key encoder.""" + + def custom_key_encoder(doc: Document) -> str: + return f"quack-{doc.metadata['key']}" + + document = Document( + page_content="Lorem ipsum dolor sit amet", metadata={"key": "like a duck"} + ) + hashed_document = _get_document_with_hash(document, key_encoder=custom_key_encoder) + assert hashed_document.id == "quack-like a duck" + assert isinstance(hashed_document.id, str) diff --git a/libs/core/tests/unit_tests/indexing/test_indexing.py b/libs/core/tests/unit_tests/indexing/test_indexing.py index 78500719228..1f6ca8671e9 100644 --- a/libs/core/tests/unit_tests/indexing/test_indexing.py +++ b/libs/core/tests/unit_tests/indexing/test_indexing.py @@ -13,7 +13,11 @@ from langchain_core.document_loaders.base import BaseLoader from langchain_core.documents import Document from langchain_core.embeddings import DeterministicFakeEmbedding from langchain_core.indexing import InMemoryRecordManager, aindex, index -from langchain_core.indexing.api import IndexingException, _abatch, _HashedDocument +from langchain_core.indexing.api import ( + IndexingException, + _abatch, + _get_document_with_hash, +) from langchain_core.indexing.in_memory import InMemoryDocumentIndex from langchain_core.vectorstores import InMemoryVectorStore, VectorStore @@ -2222,7 +2226,7 @@ def test_indexing_custom_batch_size( metadata={"source": "1"}, ), ] - ids = [_HashedDocument.from_document(doc).uid for doc in docs] + ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs] batch_size = 1 @@ -2232,7 +2236,13 @@ def test_indexing_custom_batch_size( mock_add_documents = MagicMock() vector_store.add_documents = mock_add_documents # type: ignore[method-assign] - index(docs, record_manager, vector_store, batch_size=batch_size) + index( + docs, + record_manager, + vector_store, + batch_size=batch_size, + key_encoder="sha256", + ) args, kwargs = mock_add_documents.call_args doc_with_id = Document( id=ids[0], page_content="This is a test document.", metadata={"source": "1"} @@ -2253,7 +2263,7 @@ async def test_aindexing_custom_batch_size( metadata={"source": "1"}, ), ] - ids = [_HashedDocument.from_document(doc).uid for doc in docs] + ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs] batch_size = 1 mock_add_documents = AsyncMock() @@ -2261,7 +2271,9 @@ async def test_aindexing_custom_batch_size( id=ids[0], page_content="This is a test document.", metadata={"source": "1"} ) vector_store.aadd_documents = mock_add_documents # type: ignore[method-assign] - await aindex(docs, arecord_manager, vector_store, batch_size=batch_size) + await aindex( + docs, arecord_manager, vector_store, batch_size=batch_size, key_encoder="sha256" + ) args, kwargs = mock_add_documents.call_args assert args == ([doc_with_id],) assert kwargs == {"ids": ids, "batch_size": batch_size} diff --git a/libs/langchain/tests/unit_tests/indexes/test_hashed_document.py b/libs/langchain/tests/unit_tests/indexes/test_hashed_document.py deleted file mode 100644 index 27ffb32b306..00000000000 --- a/libs/langchain/tests/unit_tests/indexes/test_hashed_document.py +++ /dev/null @@ -1,50 +0,0 @@ -import pytest -from langchain_core.documents import Document - -from langchain.indexes._api import _HashedDocument - - -def test_hashed_document_hashing() -> None: - hashed_document = _HashedDocument( # type: ignore[call-arg] - uid="123", page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"} - ) - assert isinstance(hashed_document.hash_, str) - - -def test_hashing_with_missing_content() -> None: - """Check that ValueError is raised if page_content is missing.""" - with pytest.raises(TypeError): - _HashedDocument( # type: ignore[call-arg] - metadata={"key": "value"}, - ) - - -def test_uid_auto_assigned_to_hash() -> None: - """Test uid is auto-assigned to the hashed_document hash.""" - hashed_document = _HashedDocument( # type: ignore[call-arg] - page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"} - ) - assert hashed_document.uid == hashed_document.hash_ - - -def test_to_document() -> None: - """Test to_document method.""" - hashed_document = _HashedDocument( # type: ignore[call-arg] - page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"} - ) - doc = hashed_document.to_document() - assert isinstance(doc, Document) - assert doc.page_content == "Lorem ipsum dolor sit amet" - assert doc.metadata == {"key": "value"} - - -def test_from_document() -> None: - """Test from document class method.""" - document = Document( - page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"} - ) - - hashed_document = _HashedDocument.from_document(document) - # hash should be deterministic - assert hashed_document.hash_ == "fd1dc827-051b-537d-a1fe-1fa043e8b276" - assert hashed_document.uid == hashed_document.hash_ diff --git a/libs/langchain/tests/unit_tests/indexes/test_indexing.py b/libs/langchain/tests/unit_tests/indexes/test_indexing.py index 1e9df94e698..b7695d51259 100644 --- a/libs/langchain/tests/unit_tests/indexes/test_indexing.py +++ b/libs/langchain/tests/unit_tests/indexes/test_indexing.py @@ -11,10 +11,10 @@ import pytest_asyncio from langchain_core.document_loaders import BaseLoader from langchain_core.documents import Document from langchain_core.embeddings import Embeddings +from langchain_core.indexing.api import _abatch, _get_document_with_hash from langchain_core.vectorstores import VST, VectorStore from langchain.indexes import aindex, index -from langchain.indexes._api import _abatch, _HashedDocument from langchain.indexes._sql_record_manager import SQLRecordManager @@ -1374,11 +1374,17 @@ def test_indexing_custom_batch_size( metadata={"source": "1"}, ), ] - ids = [_HashedDocument.from_document(doc).uid for doc in docs] + ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs] batch_size = 1 with patch.object(vector_store, "add_documents") as mock_add_documents: - index(docs, record_manager, vector_store, batch_size=batch_size) + index( + docs, + record_manager, + vector_store, + batch_size=batch_size, + key_encoder="sha256", + ) args, kwargs = mock_add_documents.call_args docs_with_id = [ Document( @@ -1402,11 +1408,17 @@ async def test_aindexing_custom_batch_size( metadata={"source": "1"}, ), ] - ids = [_HashedDocument.from_document(doc).uid for doc in docs] + ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs] batch_size = 1 with patch.object(vector_store, "aadd_documents") as mock_add_documents: - await aindex(docs, arecord_manager, vector_store, batch_size=batch_size) + await aindex( + docs, + arecord_manager, + vector_store, + batch_size=batch_size, + key_encoder="sha256", + ) args, kwargs = mock_add_documents.call_args docs_with_id = [ Document(