core[patch]: Add additional hashing options to indexing API, warn on SHA-1 (#31649)

Add additional hashing options to the indexing API, warn on SHA-1 Requires: - Bumping langchain-core version - bumping min langchain-core in langchain --------- Co-authored-by: ccurme <chester.curme@gmail.com>
2025-08-14 07:07:34 +00:00 · 2025-06-24 14:44:06 -04:00 · 2025-06-24 14:44:06 -04:00 · 9164e6f906
commit 9164e6f906
parent cc4f5269b1
5 changed files with 286 additions and 196 deletions
--- a/libs/core/langchain_core/indexing/api.py
+++ b/libs/core/langchain_core/indexing/api.py
@ -5,6 +5,7 @@ from __future__ import annotations
 import hashlib
 import json
 import uuid
 import warnings
 from collections.abc import AsyncIterable, AsyncIterator, Iterable, Iterator, Sequence
 from itertools import islice
 from typing import (
@ -18,8 +19,6 @@ from typing import (
    cast,
 )
 from pydantic import model_validator
 from langchain_core.document_loaders.base import BaseLoader
 from langchain_core.documents import Document
 from langchain_core.exceptions import LangChainException
@ -35,94 +34,51 @@ NAMESPACE_UUID = uuid.UUID(int=1984)
 T = TypeVar("T")
-def _hash_string_to_uuid(input_string: str) -> uuid.UUID:
+def _hash_string_to_uuid(input_string: str) -> str:
    """Hashes a string and returns the corresponding UUID."""
    hash_value = hashlib.sha1(
        input_string.encode("utf-8"), usedforsecurity=False
    ).hexdigest()
    return str(uuid.uuid5(NAMESPACE_UUID, hash_value))
 _WARNED_ABOUT_SHA1: bool = False
 def _warn_about_sha1() -> None:
    """Emit a one-time warning about SHA-1 collision weaknesses."""
    # Global variable OK in this case
    global _WARNED_ABOUT_SHA1  # noqa: PLW0603
    if not _WARNED_ABOUT_SHA1:
        warnings.warn(
            "Using SHA-1 for document hashing. SHA-1 is *not* "
            "collision-resistant; a motivated attacker can construct distinct inputs "
            "that map to the same fingerprint. If this matters in your "
            "threat model, switch to a stronger algorithm such "
            "as 'blake2b', 'sha256', or 'sha512' by specifying "
            " `key_encoder` parameter in the the `index` or `aindex` function. ",
            category=UserWarning,
            stacklevel=2,
        )
        _WARNED_ABOUT_SHA1 = True
 def _hash_string(
    input_string: str, *, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]
 ) -> uuid.UUID:
    """Hash *input_string* to a deterministic UUID using the configured algorithm."""
    if algorithm == "sha1":
        _warn_about_sha1()
    hash_value = _calculate_hash(input_string, algorithm)
    return uuid.uuid5(NAMESPACE_UUID, hash_value)
-def _hash_nested_dict_to_uuid(data: dict[Any, Any]) -> uuid.UUID:
+def _hash_nested_dict(
-    """Hashes a nested dictionary and returns the corresponding UUID."""
+    data: dict[Any, Any], *, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]
 ) -> uuid.UUID:
    """Hash a nested dictionary to a UUID using the configured algorithm."""
    serialized_data = json.dumps(data, sort_keys=True)
-    hash_value = hashlib.sha1(
+    return _hash_string(serialized_data, algorithm=algorithm)
        serialized_data.encode("utf-8"), usedforsecurity=False
    ).hexdigest()
    return uuid.uuid5(NAMESPACE_UUID, hash_value)
 class _HashedDocument(Document):
    """A hashed document with a unique ID."""
    uid: str
    hash_: str
    """The hash of the document including content and metadata."""
    content_hash: str
    """The hash of the document content."""
    metadata_hash: str
    """The hash of the document metadata."""
    @classmethod
    def is_lc_serializable(cls) -> bool:
        return False
    @model_validator(mode="before")
    @classmethod
    def calculate_hashes(cls, values: dict[str, Any]) -> Any:
        """Root validator to calculate content and metadata hash."""
        content = values.get("page_content", "")
        metadata = values.get("metadata", {})
        forbidden_keys = ("hash_", "content_hash", "metadata_hash")
        for key in forbidden_keys:
            if key in metadata:
                msg = (
                    f"Metadata cannot contain key {key} as it "
                    f"is reserved for internal use."
                )
                raise ValueError(msg)
        content_hash = str(_hash_string_to_uuid(content))
        try:
            metadata_hash = str(_hash_nested_dict_to_uuid(metadata))
        except Exception as e:
            msg = (
                f"Failed to hash metadata: {e}. "
                f"Please use a dict that can be serialized using json."
            )
            raise ValueError(msg) from e
        values["content_hash"] = content_hash
        values["metadata_hash"] = metadata_hash
        values["hash_"] = str(_hash_string_to_uuid(content_hash + metadata_hash))
        _uid = values.get("uid")
        if _uid is None:
            values["uid"] = values["hash_"]
        return values
    def to_document(self) -> Document:
        """Return a Document object."""
        return Document(
            id=self.uid,
            page_content=self.page_content,
            metadata=self.metadata,
        )
    @classmethod
    def from_document(
        cls, document: Document, *, uid: Optional[str] = None
    ) -> _HashedDocument:
        """Create a HashedDocument from a Document."""
        return cls(  # type: ignore[call-arg]
            uid=uid,  # type: ignore[arg-type]
            page_content=document.page_content,
            metadata=document.metadata,
        )
 def _batch(size: int, iterable: Iterable[T]) -> Iterator[list[T]]:
@ -168,14 +124,16 @@ def _get_source_id_assigner(
 def _deduplicate_in_order(
-    hashed_documents: Iterable[_HashedDocument],
+    hashed_documents: Iterable[Document],
-) -> Iterator[_HashedDocument]:
+) -> Iterator[Document]:
    """Deduplicate a list of hashed documents while preserving order."""
    seen: set[str] = set()
    for hashed_doc in hashed_documents:
-        if hashed_doc.hash_ not in seen:
+        if hashed_doc.id not in seen:
-            seen.add(hashed_doc.hash_)
+            # At this stage, the id is guaranteed to be a string.
            # Avoiding unnecessary run time checks.
            seen.add(cast("str", hashed_doc.id))
            yield hashed_doc
@ -183,6 +141,94 @@ class IndexingException(LangChainException):
    """Raised when an indexing operation fails."""
 def _calculate_hash(
    text: str, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]
 ) -> str:
    """Return a hexadecimal digest of *text* using *algorithm*."""
    if algorithm == "sha1":
        # Calculate the SHA-1 hash and return it as a UUID.
        digest = hashlib.sha1(text.encode("utf-8"), usedforsecurity=False).hexdigest()
        return str(uuid.uuid5(NAMESPACE_UUID, digest))
    if algorithm == "blake2b":
        return hashlib.blake2b(text.encode("utf-8")).hexdigest()
    if algorithm == "sha256":
        return hashlib.sha256(text.encode("utf-8")).hexdigest()
    if algorithm == "sha512":
        return hashlib.sha512(text.encode("utf-8")).hexdigest()
    msg = f"Unsupported hashing algorithm: {algorithm}"
    raise ValueError(msg)
 def _get_document_with_hash(
    document: Document,
    *,
    key_encoder: Union[
        Callable[[Document], str], Literal["sha1", "sha256", "sha512", "blake2b"]
    ],
 ) -> Document:
    """Calculate a hash of the document, and assign it to the uid.
    When using one of the predefined hashing algorithms, the hash is calculated
    by hashing the content and the metadata of the document.
    Args:
        document: Document to hash.
        key_encoder: Hashing algorithm to use for hashing the document.
            If not provided, a default encoder using SHA-1 will be used.
            SHA-1 is not collision-resistant, and a motivated attacker
            could craft two different texts that hash to the
            same cache key.
            New applications should use one of the alternative encoders
            or provide a custom and strong key encoder function to avoid this risk.
            When changing the key encoder, you must change the
            index as well to avoid duplicated documents in the cache.
    Returns:
        Document with a unique identifier based on the hash of the content and metadata.
    """
    metadata: dict[str, Any] = dict(document.metadata or {})
    if callable(key_encoder):
        # If key_encoder is a callable, we use it to generate the hash.
        hash_ = key_encoder(document)
    else:
        # The hashes are calculated separate for the content and the metadata.
        content_hash = _calculate_hash(document.page_content, algorithm=key_encoder)
        try:
            serialized_meta = json.dumps(metadata, sort_keys=True)
        except Exception as e:
            msg = (
                f"Failed to hash metadata: {e}. "
                f"Please use a dict that can be serialized using json."
            )
            raise ValueError(msg) from e
        metadata_hash = _calculate_hash(serialized_meta, algorithm=key_encoder)
        hash_ = _calculate_hash(content_hash + metadata_hash, algorithm=key_encoder)
    return Document(
        # Assign a unique identifier based on the hash.
        id=hash_,
        page_content=document.page_content,
        metadata=document.metadata,
    )
 # This internal abstraction was imported by the langchain package internally, so
 # we keep it here for backwards compatibility.
 class _HashedDocument:
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        """Raise an error if this class is instantiated."""
        msg = (
            "_HashedDocument is an internal abstraction that was deprecated in "
            " langchain-core 0.3.63. This abstraction is marked as private and "
            " should not have been used directly. If you are seeing this error, please "
            " update your code appropriately."
        )
        raise NotImplementedError(msg)
 def _delete(
    vector_store: Union[VectorStore, DocumentIndex],
    ids: list[str],
@ -231,6 +277,9 @@ def index(
    source_id_key: Union[str, Callable[[Document], str], None] = None,
    cleanup_batch_size: int = 1_000,
    force_update: bool = False,
    key_encoder: Union[
        Literal["sha1", "sha256", "sha512", "blake2b"], Callable[[Document], str]
    ] = "sha1",
    upsert_kwargs: Optional[dict[str, Any]] = None,
 ) -> IndexingResult:
    """Index data from the loader into the vector store.
@ -291,6 +340,23 @@ def index(
        force_update: Force update documents even if they are present in the
            record manager. Useful if you are re-indexing with updated embeddings.
            Default is False.
        key_encoder: Hashing algorithm to use for hashing the document content and
            metadata. Default is "sha1".
            Other options include "blake2b", "sha256", and "sha512".
            .. versionadded:: 0.3.66
        key_encoder: Hashing algorithm to use for hashing the document.
            If not provided, a default encoder using SHA-1 will be used.
            SHA-1 is not collision-resistant, and a motivated attacker
            could craft two different texts that hash to the
            same cache key.
            New applications should use one of the alternative encoders
            or provide a custom and strong key encoder function to avoid this risk.
            When changing the key encoder, you must change the
            index as well to avoid duplicated documents in the cache.
        upsert_kwargs: Additional keyword arguments to pass to the add_documents
                       method of the VectorStore or the upsert method of the
                       DocumentIndex. For example, you can use this to
@ -313,6 +379,11 @@ def index(
        * Added `scoped_full` cleanup mode.
    """
    # Behavior is deprecated, but we keep it for backwards compatibility.
    # # Warn only once per process.
    if key_encoder == "sha1":
        _warn_about_sha1()
    if cleanup not in {"incremental", "full", "scoped_full", None}:
        msg = (
            f"cleanup should be one of 'incremental', 'full', 'scoped_full' or None. "
@ -375,12 +446,15 @@ def index(
    for doc_batch in _batch(batch_size, doc_iterator):
        hashed_docs = list(
            _deduplicate_in_order(
-                [_HashedDocument.from_document(doc) for doc in doc_batch]
+                [
                    _get_document_with_hash(doc, key_encoder=key_encoder)
                    for doc in doc_batch
                ]
            )
        )
        source_ids: Sequence[Optional[str]] = [
-            source_id_assigner(doc) for doc in hashed_docs
+            source_id_assigner(hashed_doc) for hashed_doc in hashed_docs
        ]
        if cleanup in {"incremental", "scoped_full"}:
@ -391,8 +465,8 @@ def index(
                        f"Source ids are required when cleanup mode is "
                        f"incremental or scoped_full. "
                        f"Document that starts with "
-                        f"content: {hashed_doc.page_content[:100]} was not assigned "
+                        f"content: {hashed_doc.page_content[:100]} "
-                        f"as source id."
+                        f"was not assigned as source id."
                    )
                    raise ValueError(msg)
                if cleanup == "scoped_full":
@ -400,7 +474,9 @@ def index(
            # source ids cannot be None after for loop above.
            source_ids = cast("Sequence[str]", source_ids)
-        exists_batch = record_manager.exists([doc.uid for doc in hashed_docs])
+        exists_batch = record_manager.exists(
            cast("Sequence[str]", [doc.id for doc in hashed_docs])
        )
        # Filter out documents that already exist in the record store.
        uids = []
@ -408,14 +484,15 @@ def index(
        uids_to_refresh = []
        seen_docs: set[str] = set()
        for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
            hashed_id = cast("str", hashed_doc.id)
            if doc_exists:
                if force_update:
-                    seen_docs.add(hashed_doc.uid)
+                    seen_docs.add(hashed_id)
                else:
-                    uids_to_refresh.append(hashed_doc.uid)
+                    uids_to_refresh.append(hashed_id)
                    continue
-            uids.append(hashed_doc.uid)
+            uids.append(hashed_id)
-            docs_to_index.append(hashed_doc.to_document())
+            docs_to_index.append(hashed_doc)
        # Update refresh timestamp
        if uids_to_refresh:
@ -445,7 +522,7 @@ def index(
        # Update ALL records, even if they already exist since we want to refresh
        # their timestamp.
        record_manager.update(
-            [doc.uid for doc in hashed_docs],
+            cast("Sequence[str]", [doc.id for doc in hashed_docs]),
            group_ids=source_ids,
            time_at_least=index_start_dt,
        )
@ -453,7 +530,6 @@ def index(
        # If source IDs are provided, we can do the deletion incrementally!
        if cleanup == "incremental":
            # Get the uids of the documents that were not returned by the loader.
            # mypy isn't good enough to determine that source ids cannot be None
            # here due to a check that's happening above, so we check again.
            for source_id in source_ids:
@ -537,6 +613,9 @@ async def aindex(
    source_id_key: Union[str, Callable[[Document], str], None] = None,
    cleanup_batch_size: int = 1_000,
    force_update: bool = False,
    key_encoder: Union[
        Literal["sha1", "sha256", "sha512", "blake2b"], Callable[[Document], str]
    ] = "sha1",
    upsert_kwargs: Optional[dict[str, Any]] = None,
 ) -> IndexingResult:
    """Async index data from the loader into the vector store.
@ -596,6 +675,17 @@ async def aindex(
        force_update: Force update documents even if they are present in the
            record manager. Useful if you are re-indexing with updated embeddings.
            Default is False.
        key_encoder: Hashing algorithm to use for hashing the document.
            If not provided, a default encoder using SHA-1 will be used.
            SHA-1 is not collision-resistant, and a motivated attacker
            could craft two different texts that hash to the
            same cache key.
            New applications should use one of the alternative encoders
            or provide a custom and strong key encoder function to avoid this risk.
            When changing the key encoder, you must change the
            index as well to avoid duplicated documents in the cache.
        upsert_kwargs: Additional keyword arguments to pass to the aadd_documents
                       method of the VectorStore or the aupsert method of the
                       DocumentIndex. For example, you can use this to
@ -618,6 +708,11 @@ async def aindex(
        * Added `scoped_full` cleanup mode.
    """
    # Behavior is deprecated, but we keep it for backwards compatibility.
    # # Warn only once per process.
    if key_encoder == "sha1":
        _warn_about_sha1()
    if cleanup not in {"incremental", "full", "scoped_full", None}:
        msg = (
            f"cleanup should be one of 'incremental', 'full', 'scoped_full' or None. "
@ -691,7 +786,10 @@ async def aindex(
    async for doc_batch in _abatch(batch_size, async_doc_iterator):
        hashed_docs = list(
            _deduplicate_in_order(
-                [_HashedDocument.from_document(doc) for doc in doc_batch]
+                [
                    _get_document_with_hash(doc, key_encoder=key_encoder)
                    for doc in doc_batch
                ]
            )
        )
@ -707,8 +805,8 @@ async def aindex(
                        f"Source ids are required when cleanup mode is "
                        f"incremental or scoped_full. "
                        f"Document that starts with "
-                        f"content: {hashed_doc.page_content[:100]} was not assigned "
+                        f"content: {hashed_doc.page_content[:100]} "
-                        f"as source id."
+                        f"was not assigned as source id."
                    )
                    raise ValueError(msg)
                if cleanup == "scoped_full":
@ -716,7 +814,9 @@ async def aindex(
            # source ids cannot be None after for loop above.
            source_ids = cast("Sequence[str]", source_ids)
-        exists_batch = await record_manager.aexists([doc.uid for doc in hashed_docs])
+        exists_batch = await record_manager.aexists(
            cast("Sequence[str]", [doc.id for doc in hashed_docs])
        )
        # Filter out documents that already exist in the record store.
        uids: list[str] = []
@ -724,14 +824,15 @@ async def aindex(
        uids_to_refresh = []
        seen_docs: set[str] = set()
        for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
            hashed_id = cast("str", hashed_doc.id)
            if doc_exists:
                if force_update:
-                    seen_docs.add(hashed_doc.uid)
+                    seen_docs.add(hashed_id)
                else:
-                    uids_to_refresh.append(hashed_doc.uid)
+                    uids_to_refresh.append(hashed_id)
                    continue
-            uids.append(hashed_doc.uid)
+            uids.append(hashed_id)
-            docs_to_index.append(hashed_doc.to_document())
+            docs_to_index.append(hashed_doc)
        if uids_to_refresh:
            # Must be updated to refresh timestamp.
@ -760,7 +861,7 @@ async def aindex(
        # Update ALL records, even if they already exist since we want to refresh
        # their timestamp.
        await record_manager.aupdate(
-            [doc.uid for doc in hashed_docs],
+            cast("Sequence[str]", [doc.id for doc in hashed_docs]),
            group_ids=source_ids,
            time_at_least=index_start_dt,
        )
--- a/libs/core/tests/unit_tests/indexing/test_hashed_document.py
+++ b/libs/core/tests/unit_tests/indexing/test_hashed_document.py
@ -1,50 +1,65 @@
-import pytest
+from typing import Literal
 from langchain_core.documents import Document
-from langchain_core.indexing.api import _HashedDocument
+from langchain_core.indexing.api import _get_document_with_hash
 def test_hashed_document_hashing() -> None:
-    hashed_document = _HashedDocument(  # type: ignore[call-arg]
+    document = Document(
        uid="123", page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
    )
-    assert isinstance(hashed_document.hash_, str)
+    hashed_document = _get_document_with_hash(document, key_encoder="sha1")
-
+    assert isinstance(hashed_document.id, str)
 def test_hashing_with_missing_content() -> None:
    """Check that ValueError is raised if page_content is missing."""
    with pytest.raises(TypeError):
        _HashedDocument(
            metadata={"key": "value"},
        )  # type: ignore[call-arg]
 def test_uid_auto_assigned_to_hash() -> None:
    """Test uid is auto-assigned to the hashed_document hash."""
    hashed_document = _HashedDocument(  # type: ignore[call-arg]
        page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
    )
    assert hashed_document.uid == hashed_document.hash_
 def test_to_document() -> None:
    """Test to_document method."""
-    hashed_document = _HashedDocument(  # type: ignore[call-arg]
+    original_doc = Document(
        page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
    )
-    doc = hashed_document.to_document()
+    hashed_doc = _get_document_with_hash(original_doc, key_encoder="sha1")
-    assert isinstance(doc, Document)
+    assert isinstance(hashed_doc, Document)
-    assert doc.page_content == "Lorem ipsum dolor sit amet"
+    assert hashed_doc is not original_doc
-    assert doc.metadata == {"key": "value"}
+    assert hashed_doc.page_content == "Lorem ipsum dolor sit amet"
    assert hashed_doc.metadata["key"] == "value"
-def test_from_document() -> None:
+def test_hashing() -> None:
    """Test from document class method."""
    document = Document(
        page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
    )
-
+    hashed_document = _get_document_with_hash(document, key_encoder="sha1")
    hashed_document = _HashedDocument.from_document(document)
    # hash should be deterministic
-    assert hashed_document.hash_ == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
+    assert hashed_document.id == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
-    assert hashed_document.uid == hashed_document.hash_
+
    # Verify that hashing with sha1 is determinstic
    another_hashed_document = _get_document_with_hash(document, key_encoder="sha1")
    assert another_hashed_document.id == hashed_document.id
    # Verify that the result is different from SHA256, SHA512, blake2b
    values: list[Literal["sha256", "sha512", "blake2b"]] = [
        "sha256",
        "sha512",
        "blake2b",
    ]
    for key_encoder in values:
        different_hashed_document = _get_document_with_hash(
            document, key_encoder=key_encoder
        )
        assert different_hashed_document.id != hashed_document.id
 def test_hashing_custom_key_encoder() -> None:
    """Test hashing with a custom key encoder."""
    def custom_key_encoder(doc: Document) -> str:
        return f"quack-{doc.metadata['key']}"
    document = Document(
        page_content="Lorem ipsum dolor sit amet", metadata={"key": "like a duck"}
    )
    hashed_document = _get_document_with_hash(document, key_encoder=custom_key_encoder)
    assert hashed_document.id == "quack-like a duck"
    assert isinstance(hashed_document.id, str)
--- a/libs/core/tests/unit_tests/indexing/test_indexing.py
+++ b/libs/core/tests/unit_tests/indexing/test_indexing.py
@ -13,7 +13,11 @@ from langchain_core.document_loaders.base import BaseLoader
 from langchain_core.documents import Document
 from langchain_core.embeddings import DeterministicFakeEmbedding
 from langchain_core.indexing import InMemoryRecordManager, aindex, index
-from langchain_core.indexing.api import IndexingException, _abatch, _HashedDocument
+from langchain_core.indexing.api import (
    IndexingException,
    _abatch,
    _get_document_with_hash,
 )
 from langchain_core.indexing.in_memory import InMemoryDocumentIndex
 from langchain_core.vectorstores import InMemoryVectorStore, VectorStore
@ -2222,7 +2226,7 @@ def test_indexing_custom_batch_size(
            metadata={"source": "1"},
        ),
    ]
-    ids = [_HashedDocument.from_document(doc).uid for doc in docs]
+    ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]
    batch_size = 1
@ -2232,7 +2236,13 @@ def test_indexing_custom_batch_size(
        mock_add_documents = MagicMock()
        vector_store.add_documents = mock_add_documents  # type: ignore[method-assign]
-        index(docs, record_manager, vector_store, batch_size=batch_size)
+        index(
            docs,
            record_manager,
            vector_store,
            batch_size=batch_size,
            key_encoder="sha256",
        )
        args, kwargs = mock_add_documents.call_args
        doc_with_id = Document(
            id=ids[0], page_content="This is a test document.", metadata={"source": "1"}
@ -2253,7 +2263,7 @@ async def test_aindexing_custom_batch_size(
            metadata={"source": "1"},
        ),
    ]
-    ids = [_HashedDocument.from_document(doc).uid for doc in docs]
+    ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]
    batch_size = 1
    mock_add_documents = AsyncMock()
@ -2261,7 +2271,9 @@ async def test_aindexing_custom_batch_size(
        id=ids[0], page_content="This is a test document.", metadata={"source": "1"}
    )
    vector_store.aadd_documents = mock_add_documents  # type: ignore[method-assign]
-    await aindex(docs, arecord_manager, vector_store, batch_size=batch_size)
+    await aindex(
        docs, arecord_manager, vector_store, batch_size=batch_size, key_encoder="sha256"
    )
    args, kwargs = mock_add_documents.call_args
    assert args == ([doc_with_id],)
    assert kwargs == {"ids": ids, "batch_size": batch_size}
--- a/libs/langchain/tests/unit_tests/indexes/test_hashed_document.py
+++ b/libs/langchain/tests/unit_tests/indexes/test_hashed_document.py
@ -1,50 +0,0 @@
 import pytest
 from langchain_core.documents import Document
 from langchain.indexes._api import _HashedDocument
 def test_hashed_document_hashing() -> None:
    hashed_document = _HashedDocument(  # type: ignore[call-arg]
        uid="123", page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
    )
    assert isinstance(hashed_document.hash_, str)
 def test_hashing_with_missing_content() -> None:
    """Check that ValueError is raised if page_content is missing."""
    with pytest.raises(TypeError):
        _HashedDocument(  # type: ignore[call-arg]
            metadata={"key": "value"},
        )
 def test_uid_auto_assigned_to_hash() -> None:
    """Test uid is auto-assigned to the hashed_document hash."""
    hashed_document = _HashedDocument(  # type: ignore[call-arg]
        page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
    )
    assert hashed_document.uid == hashed_document.hash_
 def test_to_document() -> None:
    """Test to_document method."""
    hashed_document = _HashedDocument(  # type: ignore[call-arg]
        page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
    )
    doc = hashed_document.to_document()
    assert isinstance(doc, Document)
    assert doc.page_content == "Lorem ipsum dolor sit amet"
    assert doc.metadata == {"key": "value"}
 def test_from_document() -> None:
    """Test from document class method."""
    document = Document(
        page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
    )
    hashed_document = _HashedDocument.from_document(document)
    # hash should be deterministic
    assert hashed_document.hash_ == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
    assert hashed_document.uid == hashed_document.hash_
--- a/libs/langchain/tests/unit_tests/indexes/test_indexing.py
+++ b/libs/langchain/tests/unit_tests/indexes/test_indexing.py
@ -11,10 +11,10 @@ import pytest_asyncio
 from langchain_core.document_loaders import BaseLoader
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
 from langchain_core.indexing.api import _abatch, _get_document_with_hash
 from langchain_core.vectorstores import VST, VectorStore
 from langchain.indexes import aindex, index
 from langchain.indexes._api import _abatch, _HashedDocument
 from langchain.indexes._sql_record_manager import SQLRecordManager
@ -1374,11 +1374,17 @@ def test_indexing_custom_batch_size(
            metadata={"source": "1"},
        ),
    ]
-    ids = [_HashedDocument.from_document(doc).uid for doc in docs]
+    ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]
    batch_size = 1
    with patch.object(vector_store, "add_documents") as mock_add_documents:
-        index(docs, record_manager, vector_store, batch_size=batch_size)
+        index(
            docs,
            record_manager,
            vector_store,
            batch_size=batch_size,
            key_encoder="sha256",
        )
        args, kwargs = mock_add_documents.call_args
        docs_with_id = [
            Document(
@ -1402,11 +1408,17 @@ async def test_aindexing_custom_batch_size(
            metadata={"source": "1"},
        ),
    ]
-    ids = [_HashedDocument.from_document(doc).uid for doc in docs]
+    ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]
    batch_size = 1
    with patch.object(vector_store, "aadd_documents") as mock_add_documents:
-        await aindex(docs, arecord_manager, vector_store, batch_size=batch_size)
+        await aindex(
            docs,
            arecord_manager,
            vector_store,
            batch_size=batch_size,
            key_encoder="sha256",
        )
        args, kwargs = mock_add_documents.call_args
        docs_with_id = [
            Document(