core[patch]: Add additional hashing options to indexing API, warn on SHA-1 (#31649)

Add additional hashing options to the indexing API, warn on SHA-1 Requires: - Bumping langchain-core version - bumping min langchain-core in langchain --------- Co-authored-by: ccurme <chester.curme@gmail.com>
2025-09-25 21:37:20 +00:00 · 2025-06-24 14:44:06 -04:00
parent cc4f5269b1
commit 9164e6f906
5 changed files with 286 additions and 196 deletions
--- a/libs/core/langchain_core/indexing/api.py
+++ b/libs/core/langchain_core/indexing/api.py
@@ -5,6 +5,7 @@ from __future__ import annotations
 import hashlib
 import json
 import uuid
+import warnings
 from collections.abc import AsyncIterable, AsyncIterator, Iterable, Iterator, Sequence
 from itertools import islice
 from typing import (
@@ -18,8 +19,6 @@ from typing import (
    cast,
 )

-from pydantic import model_validator
-
 from langchain_core.document_loaders.base import BaseLoader
 from langchain_core.documents import Document
 from langchain_core.exceptions import LangChainException
@@ -35,94 +34,51 @@ NAMESPACE_UUID = uuid.UUID(int=1984)
 T = TypeVar("T")


-def _hash_string_to_uuid(input_string: str) -> uuid.UUID:
+def _hash_string_to_uuid(input_string: str) -> str:
    """Hashes a string and returns the corresponding UUID."""
    hash_value = hashlib.sha1(
        input_string.encode("utf-8"), usedforsecurity=False
    ).hexdigest()
+    return str(uuid.uuid5(NAMESPACE_UUID, hash_value))
+
+
+_WARNED_ABOUT_SHA1: bool = False
+
+
+def _warn_about_sha1() -> None:
+    """Emit a one-time warning about SHA-1 collision weaknesses."""
+    # Global variable OK in this case
+    global _WARNED_ABOUT_SHA1  # noqa: PLW0603
+    if not _WARNED_ABOUT_SHA1:
+        warnings.warn(
+            "Using SHA-1 for document hashing. SHA-1 is *not* "
+            "collision-resistant; a motivated attacker can construct distinct inputs "
+            "that map to the same fingerprint. If this matters in your "
+            "threat model, switch to a stronger algorithm such "
+            "as 'blake2b', 'sha256', or 'sha512' by specifying "
+            " `key_encoder` parameter in the the `index` or `aindex` function. ",
+            category=UserWarning,
+            stacklevel=2,
+        )
+        _WARNED_ABOUT_SHA1 = True
+
+
+def _hash_string(
+    input_string: str, *, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]
+) -> uuid.UUID:
+    """Hash *input_string* to a deterministic UUID using the configured algorithm."""
+    if algorithm == "sha1":
+        _warn_about_sha1()
+    hash_value = _calculate_hash(input_string, algorithm)
    return uuid.uuid5(NAMESPACE_UUID, hash_value)


-def _hash_nested_dict_to_uuid(data: dict[Any, Any]) -> uuid.UUID:
-    """Hashes a nested dictionary and returns the corresponding UUID."""
+def _hash_nested_dict(
+    data: dict[Any, Any], *, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]
+) -> uuid.UUID:
+    """Hash a nested dictionary to a UUID using the configured algorithm."""
    serialized_data = json.dumps(data, sort_keys=True)
-    hash_value = hashlib.sha1(
-        serialized_data.encode("utf-8"), usedforsecurity=False
-    ).hexdigest()
-    return uuid.uuid5(NAMESPACE_UUID, hash_value)
-
-
-class _HashedDocument(Document):
-    """A hashed document with a unique ID."""
-
-    uid: str
-    hash_: str
-    """The hash of the document including content and metadata."""
-    content_hash: str
-    """The hash of the document content."""
-    metadata_hash: str
-    """The hash of the document metadata."""
-
-    @classmethod
-    def is_lc_serializable(cls) -> bool:
-        return False
-
-    @model_validator(mode="before")
-    @classmethod
-    def calculate_hashes(cls, values: dict[str, Any]) -> Any:
-        """Root validator to calculate content and metadata hash."""
-        content = values.get("page_content", "")
-        metadata = values.get("metadata", {})
-
-        forbidden_keys = ("hash_", "content_hash", "metadata_hash")
-
-        for key in forbidden_keys:
-            if key in metadata:
-                msg = (
-                    f"Metadata cannot contain key {key} as it "
-                    f"is reserved for internal use."
-                )
-                raise ValueError(msg)
-
-        content_hash = str(_hash_string_to_uuid(content))
-
-        try:
-            metadata_hash = str(_hash_nested_dict_to_uuid(metadata))
-        except Exception as e:
-            msg = (
-                f"Failed to hash metadata: {e}. "
-                f"Please use a dict that can be serialized using json."
-            )
-            raise ValueError(msg) from e
-
-        values["content_hash"] = content_hash
-        values["metadata_hash"] = metadata_hash
-        values["hash_"] = str(_hash_string_to_uuid(content_hash + metadata_hash))
-
-        _uid = values.get("uid")
-
-        if _uid is None:
-            values["uid"] = values["hash_"]
-        return values
-
-    def to_document(self) -> Document:
-        """Return a Document object."""
-        return Document(
-            id=self.uid,
-            page_content=self.page_content,
-            metadata=self.metadata,
-        )
-
-    @classmethod
-    def from_document(
-        cls, document: Document, *, uid: Optional[str] = None
-    ) -> _HashedDocument:
-        """Create a HashedDocument from a Document."""
-        return cls(  # type: ignore[call-arg]
-            uid=uid,  # type: ignore[arg-type]
-            page_content=document.page_content,
-            metadata=document.metadata,
-        )
+    return _hash_string(serialized_data, algorithm=algorithm)


 def _batch(size: int, iterable: Iterable[T]) -> Iterator[list[T]]:
@@ -168,14 +124,16 @@ def _get_source_id_assigner(


 def _deduplicate_in_order(
-    hashed_documents: Iterable[_HashedDocument],
-) -> Iterator[_HashedDocument]:
+    hashed_documents: Iterable[Document],
+) -> Iterator[Document]:
    """Deduplicate a list of hashed documents while preserving order."""
    seen: set[str] = set()

    for hashed_doc in hashed_documents:
-        if hashed_doc.hash_ not in seen:
-            seen.add(hashed_doc.hash_)
+        if hashed_doc.id not in seen:
+            # At this stage, the id is guaranteed to be a string.
+            # Avoiding unnecessary run time checks.
+            seen.add(cast("str", hashed_doc.id))
            yield hashed_doc


@@ -183,6 +141,94 @@ class IndexingException(LangChainException):
    """Raised when an indexing operation fails."""


+def _calculate_hash(
+    text: str, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]
+) -> str:
+    """Return a hexadecimal digest of *text* using *algorithm*."""
+    if algorithm == "sha1":
+        # Calculate the SHA-1 hash and return it as a UUID.
+        digest = hashlib.sha1(text.encode("utf-8"), usedforsecurity=False).hexdigest()
+        return str(uuid.uuid5(NAMESPACE_UUID, digest))
+    if algorithm == "blake2b":
+        return hashlib.blake2b(text.encode("utf-8")).hexdigest()
+    if algorithm == "sha256":
+        return hashlib.sha256(text.encode("utf-8")).hexdigest()
+    if algorithm == "sha512":
+        return hashlib.sha512(text.encode("utf-8")).hexdigest()
+    msg = f"Unsupported hashing algorithm: {algorithm}"
+    raise ValueError(msg)
+
+
+def _get_document_with_hash(
+    document: Document,
+    *,
+    key_encoder: Union[
+        Callable[[Document], str], Literal["sha1", "sha256", "sha512", "blake2b"]
+    ],
+) -> Document:
+    """Calculate a hash of the document, and assign it to the uid.
+
+    When using one of the predefined hashing algorithms, the hash is calculated
+    by hashing the content and the metadata of the document.
+
+    Args:
+        document: Document to hash.
+        key_encoder: Hashing algorithm to use for hashing the document.
+            If not provided, a default encoder using SHA-1 will be used.
+            SHA-1 is not collision-resistant, and a motivated attacker
+            could craft two different texts that hash to the
+            same cache key.
+
+            New applications should use one of the alternative encoders
+            or provide a custom and strong key encoder function to avoid this risk.
+
+            When changing the key encoder, you must change the
+            index as well to avoid duplicated documents in the cache.
+
+    Returns:
+        Document with a unique identifier based on the hash of the content and metadata.
+    """
+    metadata: dict[str, Any] = dict(document.metadata or {})
+
+    if callable(key_encoder):
+        # If key_encoder is a callable, we use it to generate the hash.
+        hash_ = key_encoder(document)
+    else:
+        # The hashes are calculated separate for the content and the metadata.
+        content_hash = _calculate_hash(document.page_content, algorithm=key_encoder)
+        try:
+            serialized_meta = json.dumps(metadata, sort_keys=True)
+        except Exception as e:
+            msg = (
+                f"Failed to hash metadata: {e}. "
+                f"Please use a dict that can be serialized using json."
+            )
+            raise ValueError(msg) from e
+        metadata_hash = _calculate_hash(serialized_meta, algorithm=key_encoder)
+        hash_ = _calculate_hash(content_hash + metadata_hash, algorithm=key_encoder)
+
+    return Document(
+        # Assign a unique identifier based on the hash.
+        id=hash_,
+        page_content=document.page_content,
+        metadata=document.metadata,
+    )
+
+
+# This internal abstraction was imported by the langchain package internally, so
+# we keep it here for backwards compatibility.
+class _HashedDocument:
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        """Raise an error if this class is instantiated."""
+        msg = (
+            "_HashedDocument is an internal abstraction that was deprecated in "
+            " langchain-core 0.3.63. This abstraction is marked as private and "
+            " should not have been used directly. If you are seeing this error, please "
+            " update your code appropriately."
+        )
+        raise NotImplementedError(msg)
+
+
 def _delete(
    vector_store: Union[VectorStore, DocumentIndex],
    ids: list[str],
@@ -231,6 +277,9 @@ def index(
    source_id_key: Union[str, Callable[[Document], str], None] = None,
    cleanup_batch_size: int = 1_000,
    force_update: bool = False,
+    key_encoder: Union[
+        Literal["sha1", "sha256", "sha512", "blake2b"], Callable[[Document], str]
+    ] = "sha1",
    upsert_kwargs: Optional[dict[str, Any]] = None,
 ) -> IndexingResult:
    """Index data from the loader into the vector store.
@@ -291,6 +340,23 @@ def index(
        force_update: Force update documents even if they are present in the
            record manager. Useful if you are re-indexing with updated embeddings.
            Default is False.
+        key_encoder: Hashing algorithm to use for hashing the document content and
+            metadata. Default is "sha1".
+            Other options include "blake2b", "sha256", and "sha512".
+
+            .. versionadded:: 0.3.66
+
+        key_encoder: Hashing algorithm to use for hashing the document.
+            If not provided, a default encoder using SHA-1 will be used.
+            SHA-1 is not collision-resistant, and a motivated attacker
+            could craft two different texts that hash to the
+            same cache key.
+
+            New applications should use one of the alternative encoders
+            or provide a custom and strong key encoder function to avoid this risk.
+
+            When changing the key encoder, you must change the
+            index as well to avoid duplicated documents in the cache.
        upsert_kwargs: Additional keyword arguments to pass to the add_documents
                       method of the VectorStore or the upsert method of the
                       DocumentIndex. For example, you can use this to
@@ -313,6 +379,11 @@ def index(

        * Added `scoped_full` cleanup mode.
    """
+    # Behavior is deprecated, but we keep it for backwards compatibility.
+    # # Warn only once per process.
+    if key_encoder == "sha1":
+        _warn_about_sha1()
+
    if cleanup not in {"incremental", "full", "scoped_full", None}:
        msg = (
            f"cleanup should be one of 'incremental', 'full', 'scoped_full' or None. "
@@ -375,12 +446,15 @@ def index(
    for doc_batch in _batch(batch_size, doc_iterator):
        hashed_docs = list(
            _deduplicate_in_order(
-                [_HashedDocument.from_document(doc) for doc in doc_batch]
+                [
+                    _get_document_with_hash(doc, key_encoder=key_encoder)
+                    for doc in doc_batch
+                ]
            )
        )

        source_ids: Sequence[Optional[str]] = [
-            source_id_assigner(doc) for doc in hashed_docs
+            source_id_assigner(hashed_doc) for hashed_doc in hashed_docs
        ]

        if cleanup in {"incremental", "scoped_full"}:
@@ -391,8 +465,8 @@ def index(
                        f"Source ids are required when cleanup mode is "
                        f"incremental or scoped_full. "
                        f"Document that starts with "
-                        f"content: {hashed_doc.page_content[:100]} was not assigned "
-                        f"as source id."
+                        f"content: {hashed_doc.page_content[:100]} "
+                        f"was not assigned as source id."
                    )
                    raise ValueError(msg)
                if cleanup == "scoped_full":
@@ -400,7 +474,9 @@ def index(
            # source ids cannot be None after for loop above.
            source_ids = cast("Sequence[str]", source_ids)

-        exists_batch = record_manager.exists([doc.uid for doc in hashed_docs])
+        exists_batch = record_manager.exists(
+            cast("Sequence[str]", [doc.id for doc in hashed_docs])
+        )

        # Filter out documents that already exist in the record store.
        uids = []
@@ -408,14 +484,15 @@ def index(
        uids_to_refresh = []
        seen_docs: set[str] = set()
        for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
+            hashed_id = cast("str", hashed_doc.id)
            if doc_exists:
                if force_update:
-                    seen_docs.add(hashed_doc.uid)
+                    seen_docs.add(hashed_id)
                else:
-                    uids_to_refresh.append(hashed_doc.uid)
+                    uids_to_refresh.append(hashed_id)
                    continue
-            uids.append(hashed_doc.uid)
-            docs_to_index.append(hashed_doc.to_document())
+            uids.append(hashed_id)
+            docs_to_index.append(hashed_doc)

        # Update refresh timestamp
        if uids_to_refresh:
@@ -445,7 +522,7 @@ def index(
        # Update ALL records, even if they already exist since we want to refresh
        # their timestamp.
        record_manager.update(
-            [doc.uid for doc in hashed_docs],
+            cast("Sequence[str]", [doc.id for doc in hashed_docs]),
            group_ids=source_ids,
            time_at_least=index_start_dt,
        )
@@ -453,7 +530,6 @@ def index(
        # If source IDs are provided, we can do the deletion incrementally!
        if cleanup == "incremental":
            # Get the uids of the documents that were not returned by the loader.
-
            # mypy isn't good enough to determine that source ids cannot be None
            # here due to a check that's happening above, so we check again.
            for source_id in source_ids:
@@ -537,6 +613,9 @@ async def aindex(
    source_id_key: Union[str, Callable[[Document], str], None] = None,
    cleanup_batch_size: int = 1_000,
    force_update: bool = False,
+    key_encoder: Union[
+        Literal["sha1", "sha256", "sha512", "blake2b"], Callable[[Document], str]
+    ] = "sha1",
    upsert_kwargs: Optional[dict[str, Any]] = None,
 ) -> IndexingResult:
    """Async index data from the loader into the vector store.
@@ -596,6 +675,17 @@ async def aindex(
        force_update: Force update documents even if they are present in the
            record manager. Useful if you are re-indexing with updated embeddings.
            Default is False.
+        key_encoder: Hashing algorithm to use for hashing the document.
+            If not provided, a default encoder using SHA-1 will be used.
+            SHA-1 is not collision-resistant, and a motivated attacker
+            could craft two different texts that hash to the
+            same cache key.
+
+            New applications should use one of the alternative encoders
+            or provide a custom and strong key encoder function to avoid this risk.
+
+            When changing the key encoder, you must change the
+            index as well to avoid duplicated documents in the cache.
        upsert_kwargs: Additional keyword arguments to pass to the aadd_documents
                       method of the VectorStore or the aupsert method of the
                       DocumentIndex. For example, you can use this to
@@ -618,6 +708,11 @@ async def aindex(

        * Added `scoped_full` cleanup mode.
    """
+    # Behavior is deprecated, but we keep it for backwards compatibility.
+    # # Warn only once per process.
+    if key_encoder == "sha1":
+        _warn_about_sha1()
+
    if cleanup not in {"incremental", "full", "scoped_full", None}:
        msg = (
            f"cleanup should be one of 'incremental', 'full', 'scoped_full' or None. "
@@ -691,7 +786,10 @@ async def aindex(
    async for doc_batch in _abatch(batch_size, async_doc_iterator):
        hashed_docs = list(
            _deduplicate_in_order(
-                [_HashedDocument.from_document(doc) for doc in doc_batch]
+                [
+                    _get_document_with_hash(doc, key_encoder=key_encoder)
+                    for doc in doc_batch
+                ]
            )
        )

@@ -707,8 +805,8 @@ async def aindex(
                        f"Source ids are required when cleanup mode is "
                        f"incremental or scoped_full. "
                        f"Document that starts with "
-                        f"content: {hashed_doc.page_content[:100]} was not assigned "
-                        f"as source id."
+                        f"content: {hashed_doc.page_content[:100]} "
+                        f"was not assigned as source id."
                    )
                    raise ValueError(msg)
                if cleanup == "scoped_full":
@@ -716,7 +814,9 @@ async def aindex(
            # source ids cannot be None after for loop above.
            source_ids = cast("Sequence[str]", source_ids)

-        exists_batch = await record_manager.aexists([doc.uid for doc in hashed_docs])
+        exists_batch = await record_manager.aexists(
+            cast("Sequence[str]", [doc.id for doc in hashed_docs])
+        )

        # Filter out documents that already exist in the record store.
        uids: list[str] = []
@@ -724,14 +824,15 @@ async def aindex(
        uids_to_refresh = []
        seen_docs: set[str] = set()
        for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
+            hashed_id = cast("str", hashed_doc.id)
            if doc_exists:
                if force_update:
-                    seen_docs.add(hashed_doc.uid)
+                    seen_docs.add(hashed_id)
                else:
-                    uids_to_refresh.append(hashed_doc.uid)
+                    uids_to_refresh.append(hashed_id)
                    continue
-            uids.append(hashed_doc.uid)
-            docs_to_index.append(hashed_doc.to_document())
+            uids.append(hashed_id)
+            docs_to_index.append(hashed_doc)

        if uids_to_refresh:
            # Must be updated to refresh timestamp.
@@ -760,7 +861,7 @@ async def aindex(
        # Update ALL records, even if they already exist since we want to refresh
        # their timestamp.
        await record_manager.aupdate(
-            [doc.uid for doc in hashed_docs],
+            cast("Sequence[str]", [doc.id for doc in hashed_docs]),
            group_ids=source_ids,
            time_at_least=index_start_dt,
        )
--- a/libs/core/tests/unit_tests/indexing/test_hashed_document.py
+++ b/libs/core/tests/unit_tests/indexing/test_hashed_document.py
@@ -1,50 +1,65 @@
-import pytest
+from typing import Literal

 from langchain_core.documents import Document
-from langchain_core.indexing.api import _HashedDocument
+from langchain_core.indexing.api import _get_document_with_hash


 def test_hashed_document_hashing() -> None:
-    hashed_document = _HashedDocument(  # type: ignore[call-arg]
+    document = Document(
        uid="123", page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
    )
-    assert isinstance(hashed_document.hash_, str)
-
-
-def test_hashing_with_missing_content() -> None:
-    """Check that ValueError is raised if page_content is missing."""
-    with pytest.raises(TypeError):
-        _HashedDocument(
-            metadata={"key": "value"},
-        )  # type: ignore[call-arg]
-
-
-def test_uid_auto_assigned_to_hash() -> None:
-    """Test uid is auto-assigned to the hashed_document hash."""
-    hashed_document = _HashedDocument(  # type: ignore[call-arg]
-        page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
-    )
-    assert hashed_document.uid == hashed_document.hash_
+    hashed_document = _get_document_with_hash(document, key_encoder="sha1")
+    assert isinstance(hashed_document.id, str)


 def test_to_document() -> None:
    """Test to_document method."""
-    hashed_document = _HashedDocument(  # type: ignore[call-arg]
+    original_doc = Document(
        page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
    )
-    doc = hashed_document.to_document()
-    assert isinstance(doc, Document)
-    assert doc.page_content == "Lorem ipsum dolor sit amet"
-    assert doc.metadata == {"key": "value"}
+    hashed_doc = _get_document_with_hash(original_doc, key_encoder="sha1")
+    assert isinstance(hashed_doc, Document)
+    assert hashed_doc is not original_doc
+    assert hashed_doc.page_content == "Lorem ipsum dolor sit amet"
+    assert hashed_doc.metadata["key"] == "value"


-def test_from_document() -> None:
+def test_hashing() -> None:
    """Test from document class method."""
    document = Document(
        page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
    )
-
-    hashed_document = _HashedDocument.from_document(document)
+    hashed_document = _get_document_with_hash(document, key_encoder="sha1")
    # hash should be deterministic
-    assert hashed_document.hash_ == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
-    assert hashed_document.uid == hashed_document.hash_
+    assert hashed_document.id == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
+
+    # Verify that hashing with sha1 is determinstic
+    another_hashed_document = _get_document_with_hash(document, key_encoder="sha1")
+    assert another_hashed_document.id == hashed_document.id
+
+    # Verify that the result is different from SHA256, SHA512, blake2b
+    values: list[Literal["sha256", "sha512", "blake2b"]] = [
+        "sha256",
+        "sha512",
+        "blake2b",
+    ]
+
+    for key_encoder in values:
+        different_hashed_document = _get_document_with_hash(
+            document, key_encoder=key_encoder
+        )
+        assert different_hashed_document.id != hashed_document.id
+
+
+def test_hashing_custom_key_encoder() -> None:
+    """Test hashing with a custom key encoder."""
+
+    def custom_key_encoder(doc: Document) -> str:
+        return f"quack-{doc.metadata['key']}"
+
+    document = Document(
+        page_content="Lorem ipsum dolor sit amet", metadata={"key": "like a duck"}
+    )
+    hashed_document = _get_document_with_hash(document, key_encoder=custom_key_encoder)
+    assert hashed_document.id == "quack-like a duck"
+    assert isinstance(hashed_document.id, str)
--- a/libs/core/tests/unit_tests/indexing/test_indexing.py
+++ b/libs/core/tests/unit_tests/indexing/test_indexing.py
@@ -13,7 +13,11 @@ from langchain_core.document_loaders.base import BaseLoader
 from langchain_core.documents import Document
 from langchain_core.embeddings import DeterministicFakeEmbedding
 from langchain_core.indexing import InMemoryRecordManager, aindex, index
-from langchain_core.indexing.api import IndexingException, _abatch, _HashedDocument
+from langchain_core.indexing.api import (
+    IndexingException,
+    _abatch,
+    _get_document_with_hash,
+)
 from langchain_core.indexing.in_memory import InMemoryDocumentIndex
 from langchain_core.vectorstores import InMemoryVectorStore, VectorStore

@@ -2222,7 +2226,7 @@ def test_indexing_custom_batch_size(
            metadata={"source": "1"},
        ),
    ]
-    ids = [_HashedDocument.from_document(doc).uid for doc in docs]
+    ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]

    batch_size = 1

@@ -2232,7 +2236,13 @@ def test_indexing_custom_batch_size(
        mock_add_documents = MagicMock()
        vector_store.add_documents = mock_add_documents  # type: ignore[method-assign]

-        index(docs, record_manager, vector_store, batch_size=batch_size)
+        index(
+            docs,
+            record_manager,
+            vector_store,
+            batch_size=batch_size,
+            key_encoder="sha256",
+        )
        args, kwargs = mock_add_documents.call_args
        doc_with_id = Document(
            id=ids[0], page_content="This is a test document.", metadata={"source": "1"}
@@ -2253,7 +2263,7 @@ async def test_aindexing_custom_batch_size(
            metadata={"source": "1"},
        ),
    ]
-    ids = [_HashedDocument.from_document(doc).uid for doc in docs]
+    ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]

    batch_size = 1
    mock_add_documents = AsyncMock()
@@ -2261,7 +2271,9 @@ async def test_aindexing_custom_batch_size(
        id=ids[0], page_content="This is a test document.", metadata={"source": "1"}
    )
    vector_store.aadd_documents = mock_add_documents  # type: ignore[method-assign]
-    await aindex(docs, arecord_manager, vector_store, batch_size=batch_size)
+    await aindex(
+        docs, arecord_manager, vector_store, batch_size=batch_size, key_encoder="sha256"
+    )
    args, kwargs = mock_add_documents.call_args
    assert args == ([doc_with_id],)
    assert kwargs == {"ids": ids, "batch_size": batch_size}
--- a/libs/langchain/tests/unit_tests/indexes/test_hashed_document.py
+++ b/libs/langchain/tests/unit_tests/indexes/test_hashed_document.py
@@ -1,50 +0,0 @@
-import pytest
-from langchain_core.documents import Document
-
-from langchain.indexes._api import _HashedDocument
-
-
-def test_hashed_document_hashing() -> None:
-    hashed_document = _HashedDocument(  # type: ignore[call-arg]
-        uid="123", page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
-    )
-    assert isinstance(hashed_document.hash_, str)
-
-
-def test_hashing_with_missing_content() -> None:
-    """Check that ValueError is raised if page_content is missing."""
-    with pytest.raises(TypeError):
-        _HashedDocument(  # type: ignore[call-arg]
-            metadata={"key": "value"},
-        )
-
-
-def test_uid_auto_assigned_to_hash() -> None:
-    """Test uid is auto-assigned to the hashed_document hash."""
-    hashed_document = _HashedDocument(  # type: ignore[call-arg]
-        page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
-    )
-    assert hashed_document.uid == hashed_document.hash_
-
-
-def test_to_document() -> None:
-    """Test to_document method."""
-    hashed_document = _HashedDocument(  # type: ignore[call-arg]
-        page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
-    )
-    doc = hashed_document.to_document()
-    assert isinstance(doc, Document)
-    assert doc.page_content == "Lorem ipsum dolor sit amet"
-    assert doc.metadata == {"key": "value"}
-
-
-def test_from_document() -> None:
-    """Test from document class method."""
-    document = Document(
-        page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
-    )
-
-    hashed_document = _HashedDocument.from_document(document)
-    # hash should be deterministic
-    assert hashed_document.hash_ == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
-    assert hashed_document.uid == hashed_document.hash_
--- a/libs/langchain/tests/unit_tests/indexes/test_indexing.py
+++ b/libs/langchain/tests/unit_tests/indexes/test_indexing.py
@@ -11,10 +11,10 @@ import pytest_asyncio
 from langchain_core.document_loaders import BaseLoader
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
+from langchain_core.indexing.api import _abatch, _get_document_with_hash
 from langchain_core.vectorstores import VST, VectorStore

 from langchain.indexes import aindex, index
-from langchain.indexes._api import _abatch, _HashedDocument
 from langchain.indexes._sql_record_manager import SQLRecordManager


@@ -1374,11 +1374,17 @@ def test_indexing_custom_batch_size(
            metadata={"source": "1"},
        ),
    ]
-    ids = [_HashedDocument.from_document(doc).uid for doc in docs]
+    ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]

    batch_size = 1
    with patch.object(vector_store, "add_documents") as mock_add_documents:
-        index(docs, record_manager, vector_store, batch_size=batch_size)
+        index(
+            docs,
+            record_manager,
+            vector_store,
+            batch_size=batch_size,
+            key_encoder="sha256",
+        )
        args, kwargs = mock_add_documents.call_args
        docs_with_id = [
            Document(
@@ -1402,11 +1408,17 @@ async def test_aindexing_custom_batch_size(
            metadata={"source": "1"},
        ),
    ]
-    ids = [_HashedDocument.from_document(doc).uid for doc in docs]
+    ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]

    batch_size = 1
    with patch.object(vector_store, "aadd_documents") as mock_add_documents:
-        await aindex(docs, arecord_manager, vector_store, batch_size=batch_size)
+        await aindex(
+            docs,
+            arecord_manager,
+            vector_store,
+            batch_size=batch_size,
+            key_encoder="sha256",
+        )
        args, kwargs = mock_add_documents.call_args
        docs_with_id = [
            Document(