core[patch]: Add additional hashing options to indexing API, warn on SHA-1 (#31649)

Add additional hashing options to the indexing API, warn on SHA-1 Requires: - Bumping langchain-core version - bumping min langchain-core in langchain --------- Co-authored-by: ccurme <chester.curme@gmail.com>
2025-09-16 23:13:31 +00:00 · 2025-06-24 14:44:06 -04:00
parent cc4f5269b1
commit 9164e6f906
5 changed files with 286 additions and 196 deletions
--- a/libs/core/tests/unit_tests/indexing/test_hashed_document.py
+++ b/libs/core/tests/unit_tests/indexing/test_hashed_document.py
@@ -1,50 +1,65 @@
-import pytest
+from typing import Literal

 from langchain_core.documents import Document
-from langchain_core.indexing.api import _HashedDocument
+from langchain_core.indexing.api import _get_document_with_hash


 def test_hashed_document_hashing() -> None:
-    hashed_document = _HashedDocument(  # type: ignore[call-arg]
+    document = Document(
        uid="123", page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
    )
-    assert isinstance(hashed_document.hash_, str)
-
-
-def test_hashing_with_missing_content() -> None:
-    """Check that ValueError is raised if page_content is missing."""
-    with pytest.raises(TypeError):
-        _HashedDocument(
-            metadata={"key": "value"},
-        )  # type: ignore[call-arg]
-
-
-def test_uid_auto_assigned_to_hash() -> None:
-    """Test uid is auto-assigned to the hashed_document hash."""
-    hashed_document = _HashedDocument(  # type: ignore[call-arg]
-        page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
-    )
-    assert hashed_document.uid == hashed_document.hash_
+    hashed_document = _get_document_with_hash(document, key_encoder="sha1")
+    assert isinstance(hashed_document.id, str)


 def test_to_document() -> None:
    """Test to_document method."""
-    hashed_document = _HashedDocument(  # type: ignore[call-arg]
+    original_doc = Document(
        page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
    )
-    doc = hashed_document.to_document()
-    assert isinstance(doc, Document)
-    assert doc.page_content == "Lorem ipsum dolor sit amet"
-    assert doc.metadata == {"key": "value"}
+    hashed_doc = _get_document_with_hash(original_doc, key_encoder="sha1")
+    assert isinstance(hashed_doc, Document)
+    assert hashed_doc is not original_doc
+    assert hashed_doc.page_content == "Lorem ipsum dolor sit amet"
+    assert hashed_doc.metadata["key"] == "value"


-def test_from_document() -> None:
+def test_hashing() -> None:
    """Test from document class method."""
    document = Document(
        page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
    )
-
-    hashed_document = _HashedDocument.from_document(document)
+    hashed_document = _get_document_with_hash(document, key_encoder="sha1")
    # hash should be deterministic
-    assert hashed_document.hash_ == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
-    assert hashed_document.uid == hashed_document.hash_
+    assert hashed_document.id == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
+
+    # Verify that hashing with sha1 is determinstic
+    another_hashed_document = _get_document_with_hash(document, key_encoder="sha1")
+    assert another_hashed_document.id == hashed_document.id
+
+    # Verify that the result is different from SHA256, SHA512, blake2b
+    values: list[Literal["sha256", "sha512", "blake2b"]] = [
+        "sha256",
+        "sha512",
+        "blake2b",
+    ]
+
+    for key_encoder in values:
+        different_hashed_document = _get_document_with_hash(
+            document, key_encoder=key_encoder
+        )
+        assert different_hashed_document.id != hashed_document.id
+
+
+def test_hashing_custom_key_encoder() -> None:
+    """Test hashing with a custom key encoder."""
+
+    def custom_key_encoder(doc: Document) -> str:
+        return f"quack-{doc.metadata['key']}"
+
+    document = Document(
+        page_content="Lorem ipsum dolor sit amet", metadata={"key": "like a duck"}
+    )
+    hashed_document = _get_document_with_hash(document, key_encoder=custom_key_encoder)
+    assert hashed_document.id == "quack-like a duck"
+    assert isinstance(hashed_document.id, str)
--- a/libs/core/tests/unit_tests/indexing/test_indexing.py
+++ b/libs/core/tests/unit_tests/indexing/test_indexing.py
@@ -13,7 +13,11 @@ from langchain_core.document_loaders.base import BaseLoader
 from langchain_core.documents import Document
 from langchain_core.embeddings import DeterministicFakeEmbedding
 from langchain_core.indexing import InMemoryRecordManager, aindex, index
-from langchain_core.indexing.api import IndexingException, _abatch, _HashedDocument
+from langchain_core.indexing.api import (
+    IndexingException,
+    _abatch,
+    _get_document_with_hash,
+)
 from langchain_core.indexing.in_memory import InMemoryDocumentIndex
 from langchain_core.vectorstores import InMemoryVectorStore, VectorStore

@@ -2222,7 +2226,7 @@ def test_indexing_custom_batch_size(
            metadata={"source": "1"},
        ),
    ]
-    ids = [_HashedDocument.from_document(doc).uid for doc in docs]
+    ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]

    batch_size = 1

@@ -2232,7 +2236,13 @@ def test_indexing_custom_batch_size(
        mock_add_documents = MagicMock()
        vector_store.add_documents = mock_add_documents  # type: ignore[method-assign]

-        index(docs, record_manager, vector_store, batch_size=batch_size)
+        index(
+            docs,
+            record_manager,
+            vector_store,
+            batch_size=batch_size,
+            key_encoder="sha256",
+        )
        args, kwargs = mock_add_documents.call_args
        doc_with_id = Document(
            id=ids[0], page_content="This is a test document.", metadata={"source": "1"}
@@ -2253,7 +2263,7 @@ async def test_aindexing_custom_batch_size(
            metadata={"source": "1"},
        ),
    ]
-    ids = [_HashedDocument.from_document(doc).uid for doc in docs]
+    ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]

    batch_size = 1
    mock_add_documents = AsyncMock()
@@ -2261,7 +2271,9 @@ async def test_aindexing_custom_batch_size(
        id=ids[0], page_content="This is a test document.", metadata={"source": "1"}
    )
    vector_store.aadd_documents = mock_add_documents  # type: ignore[method-assign]
-    await aindex(docs, arecord_manager, vector_store, batch_size=batch_size)
+    await aindex(
+        docs, arecord_manager, vector_store, batch_size=batch_size, key_encoder="sha256"
+    )
    args, kwargs = mock_add_documents.call_args
    assert args == ([doc_with_id],)
    assert kwargs == {"ids": ids, "batch_size": batch_size}