mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-16 23:13:31 +00:00
core[patch]: Add additional hashing options to indexing API, warn on SHA-1 (#31649)
Add additional hashing options to the indexing API, warn on SHA-1 Requires: - Bumping langchain-core version - bumping min langchain-core in langchain --------- Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
@@ -1,50 +1,65 @@
|
||||
import pytest
|
||||
from typing import Literal
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.indexing.api import _HashedDocument
|
||||
from langchain_core.indexing.api import _get_document_with_hash
|
||||
|
||||
|
||||
def test_hashed_document_hashing() -> None:
|
||||
hashed_document = _HashedDocument( # type: ignore[call-arg]
|
||||
document = Document(
|
||||
uid="123", page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
||||
)
|
||||
assert isinstance(hashed_document.hash_, str)
|
||||
|
||||
|
||||
def test_hashing_with_missing_content() -> None:
|
||||
"""Check that ValueError is raised if page_content is missing."""
|
||||
with pytest.raises(TypeError):
|
||||
_HashedDocument(
|
||||
metadata={"key": "value"},
|
||||
) # type: ignore[call-arg]
|
||||
|
||||
|
||||
def test_uid_auto_assigned_to_hash() -> None:
|
||||
"""Test uid is auto-assigned to the hashed_document hash."""
|
||||
hashed_document = _HashedDocument( # type: ignore[call-arg]
|
||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
||||
)
|
||||
assert hashed_document.uid == hashed_document.hash_
|
||||
hashed_document = _get_document_with_hash(document, key_encoder="sha1")
|
||||
assert isinstance(hashed_document.id, str)
|
||||
|
||||
|
||||
def test_to_document() -> None:
|
||||
"""Test to_document method."""
|
||||
hashed_document = _HashedDocument( # type: ignore[call-arg]
|
||||
original_doc = Document(
|
||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
||||
)
|
||||
doc = hashed_document.to_document()
|
||||
assert isinstance(doc, Document)
|
||||
assert doc.page_content == "Lorem ipsum dolor sit amet"
|
||||
assert doc.metadata == {"key": "value"}
|
||||
hashed_doc = _get_document_with_hash(original_doc, key_encoder="sha1")
|
||||
assert isinstance(hashed_doc, Document)
|
||||
assert hashed_doc is not original_doc
|
||||
assert hashed_doc.page_content == "Lorem ipsum dolor sit amet"
|
||||
assert hashed_doc.metadata["key"] == "value"
|
||||
|
||||
|
||||
def test_from_document() -> None:
|
||||
def test_hashing() -> None:
|
||||
"""Test from document class method."""
|
||||
document = Document(
|
||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "value"}
|
||||
)
|
||||
|
||||
hashed_document = _HashedDocument.from_document(document)
|
||||
hashed_document = _get_document_with_hash(document, key_encoder="sha1")
|
||||
# hash should be deterministic
|
||||
assert hashed_document.hash_ == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
|
||||
assert hashed_document.uid == hashed_document.hash_
|
||||
assert hashed_document.id == "fd1dc827-051b-537d-a1fe-1fa043e8b276"
|
||||
|
||||
# Verify that hashing with sha1 is determinstic
|
||||
another_hashed_document = _get_document_with_hash(document, key_encoder="sha1")
|
||||
assert another_hashed_document.id == hashed_document.id
|
||||
|
||||
# Verify that the result is different from SHA256, SHA512, blake2b
|
||||
values: list[Literal["sha256", "sha512", "blake2b"]] = [
|
||||
"sha256",
|
||||
"sha512",
|
||||
"blake2b",
|
||||
]
|
||||
|
||||
for key_encoder in values:
|
||||
different_hashed_document = _get_document_with_hash(
|
||||
document, key_encoder=key_encoder
|
||||
)
|
||||
assert different_hashed_document.id != hashed_document.id
|
||||
|
||||
|
||||
def test_hashing_custom_key_encoder() -> None:
|
||||
"""Test hashing with a custom key encoder."""
|
||||
|
||||
def custom_key_encoder(doc: Document) -> str:
|
||||
return f"quack-{doc.metadata['key']}"
|
||||
|
||||
document = Document(
|
||||
page_content="Lorem ipsum dolor sit amet", metadata={"key": "like a duck"}
|
||||
)
|
||||
hashed_document = _get_document_with_hash(document, key_encoder=custom_key_encoder)
|
||||
assert hashed_document.id == "quack-like a duck"
|
||||
assert isinstance(hashed_document.id, str)
|
||||
|
@@ -13,7 +13,11 @@ from langchain_core.document_loaders.base import BaseLoader
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import DeterministicFakeEmbedding
|
||||
from langchain_core.indexing import InMemoryRecordManager, aindex, index
|
||||
from langchain_core.indexing.api import IndexingException, _abatch, _HashedDocument
|
||||
from langchain_core.indexing.api import (
|
||||
IndexingException,
|
||||
_abatch,
|
||||
_get_document_with_hash,
|
||||
)
|
||||
from langchain_core.indexing.in_memory import InMemoryDocumentIndex
|
||||
from langchain_core.vectorstores import InMemoryVectorStore, VectorStore
|
||||
|
||||
@@ -2222,7 +2226,7 @@ def test_indexing_custom_batch_size(
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
]
|
||||
ids = [_HashedDocument.from_document(doc).uid for doc in docs]
|
||||
ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]
|
||||
|
||||
batch_size = 1
|
||||
|
||||
@@ -2232,7 +2236,13 @@ def test_indexing_custom_batch_size(
|
||||
mock_add_documents = MagicMock()
|
||||
vector_store.add_documents = mock_add_documents # type: ignore[method-assign]
|
||||
|
||||
index(docs, record_manager, vector_store, batch_size=batch_size)
|
||||
index(
|
||||
docs,
|
||||
record_manager,
|
||||
vector_store,
|
||||
batch_size=batch_size,
|
||||
key_encoder="sha256",
|
||||
)
|
||||
args, kwargs = mock_add_documents.call_args
|
||||
doc_with_id = Document(
|
||||
id=ids[0], page_content="This is a test document.", metadata={"source": "1"}
|
||||
@@ -2253,7 +2263,7 @@ async def test_aindexing_custom_batch_size(
|
||||
metadata={"source": "1"},
|
||||
),
|
||||
]
|
||||
ids = [_HashedDocument.from_document(doc).uid for doc in docs]
|
||||
ids = [_get_document_with_hash(doc, key_encoder="sha256").id for doc in docs]
|
||||
|
||||
batch_size = 1
|
||||
mock_add_documents = AsyncMock()
|
||||
@@ -2261,7 +2271,9 @@ async def test_aindexing_custom_batch_size(
|
||||
id=ids[0], page_content="This is a test document.", metadata={"source": "1"}
|
||||
)
|
||||
vector_store.aadd_documents = mock_add_documents # type: ignore[method-assign]
|
||||
await aindex(docs, arecord_manager, vector_store, batch_size=batch_size)
|
||||
await aindex(
|
||||
docs, arecord_manager, vector_store, batch_size=batch_size, key_encoder="sha256"
|
||||
)
|
||||
args, kwargs = mock_add_documents.call_args
|
||||
assert args == ([doc_with_id],)
|
||||
assert kwargs == {"ids": ids, "batch_size": batch_size}
|
||||
|
Reference in New Issue
Block a user