mirror of
https://github.com/hwchase17/langchain.git
synced 2026-01-22 20:59:05 +00:00
Compare commits
3 Commits
charlie/fi
...
bagatur/de
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f46b8a4ed4 | ||
|
|
481dc51a91 | ||
|
|
fd8a7f7ab1 |
36
libs/langchain/langchain/document_transformers/dedup.py
Normal file
36
libs/langchain/langchain/document_transformers/dedup.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from typing import Any, Iterable, Iterator, Sequence, Union
|
||||
|
||||
from langchain.schema import BaseDocumentTransformer, Document
|
||||
from langchain.schema.document import _deduplicate_in_order, _HashedDocument
|
||||
|
||||
|
||||
class DedupDocumentTransformer(BaseDocumentTransformer):
|
||||
def __init__(
|
||||
self, by_content: bool = True, by_metadata: Union[bool, Sequence[str]] = False
|
||||
) -> None:
|
||||
self.by_content = by_content
|
||||
self.by_metadata = by_metadata
|
||||
|
||||
def _hashed_documents(
|
||||
self, documents: Iterable[Document]
|
||||
) -> Iterator[_HashedDocument]:
|
||||
for doc in documents:
|
||||
page_content = doc.page_content if self.by_content else ""
|
||||
if isinstance(self.by_metadata, Sequence):
|
||||
metadata = {k: doc.metadata[k] for k in self.by_metadata}
|
||||
elif self.by_metadata:
|
||||
metadata = doc.metadata
|
||||
else:
|
||||
metadata = {}
|
||||
_doc = Document(page_content=page_content, metadata=metadata)
|
||||
yield _HashedDocument.from_document(_doc)
|
||||
|
||||
def transform_documents(
|
||||
self, documents: Sequence[Document], **kwargs: Any
|
||||
) -> Sequence[Document]:
|
||||
return list(_deduplicate_in_order(self._hashed_documents(documents)))
|
||||
|
||||
async def atransform_documents(
|
||||
self, documents: Sequence[Document], **kwargs: Any
|
||||
) -> Sequence[Document]:
|
||||
raise NotImplementedError
|
||||
@@ -8,7 +8,6 @@ The text is hashed and the hash is used as the key in the cache.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import uuid
|
||||
from functools import partial
|
||||
@@ -17,19 +16,14 @@ from typing import Callable, List, Sequence, Union, cast
|
||||
from langchain.schema import BaseStore
|
||||
from langchain.schema.embeddings import Embeddings
|
||||
from langchain.storage.encoder_backed import EncoderBackedStore
|
||||
from langchain.utils.hash import hash_string_to_uuid
|
||||
|
||||
NAMESPACE_UUID = uuid.UUID(int=1985)
|
||||
|
||||
|
||||
def _hash_string_to_uuid(input_string: str) -> uuid.UUID:
|
||||
"""Hash a string and returns the corresponding UUID."""
|
||||
hash_value = hashlib.sha1(input_string.encode("utf-8")).hexdigest()
|
||||
return uuid.uuid5(NAMESPACE_UUID, hash_value)
|
||||
|
||||
|
||||
def _key_encoder(key: str, namespace: str) -> str:
|
||||
"""Encode a key."""
|
||||
return namespace + str(_hash_string_to_uuid(key))
|
||||
return namespace + str(hash_string_to_uuid(key, NAMESPACE_UUID))
|
||||
|
||||
|
||||
def _create_key_encoder(namespace: str) -> Callable[[str], str]:
|
||||
|
||||
@@ -1,14 +1,9 @@
|
||||
"""Module contains logic for indexing documents into vector stores."""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import uuid
|
||||
from itertools import islice
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
@@ -22,92 +17,14 @@ from typing import (
|
||||
)
|
||||
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.indexes.base import NAMESPACE_UUID, RecordManager
|
||||
from langchain.pydantic_v1 import root_validator
|
||||
from langchain.indexes.base import RecordManager
|
||||
from langchain.schema import Document
|
||||
from langchain.schema.document import _deduplicate_in_order, _HashedDocument
|
||||
from langchain.schema.vectorstore import VectorStore
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def _hash_string_to_uuid(input_string: str) -> uuid.UUID:
|
||||
"""Hashes a string and returns the corresponding UUID."""
|
||||
hash_value = hashlib.sha1(input_string.encode("utf-8")).hexdigest()
|
||||
return uuid.uuid5(NAMESPACE_UUID, hash_value)
|
||||
|
||||
|
||||
def _hash_nested_dict_to_uuid(data: dict) -> uuid.UUID:
|
||||
"""Hashes a nested dictionary and returns the corresponding UUID."""
|
||||
serialized_data = json.dumps(data, sort_keys=True)
|
||||
hash_value = hashlib.sha1(serialized_data.encode("utf-8")).hexdigest()
|
||||
return uuid.uuid5(NAMESPACE_UUID, hash_value)
|
||||
|
||||
|
||||
class _HashedDocument(Document):
|
||||
"""A hashed document with a unique ID."""
|
||||
|
||||
uid: str
|
||||
hash_: str
|
||||
"""The hash of the document including content and metadata."""
|
||||
content_hash: str
|
||||
"""The hash of the document content."""
|
||||
metadata_hash: str
|
||||
"""The hash of the document metadata."""
|
||||
|
||||
@root_validator(pre=True)
|
||||
def calculate_hashes(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Root validator to calculate content and metadata hash."""
|
||||
content = values.get("page_content", "")
|
||||
metadata = values.get("metadata", {})
|
||||
|
||||
forbidden_keys = ("hash_", "content_hash", "metadata_hash")
|
||||
|
||||
for key in forbidden_keys:
|
||||
if key in metadata:
|
||||
raise ValueError(
|
||||
f"Metadata cannot contain key {key} as it "
|
||||
f"is reserved for internal use."
|
||||
)
|
||||
|
||||
content_hash = str(_hash_string_to_uuid(content))
|
||||
|
||||
try:
|
||||
metadata_hash = str(_hash_nested_dict_to_uuid(metadata))
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
f"Failed to hash metadata: {e}. "
|
||||
f"Please use a dict that can be serialized using json."
|
||||
)
|
||||
|
||||
values["content_hash"] = content_hash
|
||||
values["metadata_hash"] = metadata_hash
|
||||
values["hash_"] = str(_hash_string_to_uuid(content_hash + metadata_hash))
|
||||
|
||||
_uid = values.get("uid", None)
|
||||
|
||||
if _uid is None:
|
||||
values["uid"] = values["hash_"]
|
||||
return values
|
||||
|
||||
def to_document(self) -> Document:
|
||||
"""Return a Document object."""
|
||||
return Document(
|
||||
page_content=self.page_content,
|
||||
metadata=self.metadata,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_document(
|
||||
cls, document: Document, *, uid: Optional[str] = None
|
||||
) -> _HashedDocument:
|
||||
"""Create a HashedDocument from a Document."""
|
||||
return cls(
|
||||
uid=uid,
|
||||
page_content=document.page_content,
|
||||
metadata=document.metadata,
|
||||
)
|
||||
|
||||
|
||||
def _batch(size: int, iterable: Iterable[T]) -> Iterator[List[T]]:
|
||||
"""Utility batching function."""
|
||||
it = iter(iterable)
|
||||
@@ -135,18 +52,6 @@ def _get_source_id_assigner(
|
||||
)
|
||||
|
||||
|
||||
def _deduplicate_in_order(
|
||||
hashed_documents: Iterable[_HashedDocument],
|
||||
) -> Iterator[_HashedDocument]:
|
||||
"""Deduplicate a list of hashed documents while preserving order."""
|
||||
seen = set()
|
||||
|
||||
for hashed_doc in hashed_documents:
|
||||
if hashed_doc.hash_ not in seen:
|
||||
seen.add(hashed_doc.hash_)
|
||||
yield hashed_doc
|
||||
|
||||
|
||||
# PUBLIC API
|
||||
|
||||
|
||||
|
||||
@@ -1,11 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Optional, Sequence
|
||||
|
||||
NAMESPACE_UUID = uuid.UUID(int=1984)
|
||||
|
||||
|
||||
class RecordManager(ABC):
|
||||
"""An abstract base class representing the interface for a record manager."""
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Sequence
|
||||
from typing import Any, Dict, Iterable, Iterator, Optional, Sequence
|
||||
|
||||
from langchain.load.serializable import Serializable
|
||||
from langchain.pydantic_v1 import Field
|
||||
from langchain.pydantic_v1 import Field, root_validator
|
||||
from langchain.utils.hash import hash_nested_dict_to_uuid, hash_string_to_uuid
|
||||
|
||||
NAMESPACE_UUID = uuid.UUID(int=1984)
|
||||
|
||||
|
||||
class Document(Serializable):
|
||||
@@ -84,3 +88,82 @@ class BaseDocumentTransformer(ABC):
|
||||
Returns:
|
||||
A list of transformed Documents.
|
||||
"""
|
||||
|
||||
|
||||
class _HashedDocument(Document):
|
||||
"""A hashed document with a unique ID."""
|
||||
|
||||
uid: str
|
||||
hash_: str
|
||||
"""The hash of the document including content and metadata."""
|
||||
content_hash: str
|
||||
"""The hash of the document content."""
|
||||
metadata_hash: str
|
||||
"""The hash of the document metadata."""
|
||||
|
||||
@root_validator(pre=True)
|
||||
def calculate_hashes(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Root validator to calculate content and metadata hash."""
|
||||
content = values.get("page_content", "")
|
||||
metadata = values.get("metadata", {})
|
||||
|
||||
forbidden_keys = ("hash_", "content_hash", "metadata_hash")
|
||||
|
||||
for key in forbidden_keys:
|
||||
if key in metadata:
|
||||
raise ValueError(
|
||||
f"Metadata cannot contain key {key} as it "
|
||||
f"is reserved for internal use."
|
||||
)
|
||||
|
||||
content_hash = str(hash_string_to_uuid(content, NAMESPACE_UUID))
|
||||
|
||||
try:
|
||||
metadata_hash = str(hash_nested_dict_to_uuid(metadata, NAMESPACE_UUID))
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
f"Failed to hash metadata: {e}. "
|
||||
f"Please use a dict that can be serialized using json."
|
||||
)
|
||||
|
||||
values["content_hash"] = content_hash
|
||||
values["metadata_hash"] = metadata_hash
|
||||
values["hash_"] = str(
|
||||
hash_string_to_uuid(content_hash + metadata_hash, NAMESPACE_UUID)
|
||||
)
|
||||
|
||||
_uid = values.get("uid", None)
|
||||
|
||||
if _uid is None:
|
||||
values["uid"] = values["hash_"]
|
||||
return values
|
||||
|
||||
def to_document(self) -> Document:
|
||||
"""Return a Document object."""
|
||||
return Document(
|
||||
page_content=self.page_content,
|
||||
metadata=self.metadata,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_document(
|
||||
cls, document: Document, *, uid: Optional[str] = None
|
||||
) -> _HashedDocument:
|
||||
"""Create a HashedDocument from a Document."""
|
||||
return cls(
|
||||
uid=uid,
|
||||
page_content=document.page_content,
|
||||
metadata=document.metadata,
|
||||
)
|
||||
|
||||
|
||||
def _deduplicate_in_order(
|
||||
hashed_documents: Iterable[_HashedDocument],
|
||||
) -> Iterator[_HashedDocument]:
|
||||
"""Deduplicate a list of hashed documents while preserving order."""
|
||||
seen = set()
|
||||
|
||||
for hashed_doc in hashed_documents:
|
||||
if hashed_doc.hash_ not in seen:
|
||||
seen.add(hashed_doc.hash_)
|
||||
yield hashed_doc
|
||||
|
||||
18
libs/langchain/langchain/utils/hash.py
Normal file
18
libs/langchain/langchain/utils/hash.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import uuid
|
||||
|
||||
|
||||
def hash_string_to_uuid(input_string: str, namespace: uuid.UUID) -> uuid.UUID:
|
||||
"""Hashes a string and returns the corresponding UUID."""
|
||||
hash_value = hashlib.sha1(input_string.encode("utf-8")).hexdigest()
|
||||
return uuid.uuid5(namespace, hash_value)
|
||||
|
||||
|
||||
def hash_nested_dict_to_uuid(data: dict, namespace: uuid.UUID) -> uuid.UUID:
|
||||
"""Hashes a nested dictionary and returns the corresponding UUID."""
|
||||
serialized_data = json.dumps(data, sort_keys=True)
|
||||
hash_value = hashlib.sha1(serialized_data.encode("utf-8")).hexdigest()
|
||||
return uuid.uuid5(namespace, hash_value)
|
||||
@@ -1,7 +1,7 @@
|
||||
import pytest
|
||||
|
||||
from langchain.indexes._api import _HashedDocument
|
||||
from langchain.schema import Document
|
||||
from langchain.schema.document import _HashedDocument
|
||||
|
||||
|
||||
def test_hashed_document_hashing() -> None:
|
||||
|
||||
Reference in New Issue
Block a user