undo

add files
add
2026-01-22 20:59:05 +00:00 · 2023-09-27 15:24:12 -07:00 · 2023-09-27 15:23:23 -07:00 · 2023-09-27 15:23:08 -07:00
7 changed files with 144 additions and 111 deletions
--- a/libs/langchain/langchain/document_transformers/dedup.py
+++ b/libs/langchain/langchain/document_transformers/dedup.py
@@ -0,0 +1,36 @@
+from typing import Any, Iterable, Iterator, Sequence, Union
+
+from langchain.schema import BaseDocumentTransformer, Document
+from langchain.schema.document import _deduplicate_in_order, _HashedDocument
+
+
+class DedupDocumentTransformer(BaseDocumentTransformer):
+    def __init__(
+        self, by_content: bool = True, by_metadata: Union[bool, Sequence[str]] = False
+    ) -> None:
+        self.by_content = by_content
+        self.by_metadata = by_metadata
+
+    def _hashed_documents(
+        self, documents: Iterable[Document]
+    ) -> Iterator[_HashedDocument]:
+        for doc in documents:
+            page_content = doc.page_content if self.by_content else ""
+            if isinstance(self.by_metadata, Sequence):
+                metadata = {k: doc.metadata[k] for k in self.by_metadata}
+            elif self.by_metadata:
+                metadata = doc.metadata
+            else:
+                metadata = {}
+            _doc = Document(page_content=page_content, metadata=metadata)
+            yield _HashedDocument.from_document(_doc)
+
+    def transform_documents(
+        self, documents: Sequence[Document], **kwargs: Any
+    ) -> Sequence[Document]:
+        return list(_deduplicate_in_order(self._hashed_documents(documents)))
+
+    async def atransform_documents(
+        self, documents: Sequence[Document], **kwargs: Any
+    ) -> Sequence[Document]:
+        raise NotImplementedError
--- a/libs/langchain/langchain/embeddings/cache.py
+++ b/libs/langchain/langchain/embeddings/cache.py
@@ -8,7 +8,6 @@ The text is hashed and the hash is used as the key in the cache.
 """
 from __future__ import annotations

-import hashlib
 import json
 import uuid
 from functools import partial
@@ -17,19 +16,14 @@ from typing import Callable, List, Sequence, Union, cast
 from langchain.schema import BaseStore
 from langchain.schema.embeddings import Embeddings
 from langchain.storage.encoder_backed import EncoderBackedStore
+from langchain.utils.hash import hash_string_to_uuid

 NAMESPACE_UUID = uuid.UUID(int=1985)


-def _hash_string_to_uuid(input_string: str) -> uuid.UUID:
-    """Hash a string and returns the corresponding UUID."""
-    hash_value = hashlib.sha1(input_string.encode("utf-8")).hexdigest()
-    return uuid.uuid5(NAMESPACE_UUID, hash_value)
-
-
 def _key_encoder(key: str, namespace: str) -> str:
    """Encode a key."""
-    return namespace + str(_hash_string_to_uuid(key))
+    return namespace + str(hash_string_to_uuid(key, NAMESPACE_UUID))


 def _create_key_encoder(namespace: str) -> Callable[[str], str]:
--- a/libs/langchain/langchain/indexes/_api.py
+++ b/libs/langchain/langchain/indexes/_api.py
@@ -1,14 +1,9 @@
 """Module contains logic for indexing documents into vector stores."""
 from __future__ import annotations

-import hashlib
-import json
-import uuid
 from itertools import islice
 from typing import (
-    Any,
    Callable,
-    Dict,
    Iterable,
    Iterator,
    List,
@@ -22,92 +17,14 @@ from typing import (
 )

 from langchain.document_loaders.base import BaseLoader
-from langchain.indexes.base import NAMESPACE_UUID, RecordManager
-from langchain.pydantic_v1 import root_validator
+from langchain.indexes.base import RecordManager
 from langchain.schema import Document
+from langchain.schema.document import _deduplicate_in_order, _HashedDocument
 from langchain.schema.vectorstore import VectorStore

 T = TypeVar("T")


-def _hash_string_to_uuid(input_string: str) -> uuid.UUID:
-    """Hashes a string and returns the corresponding UUID."""
-    hash_value = hashlib.sha1(input_string.encode("utf-8")).hexdigest()
-    return uuid.uuid5(NAMESPACE_UUID, hash_value)
-
-
-def _hash_nested_dict_to_uuid(data: dict) -> uuid.UUID:
-    """Hashes a nested dictionary and returns the corresponding UUID."""
-    serialized_data = json.dumps(data, sort_keys=True)
-    hash_value = hashlib.sha1(serialized_data.encode("utf-8")).hexdigest()
-    return uuid.uuid5(NAMESPACE_UUID, hash_value)
-
-
-class _HashedDocument(Document):
-    """A hashed document with a unique ID."""
-
-    uid: str
-    hash_: str
-    """The hash of the document including content and metadata."""
-    content_hash: str
-    """The hash of the document content."""
-    metadata_hash: str
-    """The hash of the document metadata."""
-
-    @root_validator(pre=True)
-    def calculate_hashes(cls, values: Dict[str, Any]) -> Dict[str, Any]:
-        """Root validator to calculate content and metadata hash."""
-        content = values.get("page_content", "")
-        metadata = values.get("metadata", {})
-
-        forbidden_keys = ("hash_", "content_hash", "metadata_hash")
-
-        for key in forbidden_keys:
-            if key in metadata:
-                raise ValueError(
-                    f"Metadata cannot contain key {key} as it "
-                    f"is reserved for internal use."
-                )
-
-        content_hash = str(_hash_string_to_uuid(content))
-
-        try:
-            metadata_hash = str(_hash_nested_dict_to_uuid(metadata))
-        except Exception as e:
-            raise ValueError(
-                f"Failed to hash metadata: {e}. "
-                f"Please use a dict that can be serialized using json."
-            )
-
-        values["content_hash"] = content_hash
-        values["metadata_hash"] = metadata_hash
-        values["hash_"] = str(_hash_string_to_uuid(content_hash + metadata_hash))
-
-        _uid = values.get("uid", None)
-
-        if _uid is None:
-            values["uid"] = values["hash_"]
-        return values
-
-    def to_document(self) -> Document:
-        """Return a Document object."""
-        return Document(
-            page_content=self.page_content,
-            metadata=self.metadata,
-        )
-
-    @classmethod
-    def from_document(
-        cls, document: Document, *, uid: Optional[str] = None
-    ) -> _HashedDocument:
-        """Create a HashedDocument from a Document."""
-        return cls(
-            uid=uid,
-            page_content=document.page_content,
-            metadata=document.metadata,
-        )
-
-
 def _batch(size: int, iterable: Iterable[T]) -> Iterator[List[T]]:
    """Utility batching function."""
    it = iter(iterable)
@@ -135,18 +52,6 @@ def _get_source_id_assigner(
        )


-def _deduplicate_in_order(
-    hashed_documents: Iterable[_HashedDocument],
-) -> Iterator[_HashedDocument]:
-    """Deduplicate a list of hashed documents while preserving order."""
-    seen = set()
-
-    for hashed_doc in hashed_documents:
-        if hashed_doc.hash_ not in seen:
-            seen.add(hashed_doc.hash_)
-            yield hashed_doc
-
-
 # PUBLIC API


--- a/libs/langchain/langchain/indexes/base.py
+++ b/libs/langchain/langchain/indexes/base.py
@@ -1,11 +1,8 @@
 from __future__ import annotations

-import uuid
 from abc import ABC, abstractmethod
 from typing import List, Optional, Sequence

-NAMESPACE_UUID = uuid.UUID(int=1984)
-

 class RecordManager(ABC):
    """An abstract base class representing the interface for a record manager."""
--- a/libs/langchain/langchain/schema/document.py
+++ b/libs/langchain/langchain/schema/document.py
@@ -1,10 +1,14 @@
 from __future__ import annotations

+import uuid
 from abc import ABC, abstractmethod
-from typing import Any, Sequence
+from typing import Any, Dict, Iterable, Iterator, Optional, Sequence

 from langchain.load.serializable import Serializable
-from langchain.pydantic_v1 import Field
+from langchain.pydantic_v1 import Field, root_validator
+from langchain.utils.hash import hash_nested_dict_to_uuid, hash_string_to_uuid
+
+NAMESPACE_UUID = uuid.UUID(int=1984)


 class Document(Serializable):
@@ -84,3 +88,82 @@ class BaseDocumentTransformer(ABC):
        Returns:
            A list of transformed Documents.
        """
+
+
+class _HashedDocument(Document):
+    """A hashed document with a unique ID."""
+
+    uid: str
+    hash_: str
+    """The hash of the document including content and metadata."""
+    content_hash: str
+    """The hash of the document content."""
+    metadata_hash: str
+    """The hash of the document metadata."""
+
+    @root_validator(pre=True)
+    def calculate_hashes(cls, values: Dict[str, Any]) -> Dict[str, Any]:
+        """Root validator to calculate content and metadata hash."""
+        content = values.get("page_content", "")
+        metadata = values.get("metadata", {})
+
+        forbidden_keys = ("hash_", "content_hash", "metadata_hash")
+
+        for key in forbidden_keys:
+            if key in metadata:
+                raise ValueError(
+                    f"Metadata cannot contain key {key} as it "
+                    f"is reserved for internal use."
+                )
+
+        content_hash = str(hash_string_to_uuid(content, NAMESPACE_UUID))
+
+        try:
+            metadata_hash = str(hash_nested_dict_to_uuid(metadata, NAMESPACE_UUID))
+        except Exception as e:
+            raise ValueError(
+                f"Failed to hash metadata: {e}. "
+                f"Please use a dict that can be serialized using json."
+            )
+
+        values["content_hash"] = content_hash
+        values["metadata_hash"] = metadata_hash
+        values["hash_"] = str(
+            hash_string_to_uuid(content_hash + metadata_hash, NAMESPACE_UUID)
+        )
+
+        _uid = values.get("uid", None)
+
+        if _uid is None:
+            values["uid"] = values["hash_"]
+        return values
+
+    def to_document(self) -> Document:
+        """Return a Document object."""
+        return Document(
+            page_content=self.page_content,
+            metadata=self.metadata,
+        )
+
+    @classmethod
+    def from_document(
+        cls, document: Document, *, uid: Optional[str] = None
+    ) -> _HashedDocument:
+        """Create a HashedDocument from a Document."""
+        return cls(
+            uid=uid,
+            page_content=document.page_content,
+            metadata=document.metadata,
+        )
+
+
+def _deduplicate_in_order(
+    hashed_documents: Iterable[_HashedDocument],
+) -> Iterator[_HashedDocument]:
+    """Deduplicate a list of hashed documents while preserving order."""
+    seen = set()
+
+    for hashed_doc in hashed_documents:
+        if hashed_doc.hash_ not in seen:
+            seen.add(hashed_doc.hash_)
+            yield hashed_doc
--- a/libs/langchain/langchain/utils/hash.py
+++ b/libs/langchain/langchain/utils/hash.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+import hashlib
+import json
+import uuid
+
+
+def hash_string_to_uuid(input_string: str, namespace: uuid.UUID) -> uuid.UUID:
+    """Hashes a string and returns the corresponding UUID."""
+    hash_value = hashlib.sha1(input_string.encode("utf-8")).hexdigest()
+    return uuid.uuid5(namespace, hash_value)
+
+
+def hash_nested_dict_to_uuid(data: dict, namespace: uuid.UUID) -> uuid.UUID:
+    """Hashes a nested dictionary and returns the corresponding UUID."""
+    serialized_data = json.dumps(data, sort_keys=True)
+    hash_value = hashlib.sha1(serialized_data.encode("utf-8")).hexdigest()
+    return uuid.uuid5(namespace, hash_value)
--- a/libs/langchain/tests/unit_tests/indexes/test_hashed_document.py
+++ b/libs/langchain/tests/unit_tests/indexes/test_hashed_document.py
@@ -1,7 +1,7 @@
 import pytest

-from langchain.indexes._api import _HashedDocument
 from langchain.schema import Document
+from langchain.schema.document import _HashedDocument


 def test_hashed_document_hashing() -> None:
Author	SHA1	Message	Date
Bagatur	f46b8a4ed4	undo	2023-09-27 15:24:12 -07:00
Bagatur	481dc51a91	add files	2023-09-27 15:23:23 -07:00
Bagatur	fd8a7f7ab1	add	2023-09-27 15:23:08 -07:00