Compare commits

...

3 Commits

Author SHA1 Message Date
Bagatur
f46b8a4ed4 undo 2023-09-27 15:24:12 -07:00
Bagatur
481dc51a91 add files 2023-09-27 15:23:23 -07:00
Bagatur
fd8a7f7ab1 add 2023-09-27 15:23:08 -07:00
7 changed files with 144 additions and 111 deletions

View File

@@ -0,0 +1,36 @@
from typing import Any, Iterable, Iterator, Sequence, Union
from langchain.schema import BaseDocumentTransformer, Document
from langchain.schema.document import _deduplicate_in_order, _HashedDocument
class DedupDocumentTransformer(BaseDocumentTransformer):
def __init__(
self, by_content: bool = True, by_metadata: Union[bool, Sequence[str]] = False
) -> None:
self.by_content = by_content
self.by_metadata = by_metadata
def _hashed_documents(
self, documents: Iterable[Document]
) -> Iterator[_HashedDocument]:
for doc in documents:
page_content = doc.page_content if self.by_content else ""
if isinstance(self.by_metadata, Sequence):
metadata = {k: doc.metadata[k] for k in self.by_metadata}
elif self.by_metadata:
metadata = doc.metadata
else:
metadata = {}
_doc = Document(page_content=page_content, metadata=metadata)
yield _HashedDocument.from_document(_doc)
def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
return list(_deduplicate_in_order(self._hashed_documents(documents)))
async def atransform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
raise NotImplementedError

View File

@@ -8,7 +8,6 @@ The text is hashed and the hash is used as the key in the cache.
"""
from __future__ import annotations
import hashlib
import json
import uuid
from functools import partial
@@ -17,19 +16,14 @@ from typing import Callable, List, Sequence, Union, cast
from langchain.schema import BaseStore
from langchain.schema.embeddings import Embeddings
from langchain.storage.encoder_backed import EncoderBackedStore
from langchain.utils.hash import hash_string_to_uuid
NAMESPACE_UUID = uuid.UUID(int=1985)
def _hash_string_to_uuid(input_string: str) -> uuid.UUID:
"""Hash a string and returns the corresponding UUID."""
hash_value = hashlib.sha1(input_string.encode("utf-8")).hexdigest()
return uuid.uuid5(NAMESPACE_UUID, hash_value)
def _key_encoder(key: str, namespace: str) -> str:
"""Encode a key."""
return namespace + str(_hash_string_to_uuid(key))
return namespace + str(hash_string_to_uuid(key, NAMESPACE_UUID))
def _create_key_encoder(namespace: str) -> Callable[[str], str]:

View File

@@ -1,14 +1,9 @@
"""Module contains logic for indexing documents into vector stores."""
from __future__ import annotations
import hashlib
import json
import uuid
from itertools import islice
from typing import (
Any,
Callable,
Dict,
Iterable,
Iterator,
List,
@@ -22,92 +17,14 @@ from typing import (
)
from langchain.document_loaders.base import BaseLoader
from langchain.indexes.base import NAMESPACE_UUID, RecordManager
from langchain.pydantic_v1 import root_validator
from langchain.indexes.base import RecordManager
from langchain.schema import Document
from langchain.schema.document import _deduplicate_in_order, _HashedDocument
from langchain.schema.vectorstore import VectorStore
T = TypeVar("T")
def _hash_string_to_uuid(input_string: str) -> uuid.UUID:
"""Hashes a string and returns the corresponding UUID."""
hash_value = hashlib.sha1(input_string.encode("utf-8")).hexdigest()
return uuid.uuid5(NAMESPACE_UUID, hash_value)
def _hash_nested_dict_to_uuid(data: dict) -> uuid.UUID:
"""Hashes a nested dictionary and returns the corresponding UUID."""
serialized_data = json.dumps(data, sort_keys=True)
hash_value = hashlib.sha1(serialized_data.encode("utf-8")).hexdigest()
return uuid.uuid5(NAMESPACE_UUID, hash_value)
class _HashedDocument(Document):
"""A hashed document with a unique ID."""
uid: str
hash_: str
"""The hash of the document including content and metadata."""
content_hash: str
"""The hash of the document content."""
metadata_hash: str
"""The hash of the document metadata."""
@root_validator(pre=True)
def calculate_hashes(cls, values: Dict[str, Any]) -> Dict[str, Any]:
"""Root validator to calculate content and metadata hash."""
content = values.get("page_content", "")
metadata = values.get("metadata", {})
forbidden_keys = ("hash_", "content_hash", "metadata_hash")
for key in forbidden_keys:
if key in metadata:
raise ValueError(
f"Metadata cannot contain key {key} as it "
f"is reserved for internal use."
)
content_hash = str(_hash_string_to_uuid(content))
try:
metadata_hash = str(_hash_nested_dict_to_uuid(metadata))
except Exception as e:
raise ValueError(
f"Failed to hash metadata: {e}. "
f"Please use a dict that can be serialized using json."
)
values["content_hash"] = content_hash
values["metadata_hash"] = metadata_hash
values["hash_"] = str(_hash_string_to_uuid(content_hash + metadata_hash))
_uid = values.get("uid", None)
if _uid is None:
values["uid"] = values["hash_"]
return values
def to_document(self) -> Document:
"""Return a Document object."""
return Document(
page_content=self.page_content,
metadata=self.metadata,
)
@classmethod
def from_document(
cls, document: Document, *, uid: Optional[str] = None
) -> _HashedDocument:
"""Create a HashedDocument from a Document."""
return cls(
uid=uid,
page_content=document.page_content,
metadata=document.metadata,
)
def _batch(size: int, iterable: Iterable[T]) -> Iterator[List[T]]:
"""Utility batching function."""
it = iter(iterable)
@@ -135,18 +52,6 @@ def _get_source_id_assigner(
)
def _deduplicate_in_order(
hashed_documents: Iterable[_HashedDocument],
) -> Iterator[_HashedDocument]:
"""Deduplicate a list of hashed documents while preserving order."""
seen = set()
for hashed_doc in hashed_documents:
if hashed_doc.hash_ not in seen:
seen.add(hashed_doc.hash_)
yield hashed_doc
# PUBLIC API

View File

@@ -1,11 +1,8 @@
from __future__ import annotations
import uuid
from abc import ABC, abstractmethod
from typing import List, Optional, Sequence
NAMESPACE_UUID = uuid.UUID(int=1984)
class RecordManager(ABC):
"""An abstract base class representing the interface for a record manager."""

View File

@@ -1,10 +1,14 @@
from __future__ import annotations
import uuid
from abc import ABC, abstractmethod
from typing import Any, Sequence
from typing import Any, Dict, Iterable, Iterator, Optional, Sequence
from langchain.load.serializable import Serializable
from langchain.pydantic_v1 import Field
from langchain.pydantic_v1 import Field, root_validator
from langchain.utils.hash import hash_nested_dict_to_uuid, hash_string_to_uuid
NAMESPACE_UUID = uuid.UUID(int=1984)
class Document(Serializable):
@@ -84,3 +88,82 @@ class BaseDocumentTransformer(ABC):
Returns:
A list of transformed Documents.
"""
class _HashedDocument(Document):
"""A hashed document with a unique ID."""
uid: str
hash_: str
"""The hash of the document including content and metadata."""
content_hash: str
"""The hash of the document content."""
metadata_hash: str
"""The hash of the document metadata."""
@root_validator(pre=True)
def calculate_hashes(cls, values: Dict[str, Any]) -> Dict[str, Any]:
"""Root validator to calculate content and metadata hash."""
content = values.get("page_content", "")
metadata = values.get("metadata", {})
forbidden_keys = ("hash_", "content_hash", "metadata_hash")
for key in forbidden_keys:
if key in metadata:
raise ValueError(
f"Metadata cannot contain key {key} as it "
f"is reserved for internal use."
)
content_hash = str(hash_string_to_uuid(content, NAMESPACE_UUID))
try:
metadata_hash = str(hash_nested_dict_to_uuid(metadata, NAMESPACE_UUID))
except Exception as e:
raise ValueError(
f"Failed to hash metadata: {e}. "
f"Please use a dict that can be serialized using json."
)
values["content_hash"] = content_hash
values["metadata_hash"] = metadata_hash
values["hash_"] = str(
hash_string_to_uuid(content_hash + metadata_hash, NAMESPACE_UUID)
)
_uid = values.get("uid", None)
if _uid is None:
values["uid"] = values["hash_"]
return values
def to_document(self) -> Document:
"""Return a Document object."""
return Document(
page_content=self.page_content,
metadata=self.metadata,
)
@classmethod
def from_document(
cls, document: Document, *, uid: Optional[str] = None
) -> _HashedDocument:
"""Create a HashedDocument from a Document."""
return cls(
uid=uid,
page_content=document.page_content,
metadata=document.metadata,
)
def _deduplicate_in_order(
hashed_documents: Iterable[_HashedDocument],
) -> Iterator[_HashedDocument]:
"""Deduplicate a list of hashed documents while preserving order."""
seen = set()
for hashed_doc in hashed_documents:
if hashed_doc.hash_ not in seen:
seen.add(hashed_doc.hash_)
yield hashed_doc

View File

@@ -0,0 +1,18 @@
from __future__ import annotations
import hashlib
import json
import uuid
def hash_string_to_uuid(input_string: str, namespace: uuid.UUID) -> uuid.UUID:
"""Hashes a string and returns the corresponding UUID."""
hash_value = hashlib.sha1(input_string.encode("utf-8")).hexdigest()
return uuid.uuid5(namespace, hash_value)
def hash_nested_dict_to_uuid(data: dict, namespace: uuid.UUID) -> uuid.UUID:
"""Hashes a nested dictionary and returns the corresponding UUID."""
serialized_data = json.dumps(data, sort_keys=True)
hash_value = hashlib.sha1(serialized_data.encode("utf-8")).hexdigest()
return uuid.uuid5(namespace, hash_value)

View File

@@ -1,7 +1,7 @@
import pytest
from langchain.indexes._api import _HashedDocument
from langchain.schema import Document
from langchain.schema.document import _HashedDocument
def test_hashed_document_hashing() -> None: