core[minor]: Create BaseMedia object (#23639)

This PR implements a BaseContent object from which Document and Blob objects will inherit proposed here: https://github.com/langchain-ai/langchain/pull/23544 Alternative: Create a base object that only has an identifier and no metadata. For now decided against it, since that refactor can be done at a later time. It also feels a bit odd since our IDs are optional at the moment. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
2025-08-17 00:17:47 +00:00 · 2024-07-01 15:07:30 -04:00 · 2024-07-01 15:07:30 -04:00 · e800f6bb57
commit e800f6bb57
parent 04bc5f1a95
5 changed files with 255 additions and 188 deletions
--- a/libs/core/langchain_core/document_loaders/base.py
+++ b/libs/core/langchain_core/document_loaders/base.py
@ -10,7 +10,7 @@ from langchain_core.runnables import run_in_executor
 if TYPE_CHECKING:
    from langchain_text_splitters import TextSplitter
-from langchain_core.document_loaders.blob_loaders import Blob
+from langchain_core.documents.base import Blob
 class BaseLoader(ABC):
--- a/libs/core/langchain_core/document_loaders/blob_loaders.py
+++ b/libs/core/langchain_core/document_loaders/blob_loaders.py
@ -6,175 +6,12 @@ In addition, content loading code should provide a lazy loading interface by def
 """
 from __future__ import annotations
 import contextlib
 import mimetypes
 from abc import ABC, abstractmethod
-from io import BufferedReader, BytesIO
+from typing import Iterable
 from pathlib import PurePath
 from typing import Any, Dict, Generator, Iterable, Mapping, Optional, Union, cast
-from langchain_core.pydantic_v1 import BaseModel, Field, root_validator
+# Re-export Blob and PathLike for backwards compatibility
-
+from langchain_core.documents.base import Blob as Blob
-PathLike = Union[str, PurePath]
+from langchain_core.documents.base import PathLike as PathLike
 class Blob(BaseModel):
    """Blob represents raw data by either reference or value.
    Provides an interface to materialize the blob in different representations, and
    help to decouple the development of data loaders from the downstream parsing of
    the raw data.
    Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
    """
    data: Union[bytes, str, None]
    """Raw data associated with the blob."""
    mimetype: Optional[str] = None
    """MimeType not to be confused with a file extension."""
    encoding: str = "utf-8"
    """Encoding to use if decoding the bytes into a string.
    Use utf-8 as default encoding, if decoding to string.
    """
    path: Optional[PathLike] = None
    """Location where the original content was found."""
    metadata: Dict[str, Any] = Field(default_factory=dict)
    """Metadata about the blob (e.g., source)"""
    class Config:
        arbitrary_types_allowed = True
        frozen = True
    @property
    def source(self) -> Optional[str]:
        """The source location of the blob as string if known otherwise none.
        If a path is associated with the blob, it will default to the path location.
        Unless explicitly set via a metadata field called "source", in which
        case that value will be used instead.
        """
        if self.metadata and "source" in self.metadata:
            return cast(Optional[str], self.metadata["source"])
        return str(self.path) if self.path else None
    @root_validator(pre=True)
    def check_blob_is_valid(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
        """Verify that either data or path is provided."""
        if "data" not in values and "path" not in values:
            raise ValueError("Either data or path must be provided")
        return values
    def as_string(self) -> str:
        """Read data as a string."""
        if self.data is None and self.path:
            with open(str(self.path), "r", encoding=self.encoding) as f:
                return f.read()
        elif isinstance(self.data, bytes):
            return self.data.decode(self.encoding)
        elif isinstance(self.data, str):
            return self.data
        else:
            raise ValueError(f"Unable to get string for blob {self}")
    def as_bytes(self) -> bytes:
        """Read data as bytes."""
        if isinstance(self.data, bytes):
            return self.data
        elif isinstance(self.data, str):
            return self.data.encode(self.encoding)
        elif self.data is None and self.path:
            with open(str(self.path), "rb") as f:
                return f.read()
        else:
            raise ValueError(f"Unable to get bytes for blob {self}")
    @contextlib.contextmanager
    def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
        """Read data as a byte stream."""
        if isinstance(self.data, bytes):
            yield BytesIO(self.data)
        elif self.data is None and self.path:
            with open(str(self.path), "rb") as f:
                yield f
        else:
            raise NotImplementedError(f"Unable to convert blob {self}")
    @classmethod
    def from_path(
        cls,
        path: PathLike,
        *,
        encoding: str = "utf-8",
        mime_type: Optional[str] = None,
        guess_type: bool = True,
        metadata: Optional[dict] = None,
    ) -> Blob:
        """Load the blob from a path like object.
        Args:
            path: path like object to file to be read
            encoding: Encoding to use if decoding the bytes into a string
            mime_type: if provided, will be set as the mime-type of the data
            guess_type: If True, the mimetype will be guessed from the file extension,
                        if a mime-type was not provided
            metadata: Metadata to associate with the blob
        Returns:
            Blob instance
        """
        if mime_type is None and guess_type:
            _mimetype = mimetypes.guess_type(path)[0] if guess_type else None
        else:
            _mimetype = mime_type
        # We do not load the data immediately, instead we treat the blob as a
        # reference to the underlying data.
        return cls(
            data=None,
            mimetype=_mimetype,
            encoding=encoding,
            path=path,
            metadata=metadata if metadata is not None else {},
        )
    @classmethod
    def from_data(
        cls,
        data: Union[str, bytes],
        *,
        encoding: str = "utf-8",
        mime_type: Optional[str] = None,
        path: Optional[str] = None,
        metadata: Optional[dict] = None,
    ) -> Blob:
        """Initialize the blob from in-memory data.
        Args:
            data: the in-memory data associated with the blob
            encoding: Encoding to use if decoding the bytes into a string
            mime_type: if provided, will be set as the mime-type of the data
            path: if provided, will be set as the source from which the data came
            metadata: Metadata to associate with the blob
        Returns:
            Blob instance
        """
        return cls(
            data=data,
            mimetype=mime_type,
            encoding=encoding,
            path=path,
            metadata=metadata if metadata is not None else {},
        )
    def __repr__(self) -> str:
        """Define the blob representation."""
        str_repr = f"Blob {id(self)}"
        if self.source:
            str_repr += f" {self.source}"
        return str_repr
 class BlobLoader(ABC):
@ -193,3 +30,7 @@ class BlobLoader(ABC):
        Returns:
            A generator over blobs
        """
 # Re-export Blob and Pathlike for backwards compatibility
 __all__ = ["Blob", "BlobLoader", "PathLike"]
--- a/libs/core/langchain_core/documents/base.py
+++ b/libs/core/langchain_core/documents/base.py
@ -1,12 +1,250 @@
 from __future__ import annotations
-from typing import Any, List, Literal, Optional
+import contextlib
 import mimetypes
 from io import BufferedReader, BytesIO
 from pathlib import PurePath
 from typing import Any, Generator, List, Literal, Mapping, Optional, Union, cast
 from langchain_core.load.serializable import Serializable
-from langchain_core.pydantic_v1 import Field
+from langchain_core.pydantic_v1 import Field, root_validator
 PathLike = Union[str, PurePath]
-class Document(Serializable):
+class BaseMedia(Serializable):
    """Use to represent media content.
    Media objets can be used to represent raw data, such as text or binary data.
    LangChain Media objects allow associating metadata and an optional identifier
    with the content.
    The presence of an ID and metadata make it easier to store, index, and search
    over the content in a structured way.
    """
    # The ID field is optional at the moment.
    # It will likely become required in a future major release after
    # it has been adopted by enough vectorstore implementations.
    id: Optional[str] = None
    """An optional identifier for the document.
    Ideally this should be unique across the document collection and formatted 
    as a UUID, but this will not be enforced.
    .. versionadded:: 0.2.11
    """
    metadata: dict = Field(default_factory=dict)
    """Arbitrary metadata associated with the content."""
 class Blob(BaseMedia):
    """Blob represents raw data by either reference or value.
    Provides an interface to materialize the blob in different representations, and
    help to decouple the development of data loaders from the downstream parsing of
    the raw data.
    Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
    Example: Initialize a blob from in-memory data
        .. code-block:: python
            from langchain_core.documents import Blob
            blob = Blob.from_data("Hello, world!")
            # Read the blob as a string
            print(blob.as_string())
            # Read the blob as bytes
            print(blob.as_bytes())
            # Read the blob as a byte stream
            with blob.as_bytes_io() as f:
                print(f.read())
    Example: Load from memory and specify mime-type and metadata
        .. code-block:: python
            from langchain_core.documents import Blob
            blob = Blob.from_data(
                data="Hello, world!",
                mime_type="text/plain",
                metadata={"source": "https://example.com"}
            )
    Example: Load the blob from a file
        .. code-block:: python
            from langchain_core.documents import Blob
            blob = Blob.from_path("path/to/file.txt")
            # Read the blob as a string
            print(blob.as_string())
            # Read the blob as bytes
            print(blob.as_bytes())
            # Read the blob as a byte stream
            with blob.as_bytes_io() as f:
                print(f.read())
    """
    data: Union[bytes, str, None]
    """Raw data associated with the blob."""
    mimetype: Optional[str] = None
    """MimeType not to be confused with a file extension."""
    encoding: str = "utf-8"
    """Encoding to use if decoding the bytes into a string.
    Use utf-8 as default encoding, if decoding to string.
    """
    path: Optional[PathLike] = None
    """Location where the original content was found."""
    class Config:
        arbitrary_types_allowed = True
        frozen = True
    @property
    def source(self) -> Optional[str]:
        """The source location of the blob as string if known otherwise none.
        If a path is associated with the blob, it will default to the path location.
        Unless explicitly set via a metadata field called "source", in which
        case that value will be used instead.
        """
        if self.metadata and "source" in self.metadata:
            return cast(Optional[str], self.metadata["source"])
        return str(self.path) if self.path else None
    @root_validator(pre=True)
    def check_blob_is_valid(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
        """Verify that either data or path is provided."""
        if "data" not in values and "path" not in values:
            raise ValueError("Either data or path must be provided")
        return values
    def as_string(self) -> str:
        """Read data as a string."""
        if self.data is None and self.path:
            with open(str(self.path), "r", encoding=self.encoding) as f:
                return f.read()
        elif isinstance(self.data, bytes):
            return self.data.decode(self.encoding)
        elif isinstance(self.data, str):
            return self.data
        else:
            raise ValueError(f"Unable to get string for blob {self}")
    def as_bytes(self) -> bytes:
        """Read data as bytes."""
        if isinstance(self.data, bytes):
            return self.data
        elif isinstance(self.data, str):
            return self.data.encode(self.encoding)
        elif self.data is None and self.path:
            with open(str(self.path), "rb") as f:
                return f.read()
        else:
            raise ValueError(f"Unable to get bytes for blob {self}")
    @contextlib.contextmanager
    def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
        """Read data as a byte stream."""
        if isinstance(self.data, bytes):
            yield BytesIO(self.data)
        elif self.data is None and self.path:
            with open(str(self.path), "rb") as f:
                yield f
        else:
            raise NotImplementedError(f"Unable to convert blob {self}")
    @classmethod
    def from_path(
        cls,
        path: PathLike,
        *,
        encoding: str = "utf-8",
        mime_type: Optional[str] = None,
        guess_type: bool = True,
        metadata: Optional[dict] = None,
    ) -> Blob:
        """Load the blob from a path like object.
        Args:
            path: path like object to file to be read
            encoding: Encoding to use if decoding the bytes into a string
            mime_type: if provided, will be set as the mime-type of the data
            guess_type: If True, the mimetype will be guessed from the file extension,
                        if a mime-type was not provided
            metadata: Metadata to associate with the blob
        Returns:
            Blob instance
        """
        if mime_type is None and guess_type:
            _mimetype = mimetypes.guess_type(path)[0] if guess_type else None
        else:
            _mimetype = mime_type
        # We do not load the data immediately, instead we treat the blob as a
        # reference to the underlying data.
        return cls(
            data=None,
            mimetype=_mimetype,
            encoding=encoding,
            path=path,
            metadata=metadata if metadata is not None else {},
        )
    @classmethod
    def from_data(
        cls,
        data: Union[str, bytes],
        *,
        encoding: str = "utf-8",
        mime_type: Optional[str] = None,
        path: Optional[str] = None,
        metadata: Optional[dict] = None,
    ) -> Blob:
        """Initialize the blob from in-memory data.
        Args:
            data: the in-memory data associated with the blob
            encoding: Encoding to use if decoding the bytes into a string
            mime_type: if provided, will be set as the mime-type of the data
            path: if provided, will be set as the source from which the data came
            metadata: Metadata to associate with the blob
        Returns:
            Blob instance
        """
        return cls(
            data=data,
            mimetype=mime_type,
            encoding=encoding,
            path=path,
            metadata=metadata if metadata is not None else {},
        )
    def __repr__(self) -> str:
        """Define the blob representation."""
        str_repr = f"Blob {id(self)}"
        if self.source:
            str_repr += f" {self.source}"
        return str_repr
 class Document(BaseMedia):
    """Class for storing a piece of text and associated metadata.
    Example:
@ -21,27 +259,15 @@ class Document(Serializable):
            )
    """
    # The ID field is optional at the moment.
    # It will likely become required in a future major release after
    # it has been adopted by enough vectorstore implementations.
    id: Optional[str] = None
    """An optional identifier for the document.
    Ideally this should be unique across the document collection and formatted 
    as a UUID, but this will not be enforced.
    """
    page_content: str
    """String text."""
    metadata: dict = Field(default_factory=dict)
    """Arbitrary metadata about the page content (e.g., source, relationships to other
        documents, etc.).
    """
    type: Literal["Document"] = "Document"
    def __init__(self, page_content: str, **kwargs: Any) -> None:
        """Pass page_content in as positional or named arg."""
-        super().__init__(page_content=page_content, **kwargs)
+        # my-py is complaining that page_content is not defined on the base class.
        # Here, we're relying on pydantic base class to handle the validation.
        super().__init__(page_content=page_content, **kwargs)  # type: ignore[call-arg]
    @classmethod
    def is_lc_serializable(cls) -> bool:
--- a/libs/core/tests/unit_tests/document_loaders/test_base.py
+++ b/libs/core/tests/unit_tests/document_loaders/test_base.py
@ -4,8 +4,8 @@ from typing import Iterator, List
 import pytest
 from langchain_core.document_loaders.base import BaseBlobParser, BaseLoader
 from langchain_core.document_loaders.blob_loaders import Blob
 from langchain_core.documents import Document
 from langchain_core.documents.base import Blob
 def test_base_blob_parser() -> None:
--- a/libs/core/tests/unit_tests/documents/test_str.py
+++ b/libs/core/tests/unit_tests/documents/test_str.py
@ -16,5 +16,5 @@ def test_repr() -> None:
    )
    assert (
        repr(Document(page_content="Hello, World!", metadata={"a": 3}))
-        == "Document(page_content='Hello, World!', metadata={'a': 3})"
+        == "Document(metadata={'a': 3}, page_content='Hello, World!')"
    )